Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks.

This commit is contained in:
Rob Armstrong 2025-03-27 10:30:07 -07:00
parent 2cd58fbc9a
commit ceab6e8bcc
782 changed files with 107230 additions and 106548 deletions

49
.clang-format Normal file
View File

@ -0,0 +1,49 @@
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: Consecutive
AlignConsecutiveDeclarations: Consecutive
AlignConsecutiveMacros: Consecutive
AlignEscapedNewlines: Left
AlignOperands: AlignAfterOperator
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: false
AfterExternBlock: true
AfterFunction: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
IndentBraces: false
BreakBeforeBraces: Custom
BreakBeforeConceptDeclarations: true
BreakBeforeBinaryOperators: NonAssignment
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
ColumnLimit: 120
DerivePointerAlignment: false
FixNamespaceComments: true
IncludeCategories:
- Regex: '^<.*>'
Priority: 1
- Regex: '^".*"'
Priority: 2
SortIncludes: true
IncludeBlocks: Regroup
IndentWidth: 4
MaxEmptyLinesToKeep: 2
PointerAlignment: Right
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
Standard: c++17
TabWidth: 4
UseTab: Never
...

100
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,100 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
ci:
autofix_commit_msg: |
[pre-commit.ci] auto code formatting
autofix_prs: false
autoupdate_branch: ''
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
autoupdate_schedule: quarterly
skip: []
submodules: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- id: mixed-line-ending
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- id: trailing-whitespace
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.6
hooks:
- id: clang-format
types_or: [file]
files: |
(?x)^(
^.*\.c$|
^.*\.cpp$|
^.*\.cu$|
^.*\.cuh$|
^.*\.cxx$|
^.*\.h$|
^.*\.hpp$|
^.*\.inl$|
^.*\.mm$
)
exclude: |
(?x)^(
Common/.*
)
args: ["-fallback-style=none", "-style=file", "-i"]

View File

@ -31,10 +31,10 @@
*/ */
// system includes // system includes
#include <algorithm>
#include <cstdio> #include <cstdio>
#include <ctime> #include <ctime>
#include <vector> #include <vector>
#include <algorithm>
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
#include <pthread.h> #include <pthread.h>
#else #else
@ -58,15 +58,25 @@ double drand48() { return double(rand()) / RAND_MAX; }
const char *sSDKname = "UnifiedMemoryStreams"; const char *sSDKname = "UnifiedMemoryStreams";
// simple task // simple task
template <typename T> template <typename T> struct Task
struct Task { {
unsigned int size, id; unsigned int size, id;
T *data; T *data;
T *result; T *result;
T *vector; T *vector;
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){}; Task()
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) { : size(0)
, id(0)
, data(NULL)
, result(NULL)
, vector(NULL) {};
Task(unsigned int s)
: size(s)
, id(0)
, data(NULL)
, result(NULL)
{
// allocate unified memory -- the operation performed in this example will // allocate unified memory -- the operation performed in this example will
// be a DGEMV // be a DGEMV
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size)); checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
@ -75,7 +85,8 @@ struct Task {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
} }
~Task() { ~Task()
{
// ensure all memory is deallocated // ensure all memory is deallocated
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(data)); checkCudaErrors(cudaFree(data));
@ -83,7 +94,8 @@ struct Task {
checkCudaErrors(cudaFree(vector)); checkCudaErrors(cudaFree(vector));
} }
void allocate(const unsigned int s, const unsigned int unique_id) { void allocate(const unsigned int s, const unsigned int unique_id)
{
// allocate unified memory outside of constructor // allocate unified memory outside of constructor
id = unique_id; id = unique_id;
size = s; size = s;
@ -105,7 +117,8 @@ struct Task {
}; };
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
struct threadData_t { struct threadData_t
{
int tid; int tid;
Task<double> *TaskListPtr; Task<double> *TaskListPtr;
cudaStream_t *streams; cudaStream_t *streams;
@ -117,8 +130,8 @@ typedef struct threadData_t threadData;
#endif #endif
// simple host dgemv: assume data is in row-major format and square // simple host dgemv: assume data is in row-major format and square
template <typename T> template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) { {
// rows // rows
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
result[i] *= beta; result[i] *= beta;
@ -131,7 +144,8 @@ void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
// execute a single task on either host or device depending on size // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
void *execute(void *inpArgs) { void *execute(void *inpArgs)
{
threadData *dataPtr = (threadData *)inpArgs; threadData *dataPtr = (threadData *)inpArgs;
cudaStream_t *stream = dataPtr->streams; cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles; cublasHandle_t *handle = dataPtr->handles;
@ -142,92 +156,75 @@ void *execute(void *inpArgs) {
if (t.size < 100) { if (t.size < 100) {
// perform on host // perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
t.size);
// attach managed memory to a (dummy) stream to allow host access while // attach managed memory to a (dummy) stream to allow host access while
// the device is running // the device is running
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0])); checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation // call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
} else { }
else {
// perform on device // perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
t.size);
double one = 1.0; double one = 1.0;
double zero = 0.0; double zero = 0.0;
// attach managed memory to my stream // attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
cudaMemAttachSingle));
// call the device operation // call the device operation
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size, checkCudaErrors(cublasDgemv(
&one, t.data, t.size, t.vector, 1, &zero, handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
t.result, 1));
} }
} }
pthread_exit(NULL); pthread_exit(NULL);
} }
#else #else
template <typename T> template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, {
int tid) {
if (t.size < 100) { if (t.size < 100) {
// perform on host // perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
t.size);
// attach managed memory to a (dummy) stream to allow host access while the // attach managed memory to a (dummy) stream to allow host access while the
// device is running // device is running
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0])); checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation // call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
} else { }
else {
// perform on device // perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
t.size);
double one = 1.0; double one = 1.0;
double zero = 0.0; double zero = 0.0;
// attach managed memory to my stream // attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
cudaMemAttachSingle));
// call the device operation // call the device operation
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size, checkCudaErrors(cublasDgemv(
&one, t.data, t.size, t.vector, 1, &zero, handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
t.result, 1));
} }
} }
#endif #endif
// populate a list of tasks with random sizes // populate a list of tasks with random sizes
template <typename T> template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
void initialise_tasks(std::vector<Task<T> > &TaskList) { {
for (unsigned int i = 0; i < TaskList.size(); i++) { for (unsigned int i = 0; i < TaskList.size(); i++) {
// generate random size // generate random size
int size; int size;
@ -236,7 +233,8 @@ void initialise_tasks(std::vector<Task<T> > &TaskList) {
} }
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
// set device // set device
cudaDeviceProp device_prop; cudaDeviceProp device_prop;
int dev_id = findCudaDevice(argc, (const char **)argv); int dev_id = findCudaDevice(argc, (const char **)argv);
@ -276,7 +274,7 @@ int main(int argc, char **argv) {
// create list of N tasks // create list of N tasks
unsigned int N = 40; unsigned int N = 40;
std::vector<Task<double> > TaskList(N); std::vector<Task<double>> TaskList(N);
initialise_tasks(TaskList); initialise_tasks(TaskList);
printf("Executing tasks on host / device\n"); printf("Executing tasks on host / device\n");
@ -294,19 +292,17 @@ int main(int argc, char **argv) {
if ((TaskList.size() / nthreads) == 0) { if ((TaskList.size() / nthreads) == 0) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
&TaskList[i * (TaskList.size() / nthreads)]; }
} else { else {
if (i == nthreads - 1) { if (i == nthreads - 1) {
InputToThreads[i].taskSize = InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
(TaskList.size() / nthreads) + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr = InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads) + &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
(TaskList.size() % nthreads)]; }
} else { else {
InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
&TaskList[i * (TaskList.size() / nthreads)];
} }
} }
@ -334,7 +330,7 @@ int main(int argc, char **argv) {
} }
// Free TaskList // Free TaskList
std::vector<Task<double> >().swap(TaskList); std::vector<Task<double>>().swap(TaskList);
printf("All Done!\n"); printf("All Done!\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);

View File

@ -38,19 +38,21 @@
#include <stdio.h> #include <stdio.h>
// includes CUDA Runtime // includes CUDA Runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper utility functions #include <helper_functions.h> // helper utility functions
__global__ void increment_kernel(int *g_data, int inc_value) { __global__ void increment_kernel(int *g_data, int inc_value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value; g_data[idx] = g_data[idx] + inc_value;
} }
bool correct_output(int *data, const int n, const int x) { bool correct_output(int *data, const int n, const int x)
{
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
if (data[i] != x) { if (data[i] != x) {
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
@ -60,7 +62,8 @@ bool correct_output(int *data, const int n, const int x) {
return true; return true;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
{
int devID; int devID;
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
@ -126,8 +129,7 @@ int main(int argc, char *argv[]) {
// print the cpu and gpu times // print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time); printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
printf("CPU executed %lu iterations while waiting for GPU to finish\n", printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
counter);
// check the output for correctness // check the output for correctness
bool bFinalResults = correct_output(a, n, value); bool bFinalResults = correct_output(a, n, value);

View File

@ -48,15 +48,16 @@
// This kernel computes a standard parallel reduction and evaluates the // This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored // time it takes to do that for each block. The timing results are stored
// in device memory. // in device memory.
__global__ static void timedReduction(const float *input, float *output, __global__ static void timedReduction(const float *input, float *output, clock_t *timer)
clock_t *timer) { {
// __shared__ float shared[2 * blockDim.x]; // __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[]; extern __shared__ float shared[];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int bid = blockIdx.x; const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock(); if (tid == 0)
timer[bid] = clock();
// Copy input. // Copy input.
shared[tid] = input[tid]; shared[tid] = input[tid];
@ -77,11 +78,13 @@ __global__ static void timedReduction(const float *input, float *output,
} }
// Write result. // Write result.
if (tid == 0) output[bid] = shared[0]; if (tid == 0)
output[bid] = shared[0];
__syncthreads(); __syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock(); if (tid == 0)
timer[bid + gridDim.x] = clock();
} }
#define NUM_BLOCKS 64 #define NUM_BLOCKS 64
@ -104,7 +107,8 @@ __global__ static void timedReduction(const float *input, float *output,
// the memory. With more than 32 the speed scales linearly. // the memory. With more than 32 the speed scales linearly.
// Start the main CUDA Sample here // Start the main CUDA Sample here
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("CUDA Clock sample\n"); printf("CUDA Clock sample\n");
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
@ -121,20 +125,15 @@ int main(int argc, char **argv) {
input[i] = (float)i; input[i] = (float)i;
} }
checkCudaErrors( checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors( checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>( timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
dinput, doutput, dtimer);
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dinput)); checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput)); checkCudaErrors(cudaFree(doutput));

View File

@ -34,12 +34,11 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <nvrtc_helper.h> #include <nvrtc_helper.h>
#include <stdint.h>
#include <stdio.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h> #include <helper_functions.h>
@ -71,7 +70,8 @@
// Start the main CUDA Sample here // Start the main CUDA Sample here
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("CUDA Clock sample\n"); printf("CUDA Clock sample\n");
typedef long clock_t; typedef long clock_t;
@ -106,17 +106,20 @@ int main(int argc, char **argv) {
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer}; void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(kernel_addr,
kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.x,
cudaGridSize.y,
cudaGridSize.z, /* grid dim */ cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ cudaBlockSize.x,
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */ cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
sizeof(float) * 2 * NUM_THREADS,
0, /* shared mem, stream */
&arr[0], /* arguments */ &arr[0], /* arguments */
0)); 0));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
checkCudaErrors( checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemFree(dinput)); checkCudaErrors(cuMemFree(dinput));
checkCudaErrors(cuMemFree(doutput)); checkCudaErrors(cuMemFree(doutput));
checkCudaErrors(cuMemFree(dtimer)); checkCudaErrors(cuMemFree(dtimer));

View File

@ -37,15 +37,16 @@
// time it takes to do that for each block. The timing results are stored // time it takes to do that for each block. The timing results are stored
// in device memory. // in device memory.
extern "C" __global__ void timedReduction(const float *input, float *output, extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
clock_t *timer) { {
// __shared__ float shared[2 * blockDim.x]; // __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[]; extern __shared__ float shared[];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int bid = blockIdx.x; const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock(); if (tid == 0)
timer[bid] = clock();
// Copy input. // Copy input.
shared[tid] = input[tid]; shared[tid] = input[tid];
@ -66,9 +67,11 @@ extern "C" __global__ void timedReduction(const float *input, float *output,
} }
// Write result. // Write result.
if (tid == 0) output[bid] = shared[0]; if (tid == 0)
output[bid] = shared[0];
__syncthreads(); __syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock(); if (tid == 0)
timer[bid + gridDim.x] = clock();
} }

View File

@ -37,20 +37,24 @@
using namespace std; using namespace std;
// a simple kernel that simply increments each array element by b // a simple kernel that simply increments each array element by b
__global__ void kernelAddConstant(int *g_a, const int b) { __global__ void kernelAddConstant(int *g_a, const int b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_a[idx] += b; g_a[idx] += b;
} }
// a predicate that checks whether each array element is set to its index plus b // a predicate that checks whether each array element is set to its index plus b
int correctResult(int *data, const int n, const int b) { int correctResult(int *data, const int n, const int b)
{
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
if (data[i] != i + b) return 0; if (data[i] != i + b)
return 0;
return 1; return 1;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
{
int num_gpus = 0; // number of CUDA GPUs int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
@ -93,7 +97,8 @@ int main(int argc, char *argv[]) {
return 1; return 1;
} }
for (unsigned int i = 0; i < n; i++) a[i] = i; for (unsigned int i = 0; i < n; i++)
a[i] = i;
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// run as many CPU threads as there are CUDA devices // run as many CPU threads as there are CUDA devices
@ -105,8 +110,7 @@ int main(int argc, char *argv[]) {
// Recall that all variables declared inside an "omp parallel" scope are // Recall that all variables declared inside an "omp parallel" scope are
// local to each CPU thread // local to each CPU thread
// //
omp_set_num_threads( omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
num_gpus); // create as many CPU threads as there are CUDA devices
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there // omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
// are CUDA devices // are CUDA devices
#pragma omp parallel #pragma omp parallel
@ -116,31 +120,23 @@ int main(int argc, char *argv[]) {
// set and check the CUDA device for this CPU thread // set and check the CUDA device for this CPU thread
int gpu_id = -1; int gpu_id = -1;
checkCudaErrors(cudaSetDevice( checkCudaErrors(
cpu_thread_id % cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
checkCudaErrors(cudaGetDevice(&gpu_id)); checkCudaErrors(cudaGetDevice(&gpu_id));
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
num_cpu_threads, gpu_id);
int *d_a = int *d_a = 0; // pointer to memory on the device associated with this CPU thread
0; // pointer to memory on the device associated with this CPU thread int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
int *sub_a =
a +
cpu_thread_id * n /
num_cpu_threads; // pointer to this CPU thread's portion of data
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads; unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
dim3 gpu_threads(128); // 128 threads per block dim3 gpu_threads(128); // 128 threads per block
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads)); dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel)); checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel)); checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b); kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
checkCudaErrors( checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_a)); checkCudaErrors(cudaFree(d_a));
} }
printf("---------------------------\n"); printf("---------------------------\n");
@ -153,7 +149,8 @@ int main(int argc, char *argv[]) {
// //
bool bResult = correctResult(a, n, b); bool bResult = correctResult(a, n, b);
if (a) free(a); // free CPU memory if (a)
free(a); // free CPU memory
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -25,17 +25,18 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "cuda_fp16.h"
#include "helper_cuda.h"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <ctime> #include <ctime>
#include "cuda_fp16.h"
#include "helper_cuda.h"
#define NUM_OF_BLOCKS 128 #define NUM_OF_BLOCKS 128
#define NUM_OF_THREADS 128 #define NUM_OF_THREADS 128
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) { __forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
{
if (threadIdx.x < 64) if (threadIdx.x < 64)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
__syncthreads(); __syncthreads();
@ -59,27 +60,34 @@ __forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
__syncthreads(); __syncthreads();
} }
__forceinline__ __device__ void reduceInShared_native(half2 *const v) { __forceinline__ __device__ void reduceInShared_native(half2 *const v)
if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64]; {
if (threadIdx.x < 64)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
__syncthreads(); __syncthreads();
if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32]; if (threadIdx.x < 32)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
__syncthreads(); __syncthreads();
if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16]; if (threadIdx.x < 16)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
__syncthreads(); __syncthreads();
if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8]; if (threadIdx.x < 8)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
__syncthreads(); __syncthreads();
if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4]; if (threadIdx.x < 4)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
__syncthreads(); __syncthreads();
if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2]; if (threadIdx.x < 2)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
__syncthreads(); __syncthreads();
if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1]; if (threadIdx.x < 1)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
__syncthreads(); __syncthreads();
} }
__global__ void scalarProductKernel_intrinsics(half2 const *const a, __global__ void
half2 const *const b, scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
float *const results, {
size_t const size) {
const int stride = gridDim.x * blockDim.x; const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS]; __shared__ half2 shArray[NUM_OF_THREADS];
@ -101,10 +109,9 @@ __global__ void scalarProductKernel_intrinsics(half2 const *const a,
} }
} }
__global__ void scalarProductKernel_native(half2 const *const a, __global__ void
half2 const *const b, scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
float *const results, {
size_t const size) {
const int stride = gridDim.x * blockDim.x; const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS]; __shared__ half2 shArray[NUM_OF_THREADS];
@ -126,7 +133,8 @@ __global__ void scalarProductKernel_native(half2 const *const a,
} }
} }
void generateInput(half2 *a, size_t size) { void generateInput(half2 *a, size_t size)
{
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
half2 temp; half2 temp;
temp.x = static_cast<float>(rand() % 4); temp.x = static_cast<float>(rand() % 4);
@ -135,7 +143,8 @@ void generateInput(half2 *a, size_t size) {
} }
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
{
srand((unsigned int)time(NULL)); srand((unsigned int)time(NULL));
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16; size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
@ -151,8 +160,7 @@ int main(int argc, char *argv[]) {
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID)); checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) { if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
printf( printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
"ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
"higher.\n"); "higher.\n");
return EXIT_WAIVED; return EXIT_WAIVED;
} }
@ -162,23 +170,17 @@ int main(int argc, char *argv[]) {
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i])); checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
} }
checkCudaErrors( checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results)); checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
checkCudaErrors(
cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
generateInput(vec[i], size); generateInput(vec[i], size);
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
} }
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>( scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults, checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
float result_native = 0; float result_native = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) { for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
@ -186,12 +188,9 @@ int main(int argc, char *argv[]) {
} }
printf("Result native operators\t: %f \n", result_native); printf("Result native operators\t: %f \n", result_native);
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>( scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults, checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
float result_intrinsics = 0; float result_intrinsics = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) { for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
@ -199,9 +198,7 @@ int main(int argc, char *argv[]) {
} }
printf("Result intrinsics\t: %f \n", result_intrinsics); printf("Result intrinsics\t: %f \n", result_intrinsics);
printf("&&&& fp16ScalarProduct %s\n", printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
(fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
: "FAILED");
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaFree(devVec[i])); checkCudaErrors(cudaFree(devVec[i]));

View File

@ -40,24 +40,23 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA // Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
/** /**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B * Matrix multiplication (CUDA Kernel) on the device: C = A * B
* wA is A's width and wB is B's width * wA is A's width and wB is B's width
*/ */
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
float *B, int wA, {
int wB) {
// Block index // Block index
int bx = blockIdx.x; int bx = blockIdx.x;
int by = blockIdx.y; int by = blockIdx.y;
@ -87,9 +86,7 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
// Loop over all the sub-matrices of A and B // Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix // required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
a <= aEnd;
a += aStep, b += bStep) {
// Declaration of the shared memory array As used to // Declaration of the shared memory array As used to
// store the sub-matrix of A // store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
@ -128,7 +125,8 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
C[c + wB * ty + tx] = Csub; C[c + wB * ty + tx] = Csub;
} }
void ConstantInit(float *data, int size, float val) { void ConstantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
data[i] = val; data[i] = val;
} }
@ -137,9 +135,8 @@ void ConstantInit(float *data, int size, float val) {
/** /**
* Run a simple test of matrix multiplication using CUDA * Run a simple test of matrix multiplication using CUDA
*/ */
int MatrixMultiply(int argc, char **argv, int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
int block_size, const dim3 &dimsA, {
const dim3 &dimsB) {
// Allocate host memory for matrices A and B // Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y; unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A; unsigned int mem_size_A = sizeof(float) * size_A;
@ -181,10 +178,8 @@ int MatrixMultiply(int argc, char **argv,
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// copy host memory to device // copy host memory to device
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
// Setup execution parameters // Setup execution parameters
dim3 threads(block_size, block_size); dim3 threads(block_size, block_size);
@ -195,11 +190,10 @@ int MatrixMultiply(int argc, char **argv,
// Performs warmup operation using matrixMul CUDA kernel // Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16) { if (block_size == 16) {
MatrixMulCUDA<16> MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x); }
} else { else {
MatrixMulCUDA<32> MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} }
printf("done\n"); printf("done\n");
@ -213,11 +207,10 @@ int MatrixMultiply(int argc, char **argv,
for (int j = 0; j < nIter; j++) { for (int j = 0; j < nIter; j++) {
if (block_size == 16) { if (block_size == 16) {
MatrixMulCUDA<16> MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x); }
} else { else {
MatrixMulCUDA<32> MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} }
} }
@ -232,19 +225,18 @@ int MatrixMultiply(int argc, char **argv,
// Compute and print the performance // Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter; float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) * double flopsPerMatrixMul =
static_cast<double>(dimsA.y) * 2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
static_cast<double>(dimsB.x); double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
double gigaFlops = printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n", " WorkgroupSize= %u threads/block\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul,
threads.x * threads.y);
// Copy result from device to host // Copy result from device to host
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaStreamSynchronize(stream));
printf("Checking computed result for correctness: "); printf("Checking computed result for correctness: ");
@ -261,8 +253,7 @@ int MatrixMultiply(int argc, char **argv,
double rel_err = abs_err / abs_val / dot_length; double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) { if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
i, h_C[i], dimsA.x * valB, eps);
correct = false; correct = false;
} }
} }
@ -278,13 +269,13 @@ int MatrixMultiply(int argc, char **argv,
checkCudaErrors(cudaFree(d_C)); checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop)); checkCudaErrors(cudaEventDestroy(stop));
printf( printf("\nNOTE: The CUDA Samples are not meant for performance "
"\nNOTE: The CUDA Samples are not meant for performance "
"measurements. Results may vary when GPU Boost is enabled.\n"); "measurements. Results may vary when GPU Boost is enabled.\n");
if (correct) { if (correct) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} else { }
else {
return EXIT_FAILURE; return EXIT_FAILURE;
} }
} }
@ -293,15 +284,15 @@ int MatrixMultiply(int argc, char **argv,
/** /**
* Program main * Program main
*/ */
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n"); printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n"); printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices" \ printf(" Note: Outer matrix dimensions of A & B matrices"
" must be equal.\n"); " must be equal.\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
@ -337,13 +328,11 @@ int main(int argc, char **argv) {
} }
if (dimsA.x != dimsB.y) { if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
dimsA.x, dimsB.y);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
dimsB.x, dimsB.y);
checkCudaErrors(cudaProfilerStart()); checkCudaErrors(cudaProfilerStart());
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB); int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);

View File

@ -46,23 +46,23 @@
// includes, system // includes, system
#include <builtin_types.h> #include <builtin_types.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring> #include <cstring>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, project, CUDA // includes, project, CUDA
#include <cstring>
#include <cuda.h> #include <cuda.h>
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
#include <helper_image.h> #include <helper_image.h>
#include <helper_string.h> #include <helper_string.h>
#include <helper_timer.h> #include <helper_timer.h>
#include <cstring>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include "matrixMul.h" #include "matrixMul.h"
@ -71,11 +71,9 @@
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
void randomInit(float *, int); void randomInit(float *, int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int, extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
unsigned int, unsigned int);
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
int *blk_size);
#ifndef FATBIN_FILE #ifndef FATBIN_FILE
#define FATBIN_FILE "matrixMul_kernel64.fatbin" #define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -91,7 +89,8 @@ size_t totalGlobalMem;
const char *sSDKsample = "matrixMulDrv (Driver API)"; const char *sSDKsample = "matrixMulDrv (Driver API)";
void constantInit(float *data, int size, float val) { void constantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
data[i] = val; data[i] = val;
} }
@ -100,7 +99,8 @@ void constantInit(float *data, int size, float val) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("[ %s ]\n", sSDKsample); printf("[ %s ]\n", sSDKsample);
runTest(argc, argv); runTest(argc, argv);
@ -109,7 +109,8 @@ int main(int argc, char **argv) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
// initialize CUDA // initialize CUDA
CUfunction matrixMul = NULL; CUfunction matrixMul = NULL;
int block_size = 0; int block_size = 0;
@ -172,10 +173,19 @@ void runTest(int argc, char **argv) {
size_t Matrix_Width_B = (size_t)WB; size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call // new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(matrixMul,
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, grid.x,
2 * block_size * block_size * sizeof(float), NULL, args, NULL)); grid.y,
} else { grid.z,
block.x,
block.y,
block.z,
2 * block_size * block_size * sizeof(float),
NULL,
args,
NULL));
}
else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method) // Launching (advanced method)
int offset = 0; int offset = 0;
@ -198,14 +208,20 @@ void runTest(int argc, char **argv) {
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B; *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B); offset += sizeof(Matrix_Width_B);
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, void *kernel_launch_config[5] = {
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call // new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(matrixMul,
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, grid.x,
2 * block_size * block_size * sizeof(float), NULL, NULL, grid.y,
grid.z,
block.x,
block.y,
block.z,
2 * block_size * block_size * sizeof(float),
NULL,
NULL,
reinterpret_cast<void **>(&kernel_launch_config))); reinterpret_cast<void **>(&kernel_launch_config)));
} }
@ -222,8 +238,7 @@ void runTest(int argc, char **argv) {
for (int i = 0; i < static_cast<int>(WC * HC); i++) { for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) { if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
h_C[i], WA * valB);
correct = false; correct = false;
} }
} }
@ -244,14 +259,15 @@ void runTest(int argc, char **argv) {
} }
// Allocates a matrix with random float entries. // Allocates a matrix with random float entries.
void randomInit(float *data, int size) { void randomInit(float *data, int size)
{
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
data[i] = rand() / static_cast<float>(RAND_MAX); data[i] = rand() / static_cast<float>(RAND_MAX);
} }
} }
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
int *blk_size) { {
CUfunction cuFunction = 0; CUfunction cuFunction = 0;
int major = 0, minor = 0; int major = 0, minor = 0;
char deviceName[100]; char deviceName[100];
@ -259,16 +275,13 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
cuDevice = findCudaDeviceDRV(argc, (const char **)argv); cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename // get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice)); checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor); printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice)); checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n", printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem);
(long long unsigned int)totalGlobalMem);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
@ -278,7 +291,8 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
@ -291,8 +305,7 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// select the suitable kernel function // select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
"matrixMul_bs8_64bit"};
int idx = 0; int idx = 0;
int block_size = 32; int block_size = 32;
@ -302,12 +315,12 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx])); checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize( checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0, &blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) { if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size); printf("> %d block size selected\n", block_size);
break; break;
} else { }
else {
block_size /= 2; block_size /= 2;
} }
idx++; idx++;

View File

@ -42,8 +42,8 @@
//! wA is A's width and wB is B's width //! wA is A's width and wB is B's width
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <int block_size, typename size_type> template <int block_size, typename size_type>
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, __device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
size_type wB) { {
// Block index // Block index
size_type bx = blockIdx.x; size_type bx = blockIdx.x;
size_type by = blockIdx.y; size_type by = blockIdx.y;
@ -96,7 +96,8 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA,
// of the block sub-matrix // of the block sub-matrix
#pragma unroll #pragma unroll
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx); for (size_type k = 0; k < block_size; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding // Synchronize to make sure that the preceding
// computation is done before loading two new // computation is done before loading two new
@ -111,16 +112,16 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA,
} }
// C wrappers around our template kernel // C wrappers around our template kernel
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
size_t wA, size_t wB) { {
matrixMul<8, size_t>(C, A, B, wA, wB); matrixMul<8, size_t>(C, A, B, wA, wB);
} }
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
size_t wA, size_t wB) { {
matrixMul<16, size_t>(C, A, B, wA, wB); matrixMul<16, size_t>(C, A, B, wA, wB);
} }
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
size_t wA, size_t wB) { {
matrixMul<32, size_t>(C, A, B, wA, wB); matrixMul<32, size_t>(C, A, B, wA, wB);
} }

View File

@ -15,13 +15,14 @@
// With these flags defined, this source file will dynamically // With these flags defined, this source file will dynamically
// load the corresponding functions. Disabled by default. // load the corresponding functions. Disabled by default.
//#define CUDA_INIT_D3D9 // #define CUDA_INIT_D3D9
//#define CUDA_INIT_D3D10 // #define CUDA_INIT_D3D10
//#define CUDA_INIT_D3D11 // #define CUDA_INIT_D3D11
//#define CUDA_INIT_OPENGL // #define CUDA_INIT_OPENGL
#include "cuda_drvapi_dynlink.h"
#include <stdio.h> #include <stdio.h>
#include "cuda_drvapi_dynlink.h"
tcuInit *_cuInit; tcuInit *_cuInit;
tcuDriverGetVersion *cuDriverGetVersion; tcuDriverGetVersion *cuDriverGetVersion;
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{ {
*pInstance = LoadLibrary(__CudaLibName); *pInstance = LoadLibrary(__CudaLibName);
if (*pInstance == NULL) if (*pInstance == NULL) {
{
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName); printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN; return CUDA_ERROR_UNKNOWN;
} }
@ -251,35 +251,32 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
#define GET_PROC_EX(name, alias, required) \ #define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \ alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
#name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \ return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V2(name, alias, required) \ #define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\ alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \ return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V3(name, alias, required) \ #define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\ alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \ return CUDA_ERROR_UNKNOWN; \
} }
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX) #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
#include <dlfcn.h> #include <dlfcn.h>
#if defined(__APPLE__) || defined(__MACOSX) #if defined(__APPLE__) || defined(__MACOSX)
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib"; static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
#elif defined(__ANDROID__) #elif defined(__ANDROID__)
#if defined (__aarch64__) #if defined(__aarch64__)
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so"; static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
#elif defined(__arm__) #elif defined(__arm__)
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so"; static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{ {
*pInstance = dlopen(__CudaLibName, RTLD_NOW); *pInstance = dlopen(__CudaLibName, RTLD_NOW);
if (*pInstance == NULL) if (*pInstance == NULL) {
{
printf("dlopen \"%s\" failed!\n", __CudaLibName); printf("dlopen \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN; return CUDA_ERROR_UNKNOWN;
} }
@ -306,24 +302,21 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
#define GET_PROC_EX(name, alias, required) \ #define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \ alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
#name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \ return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V2(name, alias, required) \ #define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \ alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \ return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V3(name, alias, required) \ #define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \ alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \ return CUDA_ERROR_UNKNOWN; \
} }
@ -337,13 +330,13 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
if (CUDA_SUCCESS != result) { \ if (CUDA_SUCCESS != result) { \
return result; \ return result; \
} \ } \
} while(0) } while (0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1) #define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0) #define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
#define GET_PROC(name) GET_PROC_REQUIRED(name) #define GET_PROC(name) GET_PROC_REQUIRED(name)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1) #define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1)
#define GET_PROC_V3(name) GET_PROC_EX_V3(name,name,1) #define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1)
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
{ {
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
// available since 2.2. if not present, version 1.0 is assumed // available since 2.2. if not present, version 1.0 is assumed
GET_PROC_OPTIONAL(cuDriverGetVersion); GET_PROC_OPTIONAL(cuDriverGetVersion);
if (cuDriverGetVersion) if (cuDriverGetVersion) {
{
CHECKED_CALL(cuDriverGetVersion(&driverVer)); CHECKED_CALL(cuDriverGetVersion(&driverVer));
} }
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuStreamDestroy); GET_PROC(cuStreamDestroy);
// These are CUDA 5.0 new functions // These are CUDA 5.0 new functions
if (driverVer >= 5000) if (driverVer >= 5000) {
{
GET_PROC(cuMipmappedArrayCreate); GET_PROC(cuMipmappedArrayCreate);
GET_PROC(cuMipmappedArrayDestroy); GET_PROC(cuMipmappedArrayDestroy);
GET_PROC(cuMipmappedArrayGetLevel); GET_PROC(cuMipmappedArrayGetLevel);
} }
// These are CUDA 4.2 new functions // These are CUDA 4.2 new functions
if (driverVer >= 4020) if (driverVer >= 4020) {
{
GET_PROC(cuFuncSetSharedMemConfig); GET_PROC(cuFuncSetSharedMemConfig);
GET_PROC(cuCtxGetSharedMemConfig); GET_PROC(cuCtxGetSharedMemConfig);
GET_PROC(cuCtxSetSharedMemConfig); GET_PROC(cuCtxSetSharedMemConfig);
} }
// These are CUDA 4.1 new functions // These are CUDA 4.1 new functions
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
{
GET_PROC(cuDeviceGetByPCIBusId); GET_PROC(cuDeviceGetByPCIBusId);
GET_PROC(cuDeviceGetPCIBusId); GET_PROC(cuDeviceGetPCIBusId);
GET_PROC(cuIpcGetEventHandle); GET_PROC(cuIpcGetEventHandle);
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
} }
// These could be _v2 interfaces // These could be _v2 interfaces
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
{
GET_PROC_V2(cuCtxDestroy); GET_PROC_V2(cuCtxDestroy);
GET_PROC_V2(cuCtxPopCurrent); GET_PROC_V2(cuCtxPopCurrent);
GET_PROC_V2(cuCtxPushCurrent); GET_PROC_V2(cuCtxPushCurrent);
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuEventDestroy); GET_PROC_V2(cuEventDestroy);
} }
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
{
GET_PROC_V2(cuDeviceTotalMem); GET_PROC_V2(cuDeviceTotalMem);
GET_PROC_V2(cuCtxCreate); GET_PROC_V2(cuCtxCreate);
GET_PROC_V2(cuModuleGetGlobal); GET_PROC_V2(cuModuleGetGlobal);
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuTexRefSetAddress); GET_PROC_V2(cuTexRefSetAddress);
GET_PROC_V2(cuTexRefGetAddress); GET_PROC_V2(cuTexRefGetAddress);
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
{
GET_PROC_V3(cuTexRefSetAddress2D); GET_PROC_V3(cuTexRefSetAddress2D);
} }
else else {
{
GET_PROC_V2(cuTexRefSetAddress2D); GET_PROC_V2(cuTexRefSetAddress2D);
} }
} }
else else {
{
// versions earlier than 3020 // versions earlier than 3020
GET_PROC(cuDeviceTotalMem); GET_PROC(cuDeviceTotalMem);
GET_PROC(cuCtxCreate); GET_PROC(cuCtxCreate);
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
} }
// The following functions are specific to CUDA versions // The following functions are specific to CUDA versions
if (driverVer >= 4000) if (driverVer >= 4000) {
{
GET_PROC(cuCtxSetCurrent); GET_PROC(cuCtxSetCurrent);
GET_PROC(cuCtxGetCurrent); GET_PROC(cuCtxGetCurrent);
GET_PROC(cuMemHostRegister); GET_PROC(cuMemHostRegister);
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuProfilerStop); GET_PROC(cuProfilerStop);
} }
if (driverVer >= 3010) if (driverVer >= 3010) {
{
GET_PROC(cuModuleGetSurfRef); GET_PROC(cuModuleGetSurfRef);
GET_PROC(cuSurfRefSetArray); GET_PROC(cuSurfRefSetArray);
GET_PROC(cuSurfRefGetArray); GET_PROC(cuSurfRefGetArray);
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuCtxGetLimit); GET_PROC(cuCtxGetLimit);
} }
if (driverVer >= 3000) if (driverVer >= 3000) {
{
GET_PROC(cuMemcpyDtoDAsync); GET_PROC(cuMemcpyDtoDAsync);
GET_PROC(cuFuncSetCacheConfig); GET_PROC(cuFuncSetCacheConfig);
#ifdef CUDA_INIT_D3D11 #ifdef CUDA_INIT_D3D11
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGraphicsUnregisterResource); GET_PROC(cuGraphicsUnregisterResource);
GET_PROC(cuGraphicsSubResourceGetMappedArray); GET_PROC(cuGraphicsSubResourceGetMappedArray);
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
{
GET_PROC_V2(cuGraphicsResourceGetMappedPointer); GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
} }
else else {
{
GET_PROC(cuGraphicsResourceGetMappedPointer); GET_PROC(cuGraphicsResourceGetMappedPointer);
} }
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGetExportTable); GET_PROC(cuGetExportTable);
} }
if (driverVer >= 2030) if (driverVer >= 2030) {
{
GET_PROC(cuMemHostGetFlags); GET_PROC(cuMemHostGetFlags);
#ifdef CUDA_INIT_D3D10 #ifdef CUDA_INIT_D3D10
GET_PROC(cuD3D10GetDevice); GET_PROC(cuD3D10GetDevice);
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
#endif #endif
} }
if (driverVer >= 2010) if (driverVer >= 2010) {
{
GET_PROC(cuModuleLoadDataEx); GET_PROC(cuModuleLoadDataEx);
GET_PROC(cuModuleLoadFatBinary); GET_PROC(cuModuleLoadFatBinary);
#ifdef CUDA_INIT_OPENGL #ifdef CUDA_INIT_OPENGL
GET_PROC(cuGLCtxCreate); GET_PROC(cuGLCtxCreate);
GET_PROC(cuGraphicsGLRegisterBuffer); GET_PROC(cuGraphicsGLRegisterBuffer);
GET_PROC(cuGraphicsGLRegisterImage); GET_PROC(cuGraphicsGLRegisterImage);
# ifdef WIN32 #ifdef WIN32
GET_PROC(cuWGLGetDevice); GET_PROC(cuWGLGetDevice);
# endif #endif
#endif #endif
#ifdef CUDA_INIT_D3D9 #ifdef CUDA_INIT_D3D9
GET_PROC(cuD3D9GetDevice); GET_PROC(cuD3D9GetDevice);

View File

@ -14,21 +14,17 @@
#ifndef HELPER_CUDA_DRVAPI_H #ifndef HELPER_CUDA_DRVAPI_H
#define HELPER_CUDA_DRVAPI_H #define HELPER_CUDA_DRVAPI_H
#include <helper_string.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <helper_string.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
#endif #endif
#ifndef HELPER_CUDA_DRVAPI_H #ifndef HELPER_CUDA_DRVAPI_H
inline int ftoi(float value) { inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
return (value >= 0 ? static_cast<int>(value + 0.5)
: static_cast<int>(value - 0.5));
}
#endif #endif
#ifndef EXIT_WAIVED #ifndef EXIT_WAIVED
@ -47,39 +43,43 @@ inline int ftoi(float value) {
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions // These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) { inline void __checkCudaErrors(CUresult err, const char *file, const int line)
{
if (CUDA_SUCCESS != err) { if (CUDA_SUCCESS != err) {
const char *errorStr = NULL; const char *errorStr = NULL;
cuGetErrorString(err, &errorStr); cuGetErrorString(err, &errorStr);
fprintf(stderr, fprintf(stderr,
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, " "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
"line %i.\n", "line %i.\n",
err, errorStr, file, line); err,
errorStr,
file,
line);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif #endif
// This function wraps the CUDA Driver API into a template function // This function wraps the CUDA Driver API into a template function
template <class T> template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, {
int device) {
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device)); checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
} }
#endif #endif
// Beginning of GPU Architecture definitions // Beginning of GPU Architecture definitions
inline int _ConvertSMVer2CoresDRV(int major, int minor) { inline int _ConvertSMVer2CoresDRV(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # // Defines for GPU Architecture types (using the SM version to determine the #
// of cores per SM // of cores per SM
typedef struct { typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
// minor version // minor version
int Cores; int Cores;
} sSMtoCores; } sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = { sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
{0x30, 192},
{0x32, 192}, {0x32, 192},
{0x35, 192}, {0x35, 192},
{0x37, 192}, {0x37, 192},
@ -110,16 +110,18 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) {
// If we don't find the values, we default use the previous one to run // If we don't find the values, we default use the previous one to run
// properly // properly
printf( printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major,
major, minor, nGpuArchCoresPerSM[index - 1].Cores); minor,
nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores; return nGpuArchCoresPerSM[index - 1].Cores;
} }
// end of GPU Architecture definitions // end of GPU Architecture definitions
#ifdef __cuda_cuda_h__ #ifdef __cuda_cuda_h__
// General GPU Device CUDA Initialization // General GPU Device CUDA Initialization
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) { inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
{
int cuDevice = 0; int cuDevice = 0;
int deviceCount = 0; int deviceCount = 0;
checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
@ -140,11 +142,8 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
if (dev > deviceCount - 1) { if (dev > deviceCount - 1) {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
deviceCount); fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
fprintf(stderr,
">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
dev);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
return -dev; return -dev;
} }
@ -171,7 +170,8 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
} }
// This function returns the best GPU based on performance // This function returns the best GPU based on performance
inline int gpuGetMaxGflopsDeviceIdDRV() { inline int gpuGetMaxGflopsDeviceIdDRV()
{
CUdevice current_device = 0; CUdevice current_device = 0;
CUdevice max_perf_device = 0; CUdevice max_perf_device = 0;
int device_count = 0; int device_count = 0;
@ -187,8 +187,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
checkCudaErrors(cuDeviceGetCount(&device_count)); checkCudaErrors(cuDeviceGetCount(&device_count));
if (device_count == 0) { if (device_count == 0) {
fprintf(stderr, fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
"gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -196,36 +195,31 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
current_device = 0; current_device = 0;
while (current_device < device_count) { while (current_device < device_count) {
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(
&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
current_device)); checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
int computeMode; int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
current_device);
if (computeMode != CU_COMPUTEMODE_PROHIBITED) { if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
if (major == 9999 && minor == 9999) { if (major == 9999 && minor == 9999) {
sm_per_multiproc = 1; sm_per_multiproc = 1;
} else { }
else {
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
} }
unsigned long long compute_perf = unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
clockRate);
if (compute_perf > max_compute_perf) { if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf; max_compute_perf = compute_perf;
max_perf_device = current_device; max_perf_device = current_device;
} }
} else { }
else {
devices_prohibited++; devices_prohibited++;
} }
@ -243,7 +237,8 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
} }
// General initialization call to pick the best CUDA Device // General initialization call to pick the best CUDA Device
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) { inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
{
CUdevice cuDevice; CUdevice cuDevice;
int devID = 0; int devID = 0;
@ -255,7 +250,8 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
printf("exiting...\n"); printf("exiting...\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
} else { }
else {
// Otherwise pick the device with highest Gflops/s // Otherwise pick the device with highest Gflops/s
char name[100]; char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV(); devID = gpuGetMaxGflopsDeviceIdDRV();
@ -269,7 +265,8 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
return cuDevice; return cuDevice;
} }
inline CUdevice findIntegratedGPUDrv() { inline CUdevice findIntegratedGPUDrv()
{
CUdevice current_device = 0; CUdevice current_device = 0;
int device_count = 0; int device_count = 0;
int devices_prohibited = 0; int devices_prohibited = 0;
@ -286,28 +283,22 @@ inline CUdevice findIntegratedGPUDrv() {
// Find the integrated GPU which is compute capable // Find the integrated GPU which is compute capable
while (current_device < device_count) { while (current_device < device_count) {
int computeMode = -1; int computeMode = -1;
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device)); checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
// If GPU is integrated and is not running on Compute Mode prohibited use // If GPU is integrated and is not running on Compute Mode prohibited use
// that // that
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) { if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
int major = 0, minor = 0; int major = 0, minor = 0;
char deviceName[256]; char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
current_device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device)); checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
current_device, deviceName, major, minor);
return current_device; return current_device;
} else { }
else {
devices_prohibited++; devices_prohibited++;
} }
@ -323,35 +314,31 @@ inline CUdevice findIntegratedGPUDrv() {
} }
// General check for CUDA GPU SM Capabilities // General check for CUDA GPU SM Capabilities
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
int devID) { {
CUdevice cuDevice; CUdevice cuDevice;
char name[256]; char name[256];
int major = 0, minor = 0; int major = 0, minor = 0;
checkCudaErrors(cuDeviceGet(&cuDevice, devID)); checkCudaErrors(cuDeviceGet(&cuDevice, devID));
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
if ((major > major_version) || if ((major > major_version) || (major == major_version && minor >= minor_version)) {
(major == major_version && minor >= minor_version)) { printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
major, minor);
return true; return true;
} else { }
printf( else {
"No GPU device was found that can support CUDA compute capability " printf("No GPU device was found that can support CUDA compute capability "
"%d.%d.\n", "%d.%d.\n",
major_version, minor_version); major_version,
minor_version);
return false; return false;
} }
} }
#endif #endif
// end of CUDA Helper Functions // end of CUDA Helper Functions
#endif // HELPER_CUDA_DRVAPI_H #endif // HELPER_CUDA_DRVAPI_H

View File

@ -43,10 +43,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, CUDA // includes, CUDA
#include "cuda_drvapi_dynlink.h" #include "cuda_drvapi_dynlink.h"
@ -60,7 +60,7 @@
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int); extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
#if defined _MSC_VER #if defined _MSC_VER
#pragma warning (disable : 4312) #pragma warning(disable : 4312)
#endif #endif
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void randomInit(float *data, size_t size) void randomInit(float *data, size_t size)
{ {
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i) {
{
data[i] = rand() / (float)RAND_MAX; data[i] = rand() / (float)RAND_MAX;
} }
} }
@ -100,18 +99,14 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
// This assumes that the user is attempting to specify a explicit device -device=n // This assumes that the user is attempting to specify a explicit device -device=n
if (argc > 1) if (argc > 1) {
{
bool bFound = false; bool bFound = false;
for (int param=0; param < argc; param++) for (int param = 0; param < argc; param++) {
{ if (!strncmp(argv[param], "-device", 7)) {
if (!strncmp(argv[param], "-device", 7)) int i = (int)strlen(argv[1]);
{
int i=(int)strlen(argv[1]);
while (argv[1][i] != '=') while (argv[1][i] != '=') {
{
i--; i--;
} }
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
int deviceCount = 0; int deviceCount = 0;
checkCudaErrors(cuDeviceGetCount(&deviceCount)); checkCudaErrors(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0) if (deviceCount == 0) {
{
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n"); fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
if (devID < 0) devID = 0; if (devID < 0)
devID = 0;
if (devID > deviceCount -1) if (devID > deviceCount - 1) {
{
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount); fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
status = CUDA_ERROR_NOT_FOUND; status = CUDA_ERROR_NOT_FOUND;
@ -159,8 +153,7 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
// create context for picked device // create context for picked device
status = cuCtxCreate(&g_cuContext, 0, cuDevice); status = cuCtxCreate(&g_cuContext, 0, cuDevice);
if (CUDA_SUCCESS != status) if (CUDA_SUCCESS != status) {
{
cuCtxDestroy(g_cuContext); cuCtxDestroy(g_cuContext);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
@ -191,31 +184,31 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
printf("> Compiling CUDA module\n"); printf("> Compiling CUDA module\n");
#if defined(_WIN64) || defined(__LP64__) #if defined(_WIN64) || defined(__LP64__)
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#else #else
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#endif #endif
printf("> PTX JIT log:\n%s\n", jitLogBuffer); printf("> PTX JIT log:\n%s\n", jitLogBuffer);
delete [] jitOptions; delete[] jitOptions;
delete [] jitOptVals; delete[] jitOptVals;
delete [] jitLogBuffer; delete[] jitLogBuffer;
} }
if (CUDA_SUCCESS != status) if (CUDA_SUCCESS != status) {
{
printf("Error while compiling PTX\n"); printf("Error while compiling PTX\n");
cuCtxDestroy(g_cuContext); cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// retrieve CUDA function from the compiled module // retrieve CUDA function from the compiled module
status = cuModuleGetFunction(&cuFunction, cuModule, status = cuModuleGetFunction(
(block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit"); &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
if (CUDA_SUCCESS != status) if (CUDA_SUCCESS != status) {
{
cuCtxDestroy(g_cuContext); cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -246,8 +239,8 @@ int main(int argc, char **argv)
size_t size_B = WB * HB; size_t size_B = WB * HB;
size_t mem_size_B = sizeof(float) * size_B; size_t mem_size_B = sizeof(float) * size_B;
float *h_A = (float *) malloc(mem_size_A); float *h_A = (float *)malloc(mem_size_A);
float *h_B = (float *) malloc(mem_size_B); float *h_B = (float *)malloc(mem_size_B);
// initialize host memory // initialize host memory
randomInit(h_A, size_A); randomInit(h_A, size_A);
@ -271,19 +264,17 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side // allocate mem for the result on host side
float *h_C = (float *) malloc(mem_size_C); float *h_C = (float *)malloc(mem_size_C);
#if __CUDA_API_VERSION >= 4000 #if __CUDA_API_VERSION >= 4000
{ {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
int Matrix_Width_A = WA; int Matrix_Width_A = WA;
int Matrix_Width_B = WB; int Matrix_Width_B = WB;
void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B }; void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1, checkCudaErrors(cuLaunchKernel(
block_size , block_size , 1, matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
0,
NULL, args, NULL));
} }
#else // __CUDA_API_VERSION <= 3020 #else // __CUDA_API_VERSION <= 3020
{ {
@ -312,7 +303,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuParamSetSize(matrixMul, offset)); checkCudaErrors(cuParamSetSize(matrixMul, offset));
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1)); checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float))); checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
// set execution configuration for the CUDA kernel // set execution configuration for the CUDA kernel
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size)); checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
@ -322,19 +313,18 @@ int main(int argc, char **argv)
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
// copy result from device to host // copy result from device to host
checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C)); checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
// compute reference solution // compute reference solution
float *reference = (float *) malloc(mem_size_C); float *reference = (float *)malloc(mem_size_C);
computeGold(reference, h_A, h_B, HA, WA, WB); computeGold(reference, h_A, h_B, HA, WA, WB);
// check result // check result
float diff=0.0f; float diff = 0.0f;
for (unsigned int i=0; i<size_C; i++) for (unsigned int i = 0; i < size_C; i++) {
{
float tmp = reference[i] - h_C[i]; float tmp = reference[i] - h_C[i];
diff += tmp*tmp; diff += tmp * tmp;
} }
int res = (diff / (float)size_C < 1e-6f); int res = (diff / (float)size_C < 1e-6f);
@ -349,7 +339,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemFree(d_C)); checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(g_cuContext)); checkCudaErrors(cuCtxDestroy(g_cuContext));
printf("Test run %s\n", (1==res) ? "success!" : "failed!"); printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE); exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -28,8 +28,7 @@
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// export C interface // export C interface
extern "C" extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set //! Compute reference data set
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
//! @param hA height of matrix A //! @param hA height of matrix A
//! @param wB width of matrix B //! @param wB width of matrix B
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{ {
for (unsigned int i = 0; i < hA; ++i) for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j) for (unsigned int j = 0; j < wB; ++j) {
{
double sum = 0; double sum = 0;
for (unsigned int k = 0; k < wA; ++k) for (unsigned int k = 0; k < wA; ++k) {
{
double a = A[i * wA + k]; double a = A[i * wA + k];
double b = B[k * wB + j]; double b = B[k * wB + j];
sum += a * b; sum += a * b;

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_32_ptxdump_h__ #define __matrixMul_kernel_32_ptxdump_h__
#if defined __cplusplus #if defined __cplusplus
extern "C" { extern "C"
{
#endif #endif
extern unsigned char matrixMul_kernel_32_ptxdump[25784]; extern unsigned char matrixMul_kernel_32_ptxdump[25784];

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_64_ptxdump_h__ #define __matrixMul_kernel_64_ptxdump_h__
#if defined __cplusplus #if defined __cplusplus
extern "C" { extern "C"
{
#endif #endif
extern unsigned char matrixMul_kernel_64_ptxdump[26489]; extern unsigned char matrixMul_kernel_64_ptxdump[26489];

View File

@ -42,17 +42,19 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "nvrtc_helper.h" #include "nvrtc_helper.h"
// Helper functions and utilities to work with CUDA // Helper functions and utilities to work with CUDA
#include <helper_functions.h> #include <helper_functions.h>
void constantInit(float *data, int size, float val) { void constantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
data[i] = val; data[i] = val;
} }
@ -61,8 +63,8 @@ void constantInit(float *data, int size, float val) {
/** /**
* Run a simple test of matrix multiplication using CUDA * Run a simple test of matrix multiplication using CUDA
*/ */
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
dim3 &dimsB) { {
// Allocate host memory for matrices A and B // Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y; unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A; unsigned int mem_size_A = sizeof(float) * size_A;
@ -114,24 +116,27 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
CUfunction kernel_addr; CUfunction kernel_addr;
if (block_size == 16) { if (block_size == 16) {
checkCudaErrors( checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16")); }
} else { else {
checkCudaErrors( checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
} }
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
(void *)&dimsB.x};
// Execute the kernel // Execute the kernel
int nIter = 300; int nIter = 300;
for (int j = 0; j < nIter; j++) { for (int j = 0; j < nIter; j++) {
checkCudaErrors( checkCudaErrors(cuLaunchKernel(kernel_addr,
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */ grid.x,
threads.x, threads.y, threads.z, /* block dim */ grid.y,
0, 0, /* shared mem, stream */ grid.z, /* grid dim */
threads.x,
threads.y,
threads.z, /* block dim */
0,
0, /* shared mem, stream */
&arr[0], /* arguments */ &arr[0], /* arguments */
0)); 0));
@ -157,16 +162,14 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
double rel_err = abs_err / abs_val / dot_length; double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) { if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
h_C[i], dimsA.x * valB, eps);
correct = false; correct = false;
} }
} }
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf( printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n"); "Results may vary when GPU Boost is enabled.\n");
// Clean up memory // Clean up memory
@ -180,7 +183,8 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
if (correct) { if (correct) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} else { }
else {
return EXIT_FAILURE; return EXIT_FAILURE;
} }
} }
@ -189,16 +193,15 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
* Program main * Program main
*/ */
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n"); printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n"); printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf( printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
@ -234,13 +237,11 @@ int main(int argc, char **argv) {
} }
if (dimsA.x != dimsB.y) { if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
dimsA.x, dimsB.y);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
dimsB.y);
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);

View File

@ -48,11 +48,10 @@
#include <cooperative_groups.h> #include <cooperative_groups.h>
template <int BLOCK_SIZE> template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) { {
// Handle to thread block group // Handle to thread block group
cooperative_groups::thread_block cta = cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
cooperative_groups::this_thread_block();
// Block index // Block index
int bx = blockIdx.x; int bx = blockIdx.x;
int by = blockIdx.y; int by = blockIdx.y;
@ -120,12 +119,12 @@ __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
C[c + wB * ty + tx] = Csub; C[c + wB * ty + tx] = Csub;
} }
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
int wA, int wB) { {
matrixMulCUDA<16>(C, A, B, wA, wB); matrixMulCUDA<16>(C, A, B, wA, wB);
} }
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
int wA, int wB) { {
matrixMulCUDA<32>(C, A, B, wA, wB); matrixMulCUDA<32>(C, A, B, wA, wB);
} }

View File

@ -28,12 +28,13 @@
#include <cooperative_groups.h> #include <cooperative_groups.h>
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <assert.h> #include <assert.h>
#include <helper_cuda.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
uint &valB, uint arrowDir) { {
uint t; uint t;
if ((keyA > keyB) == arrowDir) { if ((keyA > keyB) == arrowDir) {
@ -46,9 +47,9 @@ inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
} }
} }
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, __global__ void
uint *d_SrcKey, uint *d_SrcVal, bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
uint arrayLength, uint sortDir) { {
// Handle to thread block group // Handle to thread block group
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
// Shared memory storage for one or more short vectors // Shared memory storage for one or more short vectors
@ -62,10 +63,8 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0]; s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0]; s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint size = 2; size < arrayLength; size <<= 1) { for (uint size = 2; size < arrayLength; size <<= 1) {
// Bitonic merge // Bitonic merge
@ -74,8 +73,7 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
for (uint stride = size / 2; stride > 0; stride >>= 1) { for (uint stride = size / 2; stride > 0; stride >>= 1) {
cg::sync(cta); cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
s_val[pos + stride], dir);
} }
} }
@ -84,26 +82,25 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) { for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
cg::sync(cta); cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
s_val[pos + stride], sortDir);
} }
} }
cg::sync(cta); cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0]; d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0]; d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
} }
// Helper function (also used by odd-even merge sort) // Helper function (also used by odd-even merge sort)
extern "C" uint factorRadix2(uint *log2L, uint L) { extern "C" uint factorRadix2(uint *log2L, uint L)
{
if (!L) { if (!L) {
*log2L = 0; *log2L = 0;
return 0; return 0;
} else { }
else {
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++) for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
; ;
@ -111,10 +108,14 @@ extern "C" uint factorRadix2(uint *log2L, uint L) {
} }
} }
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal, extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint batchSize, uint arrayLength, uint *d_SrcKey,
uint sortDir) { uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir)
{
// Nothing to sort // Nothing to sort
if (arrayLength < 2) { if (arrayLength < 2) {
return; return;
@ -131,32 +132,25 @@ extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
assert(arrayLength <= SHARED_SIZE_LIMIT); assert(arrayLength <= SHARED_SIZE_LIMIT);
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0); assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
bitonicSortSharedKernel<<<blockCount, threadCount>>>( bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n"); getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals // Merge step 3: merge elementary intervals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) { static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint getSampleCount(uint dividend) { static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
return iDivUp(dividend, SAMPLE_STRIDE);
}
template <uint sortDir> template <uint sortDir>
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA, static inline __device__ void
uint &flagA, uint &keyB, ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
uint &valB, uint &flagB, {
uint arrowDir) {
uint t; uint t;
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
((arrowDir == sortDir) && (flagA == 1)) || || ((arrowDir != sortDir) && (flagB == 1))) {
((arrowDir != sortDir) && (flagB == 1))) {
t = keyA; t = keyA;
keyA = keyB; keyA = keyB;
keyB = t; keyB = t;
@ -170,9 +164,15 @@ static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
} }
template <uint sortDir> template <uint sortDir>
__global__ void bitonicMergeElementaryIntervalsKernel( __global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) { uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N)
{
// Handle to thread block group // Handle to thread block group
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE]; __shared__ uint s_key[2 * SAMPLE_STRIDE];
@ -200,10 +200,8 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
startSrcB = d_LimitsB[blockIdx.x]; startSrcB = d_LimitsB[blockIdx.x];
startDst = startSrcA + startSrcB; startDst = startSrcA + startSrcB;
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
: segmentElementsA; uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
: segmentElementsB;
lenSrcA = endSrcA - startSrcA; lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB; lenSrcB = endSrcB - startSrcB;
} }
@ -222,10 +220,8 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
// Prepare for bitonic merge by inversing the ordering // Prepare for bitonic merge by inversing the ordering
if (threadIdx.x < lenSrcB) { if (threadIdx.x < lenSrcB) {
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
d_SrcKey[stride + startSrcB + threadIdx.x]; s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
d_SrcVal[stride + startSrcB + threadIdx.x];
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0; s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
} }
@ -233,9 +229,13 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) { for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
cg::sync(cta); cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0], ComparatorExtended<sortDir>(s_key[pos + 0],
s_key[pos + stride], s_val[pos + stride], s_val[pos + 0],
s_inf[pos + stride], sortDir); s_inf[pos + 0],
s_key[pos + stride],
s_val[pos + stride],
s_inf[pos + stride],
sortDir);
} }
// Store sorted data // Store sorted data
@ -254,26 +254,28 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
} }
} }
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal, extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsA,
uint *d_LimitsB, uint stride, uint *d_LimitsB,
uint N, uint sortDir) { uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) { if (sortDir) {
bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>( bitonicMergeElementaryIntervalsKernel<1U>
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n"); getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
} else { }
bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>( else {
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, bitonicMergeElementaryIntervalsKernel<0U>
N); <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n"); getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
} }
} }

View File

@ -26,17 +26,19 @@
*/ */
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Test driver // Test driver
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
StopWatchInterface *hTimer = NULL; StopWatchInterface *hTimer = NULL;
@ -75,10 +77,8 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
printf("Initializing GPU merge sort...\n"); printf("Initializing GPU merge sort...\n");
initMergeSort(); initMergeSort();
@ -93,10 +93,8 @@ int main(int argc, char **argv) {
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer)); printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
printf("Reading back GPU merge sort results...\n"); printf("Reading back GPU merge sort results...\n");
checkCudaErrors( checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
checkCudaErrors(
cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
printf("Inspecting the results...\n"); printf("Inspecting the results...\n");
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR); uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);

View File

@ -39,21 +39,19 @@
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
#include <helper_cuda.h> #include <helper_cuda.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Helper functions // Helper functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) { static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint getSampleCount(uint dividend) { static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
return iDivUp(dividend, SAMPLE_STRIDE);
}
#define W (sizeof(uint) * 8) #define W (sizeof(uint) * 8)
static inline __device__ uint nextPowerOfTwo(uint x) { static inline __device__ uint nextPowerOfTwo(uint x)
{
/* /*
--x; --x;
x |= x >> 1; x |= x >> 1;
@ -66,9 +64,8 @@ static inline __device__ uint nextPowerOfTwo(uint x) {
return 1U << (W - __clz(x - 1)); return 1U << (W - __clz(x - 1));
} }
template <uint sortDir> template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
static inline __device__ uint binarySearchInclusive(uint val, uint *data, {
uint L, uint stride) {
if (L == 0) { if (L == 0) {
return 0; return 0;
} }
@ -78,8 +75,7 @@ static inline __device__ uint binarySearchInclusive(uint val, uint *data,
for (; stride > 0; stride >>= 1) { for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L); uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos; pos = newPos;
} }
} }
@ -87,9 +83,8 @@ static inline __device__ uint binarySearchInclusive(uint val, uint *data,
return pos; return pos;
} }
template <uint sortDir> template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
static inline __device__ uint binarySearchExclusive(uint val, uint *data, {
uint L, uint stride) {
if (L == 0) { if (L == 0) {
return 0; return 0;
} }
@ -99,8 +94,7 @@ static inline __device__ uint binarySearchExclusive(uint val, uint *data,
for (; stride > 0; stride >>= 1) { for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L); uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos; pos = newPos;
} }
} }
@ -112,9 +106,8 @@ static inline __device__ uint binarySearchExclusive(uint val, uint *data,
// Bottom-level merge sort (binary search-based) // Bottom-level merge sort (binary search-based)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <uint sortDir> template <uint sortDir>
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
uint *d_SrcKey, uint *d_SrcVal, {
uint arrayLength) {
// Handle to thread block group // Handle to thread block group
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[SHARED_SIZE_LIMIT]; __shared__ uint s_key[SHARED_SIZE_LIMIT];
@ -126,10 +119,8 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0]; s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0]; s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint stride = 1; stride < arrayLength; stride <<= 1) { for (uint stride = 1; stride < arrayLength; stride <<= 1) {
uint lPos = threadIdx.x & (stride - 1); uint lPos = threadIdx.x & (stride - 1);
@ -141,12 +132,8 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
uint valA = baseVal[lPos + 0]; uint valA = baseVal[lPos + 0];
uint keyB = baseKey[lPos + stride]; uint keyB = baseKey[lPos + stride];
uint valB = baseVal[lPos + stride]; uint valB = baseVal[lPos + stride];
uint posA = uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
lPos;
uint posB =
binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
lPos;
cg::sync(cta); cg::sync(cta);
baseKey[posA] = keyA; baseKey[posA] = keyA;
@ -158,15 +145,18 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
cg::sync(cta); cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0]; d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0]; d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
} }
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, static void mergeSortShared(uint *d_DstKey,
uint *d_SrcVal, uint batchSize, uint arrayLength, uint *d_DstVal,
uint sortDir) { uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir)
{
if (arrayLength < 2) { if (arrayLength < 2) {
return; return;
} }
@ -177,12 +167,11 @@ static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
uint threadCount = SHARED_SIZE_LIMIT / 2; uint threadCount = SHARED_SIZE_LIMIT / 2;
if (sortDir) { if (sortDir) {
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>( mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<1><<<>>> failed\n"); getLastCudaError("mergeSortShared<1><<<>>> failed\n");
} else { }
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>( else {
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength); mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<0><<<>>> failed\n"); getLastCudaError("mergeSortShared<0><<<>>> failed\n");
} }
} }
@ -191,9 +180,9 @@ static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
// Merge step 1: generate sample ranks // Merge step 1: generate sample ranks
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <uint sortDir> template <uint sortDir>
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, __global__ void
uint *d_SrcKey, uint stride, uint N, generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
uint threadCount) { {
uint pos = blockIdx.x * blockDim.x + threadIdx.x; uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) { if (pos >= threadCount) {
@ -214,33 +203,30 @@ __global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
if (i < segmentSamplesA) { if (i < segmentSamplesA) {
d_RanksA[i] = i * SAMPLE_STRIDE; d_RanksA[i] = i * SAMPLE_STRIDE;
d_RanksB[i] = binarySearchExclusive<sortDir>( d_RanksB[i] = binarySearchExclusive<sortDir>(
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
nextPowerOfTwo(segmentElementsB));
} }
if (i < segmentSamplesB) { if (i < segmentSamplesB) {
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE; d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>( d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
nextPowerOfTwo(segmentElementsA));
} }
} }
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
uint stride, uint N, uint sortDir) { {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint threadCount = uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
if (sortDir) { if (sortDir) {
generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>( generateSampleRanksKernel<1U>
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n"); getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
} else { }
generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>( else {
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); generateSampleRanksKernel<0U>
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n"); getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
} }
} }
@ -248,9 +234,8 @@ static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices // Merge step 2: generate sample ranks and indices
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, __global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
uint stride, uint N, {
uint threadCount) {
uint pos = blockIdx.x * blockDim.x + threadIdx.x; uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) { if (pos >= threadCount) {
@ -269,36 +254,29 @@ __global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
if (i < segmentSamplesA) { if (i < segmentSamplesA) {
uint dstPos = binarySearchExclusive<1U>( uint dstPos = binarySearchExclusive<1U>(
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
nextPowerOfTwo(segmentSamplesB)) + + i;
i;
d_Limits[dstPos] = d_Ranks[i]; d_Limits[dstPos] = d_Ranks[i];
} }
if (i < segmentSamplesB) { if (i < segmentSamplesB) {
uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i], uint dstPos = binarySearchInclusive<1U>(
d_Ranks, segmentSamplesA, d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
nextPowerOfTwo(segmentSamplesA)) + + i;
i;
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
} }
} }
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
uint *d_RanksA, uint *d_RanksB, uint stride, {
uint N) {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint threadCount = uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>( mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
d_LimitsA, d_RanksA, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n"); getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>( mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
d_LimitsB, d_RanksB, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n"); getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
} }
@ -306,24 +284,30 @@ static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
// Merge step 3: merge elementary intervals // Merge step 3: merge elementary intervals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <uint sortDir> template <uint sortDir>
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey, inline __device__ void merge(uint *dstKey,
uint *srcAVal, uint *srcBKey, uint *srcBVal, uint *dstVal,
uint lenA, uint nPowTwoLenA, uint lenB, uint *srcAKey,
uint nPowTwoLenB, cg::thread_block cta) { uint *srcAVal,
uint *srcBKey,
uint *srcBVal,
uint lenA,
uint nPowTwoLenA,
uint lenB,
uint nPowTwoLenB,
cg::thread_block cta)
{
uint keyA, valA, keyB, valB, dstPosA, dstPosB; uint keyA, valA, keyB, valB, dstPosA, dstPosB;
if (threadIdx.x < lenA) { if (threadIdx.x < lenA) {
keyA = srcAKey[threadIdx.x]; keyA = srcAKey[threadIdx.x];
valA = srcAVal[threadIdx.x]; valA = srcAVal[threadIdx.x];
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
threadIdx.x;
} }
if (threadIdx.x < lenB) { if (threadIdx.x < lenB) {
keyB = srcBKey[threadIdx.x]; keyB = srcBKey[threadIdx.x];
valB = srcBVal[threadIdx.x]; valB = srcBVal[threadIdx.x];
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
threadIdx.x;
} }
cg::sync(cta); cg::sync(cta);
@ -340,10 +324,15 @@ inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
} }
template <uint sortDir> template <uint sortDir>
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal, __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint *d_LimitsA, uint *d_LimitsB, uint *d_SrcKey,
uint stride, uint N) { uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N)
{
// Handle to thread block group // Handle to thread block group
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE]; __shared__ uint s_key[2 * SAMPLE_STRIDE];
@ -368,10 +357,8 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
startSrcA = d_LimitsA[blockIdx.x]; startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x]; startSrcB = d_LimitsB[blockIdx.x];
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
: segmentElementsA; uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
: segmentElementsB;
lenSrcA = endSrcA - startSrcA; lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB; lenSrcB = endSrcB - startSrcB;
startDstA = startSrcA + startSrcB; startDstA = startSrcA + startSrcB;
@ -387,17 +374,23 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
} }
if (threadIdx.x < lenSrcB) { if (threadIdx.x < lenSrcB) {
s_key[threadIdx.x + SAMPLE_STRIDE] = s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
d_SrcKey[stride + startSrcB + threadIdx.x]; s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
s_val[threadIdx.x + SAMPLE_STRIDE] =
d_SrcVal[stride + startSrcB + threadIdx.x];
} }
// Merge data in shared memory // Merge data in shared memory
cg::sync(cta); cg::sync(cta);
merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE, merge<sortDir>(s_key,
s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB, s_val,
SAMPLE_STRIDE, cta); s_key + 0,
s_val + 0,
s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE,
lenSrcA,
SAMPLE_STRIDE,
lenSrcB,
SAMPLE_STRIDE,
cta);
// Store merged data // Store merged data
cg::sync(cta); cg::sync(cta);
@ -413,63 +406,77 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
} }
} }
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal, static void mergeElementaryIntervals(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint *d_LimitsA, uint *d_LimitsB, uint *d_SrcKey,
uint stride, uint N, uint sortDir) { uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) { if (sortDir) {
mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>( mergeElementaryIntervalsKernel<1U>
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n"); getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
} else { }
mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>( else {
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, mergeElementaryIntervalsKernel<0U>
N); <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n"); getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
} }
} }
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal, extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint batchSize, uint arrayLength, uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir); uint sortDir);
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal, extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsA,
uint *d_LimitsB, uint stride, uint *d_LimitsB,
uint N, uint sortDir); uint stride,
uint N,
uint sortDir);
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB; static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768; static const uint MAX_SAMPLE_COUNT = 32768;
extern "C" void initMergeSort(void) { extern "C" void initMergeSort(void)
checkCudaErrors( {
cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors( checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors( checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
} }
extern "C" void closeMergeSort(void) { extern "C" void closeMergeSort(void)
{
checkCudaErrors(cudaFree(d_RanksA)); checkCudaErrors(cudaFree(d_RanksA));
checkCudaErrors(cudaFree(d_RanksB)); checkCudaErrors(cudaFree(d_RanksB));
checkCudaErrors(cudaFree(d_LimitsB)); checkCudaErrors(cudaFree(d_LimitsB));
checkCudaErrors(cudaFree(d_LimitsA)); checkCudaErrors(cudaFree(d_LimitsA));
} }
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey, extern "C" void mergeSort(uint *d_DstKey,
uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint N, uint sortDir) { uint *d_BufKey,
uint *d_BufVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint N,
uint sortDir)
{
uint stageCount = 0; uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++) for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
@ -482,7 +489,8 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
ival = d_BufVal; ival = d_BufVal;
okey = d_DstKey; okey = d_DstKey;
oval = d_DstVal; oval = d_DstVal;
} else { }
else {
ikey = d_DstKey; ikey = d_DstKey;
ival = d_DstVal; ival = d_DstVal;
okey = d_BufKey; okey = d_BufKey;
@ -491,8 +499,7 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT)); assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
assert(N % SHARED_SIZE_LIMIT == 0); assert(N % SHARED_SIZE_LIMIT == 0);
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
SHARED_SIZE_LIMIT, sortDir);
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) { for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
@ -504,18 +511,19 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N); mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
// Merge elementary intervals // Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
stride, N, sortDir);
if (lastSegmentElements <= stride) { if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be // Last merge segment consists of a single array which just needs to be
// passed through // passed through
checkCudaErrors(cudaMemcpy( checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice)); lastSegmentElements * sizeof(uint),
checkCudaErrors(cudaMemcpy( cudaMemcpyDeviceToDevice));
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice)); ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint),
cudaMemcpyDeviceToDevice));
} }
uint *t; uint *t;

View File

@ -36,14 +36,12 @@ typedef unsigned int uint;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Extensive sort validation routine // Extensive sort validation routine
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, extern "C" uint
uint arrayLength, uint numValues, validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
uint sortDir);
extern "C" void fillValues(uint *val, uint N); extern "C" void fillValues(uint *val, uint N);
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
uint batchSize, uint arrayLength);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// CUDA merge sort // CUDA merge sort
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
extern "C" void closeMergeSort(void); extern "C" void closeMergeSort(void);
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, extern "C" void
uint *bufVal, uint *srcKey, uint *srcVal, uint N, mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
uint sortDir);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// CPU "emulation" // CPU "emulation"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, extern "C" void
uint *bufVal, uint *srcKey, uint *srcVal, uint N, mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
uint sortDir);

View File

@ -29,19 +29,20 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Helper functions // Helper functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void checkOrder(uint *data, uint N, uint sortDir) { static void checkOrder(uint *data, uint N, uint sortDir)
{
if (N <= 1) { if (N <= 1) {
return; return;
} }
for (uint i = 0; i < N - 1; i++) for (uint i = 0; i < N - 1; i++)
if ((sortDir && (data[i] > data[i + 1])) || if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
(!sortDir && (data[i] < data[i + 1]))) {
fprintf(stderr, "checkOrder() failed!!!\n"); fprintf(stderr, "checkOrder() failed!!!\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -49,12 +50,13 @@ static void checkOrder(uint *data, uint N, uint sortDir) {
static uint umin(uint a, uint b) { return (a <= b) ? a : b; } static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
static uint getSampleCount(uint dividend) { static uint getSampleCount(uint dividend)
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) {
: (dividend / SAMPLE_STRIDE); return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
} }
static uint nextPowerOfTwo(uint x) { static uint nextPowerOfTwo(uint x)
{
--x; --x;
x |= x >> 1; x |= x >> 1;
x |= x >> 2; x |= x >> 2;
@ -64,7 +66,8 @@ static uint nextPowerOfTwo(uint x) {
return ++x; return ++x;
} }
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) { static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
{
if (L == 0) { if (L == 0) {
return 0; return 0;
} }
@ -74,8 +77,7 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) { for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L); uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos; pos = newPos;
} }
} }
@ -83,7 +85,8 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
return pos; return pos;
} }
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) { static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
{
if (L == 0) { if (L == 0) {
return 0; return 0;
} }
@ -93,8 +96,7 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) { for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L); uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos; pos = newPos;
} }
} }
@ -105,12 +107,10 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 1: find sample ranks in each segment // Merge step 1: find sample ranks in each segment
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
uint stride, uint N, uint sortDir) { {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint sampleCount = uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) { for (uint pos = 0; pos < sampleCount; pos++) {
@ -124,17 +124,14 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
if (i < nA) { if (i < nA) {
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE; ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
srcKey + segmentBase + stride, lenB, sortDir);
} }
if (i < nB) { if (i < nB) {
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE; ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
binarySearchInclusive( srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
srcKey + segmentBase, lenA, sortDir);
} }
} }
} }
@ -142,12 +139,10 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 2: merge ranks and indices to derive elementary intervals // Merge step 2: merge ranks and indices to derive elementary intervals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
uint N) { {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint sampleCount = uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) { for (uint pos = 0; pos < sampleCount; pos++) {
@ -161,23 +156,20 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
if (i < nA) { if (i < nA) {
uint dstPosA = uint dstPosA =
binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], binarySearchExclusive(
ranks + (segmentBase + stride) / SAMPLE_STRIDE, ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
nB, 1) + + i;
i;
assert(dstPosA < nA + nB); assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
} }
if (i < nB) { if (i < nB) {
uint dstPosA = binarySearchInclusive( uint dstPosA =
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], binarySearchInclusive(
ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) + ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
i; + i;
assert(dstPosA < nA + nB); assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
} }
} }
} }
@ -185,9 +177,16 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE) // Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal, static void merge(uint *dstKey,
uint *srcBKey, uint *srcBVal, uint lenA, uint lenB, uint *dstVal,
uint sortDir) { uint *srcAKey,
uint *srcAVal,
uint *srcBKey,
uint *srcBVal,
uint lenA,
uint lenB,
uint sortDir)
{
checkOrder(srcAKey, lenA, sortDir); checkOrder(srcAKey, lenA, sortDir);
checkOrder(srcBKey, lenB, sortDir); checkOrder(srcBKey, lenB, sortDir);
@ -206,13 +205,18 @@ static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
} }
} }
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey, static void mergeElementaryIntervals(uint *dstKey,
uint *srcVal, uint *limitsA, uint *limitsB, uint *dstVal,
uint stride, uint N, uint sortDir) { uint *srcKey,
uint *srcVal,
uint *limitsA,
uint *limitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
for (uint pos = 0; pos < mergePairs; pos++) { for (uint pos = 0; pos < mergePairs; pos++) {
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1); uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
@ -240,15 +244,18 @@ static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
(srcKey + segmentBase + 0) + startPosA, (srcKey + segmentBase + 0) + startPosA,
(srcVal + segmentBase + 0) + startPosA, (srcVal + segmentBase + 0) + startPosA,
(srcKey + segmentBase + stride) + startPosB, (srcKey + segmentBase + stride) + startPosB,
(srcVal + segmentBase + stride) + startPosB, endPosA - startPosA, (srcVal + segmentBase + stride) + startPosB,
endPosB - startPosB, sortDir); endPosA - startPosA,
endPosB - startPosB,
sortDir);
} }
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Retarded bubble sort // Retarded bubble sort
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) { static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
{
if (N <= 1) { if (N <= 1) {
return; return;
} }
@ -278,9 +285,9 @@ static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Interface function // Interface function
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, extern "C" void
uint *bufVal, uint *srcKey, uint *srcVal, uint N, mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
uint sortDir) { {
uint *ikey, *ival, *okey, *oval; uint *ikey, *ival, *okey, *oval;
uint stageCount = 0; uint stageCount = 0;
@ -292,7 +299,8 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
ival = bufVal; ival = bufVal;
okey = dstKey; okey = dstKey;
oval = dstVal; oval = dstVal;
} else { }
else {
ikey = dstKey; ikey = dstKey;
ival = dstVal; ival = dstVal;
okey = bufKey; okey = bufKey;
@ -304,8 +312,7 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
memcpy(ival, srcVal, N * sizeof(uint)); memcpy(ival, srcVal, N * sizeof(uint));
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) { for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
sortDir);
} }
printf("Merge...\n"); printf("Merge...\n");
@ -329,16 +336,15 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
mergeRanksAndIndices(limitsB, ranksB, stride, N); mergeRanksAndIndices(limitsB, ranksB, stride, N);
// Merge elementary intervals // Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
N, sortDir);
if (lastSegmentElements <= stride) { if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be // Last merge segment consists of a single array which just needs to be
// passed through // passed through
memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), memcpy(
lastSegmentElements * sizeof(uint)); okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), memcpy(
lastSegmentElements * sizeof(uint)); oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
} }
uint *t; uint *t;

View File

@ -29,14 +29,15 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Validate sorted keys array (check for integrity and proper order) // Validate sorted keys array (check for integrity and proper order)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, extern "C" uint
uint arrayLength, uint numValues, validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
uint sortDir) { {
uint *srcHist; uint *srcHist;
uint *resHist; uint *resHist;
@ -51,8 +52,7 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
int flag = 1; int flag = 1;
for (uint j = 0; j < batchSize; for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
j++, srcKey += arrayLength, resKey += arrayLength) {
// Build histograms for keys arrays // Build histograms for keys arrays
memset(srcHist, 0, numValues * sizeof(uint)); memset(srcHist, 0, numValues * sizeof(uint));
memset(resHist, 0, numValues * sizeof(uint)); memset(resHist, 0, numValues * sizeof(uint));
@ -61,11 +61,9 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) { if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
srcHist[srcKey[i]]++; srcHist[srcKey[i]]++;
resHist[resKey[i]]++; resHist[resKey[i]]++;
} else { }
fprintf( else {
stderr, fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
"***Set %u source/result key arrays are not limited properly***\n",
j);
flag = 0; flag = 0;
goto brk; goto brk;
} }
@ -74,18 +72,15 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
// Compare the histograms // Compare the histograms
for (uint i = 0; i < numValues; i++) for (uint i = 0; i < numValues; i++)
if (srcHist[i] != resHist[i]) { if (srcHist[i] != resHist[i]) {
fprintf(stderr, fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
"***Set %u source/result keys histograms do not match***\n", j);
flag = 0; flag = 0;
goto brk; goto brk;
} }
// Finally check the ordering // Finally check the ordering
for (uint i = 0; i < arrayLength - 1; i++) for (uint i = 0; i < arrayLength - 1; i++)
if ((sortDir && (resKey[i] > resKey[i + 1])) || if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
(!sortDir && (resKey[i] < resKey[i + 1]))) { fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
fprintf(stderr,
"***Set %u result key array is not ordered properly***\n", j);
flag = 0; flag = 0;
goto brk; goto brk;
} }
@ -95,7 +90,8 @@ brk:
free(resHist); free(resHist);
free(srcHist); free(srcHist);
if (flag) printf("OK\n"); if (flag)
printf("OK\n");
return flag; return flag;
} }
@ -103,30 +99,30 @@ brk:
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Value validation / stability check routines // Value validation / stability check routines
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void fillValues(uint *val, uint N) { extern "C" void fillValues(uint *val, uint N)
for (uint i = 0; i < N; i++) val[i] = i; {
for (uint i = 0; i < N; i++)
val[i] = i;
} }
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
uint batchSize, uint arrayLength) { {
int correctFlag = 1, stableFlag = 1; int correctFlag = 1, stableFlag = 1;
printf("...inspecting keys and values array: "); printf("...inspecting keys and values array: ");
for (uint i = 0; i < batchSize; for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
i++, resKey += arrayLength, resVal += arrayLength) {
for (uint j = 0; j < arrayLength; j++) { for (uint j = 0; j < arrayLength; j++) {
if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0; if (resKey[j] != srcKey[resVal[j]])
correctFlag = 0;
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
(resVal[j] > resVal[j + 1]))
stableFlag = 0; stableFlag = 0;
} }
} }
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n"); printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
printf(stableFlag ? "...stability property: stable!\n" printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
: "...stability property: NOT stable\n");
return correctFlag; return correctFlag;
} }

View File

@ -29,9 +29,9 @@
#include <stdio.h> #include <stdio.h>
// Includes CUDA // Includes CUDA
#include <cuda_runtime.h>
#include <cuda/barrier>
#include <cooperative_groups.h> #include <cooperative_groups.h>
#include <cuda/barrier>
#include <cuda_runtime.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
@ -43,9 +43,11 @@ namespace cg = cooperative_groups;
#if __CUDA_ARCH__ >= 700 #if __CUDA_ARCH__ >= 700
template <bool writeSquareRoot> template <bool writeSquareRoot>
__device__ void reduceBlockData( __device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
cuda::barrier<cuda::thread_scope_block> &barrier, cg::thread_block_tile<32> &tile32,
cg::thread_block_tile<32> &tile32, double &threadSum, double *result) { double &threadSum,
double *result)
{
extern __shared__ double tmp[]; extern __shared__ double tmp[];
#pragma unroll #pragma unroll
@ -62,9 +64,7 @@ __device__ void reduceBlockData(
// The warp 0 will perform last round of reduction // The warp 0 will perform last round of reduction
if (tile32.meta_group_rank() == 0) { if (tile32.meta_group_rank() == 0) {
double beta = tile32.thread_rank() < tile32.meta_group_size() double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
? tmp[tile32.thread_rank()]
: 0.0;
#pragma unroll #pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
@ -81,8 +81,8 @@ __device__ void reduceBlockData(
} }
#endif #endif
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, __global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
double *partialResults, int size) { {
#if __CUDA_ARCH__ >= 700 #if __CUDA_ARCH__ >= 700
#pragma diag_suppress static_var_with_dynamic_init #pragma diag_suppress static_var_with_dynamic_init
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
@ -105,8 +105,7 @@ __global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
// Each thread block performs reduction of partial dotProducts and writes to // Each thread block performs reduction of partial dotProducts and writes to
// global mem. // global mem.
reduceBlockData<false>(barrier, tile32, threadSum, reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
&partialResults[blockIdx.x]);
cg::sync(grid); cg::sync(grid);
@ -137,15 +136,15 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n", argv[0]); printf("%s starting...\n", argv[0]);
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv); int dev = findCudaDevice(argc, (const char **)argv);
int major = 0; int major = 0;
checkCudaErrors( checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher. // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
if (major < 7) { if (major < 7) {
@ -154,12 +153,10 @@ int main(int argc, char **argv) {
} }
int supportsCooperativeLaunch = 0; int supportsCooperativeLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
cudaDevAttrCooperativeLaunch, dev));
if (!supportsCooperativeLaunch) { if (!supportsCooperativeLaunch) {
printf( printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
"\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
"Waiving the run\n", "Waiving the run\n",
dev); dev);
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
@ -171,7 +168,8 @@ int main(int argc, char **argv) {
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) { int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
{
float *vecA, *d_vecA; float *vecA, *d_vecA;
float *vecB, *d_vecB; float *vecB, *d_vecB;
double *d_partialResults; double *d_partialResults;
@ -191,16 +189,14 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
cudaStream_t stream; cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
cudaMemcpyHostToDevice, stream));
// Kernel configuration, where a one-dimensional // Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured. // grid and one-dimensional blocks are configured.
int minGridSize = 0, blockSize = 0; int minGridSize = 0, blockSize = 0;
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( checkCudaErrors(
&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size)); cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
int smemSize = ((blockSize / 32) + 1) * sizeof(double); int smemSize = ((blockSize / 32) + 1) * sizeof(double);
@ -209,28 +205,24 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize)); &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
int multiProcessorCount = 0; int multiProcessorCount = 0;
checkCudaErrors(cudaDeviceGetAttribute( checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
minGridSize = multiProcessorCount * numBlocksPerSm; minGridSize = multiProcessorCount * numBlocksPerSm;
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double))); checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
printf( printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"blockSize = %d\n", "blockSize = %d\n",
minGridSize, blockSize); minGridSize,
blockSize);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1); dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
(void *)&d_partialResults, (void *)&size};
checkCudaErrors( checkCudaErrors(cudaLaunchCooperativeKernel(
cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid, (void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
dimBlock, kernelArgs, smemSize, stream));
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaStreamSynchronize(stream));
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal)); float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
@ -239,7 +231,8 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
if ((vecA[i] - expectedResult) > 0.00001) { if ((vecA[i] - expectedResult) > 0.00001) {
printf("mismatch at i = %d\n", i); printf("mismatch at i = %d\n", i);
break; break;
} else { }
else {
matches++; matches++;
} }
} }

View File

@ -34,8 +34,8 @@
#endif #endif
// Includes, system // Includes, system
#include <stdio.h>
#include <cassert> #include <cassert>
#include <stdio.h>
// Includes CUDA // Includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -58,7 +58,8 @@ bool testResult = true;
//! Tests assert function. //! Tests assert function.
//! Thread whose id > N will print assertion failed error message. //! Thread whose id > N will print assertion failed error message.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(int N) { __global__ void testKernel(int N)
{
int gtid = blockIdx.x * blockDim.x + threadIdx.x; int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N); assert(gtid < N);
} }
@ -70,17 +71,18 @@ void runTest(int argc, char **argv);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName); printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
int Nblocks = 2; int Nblocks = 2;
int Nthreads = 32; int Nthreads = 32;
cudaError_t error; cudaError_t error;
@ -94,7 +96,8 @@ void runTest(int argc, char **argv) {
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) { if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
printf("simpleAssert is not current supported on Mac OSX\n\n"); printf("simpleAssert is not current supported on Mac OSX\n\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} else { }
else {
printf("OS Info: <%s>\n\n", OS_System_Type.version); printf("OS Info: <%s>\n\n", OS_System_Type.version);
} }
@ -118,8 +121,7 @@ void runTest(int argc, char **argv) {
// Check for errors and failed asserts in asynchronous kernel launch. // Check for errors and failed asserts in asynchronous kernel launch.
if (error == cudaErrorAssert) { if (error == cudaErrorAssert) {
printf( printf("Device assert failed as expected, "
"Device assert failed as expected, "
"CUDA error message is: %s\n\n", "CUDA error message is: %s\n\n",
cudaGetErrorString(error)); cudaGetErrorString(error));
} }

View File

@ -34,11 +34,12 @@
#endif #endif
// Includes, system // Includes, system
#include <stdio.h>
#include <cassert> #include <cassert>
#include <stdio.h>
// Includes CUDA // Includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "nvrtc_helper.h" #include "nvrtc_helper.h"
// Utilities and timing functions // Utilities and timing functions
@ -58,7 +59,8 @@ void runTest(int argc, char **argv);
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName); printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
@ -66,7 +68,8 @@ int main(int argc, char **argv) {
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
int Nblocks = 2; int Nblocks = 2;
int Nthreads = 32; int Nthreads = 32;
@ -91,10 +94,15 @@ void runTest(int argc, char **argv) {
int count = 60; int count = 60;
void *args[] = {(void *)&count}; void *args[] = {(void *)&count};
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(kernel_addr,
kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */ dimGrid.x,
dimBlock.x, dimBlock.y, dimBlock.z, /* block dim */ dimGrid.y,
0, 0, /* shared mem, stream */ dimGrid.z, /* grid dim */
dimBlock.x,
dimBlock.y,
dimBlock.z, /* block dim */
0,
0, /* shared mem, stream */
&args[0], /* arguments */ &args[0], /* arguments */
0)); 0));

View File

@ -32,7 +32,8 @@
//! Thread whose id > N will print assertion failed error message. //! Thread whose id > N will print assertion failed error message.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void testKernel(int N) { extern "C" __global__ void testKernel(int N)
{
int gtid = blockIdx.x * blockDim.x + threadIdx.x; int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N); assert(gtid < N);
} }

View File

@ -30,10 +30,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -68,20 +68,21 @@ extern "C" bool computeGold(int *gpuData, const int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName); printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
cudaStream_t stream; cudaStream_t stream;
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
findCudaDevice(argc, (const char **)argv); findCudaDevice(argc, (const char **)argv);
@ -100,7 +101,8 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaMallocHost(&hOData, memSize)); checkCudaErrors(cudaMallocHost(&hOData, memSize));
// initialize the memory // initialize the memory
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; for (unsigned int i = 0; i < numData; i++)
hOData[i] = 0;
// To make the AND and XOR tests generate something other than 0... // To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff; hOData[8] = hOData[10] = 0xff;
@ -110,15 +112,13 @@ void runTest(int argc, char **argv) {
int *dOData; int *dOData;
checkCudaErrors(cudaMalloc((void **)&dOData, memSize)); checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
// copy host memory to device to initialize to zero // copy host memory to device to initialize to zero
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
// execute the kernel // execute the kernel
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData); testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
// Copy result from device to host // Copy result from device to host
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaStreamSynchronize(stream));
sdkStopTimer(&timer); sdkStopTimer(&timer);

View File

@ -42,7 +42,8 @@ extern "C" int computeGold(int *gpuData, const int len);
//! @param idata input data as provided to device //! @param idata input data as provided to device
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int computeGold(int *gpuData, const int len) { int computeGold(int *gpuData, const int len)
{
int val = 0; int val = 0;
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {

View File

@ -35,7 +35,8 @@
//! @param g_idata input data in global memory //! @param g_idata input data in global memory
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(int *g_odata) { __global__ void testKernel(int *g_odata)
{
// access thread id // access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

View File

@ -30,10 +30,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -64,13 +64,13 @@ extern "C" bool computeGold(int *gpuData, const int len);
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName); printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
@ -79,7 +79,8 @@ int main(int argc, char **argv) {
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
int dev = 0; int dev = 0;
char *cubin, *kernel_file; char *cubin, *kernel_file;
@ -106,7 +107,8 @@ void runTest(int argc, char **argv) {
int *hOData = (int *)malloc(memSize); int *hOData = (int *)malloc(memSize);
// initialize the memory // initialize the memory
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; for (unsigned int i = 0; i < numData; i++)
hOData[i] = 0;
// To make the AND and XOR tests generate something other than 0... // To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff; hOData[8] = hOData[10] = 0xff;
@ -121,11 +123,15 @@ void runTest(int argc, char **argv) {
dim3 cudaGridSize(numBlocks, 1, 1); dim3 cudaGridSize(numBlocks, 1, 1);
void *arr[] = {(void *)&dOData}; void *arr[] = {(void *)&dOData};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, checkCudaErrors(cuLaunchKernel(kernel_addr,
cudaGridSize.x,
cudaGridSize.y,
cudaGridSize.z, /* grid dim */ cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.x,
cudaBlockSize.y,
cudaBlockSize.z, /* block dim */ cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */ 0,
0, /* shared mem, stream */
&arr[0], /* arguments */ &arr[0], /* arguments */
0)); 0));

View File

@ -43,7 +43,8 @@ extern "C" int computeGold(int *gpuData, const int len);
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int computeGold(int *gpuData, const int len) { int computeGold(int *gpuData, const int len)
{
int val = 0; int val = 0;
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {

View File

@ -36,7 +36,8 @@
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void testKernel(int *g_odata) { extern "C" __global__ void testKernel(int *g_odata)
{
// access thread id // access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

View File

@ -26,10 +26,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA // includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -42,7 +42,8 @@
// declaration, forward // declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
cudaAccessPolicyWindow initAccessPolicyWindow(void) { cudaAccessPolicyWindow initAccessPolicyWindow(void)
{
cudaAccessPolicyWindow accessPolicyWindow = {0}; cudaAccessPolicyWindow accessPolicyWindow = {0};
accessPolicyWindow.base_ptr = (void *)0; accessPolicyWindow.base_ptr = (void *)0;
accessPolicyWindow.num_bytes = 0; accessPolicyWindow.num_bytes = 0;
@ -60,8 +61,8 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
//! @param bigDataSize input bigData size //! @param bigDataSize input bigData size
//! @param hitcount how many data access are done within block //! @param hitcount how many data access are done within block
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
int bigDataSize, int hitCount) { {
__shared__ unsigned int hit; __shared__ unsigned int hit;
int row = blockIdx.y * blockDim.y + threadIdx.y; int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.x * blockDim.x + threadIdx.x;
@ -82,9 +83,9 @@ static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
if ((tID % 2) == 0) { if ((tID % 2) == 0) {
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize]; data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
} else { }
trash[psRand % bigDataSize] = else {
trash[psRand % bigDataSize] + trash[idx % bigDataSize]; trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
} }
atomicAdd(&hit, 1); atomicAdd(&hit, 1);
@ -98,7 +99,8 @@ int main(int argc, char **argv) { runTest(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
bool bTestResult = true; bool bTestResult = true;
cudaAccessPolicyWindow accessPolicyWindow; cudaAccessPolicyWindow accessPolicyWindow;
cudaDeviceProp deviceProp; cudaDeviceProp deviceProp;
@ -127,8 +129,7 @@ void runTest(int argc, char **argv) {
// Make sure device the l2 optimization // Make sure device the l2 optimization
if (deviceProp.persistingL2CacheMaxSize == 0) { if (deviceProp.persistingL2CacheMaxSize == 0) {
printf( printf("Waiving execution as device %d does not support persisting L2 "
"Waiving execution as device %d does not support persisting L2 "
"Caching\n", "Caching\n",
devID); devID);
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
@ -139,8 +140,7 @@ void runTest(int argc, char **argv) {
// Set the amount of l2 cache that will be persisting to maximum the device // Set the amount of l2 cache that will be persisting to maximum the device
// can support // can support
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
deviceProp.persistingL2CacheMaxSize));
// Stream attribute to set // Stream attribute to set
streamAttrID = cudaStreamAttributeAccessPolicyWindow; streamAttrID = cudaStreamAttributeAccessPolicyWindow;
@ -155,8 +155,7 @@ void runTest(int argc, char **argv) {
// Allocate data // Allocate data
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int))); checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
checkCudaErrors( checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
for (int i = 0; i < bigDataSize; ++i) { for (int i = 0; i < bigDataSize; ++i) {
if (i < dataSize) { if (i < dataSize) {
@ -166,16 +165,12 @@ void runTest(int argc, char **argv) {
bigDataHostPointer[bigDataSize - i - 1] = i; bigDataHostPointer[bigDataSize - i - 1] = i;
} }
checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
checkCudaErrors( checkCudaErrors(
cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int))); cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(
cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int))); bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
dataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
bigDataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
// Make a window for the buffer of interest // Make a window for the buffer of interest
accessPolicyWindow.base_ptr = (void *)dataDevicePointer; accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
@ -186,8 +181,7 @@ void runTest(int argc, char **argv) {
streamAttrValue.accessPolicyWindow = accessPolicyWindow; streamAttrValue.accessPolicyWindow = accessPolicyWindow;
// Assign window to stream // Assign window to stream
checkCudaErrors( checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
// Demote any previous persisting lines // Demote any previous persisting lines
checkCudaErrors(cudaCtxResetPersistingL2Cache()); checkCudaErrors(cudaCtxResetPersistingL2Cache());

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -50,8 +50,8 @@
#endif #endif
// CUDA includes // CUDA includes
#include <cuda_runtime.h>
#include <cuda_gl_interop.h> #include <cuda_gl_interop.h>
#include <cuda_runtime.h>
// CUDA utilities and system includes // CUDA utilities and system includes
#include <helper_cuda.h> #include <helper_cuda.h>
@ -124,8 +124,7 @@ StopWatchInterface *timer = NULL;
GLuint shDraw; GLuint shDraw;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw);
unsigned int *g_odata, int imgw);
// Forward declarations // Forward declarations
void runStdProgram(int argc, char **argv); void runStdProgram(int argc, char **argv);
@ -140,8 +139,7 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource);
void deletePBO(GLuint *pbo); void deletePBO(GLuint *pbo);
#endif #endif
void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y);
unsigned int size_y);
void deleteTexture(GLuint *tex); void deleteTexture(GLuint *tex);
// rendering callbacks // rendering callbacks
@ -155,7 +153,8 @@ void mainMenu(int i);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Create PBO //! Create PBO
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) { void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource)
{
// set up vertex data parameter // set up vertex data parameter
num_texels = image_width * image_height; num_texels = image_width * image_height;
num_values = num_texels * 4; num_values = num_texels * 4;
@ -171,33 +170,32 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) {
glBindBuffer(GL_ARRAY_BUFFER, 0); glBindBuffer(GL_ARRAY_BUFFER, 0);
// register this buffer object with CUDA // register this buffer object with CUDA
checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, cudaGraphicsMapFlagsNone));
cudaGraphicsMapFlagsNone));
SDK_CHECK_ERROR_GL(); SDK_CHECK_ERROR_GL();
} }
void deletePBO(GLuint *pbo) { void deletePBO(GLuint *pbo)
{
glDeleteBuffers(1, pbo); glDeleteBuffers(1, pbo);
SDK_CHECK_ERROR_GL(); SDK_CHECK_ERROR_GL();
*pbo = 0; *pbo = 0;
} }
#endif #endif
const GLenum fbo_targets[] = { const GLenum fbo_targets[] = {GL_COLOR_ATTACHMENT0_EXT,
GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT1_EXT,
GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT}; GL_COLOR_ATTACHMENT2_EXT,
GL_COLOR_ATTACHMENT3_EXT};
#ifndef USE_TEXSUBIMAGE2D #ifndef USE_TEXSUBIMAGE2D
static const char *glsl_drawtex_vertshader_src = static const char *glsl_drawtex_vertshader_src = "void main(void)\n"
"void main(void)\n"
"{\n" "{\n"
" gl_Position = gl_Vertex;\n" " gl_Position = gl_Vertex;\n"
" gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n" " gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n"
"}\n"; "}\n";
static const char *glsl_drawtex_fragshader_src = static const char *glsl_drawtex_fragshader_src = "#version 130\n"
"#version 130\n"
"uniform usampler2D texImage;\n" "uniform usampler2D texImage;\n"
"void main()\n" "void main()\n"
"{\n" "{\n"
@ -227,15 +225,15 @@ static const char *glsl_draw_fragshader_src =
#endif #endif
// copy image and process using CUDA // copy image and process using CUDA
void generateCUDAImage() { void generateCUDAImage()
{
// run the Cuda kernel // run the Cuda kernel
unsigned int *out_data; unsigned int *out_data;
#ifdef USE_TEXSUBIMAGE2D #ifdef USE_TEXSUBIMAGE2D
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0));
size_t num_bytes; size_t num_bytes;
checkCudaErrors(cudaGraphicsResourceGetMappedPointer( checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
(void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
// printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n", // printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n",
// num_bytes, size_tex_data); // num_bytes, size_tex_data);
#else #else
@ -258,8 +256,7 @@ void generateCUDAImage() {
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest);
glBindTexture(GL_TEXTURE_2D, tex_cudaResult); glBindTexture(GL_TEXTURE_2D, tex_cudaResult);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
GL_UNSIGNED_BYTE, NULL);
SDK_CHECK_ERROR_GL(); SDK_CHECK_ERROR_GL();
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0); glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
@ -268,21 +265,20 @@ void generateCUDAImage() {
// map buffer objects to get CUDA device pointers // map buffer objects to get CUDA device pointers
cudaArray *texture_ptr; cudaArray *texture_ptr;
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0));
checkCudaErrors(cudaGraphicsSubResourceGetMappedArray( checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(&texture_ptr, cuda_tex_result_resource, 0, 0));
&texture_ptr, cuda_tex_result_resource, 0, 0));
int num_texels = image_width * image_height; int num_texels = image_width * image_height;
int num_values = num_texels * 4; int num_values = num_texels * 4;
int size_tex_data = sizeof(GLubyte) * num_values; int size_tex_data = sizeof(GLubyte) * num_values;
checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice));
size_tex_data, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0));
#endif #endif
} }
// display image to the screen as textured quad // display image to the screen as textured quad
void displayImage(GLuint texture) { void displayImage(GLuint texture)
{
glBindTexture(GL_TEXTURE_2D, texture); glBindTexture(GL_TEXTURE_2D, texture);
glEnable(GL_TEXTURE_2D); glEnable(GL_TEXTURE_2D);
glDisable(GL_DEPTH_TEST); glDisable(GL_DEPTH_TEST);
@ -332,7 +328,8 @@ void displayImage(GLuint texture) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Display callback //! Display callback
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void display() { void display()
{
sdkStartTimer(&timer); sdkStartTimer(&timer);
if (enable_cuda) { if (enable_cuda) {
@ -358,9 +355,7 @@ void display() {
sprintf(currentOutputPPM, "kilt.ppm"); sprintf(currentOutputPPM, "kilt.ppm");
g_CheckRender->savePPM(currentOutputPPM, true, NULL); g_CheckRender->savePPM(currentOutputPPM, true, NULL);
if (!g_CheckRender->PPMvsPPM(currentOutputPPM, if (!g_CheckRender->PPMvsPPM(currentOutputPPM, sdkFindFilePath(ref_file, pArgv[0]), MAX_EPSILON, 0.30f)) {
sdkFindFilePath(ref_file, pArgv[0]),
MAX_EPSILON, 0.30f)) {
g_TotalErrors++; g_TotalErrors++;
} }
@ -374,8 +369,7 @@ void display() {
if (++fpsCount == fpsLimit) { if (++fpsCount == fpsLimit) {
char cTitle[256]; char cTitle[256];
float fps = 1000.0f / sdkGetAverageTimerValue(&timer); float fps = 1000.0f / sdkGetAverageTimerValue(&timer);
sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, window_height, fps);
window_height, fps);
glutSetWindowTitle(cTitle); glutSetWindowTitle(cTitle);
// printf("%s\n", cTitle); // printf("%s\n", cTitle);
fpsCount = 0; fpsCount = 0;
@ -384,7 +378,8 @@ void display() {
} }
} }
void timerEvent(int value) { void timerEvent(int value)
{
glutPostRedisplay(); glutPostRedisplay();
glutTimerFunc(REFRESH_DELAY, timerEvent, 0); glutTimerFunc(REFRESH_DELAY, timerEvent, 0);
} }
@ -392,7 +387,8 @@ void timerEvent(int value) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Keyboard events handler //! Keyboard events handler
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void keyboard(unsigned char key, int /*x*/, int /*y*/) { void keyboard(unsigned char key, int /*x*/, int /*y*/)
{
switch (key) { switch (key) {
case (27): case (27):
Cleanup(EXIT_SUCCESS); Cleanup(EXIT_SUCCESS);
@ -404,7 +400,8 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/) {
if (enable_cuda) { if (enable_cuda) {
glClearColorIuiEXT(128, 128, 128, 255); glClearColorIuiEXT(128, 128, 128, 255);
} else { }
else {
glClearColor(0.5, 0.5, 0.5, 1.0); glClearColor(0.5, 0.5, 0.5, 1.0);
} }
@ -413,7 +410,8 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/) {
} }
} }
void reshape(int w, int h) { void reshape(int w, int h)
{
window_width = w; window_width = w;
window_height = h; window_height = h;
} }
@ -423,8 +421,8 @@ void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! //!
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y)
unsigned int size_y) { {
// create a texture // create a texture
glGenTextures(1, tex_cudaResult); glGenTextures(1, tex_cudaResult);
glBindTexture(GL_TEXTURE_2D, *tex_cudaResult); glBindTexture(GL_TEXTURE_2D, *tex_cudaResult);
@ -436,24 +434,22 @@ void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
#ifdef USE_TEXSUBIMAGE2D #ifdef USE_TEXSUBIMAGE2D
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
GL_UNSIGNED_BYTE, NULL);
SDK_CHECK_ERROR_GL(); SDK_CHECK_ERROR_GL();
#else #else
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
SDK_CHECK_ERROR_GL(); SDK_CHECK_ERROR_GL();
// register this texture with CUDA // register this texture with CUDA
checkCudaErrors(cudaGraphicsGLRegisterImage( checkCudaErrors(cudaGraphicsGLRegisterImage(
&cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard));
cudaGraphicsMapFlagsWriteDiscard));
#endif #endif
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! //!
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void deleteTexture(GLuint *tex) { void deleteTexture(GLuint *tex)
{
glDeleteTextures(1, tex); glDeleteTextures(1, tex);
SDK_CHECK_ERROR_GL(); SDK_CHECK_ERROR_GL();
@ -463,7 +459,8 @@ void deleteTexture(GLuint *tex) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
#if defined(__linux__) #if defined(__linux__)
char *Xstatus = getenv("DISPLAY"); char *Xstatus = getenv("DISPLAY");
if (Xstatus == NULL) { if (Xstatus == NULL) {
@ -487,8 +484,7 @@ int main(int argc, char **argv) {
if (checkCmdLineFlag(argc, (const char **)argv, "device")) { if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
printf("[%s]\n", argv[0]); printf("[%s]\n", argv[0]);
printf(" Does not explicitly support -device=n\n"); printf(" Does not explicitly support -device=n\n");
printf( printf(" This sample requires OpenGL. Only -file=<reference> are "
" This sample requires OpenGL. Only -file=<reference> are "
"supported\n"); "supported\n");
printf("exiting...\n"); printf("exiting...\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
@ -497,7 +493,8 @@ int main(int argc, char **argv) {
if (ref_file) { if (ref_file) {
printf("(Test with OpenGL verification)\n"); printf("(Test with OpenGL verification)\n");
runStdProgram(argc, argv); runStdProgram(argc, argv);
} else { }
else {
printf("(Interactive OpenGL Demo)\n"); printf("(Interactive OpenGL Demo)\n");
runStdProgram(argc, argv); runStdProgram(argc, argv);
} }
@ -508,7 +505,8 @@ int main(int argc, char **argv) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! //!
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void FreeResource() { void FreeResource()
{
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// unregister this buffer object with CUDA // unregister this buffer object with CUDA
@ -530,18 +528,18 @@ void FreeResource() {
printf("simpleCUDA2GL Exiting...\n"); printf("simpleCUDA2GL Exiting...\n");
} }
void Cleanup(int iExitCode) { void Cleanup(int iExitCode)
{
FreeResource(); FreeResource();
printf("PPM Images are %s\n", printf("PPM Images are %s\n", (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
(iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
exit(iExitCode); exit(iExitCode);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! //!
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
GLuint compileGLSLprogram(const char *vertex_shader_src, GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_shader_src)
const char *fragment_shader_src) { {
GLuint v, f, p = 0; GLuint v, f, p = 0;
p = glCreateProgram(); p = glCreateProgram();
@ -556,14 +554,15 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
glGetShaderiv(v, GL_COMPILE_STATUS, &compiled); glGetShaderiv(v, GL_COMPILE_STATUS, &compiled);
if (!compiled) { if (!compiled) {
//#ifdef NV_REPORT_COMPILE_ERRORS // #ifdef NV_REPORT_COMPILE_ERRORS
char temp[256] = ""; char temp[256] = "";
glGetShaderInfoLog(v, 256, NULL, temp); glGetShaderInfoLog(v, 256, NULL, temp);
printf("Vtx Compile failed:\n%s\n", temp); printf("Vtx Compile failed:\n%s\n", temp);
//#endif // #endif
glDeleteShader(v); glDeleteShader(v);
return 0; return 0;
} else { }
else {
glAttachShader(p, v); glAttachShader(p, v);
} }
} }
@ -578,14 +577,15 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
glGetShaderiv(f, GL_COMPILE_STATUS, &compiled); glGetShaderiv(f, GL_COMPILE_STATUS, &compiled);
if (!compiled) { if (!compiled) {
//#ifdef NV_REPORT_COMPILE_ERRORS // #ifdef NV_REPORT_COMPILE_ERRORS
char temp[256] = ""; char temp[256] = "";
glGetShaderInfoLog(f, 256, NULL, temp); glGetShaderInfoLog(f, 256, NULL, temp);
printf("frag Compile failed:\n%s\n", temp); printf("frag Compile failed:\n%s\n", temp);
//#endif // #endif
glDeleteShader(f); glDeleteShader(f);
return 0; return 0;
} else { }
else {
glAttachShader(p, f); glAttachShader(p, f);
} }
} }
@ -611,7 +611,8 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
//! Allocate the "render target" of CUDA //! Allocate the "render target" of CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifndef USE_TEXSUBIMAGE2D #ifndef USE_TEXSUBIMAGE2D
void initCUDABuffers() { void initCUDABuffers()
{
// set up vertex data parameter // set up vertex data parameter
num_texels = image_width * image_height; num_texels = image_width * image_height;
num_values = num_texels * 4; num_values = num_texels * 4;
@ -625,7 +626,8 @@ void initCUDABuffers() {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! //!
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void initGLBuffers() { void initGLBuffers()
{
// create pbo // create pbo
#ifdef USE_TEXSUBIMAGE2D #ifdef USE_TEXSUBIMAGE2D
createPBO(&pbo_dest, &cuda_pbo_dest_resource); createPBO(&pbo_dest, &cuda_pbo_dest_resource);
@ -636,8 +638,7 @@ void initGLBuffers() {
shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src); shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src);
#ifndef USE_TEXSUBIMAGE2D #ifndef USE_TEXSUBIMAGE2D
shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, glsl_drawtex_fragshader_src);
glsl_drawtex_fragshader_src);
#endif #endif
SDK_CHECK_ERROR_GL(); SDK_CHECK_ERROR_GL();
} }
@ -645,7 +646,8 @@ void initGLBuffers() {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run standard demo loop with or without GL verification //! Run standard demo loop with or without GL verification
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runStdProgram(int argc, char **argv) { void runStdProgram(int argc, char **argv)
{
// First initialize OpenGL context, so we can properly set the GL for CUDA. // First initialize OpenGL context, so we can properly set the GL for CUDA.
// This is necessary in order to achieve optimal performance with OpenGL/CUDA // This is necessary in order to achieve optimal performance with OpenGL/CUDA
// interop. // interop.
@ -683,8 +685,7 @@ void runStdProgram(int argc, char **argv) {
g_CheckRender->EnableQAReadback(true); g_CheckRender->EnableQAReadback(true);
} }
printf( printf("\n"
"\n"
"\tControls\n" "\tControls\n"
"\t(right click mouse button for Menu)\n" "\t(right click mouse button for Menu)\n"
"\t[esc] - Quit\n\n"); "\t[esc] - Quit\n\n");
@ -699,7 +700,8 @@ void runStdProgram(int argc, char **argv) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Initialize GL //! Initialize GL
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
bool initGL(int *argc, char **argv) { bool initGL(int *argc, char **argv)
{
// Create GL context // Create GL context
glutInit(argc, argv); glutInit(argc, argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH);
@ -707,8 +709,8 @@ bool initGL(int *argc, char **argv) {
iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing"); iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing");
// initialize necessary OpenGL extensions // initialize necessary OpenGL extensions
if (!isGLVersionSupported(2, 0) || if (!isGLVersionSupported(2, 0)
!areGLExtensionsSupported("GL_ARB_pixel_buffer_object " || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object "
"GL_EXT_framebuffer_object")) { "GL_EXT_framebuffer_object")) {
printf("ERROR: Support for necessary OpenGL extensions missing."); printf("ERROR: Support for necessary OpenGL extensions missing.");
fflush(stderr); fflush(stderr);
@ -729,8 +731,7 @@ bool initGL(int *argc, char **argv) {
// projection // projection
glMatrixMode(GL_PROJECTION); glMatrixMode(GL_PROJECTION);
glLoadIdentity(); glLoadIdentity();
gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, 10.0f);
10.0f);
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);

View File

@ -35,14 +35,16 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); } __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
// convert floating point rgb color to 8-bit integer // convert floating point rgb color to 8-bit integer
__device__ int rgbToInt(float r, float g, float b) { __device__ int rgbToInt(float r, float g, float b)
{
r = clamp(r, 0.0f, 255.0f); r = clamp(r, 0.0f, 255.0f);
g = clamp(g, 0.0f, 255.0f); g = clamp(g, 0.0f, 255.0f);
b = clamp(b, 0.0f, 255.0f); b = clamp(b, 0.0f, 255.0f);
return (int(b) << 16) | (int(g) << 8) | int(r); return (int(b) << 16) | (int(g) << 8) | int(r);
} }
__global__ void cudaProcess(unsigned int *g_odata, int imgw) { __global__ void cudaProcess(unsigned int *g_odata, int imgw)
{
extern __shared__ uchar4 sdata[]; extern __shared__ uchar4 sdata[];
int tx = threadIdx.x; int tx = threadIdx.x;
@ -56,7 +58,7 @@ __global__ void cudaProcess(unsigned int *g_odata, int imgw) {
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x); g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
} }
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
unsigned int *g_odata, int imgw) { {
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw); cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
} }

View File

@ -29,18 +29,21 @@
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Create thread // Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
{
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL); return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
} }
// Wait for thread to finish // Wait for thread to finish
void cutEndThread(CUTThread thread) { void cutEndThread(CUTThread thread)
{
WaitForSingleObject(thread, INFINITE); WaitForSingleObject(thread, INFINITE);
CloseHandle(thread); CloseHandle(thread);
} }
// Wait for multiple threads // Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) { void cutWaitForThreads(const CUTThread *threads, int num)
{
WaitForMultipleObjects(num, threads, true, INFINITE); WaitForMultipleObjects(num, threads, true, INFINITE);
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
@ -49,7 +52,8 @@ void cutWaitForThreads(const CUTThread *threads, int num) {
} }
// Create barrier. // Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) { CUTBarrier cutCreateBarrier(int releaseCount)
{
CUTBarrier barrier; CUTBarrier barrier;
InitializeCriticalSection(&barrier.criticalSection); InitializeCriticalSection(&barrier.criticalSection);
@ -61,7 +65,8 @@ CUTBarrier cutCreateBarrier(int releaseCount) {
} }
// Increment barrier. (execution continues) // Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) { void cutIncrementBarrier(CUTBarrier *barrier)
{
int myBarrierCount; int myBarrierCount;
EnterCriticalSection(&barrier->criticalSection); EnterCriticalSection(&barrier->criticalSection);
myBarrierCount = ++barrier->count; myBarrierCount = ++barrier->count;
@ -73,16 +78,15 @@ void cutIncrementBarrier(CUTBarrier *barrier) {
} }
// Wait for barrier release. // Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) { void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
WaitForSingleObject(barrier->barrierEvent, INFINITE);
}
// Destroy barrier // Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) {} void cutDestroyBarrier(CUTBarrier *barrier) {}
#else #else
// Create thread // Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
{
pthread_t thread; pthread_t thread;
pthread_create(&thread, NULL, func, data); pthread_create(&thread, NULL, func, data);
return thread; return thread;
@ -92,14 +96,16 @@ CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); } void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
// Wait for multiple threads // Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) { void cutWaitForThreads(const CUTThread *threads, int num)
{
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
cutEndThread(threads[i]); cutEndThread(threads[i]);
} }
} }
// Create barrier. // Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) { CUTBarrier cutCreateBarrier(int releaseCount)
{
CUTBarrier barrier; CUTBarrier barrier;
barrier.count = 0; barrier.count = 0;
@ -112,7 +118,8 @@ CUTBarrier cutCreateBarrier(int releaseCount) {
} }
// Increment barrier. (execution continues) // Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) { void cutIncrementBarrier(CUTBarrier *barrier)
{
int myBarrierCount; int myBarrierCount;
pthread_mutex_lock(&barrier->mutex); pthread_mutex_lock(&barrier->mutex);
myBarrierCount = ++barrier->count; myBarrierCount = ++barrier->count;
@ -124,7 +131,8 @@ void cutIncrementBarrier(CUTBarrier *barrier) {
} }
// Wait for barrier release. // Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) { void cutWaitForBarrier(CUTBarrier *barrier)
{
pthread_mutex_lock(&barrier->mutex); pthread_mutex_lock(&barrier->mutex);
while (barrier->count < barrier->releaseCount) { while (barrier->count < barrier->releaseCount) {
@ -135,7 +143,8 @@ void cutWaitForBarrier(CUTBarrier *barrier) {
} }
// Destroy barrier // Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) { void cutDestroyBarrier(CUTBarrier *barrier)
{
pthread_mutex_destroy(&barrier->mutex); pthread_mutex_destroy(&barrier->mutex);
pthread_cond_destroy(&barrier->conditionVariable); pthread_cond_destroy(&barrier->conditionVariable);
} }

View File

@ -37,7 +37,8 @@
typedef HANDLE CUTThread; typedef HANDLE CUTThread;
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *); typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
struct CUTBarrier { struct CUTBarrier
{
CRITICAL_SECTION criticalSection; CRITICAL_SECTION criticalSection;
HANDLE barrierEvent; HANDLE barrierEvent;
int releaseCount; int releaseCount;
@ -57,7 +58,8 @@ typedef void *(*CUT_THREADROUTINE)(void *);
#define CUT_THREADPROC void * #define CUT_THREADPROC void *
#define CUT_THREADEND return 0 #define CUT_THREADEND return 0
struct CUTBarrier { struct CUTBarrier
{
pthread_mutex_t mutex; pthread_mutex_t mutex;
pthread_cond_t conditionVariable; pthread_cond_t conditionVariable;
int releaseCount; int releaseCount;
@ -67,29 +69,30 @@ struct CUTBarrier {
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
// Create thread. // Create thread.
CUTThread cutStartThread(CUT_THREADROUTINE, void *data); CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
// Wait for thread to finish. // Wait for thread to finish.
void cutEndThread(CUTThread thread); void cutEndThread(CUTThread thread);
// Wait for multiple threads. // Wait for multiple threads.
void cutWaitForThreads(const CUTThread *threads, int num); void cutWaitForThreads(const CUTThread *threads, int num);
// Create barrier. // Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount); CUTBarrier cutCreateBarrier(int releaseCount);
// Increment barrier. (execution continues) // Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier); void cutIncrementBarrier(CUTBarrier *barrier);
// Wait for barrier release. // Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier); void cutWaitForBarrier(CUTBarrier *barrier);
// Destroy barrier // Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier); void cutDestroyBarrier(CUTBarrier *barrier);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"

View File

@ -43,8 +43,8 @@
#include <stdio.h> #include <stdio.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#include "multithreading.h" #include "multithreading.h"
@ -53,10 +53,10 @@ const int N_elements_per_workload = 100000;
CUTBarrier thread_barrier; CUTBarrier thread_barrier;
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
void *data);
struct heterogeneous_workload { struct heterogeneous_workload
{
int id; int id;
int cudaDeviceID; int cudaDeviceID;
@ -67,13 +67,16 @@ struct heterogeneous_workload {
bool success; bool success;
}; };
__global__ void incKernel(int *data, int N) { __global__ void incKernel(int *data, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) data[i]++; if (i < N)
data[i]++;
} }
CUT_THREADPROC launch(void *void_arg) { CUT_THREADPROC launch(void *void_arg)
{
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg; heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// Select GPU for this CPU thread // Select GPU for this CPU thread
@ -81,11 +84,8 @@ CUT_THREADPROC launch(void *void_arg) {
// Allocate Resources // Allocate Resources
checkCudaErrors(cudaStreamCreate(&workload->stream)); checkCudaErrors(cudaStreamCreate(&workload->stream));
checkCudaErrors( checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int))); checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
checkCudaErrors(cudaHostAlloc(&workload->h_data,
N_elements_per_workload * sizeof(int),
cudaHostAllocPortable));
// CPU thread generates data // CPU thread generates data
for (int i = 0; i < N_elements_per_workload; ++i) { for (int i = 0; i < N_elements_per_workload; ++i) {
@ -97,25 +97,28 @@ CUT_THREADPROC launch(void *void_arg) {
dim3 block(512); dim3 block(512);
dim3 grid((N_elements_per_workload + block.x - 1) / block.x); dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data, checkCudaErrors(cudaMemcpyAsync(workload->d_data,
workload->h_data,
N_elements_per_workload * sizeof(int), N_elements_per_workload * sizeof(int),
cudaMemcpyHostToDevice, workload->stream)); cudaMemcpyHostToDevice,
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, workload->stream));
N_elements_per_workload); incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data, checkCudaErrors(cudaMemcpyAsync(workload->h_data,
workload->d_data,
N_elements_per_workload * sizeof(int), N_elements_per_workload * sizeof(int),
cudaMemcpyDeviceToHost, workload->stream)); cudaMemcpyDeviceToHost,
workload->stream));
// New in CUDA 5.0: Add a CPU callback which is called once all currently // New in CUDA 5.0: Add a CPU callback which is called once all currently
// pending operations in the CUDA stream have finished // pending operations in the CUDA stream have finished
checkCudaErrors( checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
CUT_THREADEND; CUT_THREADEND;
// CPU thread end of life, GPU continues to process data... // CPU thread end of life, GPU continues to process data...
} }
CUT_THREADPROC postprocess(void *void_arg) { CUT_THREADPROC postprocess(void *void_arg)
{
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg; heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// ... GPU is done with processing, continue on new CPU thread... // ... GPU is done with processing, continue on new CPU thread...
@ -140,8 +143,8 @@ CUT_THREADPROC postprocess(void *void_arg) {
CUT_THREADEND; CUT_THREADEND;
} }
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
void *data) { {
// Check status of GPU after stream operations are done // Check status of GPU after stream operations are done
checkCudaErrors(status); checkCudaErrors(status);
@ -149,7 +152,8 @@ void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
cutStartThread(postprocess, data); cutStartThread(postprocess, data);
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
int N_gpus, max_gpus = 0; int N_gpus, max_gpus = 0;
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
@ -168,10 +172,8 @@ int main(int argc, char **argv) {
cudaSetDevice(devid); cudaSetDevice(devid);
cudaGetDeviceProperties(&deviceProp, devid); cudaGetDeviceProperties(&deviceProp, devid);
SMversion = deviceProp.major << 4 + deviceProp.minor; SMversion = deviceProp.major << 4 + deviceProp.minor;
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
deviceProp.major, deviceProp.minor); printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
printf(", %s GPU Callback Functions\n",
(SMversion >= 0x11) ? "capable" : "NOT capable");
if (SMversion >= 0x11) { if (SMversion >= 0x11) {
gpuInfo[max_gpus++] = devid; gpuInfo[max_gpus++] = devid;
@ -181,8 +183,7 @@ int main(int argc, char **argv) {
printf("%d GPUs available to run Callback Functions\n", max_gpus); printf("%d GPUs available to run Callback Functions\n", max_gpus);
heterogeneous_workload *workloads; heterogeneous_workload *workloads;
workloads = (heterogeneous_workload *)malloc(N_workloads * workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
sizeof(heterogeneous_workload));
; ;
thread_barrier = cutCreateBarrier(N_workloads); thread_barrier = cutCreateBarrier(N_workloads);

View File

@ -38,8 +38,8 @@
* *
*/ */
#include <stdio.h>
#include <cooperative_groups.h> #include <cooperative_groups.h>
#include <stdio.h>
using namespace cooperative_groups; using namespace cooperative_groups;
@ -49,7 +49,8 @@ using namespace cooperative_groups;
* calculates the sum of val across the group g. The workspace array, x, * calculates the sum of val across the group g. The workspace array, x,
* must be large enough to contain g.size() integers. * must be large enough to contain g.size() integers.
*/ */
__device__ int sumReduction(thread_group g, int *x, int val) { __device__ int sumReduction(thread_group g, int *x, int val)
{
// rank of this thread in the group // rank of this thread in the group
int lane = g.thread_rank(); int lane = g.thread_rank();
@ -85,7 +86,8 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
* *
* Creates cooperative groups and performs reductions * Creates cooperative groups and performs reductions
*/ */
__global__ void cgkernel() { __global__ void cgkernel()
{
// threadBlockGroup includes all threads in the block // threadBlockGroup includes all threads in the block
thread_block threadBlockGroup = this_thread_block(); thread_block threadBlockGroup = this_thread_block();
int threadBlockGroupSize = threadBlockGroup.size(); int threadBlockGroupSize = threadBlockGroup.size();
@ -107,24 +109,22 @@ __global__ void cgkernel() {
// master thread in group prints out result // master thread in group prints out result
if (threadBlockGroup.thread_rank() == 0) { if (threadBlockGroup.thread_rank() == 0) {
printf( printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n", (int)threadBlockGroup.size() - 1,
(int)threadBlockGroup.size() - 1, output, expectedOutput); output,
expectedOutput);
printf(" Now creating %d groups, each of size 16 threads:\n\n", printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
(int)threadBlockGroup.size() / 16);
} }
threadBlockGroup.sync(); threadBlockGroup.sync();
// each tiledPartition16 group includes 16 threads // each tiledPartition16 group includes 16 threads
thread_block_tile<16> tiledPartition16 = thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
tiled_partition<16>(threadBlockGroup);
// This offset allows each group to have its own unique area in the workspace // This offset allows each group to have its own unique area in the workspace
// array // array
int workspaceOffset = int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
// input to reduction, for each thread, is its' rank in the group // input to reduction, for each thread, is its' rank in the group
input = tiledPartition16.thread_rank(); input = tiledPartition16.thread_rank();
@ -138,10 +138,10 @@ __global__ void cgkernel() {
// each master thread prints out result // each master thread prints out result
if (tiledPartition16.thread_rank() == 0) if (tiledPartition16.thread_rank() == 0)
printf( printf(" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
"(expected %d)\n", "(expected %d)\n",
output, expectedOutput); output,
expectedOutput);
return; return;
} }
@ -149,7 +149,8 @@ __global__ void cgkernel() {
/** /**
* Host main routine * Host main routine
*/ */
int main() { int main()
{
// Error code to check return values for CUDA calls // Error code to check return values for CUDA calls
cudaError_t err; cudaError_t err;
@ -166,8 +167,7 @@ int main() {
err = cudaDeviceSynchronize(); err = cudaDeviceSynchronize();
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }

View File

@ -26,27 +26,27 @@
*/ */
/* /*
* This sample demonstrates how to use texture fetches from layered 2D textures * This sample demonstrates how to use texture fetches from layered 2D textures
* in CUDA C * in CUDA C
* *
* This sample first generates a 3D input data array for the layered texture * This sample first generates a 3D input data array for the layered texture
* and the expected output. Then it starts CUDA C kernels, one for each layer, * and the expected output. Then it starts CUDA C kernels, one for each layer,
* which fetch their layer's texture data (using normalized texture coordinates) * which fetch their layer's texture data (using normalized texture coordinates)
* transform it to the expected output, and write it to a 3D output data array. * transform it to the expected output, and write it to a 3D output data array.
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA // includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
static const char *sSDKname = "simpleCubemapTexture"; static const char *sSDKname = "simpleCubemapTexture";
@ -56,8 +56,8 @@ static const char *sSDKname = "simpleCubemapTexture";
//! Transform a cubemap face of a linear buffe using cubemap texture lookups //! Transform a cubemap face of a linear buffe using cubemap texture lookups
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *g_odata, int width, __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
cudaTextureObject_t tex) { {
// calculate this thread's data point // calculate this thread's data point
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -110,15 +110,15 @@ __global__ void transformKernel(float *g_odata, int width,
} }
// read from texture, do expected transformation and write to global memory // read from texture, do expected transformation and write to global memory
g_odata[face * width * width + y * width + x] = g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
-texCubemap<float>(tex, cx, cy, cz);
} }
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
// use command-line specified CUDA device, otherwise use device with highest // use command-line specified CUDA device, otherwise use device with highest
// Gflops/s // Gflops/s
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
@ -129,13 +129,11 @@ int main(int argc, char **argv) {
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor); printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
if (deviceProps.major < 2) { if (deviceProps.major < 2) {
printf( printf("%s requires SM 2.0 or higher for support of Texture Arrays. Test "
"%s requires SM 2.0 or higher for support of Texture Arrays. Test "
"will exit... \n", "will exit... \n",
sSDKname); sSDKname);
@ -157,8 +155,7 @@ int main(int argc, char **argv) {
for (unsigned int layer = 0; layer < num_layers; layer++) { for (unsigned int layer = 0; layer < num_layers; layer++) {
for (int i = 0; i < (int)(cubemap_size); i++) { for (int i = 0; i < (int)(cubemap_size); i++) {
h_data_ref[layer * cubemap_size + i] = h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
-h_data[layer * cubemap_size + i] + layer;
} }
} }
@ -167,19 +164,16 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaMalloc((void **)&d_data, size)); checkCudaErrors(cudaMalloc((void **)&d_data, size));
// allocate array and copy image data // allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cu_3darray; cudaArray *cu_3darray;
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc, // checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
// make_cudaExtent(width, height, num_layers), cudaArrayLayered )); // make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, checkCudaErrors(
make_cudaExtent(width, width, num_faces), cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
cudaArrayCubemap));
cudaMemcpy3DParms myparms = {0}; cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0); myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0); myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr = myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
myparms.dstArray = cu_3darray; myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, width, num_faces); myparms.extent = make_cudaExtent(width, width, num_faces);
myparms.kind = cudaMemcpyHostToDevice; myparms.kind = cudaMemcpyHostToDevice;
@ -207,10 +201,12 @@ int main(int argc, char **argv) {
dim3 dimBlock(8, 8, 1); dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1); dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
printf( printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
"Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
"block has 8 x 8 threads\n", "block has 8 x 8 threads\n",
width, num_layers, dimGrid.x, dimGrid.y); width,
num_layers,
dimGrid.x,
dimGrid.y);
transformKernel<<<dimGrid, dimBlock>>>(d_data, width, transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
tex); // warmup (for better timing) tex); // warmup (for better timing)
@ -233,8 +229,7 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer)); printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
(cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// allocate mem for the result on host side // allocate mem for the result on host side
@ -245,14 +240,13 @@ int main(int argc, char **argv) {
// write regression file if necessary // write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test // write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
false); }
} else { else {
printf("Comparing kernel output to expected data\n"); printf("Comparing kernel output to expected data\n");
#define MIN_EPSILON_ERROR 5e-3f #define MIN_EPSILON_ERROR 5e-3f
bResult = bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
} }
// cleanup memory // cleanup memory

View File

@ -33,12 +33,12 @@
*/ */
// Includes // Includes
#include <cstring>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <iostream>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <cstring>
#include <iostream>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
@ -66,11 +66,10 @@ int CleanupNoFailure(CUcontext &cuContext);
void RandomInit(float *, int); void RandomInit(float *, int);
bool findModulePath(const char *, string &, char **, ostringstream &); bool findModulePath(const char *, string &, char **, ostringstream &);
static void check(CUresult result, char const *const func, static void check(CUresult result, char const *const func, const char *const file, int const line)
const char *const file, int const line) { {
if (result) { if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
static_cast<unsigned int>(result), func);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
@ -78,7 +77,8 @@ static void check(CUresult result, char const *const func,
#define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__) #define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)
// Host code // Host code
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("simpleDrvRuntime..\n"); printf("simpleDrvRuntime..\n");
int N = 50000, devID = 0; int N = 50000, devID = 0;
size_t size = N * sizeof(float); size_t size = N * sizeof(float);
@ -100,7 +100,8 @@ int main(int argc, char **argv) {
if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
@ -113,8 +114,7 @@ int main(int argc, char **argv) {
checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module // Get function handle from module
checkCudaDrvErrors( checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
// Allocate input vectors h_A and h_B in host memory // Allocate input vectors h_A and h_B in host memory
checkCudaErrors(cudaMallocHost(&h_A, size)); checkCudaErrors(cudaMallocHost(&h_A, size));
@ -133,10 +133,8 @@ int main(int argc, char **argv) {
cudaStream_t stream; cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// Copy vectors from host memory to device memory // Copy vectors from host memory to device memory
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
int threadsPerBlock = 256; int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
@ -144,14 +142,12 @@ int main(int argc, char **argv) {
void *args[] = {&d_A, &d_B, &d_C, &N}; void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel // Launch the CUDA kernel
checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, checkCudaDrvErrors(
threadsPerBlock, 1, 1, 0, stream, args, cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
NULL));
// Copy result from device memory to host memory // Copy result from device memory to host memory
// h_C contains the result in host memory // h_C contains the result in host memory
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaStreamSynchronize(stream));
// Verify result // Verify result
int i; int i;
@ -171,7 +167,8 @@ int main(int argc, char **argv) {
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
} }
int CleanupNoFailure(CUcontext &cuContext) { int CleanupNoFailure(CUcontext &cuContext)
{
// Free device memory // Free device memory
checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_B));
@ -195,19 +192,21 @@ int CleanupNoFailure(CUcontext &cuContext) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
// Allocates an array with random float entries. // Allocates an array with random float entries.
void RandomInit(float *data, int n) { void RandomInit(float *data, int n)
{
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
data[i] = rand() / (float)RAND_MAX; data[i] = rand() / (float)RAND_MAX;
} }
} }
bool inline findModulePath(const char *module_file, string &module_path, bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
char **argv, ostringstream &ostrm) { {
char *actual_path = sdkFindFilePath(module_file, argv[0]); char *actual_path = sdkFindFilePath(module_file, argv[0]);
if (actual_path) { if (actual_path) {
module_path = actual_path; module_path = actual_path;
} else { }
else {
printf("> findModulePath file not found: <%s> \n", module_file); printf("> findModulePath file not found: <%s> \n", module_file);
return false; return false;
} }
@ -215,7 +214,8 @@ bool inline findModulePath(const char *module_file, string &module_path,
if (module_path.empty()) { if (module_path.empty()) {
printf("> findModulePath could not find file: <%s> \n", module_file); printf("> findModulePath could not find file: <%s> \n", module_file);
return false; return false;
} else { }
else {
printf("> findModulePath found file at <%s>\n", module_path.c_str()); printf("> findModulePath found file at <%s>\n", module_path.c_str());
if (module_path.rfind("fatbin") != string::npos) { if (module_path.rfind("fatbin") != string::npos) {
ifstream fileIn(module_path.c_str(), ios::binary); ifstream fileIn(module_path.c_str(), ios::binary);

View File

@ -34,9 +34,10 @@
*/ */
// Device code // Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
float *C, int N) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i]; if (i < N)
C[i] = A[i] + B[i];
} }

View File

@ -44,7 +44,8 @@ const char *sSDKsample = "hyperQ";
// This subroutine does no real work but runs for at least the specified number // This subroutine does no real work but runs for at least the specified number
// of clock ticks. // of clock ticks.
__device__ void clock_block(clock_t *d_o, clock_t clock_count) { __device__ void clock_block(clock_t *d_o, clock_t clock_count)
{
unsigned int start_clock = (unsigned int)clock(); unsigned int start_clock = (unsigned int)clock();
clock_t clock_offset = 0; clock_t clock_offset = 0;
@ -71,15 +72,12 @@ __device__ void clock_block(clock_t *d_o, clock_t clock_count) {
// We create two identical kernels calling clock_block(), we create two so that // We create two identical kernels calling clock_block(), we create two so that
// we can identify dependencies in the profile timeline ("kernel_B" is always // we can identify dependencies in the profile timeline ("kernel_B" is always
// dependent on "kernel_A" in the same stream). // dependent on "kernel_A" in the same stream).
__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { __global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
clock_block(d_o, clock_count); __global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
}
__global__ void kernel_B(clock_t *d_o, clock_t clock_count) {
clock_block(d_o, clock_count);
}
// Single-warp reduction kernel (note: this is not optimized for simplicity) // Single-warp reduction kernel (note: this is not optimized for simplicity)
__global__ void sum(clock_t *d_clocks, int N) { __global__ void sum(clock_t *d_clocks, int N)
{
// Handle to thread block group // Handle to thread block group
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
__shared__ clock_t s_clocks[32]; __shared__ clock_t s_clocks[32];
@ -106,7 +104,8 @@ __global__ void sum(clock_t *d_clocks, int N) {
} }
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
int nstreams = 32; // One stream for each pair of kernels int nstreams = 32; // One stream for each pair of kernels
float kernel_time = 10; // Time each kernel should run in ms float kernel_time = 10; // Time each kernel should run in ms
float elapsed_time; float elapsed_time;
@ -131,18 +130,20 @@ int main(int argc, char **argv) {
// HyperQ is available in devices of Compute Capability 3.5 and higher // HyperQ is available in devices of Compute Capability 3.5 and higher
if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) { if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
if (deviceProp.concurrentKernels == 0) { if (deviceProp.concurrentKernels == 0) {
printf( printf("> GPU does not support concurrent kernel execution (SM 3.5 or "
"> GPU does not support concurrent kernel execution (SM 3.5 or "
"higher required)\n"); "higher required)\n");
printf(" CUDA kernel runs will be serialized\n"); printf(" CUDA kernel runs will be serialized\n");
} else { }
else {
printf("> GPU does not support HyperQ\n"); printf("> GPU does not support HyperQ\n");
printf(" CUDA kernel runs will have limited concurrency\n"); printf(" CUDA kernel runs will have limited concurrency\n");
} }
} }
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); deviceProp.major,
deviceProp.minor,
deviceProp.multiProcessorCount);
// Allocate host memory for the output (reduced to a single value) // Allocate host memory for the output (reduced to a single value)
clock_t *a = 0; clock_t *a = 0;
@ -153,8 +154,7 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t))); checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
// Allocate and initialize an array of stream handles // Allocate and initialize an array of stream handles
cudaStream_t *streams = cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++) { for (int i = 0; i < nstreams; i++) {
checkCudaErrors(cudaStreamCreate(&(streams[i]))); checkCudaErrors(cudaStreamCreate(&(streams[i])));
@ -203,15 +203,15 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaEventSynchronize(stop_event)); checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf( printf("Expected time for serial execution of %d sets of kernels is between "
"Expected time for serial execution of %d sets of kernels is between "
"approx. %.3fs and %.3fs\n", "approx. %.3fs and %.3fs\n",
nstreams, (nstreams + 1) * kernel_time / 1000.0f, nstreams,
(nstreams + 1) * kernel_time / 1000.0f,
2 * nstreams * kernel_time / 1000.0f); 2 * nstreams * kernel_time / 1000.0f);
printf( printf("Expected time for fully concurrent execution of %d sets of kernels is "
"Expected time for fully concurrent execution of %d sets of kernels is "
"approx. %.3fs\n", "approx. %.3fs\n",
nstreams, 2 * kernel_time / 1000.0f); nstreams,
2 * kernel_time / 1000.0f);
printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f); printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
bool bTestResult = (a[0] >= total_clocks); bool bTestResult = (a[0] >= total_clocks);

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -32,6 +32,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <vector> #include <vector>
#include "helper_cuda.h" #include "helper_cuda.h"
#include "helper_multiprocess.h" #include "helper_multiprocess.h"
static const char shmName[] = "simpleIPCshm"; static const char shmName[] = "simpleIPCshm";
@ -49,7 +50,8 @@ static const char shmName[] = "simpleIPCshm";
#error Unsupported system #error Unsupported system
#endif #endif
typedef struct shmStruct_st { typedef struct shmStruct_st
{
size_t nprocesses; size_t nprocesses;
int barrier; int barrier;
int sense; int sense;
@ -58,15 +60,16 @@ typedef struct shmStruct_st {
cudaIpcEventHandle_t eventHandle[MAX_DEVICES]; cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
} shmStruct; } shmStruct;
__global__ void simpleKernel(char *ptr, int sz, char val) { __global__ void simpleKernel(char *ptr, int sz, char val)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (; idx < sz; idx += (gridDim.x * blockDim.x)) { for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
ptr[idx] = val; ptr[idx] = val;
} }
} }
static void barrierWait(volatile int *barrier, volatile int *sense, static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
unsigned int n) { {
int count; int count;
// Check-in // Check-in
@ -84,7 +87,8 @@ static void barrierWait(volatile int *barrier, volatile int *sense,
; ;
} }
static void childProcess(int id) { static void childProcess(int id)
{
volatile shmStruct *shm = NULL; volatile shmStruct *shm = NULL;
cudaStream_t stream; cudaStream_t stream;
sharedMemoryInfo info; sharedMemoryInfo info;
@ -108,8 +112,7 @@ static void childProcess(int id) {
checkCudaErrors(cudaSetDevice(shm->devices[id])); checkCudaErrors(cudaSetDevice(shm->devices[id]));
checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id])); checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
&blocks, simpleKernel, threads, 0));
blocks *= prop.multiProcessorCount; blocks *= prop.multiProcessorCount;
// Open and track all the allocations and events created in the master // Open and track all the allocations and events created in the master
@ -121,10 +124,8 @@ static void childProcess(int id) {
// Notice, we don't need to explicitly enable peer access for // Notice, we don't need to explicitly enable peer access for
// allocations on other devices. // allocations on other devices.
checkCudaErrors( checkCudaErrors(
cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
cudaIpcMemLazyEnablePeerAccess)); checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
checkCudaErrors(cudaIpcOpenEventHandle(
&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
ptrs.push_back(ptr); ptrs.push_back(ptr);
events.push_back(event); events.push_back(event);
@ -141,8 +142,7 @@ static void childProcess(int id) {
// Wait for the buffer to be accessed to be ready // Wait for the buffer to be accessed to be ready
checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0)); checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
// Push a simple kernel on it // Push a simple kernel on it
simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
DATA_SIZE, id);
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
// Signal that this buffer is ready for the next consumer // Signal that this buffer is ready for the next consumer
checkCudaErrors(cudaEventRecord(events[bufferId], stream)); checkCudaErrors(cudaEventRecord(events[bufferId], stream));
@ -158,8 +158,7 @@ static void childProcess(int id) {
// Now wait for my buffer to be ready so I can copy it locally and verify it // Now wait for my buffer to be ready so I can copy it locally and verify it
checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0)); checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
cudaMemcpyDeviceToHost, stream));
// And wait for all the queued up work to complete // And wait for all the queued up work to complete
checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaStreamSynchronize(stream));
@ -169,8 +168,11 @@ static void childProcess(int id) {
char compareId = (char)((id + 1) % procCount); char compareId = (char)((id + 1) % procCount);
for (unsigned long long j = 0; j < DATA_SIZE; j++) { for (unsigned long long j = 0; j < DATA_SIZE; j++) {
if (verification_buffer[j] != compareId) { if (verification_buffer[j] != compareId) {
printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j, printf("Process %d: Verification mismatch at %lld: %d != %d\n",
(int)verification_buffer[j], (int)compareId); id,
j,
(int)verification_buffer[j],
(int)compareId);
} }
} }
@ -185,7 +187,8 @@ static void childProcess(int id) {
printf("Process %d complete!\n", id); printf("Process %d complete!\n", id);
} }
static void parentProcess(char *app) { static void parentProcess(char *app)
{
sharedMemoryInfo info; sharedMemoryInfo info;
int devCount, i; int devCount, i;
volatile shmStruct *shm = NULL; volatile shmStruct *shm = NULL;
@ -219,17 +222,14 @@ static void parentProcess(char *app) {
// This sample requires two processes accessing each device, so we need // This sample requires two processes accessing each device, so we need
// to ensure exclusive or prohibited mode is not set // to ensure exclusive or prohibited mode is not set
if (prop.computeMode != cudaComputeModeDefault) { if (prop.computeMode != cudaComputeModeDefault) {
printf("Device %d is in an unsupported compute mode for this sample\n", printf("Device %d is in an unsupported compute mode for this sample\n", i);
i);
continue; continue;
} }
for (int j = 0; j < shm->nprocesses; j++) { for (int j = 0; j < shm->nprocesses; j++) {
int canAccessPeerIJ, canAccessPeerJI; int canAccessPeerIJ, canAccessPeerJI;
checkCudaErrors( checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i)); checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
checkCudaErrors(
cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
if (!canAccessPeerIJ || !canAccessPeerJI) { if (!canAccessPeerIJ || !canAccessPeerJI) {
allPeers = false; allPeers = false;
break; break;
@ -246,10 +246,11 @@ static void parentProcess(char *app) {
checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0)); checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
} }
shm->devices[shm->nprocesses++] = i; shm->devices[shm->nprocesses++] = i;
if (shm->nprocesses >= MAX_DEVICES) break; if (shm->nprocesses >= MAX_DEVICES)
} else { break;
printf( }
"Device %d is not peer capable with some other selected peers, " else {
printf("Device %d is not peer capable with some other selected peers, "
"skipping\n", "skipping\n",
i); i);
} }
@ -268,12 +269,9 @@ static void parentProcess(char *app) {
checkCudaErrors(cudaSetDevice(shm->devices[i])); checkCudaErrors(cudaSetDevice(shm->devices[i]));
checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE)); checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
checkCudaErrors( checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr)); checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
checkCudaErrors(cudaEventCreate( checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
&event, cudaEventDisableTiming | cudaEventInterprocess));
checkCudaErrors(cudaIpcGetEventHandle(
(cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
ptrs.push_back(ptr); ptrs.push_back(ptr);
events.push_back(event); events.push_back(event);
@ -314,14 +312,16 @@ static void parentProcess(char *app) {
sharedMemoryClose(&info); sharedMemoryClose(&info);
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
#if defined(__arm__) || defined(__aarch64__) #if defined(__arm__) || defined(__aarch64__)
printf("Not supported on ARM\n"); printf("Not supported on ARM\n");
return EXIT_WAIVED; return EXIT_WAIVED;
#else #else
if (argc == 1) { if (argc == 1) {
parentProcess(argv[0]); parentProcess(argv[0]);
} else { }
else {
childProcess(atoi(argv[1])); childProcess(atoi(argv[1]));
} }
return EXIT_SUCCESS; return EXIT_SUCCESS;

View File

@ -26,20 +26,20 @@
*/ */
/* /*
* This sample demonstrates how to use texture fetches from layered 2D textures * This sample demonstrates how to use texture fetches from layered 2D textures
* in CUDA C * in CUDA C
* *
* This sample first generates a 3D input data array for the layered texture * This sample first generates a 3D input data array for the layered texture
* and the expected output. Then it starts CUDA C kernels, one for each layer, * and the expected output. Then it starts CUDA C kernels, one for each layer,
* which fetch their layer's texture data (using normalized texture coordinates) * which fetch their layer's texture data (using normalized texture coordinates)
* transform it to the expected output, and write it to a 3D output data array. * transform it to the expected output, and write it to a 3D output data array.
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, kernels // includes, kernels
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -54,8 +54,8 @@ static const char *sSDKname = "simpleLayeredTexture";
//! Transform a layer of a layered 2D texture using texture lookups //! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *g_odata, int width, int height, __global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex)
int layer, cudaTextureObject_t tex) { {
// calculate this thread's data point // calculate this thread's data point
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -67,14 +67,14 @@ __global__ void transformKernel(float *g_odata, int width, int height,
float v = (y + 0.5f) / (float)height; float v = (y + 0.5f) / (float)height;
// read from texture, do expected transformation and write to global memory // read from texture, do expected transformation and write to global memory
g_odata[layer * width * height + y * width + x] = g_odata[layer * width * height + y * width + x] = -tex2DLayered<float>(tex, u, v, layer) + layer;
-tex2DLayered<float>(tex, u, v, layer) + layer;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("[%s] - Starting...\n", sSDKname); printf("[%s] - Starting...\n", sSDKname);
// use command-line specified CUDA device, otherwise use device with highest // use command-line specified CUDA device, otherwise use device with highest
@ -87,8 +87,7 @@ int main(int argc, char **argv) {
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor); printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
// generate input data for layered texture // generate input data for layered texture
@ -106,8 +105,7 @@ int main(int argc, char **argv) {
for (unsigned int layer = 0; layer < num_layers; layer++) for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++) { for (int i = 0; i < (int)(width * height); i++) {
h_data_ref[layer * width * height + i] = h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer;
-h_data[layer * width * height + i] + layer;
} }
// allocate device memory for result // allocate device memory for result
@ -115,17 +113,14 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaMalloc((void **)&d_data, size)); checkCudaErrors(cudaMalloc((void **)&d_data, size));
// allocate array and copy image data // allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cu_3darray; cudaArray *cu_3darray;
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, checkCudaErrors(
make_cudaExtent(width, height, num_layers), cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
cudaArrayLayered));
cudaMemcpy3DParms myparms = {0}; cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0); myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0); myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr = myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
myparms.dstArray = cu_3darray; myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, height, num_layers); myparms.extent = make_cudaExtent(width, height, num_layers);
myparms.kind = cudaMemcpyHostToDevice; myparms.kind = cudaMemcpyHostToDevice;
@ -152,10 +147,12 @@ int main(int argc, char **argv) {
dim3 dimBlock(8, 8, 1); dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
printf( printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
"Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
"8 x 8 threads\n", "8 x 8 threads\n",
width, height, dimGrid.x, dimGrid.y); width,
height,
dimGrid.x,
dimGrid.y);
transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0, transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
tex); // warmup (for better timing) tex); // warmup (for better timing)
@ -171,8 +168,7 @@ int main(int argc, char **argv) {
// execute the kernel // execute the kernel
for (unsigned int layer = 0; layer < num_layers; layer++) for (unsigned int layer = 0; layer < num_layers; layer++)
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, tex);
tex);
// check if kernel execution generated an error // check if kernel execution generated an error
getLastCudaError("Kernel execution failed"); getLastCudaError("Kernel execution failed");
@ -180,9 +176,7 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer)); printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
(width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) /
1e6));
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// allocate mem for the result on host side // allocate mem for the result on host side
@ -193,14 +187,13 @@ int main(int argc, char **argv) {
// write regression file if necessary // write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test // write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
false); }
} else { else {
printf("Comparing kernel output to expected data\n"); printf("Comparing kernel output to expected data\n");
#define MIN_EPSILON_ERROR 5e-3f #define MIN_EPSILON_ERROR 5e-3f
bResult = compareData(h_odata, h_data_ref, width * height * num_layers, bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f);
MIN_EPSILON_ERROR, 0.0f);
} }
// cleanup memory // cleanup memory

View File

@ -26,15 +26,15 @@
*/ */
/* Simple example demonstrating how to use MPI with CUDA /* Simple example demonstrating how to use MPI with CUDA
* *
* Generate some random numbers on one node. * Generate some random numbers on one node.
* Dispatch them to all nodes. * Dispatch them to all nodes.
* Compute their square root on each node's GPU. * Compute their square root on each node's GPU.
* Compute the average of the results using MPI. * Compute the average of the results using MPI.
* *
* simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms * simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
* on Windows, please download the Microsoft HPC Pack SDK 2008 * on Windows, please download the Microsoft HPC Pack SDK 2008
*/ */
// MPI include // MPI include
#include <mpi.h> #include <mpi.h>
@ -42,8 +42,8 @@
// System includes // System includes
#include <iostream> #include <iostream>
using std::cout;
using std::cerr; using std::cerr;
using std::cout;
using std::endl; using std::endl;
// User include // User include
@ -58,7 +58,8 @@ using std::endl;
// Host code // Host code
// No CUDA here, only MPI // No CUDA here, only MPI
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
{
// Dimensions of the dataset // Dimensions of the dataset
int blockSize = 256; int blockSize = 256;
int gridSize = 10000; int gridSize = 10000;
@ -87,8 +88,8 @@ int main(int argc, char *argv[]) {
float *dataNode = new float[dataSizePerNode]; float *dataNode = new float[dataSizePerNode];
// Dispatch a portion of the input data to each node // Dispatch a portion of the input data to each node
MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, MPI_CHECK(
dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD)); MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
if (commRank == 0) { if (commRank == 0) {
// No need for root data any more // No need for root data any more
@ -102,8 +103,7 @@ int main(int argc, char *argv[]) {
float sumNode = sum(dataNode, dataSizePerNode); float sumNode = sum(dataNode, dataSizePerNode);
float sumRoot; float sumRoot;
MPI_CHECK( MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
if (commRank == 0) { if (commRank == 0) {
float average = sumRoot / dataSizeTotal; float average = sumRoot / dataSizeTotal;
@ -122,7 +122,8 @@ int main(int argc, char *argv[]) {
} }
// Shut down MPI cleanly if something goes wrong // Shut down MPI cleanly if something goes wrong
void my_abort(int err) { void my_abort(int err)
{
cout << "Test FAILED\n"; cout << "Test FAILED\n";
MPI_Abort(MPI_COMM_WORLD, err); MPI_Abort(MPI_COMM_WORLD, err);
} }

View File

@ -26,14 +26,14 @@
*/ */
/* Simple example demonstrating how to use MPI with CUDA /* Simple example demonstrating how to use MPI with CUDA
* *
* Generate some random numbers on one node. * Generate some random numbers on one node.
* Dispatch them to all nodes. * Dispatch them to all nodes.
* Compute their square root on each node's GPU. * Compute their square root on each node's GPU.
* Compute the average of the results using MPI. * Compute the average of the results using MPI.
* *
* simpleMPI.cu: GPU part, compiled with nvcc * simpleMPI.cu: GPU part, compiled with nvcc
*/ */
#include <iostream> #include <iostream>
using std::cerr; using std::cerr;
@ -51,13 +51,15 @@ using std::endl;
// Device code // Device code
// Very simple GPU Kernel that computes square roots of input numbers // Very simple GPU Kernel that computes square roots of input numbers
__global__ void simpleMPIKernel(float *input, float *output) { __global__ void simpleMPIKernel(float *input, float *output)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
output[tid] = sqrt(input[tid]); output[tid] = sqrt(input[tid]);
} }
// Initialize an array with random data (between 0 and 1) // Initialize an array with random data (between 0 and 1)
void initData(float *data, int dataSize) { void initData(float *data, int dataSize)
{
for (int i = 0; i < dataSize; i++) { for (int i = 0; i < dataSize; i++) {
data[i] = (float)rand() / RAND_MAX; data[i] = (float)rand() / RAND_MAX;
} }
@ -65,7 +67,8 @@ void initData(float *data, int dataSize) {
// CUDA computation on each node // CUDA computation on each node
// No MPI here, only CUDA // No MPI here, only CUDA
void computeGPU(float *hostData, int blockSize, int gridSize) { void computeGPU(float *hostData, int blockSize, int gridSize)
{
int dataSize = blockSize * gridSize; int dataSize = blockSize * gridSize;
// Allocate data on GPU memory // Allocate data on GPU memory
@ -76,22 +79,21 @@ void computeGPU(float *hostData, int blockSize, int gridSize) {
CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float))); CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
// Copy to GPU memory // Copy to GPU memory
CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
// Run kernel // Run kernel
simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData); simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
// Copy data back to CPU memory // Copy data back to CPU memory
CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
// Free GPU memory // Free GPU memory
CUDA_CHECK(cudaFree(deviceInputData)); CUDA_CHECK(cudaFree(deviceInputData));
CUDA_CHECK(cudaFree(deviceOutputData)); CUDA_CHECK(cudaFree(deviceOutputData));
} }
float sum(float *data, int size) { float sum(float *data, int size)
{
float accum = 0.f; float accum = 0.f;
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {

View File

@ -26,19 +26,20 @@
*/ */
/* Simple example demonstrating how to use MPI with CUDA /* Simple example demonstrating how to use MPI with CUDA
* *
* Generate some random numbers on one node. * Generate some random numbers on one node.
* Dispatch them to all nodes. * Dispatch them to all nodes.
* Compute their square root on each node's GPU. * Compute their square root on each node's GPU.
* Compute the average of the results using MPI. * Compute the average of the results using MPI.
* *
* simpleMPI.h: common header file * simpleMPI.h: common header file
*/ */
// Forward declarations // Forward declarations
extern "C" { extern "C"
void initData(float *data, int dataSize); {
void computeGPU(float *hostData, int blockSize, int gridSize); void initData(float *data, int dataSize);
float sum(float *data, int size); void computeGPU(float *hostData, int blockSize, int gridSize);
void my_abort(int err); float sum(float *data, int size);
void my_abort(int err);
} }

View File

@ -38,7 +38,7 @@
* *
* Elapsed times are averaged over nreps repetitions (10 by default). * Elapsed times are averaged over nreps repetitions (10 by default).
* *
*/ */
const char *sSDKname = "simpleMultiCopy"; const char *sSDKname = "simpleMultiCopy";
@ -55,7 +55,8 @@ const char *sSDKname = "simpleMultiCopy";
// includes, kernels // includes, kernels
// Declare the CUDA kernels here and main() code that is needed to launch // Declare the CUDA kernels here and main() code that is needed to launch
// Compute workload on the system // Compute workload on the system
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) { __global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) { if (idx < N) {
@ -68,7 +69,7 @@ __global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
#define STREAM_COUNT 4 #define STREAM_COUNT 4
// Uncomment to simulate data source/sink IO times // Uncomment to simulate data source/sink IO times
//#define SIMULATE_IO // #define SIMULATE_IO
int *h_data_source; int *h_data_source;
int *h_data_sink; int *h_data_sink;
@ -102,7 +103,8 @@ bool test();
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
{
int cuda_device = 0; int cuda_device = 0;
float scale_factor; float scale_factor;
cudaDeviceProp deviceProp; cudaDeviceProp deviceProp;
@ -115,7 +117,8 @@ int main(int argc, char *argv[]) {
if (cuda_device < 0) { if (cuda_device < 0) {
printf("Invalid command line parameters\n"); printf("Invalid command line parameters\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
else {
printf("cuda_device = %d\n", cuda_device); printf("cuda_device = %d\n", cuda_device);
cuda_device = gpuDeviceInit(cuda_device); cuda_device = gpuDeviceInit(cuda_device);
@ -124,7 +127,8 @@ int main(int argc, char *argv[]) {
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
} }
} else { }
else {
// Otherwise pick the device with the highest Gflops/s // Otherwise pick the device with the highest Gflops/s
cuda_device = gpuGetMaxGflopsDeviceId(); cuda_device = gpuGetMaxGflopsDeviceId();
checkCudaErrors(cudaSetDevice(cuda_device)); checkCudaErrors(cudaSetDevice(cuda_device));
@ -133,22 +137,23 @@ int main(int argc, char *argv[]) {
} }
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
deviceProp.name,
deviceProp.multiProcessorCount, deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
deviceProp.multiProcessorCount);
// Anything that is less than 32 Cores will have scaled down workload // Anything that is less than 32 Cores will have scaled down workload
scale_factor = scale_factor =
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
(float)deviceProp.multiProcessorCount)),
1.0f); 1.0f);
N = (int)((float)N / scale_factor); N = (int)((float)N / scale_factor);
printf("> Device name: %s\n", deviceProp.name); printf("> Device name: %s\n", deviceProp.name);
printf("> CUDA Capability %d.%d hardware with %d multi-processors\n", printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); deviceProp.major,
deviceProp.minor,
deviceProp.multiProcessorCount);
printf("> scale_factor = %.2f\n", 1.0f / scale_factor); printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
printf("> array_size = %d\n\n", N); printf("> array_size = %d\n\n", N);
@ -165,13 +170,11 @@ int main(int argc, char *argv[]) {
h_data_sink = (int *)malloc(memsize); h_data_sink = (int *)malloc(memsize);
for (int i = 0; i < STREAM_COUNT; ++i) { for (int i = 0; i < STREAM_COUNT; ++i) {
checkCudaErrors( checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize)); checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize)); checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
checkCudaErrors( checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc(&d_data_out[i], memsize)); checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
checkCudaErrors(cudaStreamCreate(&stream[i])); checkCudaErrors(cudaStreamCreate(&stream[i]));
@ -190,8 +193,7 @@ int main(int argc, char *argv[]) {
// Time copies and kernel // Time copies and kernel
cudaEventRecord(start, 0); cudaEventRecord(start, 0);
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0));
cudaMemcpyHostToDevice, 0));
cudaEventRecord(stop, 0); cudaEventRecord(stop, 0);
cudaEventSynchronize(stop); cudaEventSynchronize(stop);
@ -199,8 +201,7 @@ int main(int argc, char *argv[]) {
cudaEventElapsedTime(&memcpy_h2d_time, start, stop); cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
cudaEventRecord(start, 0); cudaEventRecord(start, 0);
checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0));
cudaMemcpyDeviceToHost, 0));
cudaEventRecord(stop, 0); cudaEventRecord(stop, 0);
cudaEventSynchronize(stop); cudaEventSynchronize(stop);
@ -217,35 +218,27 @@ int main(int argc, char *argv[]) {
printf("\n"); printf("\n");
printf("Relevant properties of this CUDA device\n"); printf("Relevant properties of this CUDA device\n");
printf( printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
"(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
"(device property \"deviceOverlap\")\n", "(device property \"deviceOverlap\")\n",
deviceProp.deviceOverlap ? "X" : " "); deviceProp.deviceOverlap ? "X" : " ");
// printf("(%s) Can execute several GPU kernels simultaneously (compute // printf("(%s) Can execute several GPU kernels simultaneously (compute
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " "); // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
printf( printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
"(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro " " (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
"4000/5000/6000/K5000)\n", "4000/5000/6000/K5000)\n",
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " "); (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
printf("\n"); printf("\n");
printf("Measured timings (throughput):\n"); printf("Measured timings (throughput):\n");
printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
(memsize * 1e-6) / memcpy_h2d_time); printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time);
printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time);
(memsize * 1e-6) / memcpy_d2h_time);
printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
(inner_reps * memsize * 2e-6) / kernel_time);
printf("\n"); printf("\n");
printf( printf("Theoretical limits for speedup gained from overlapped data "
"Theoretical limits for speedup gained from overlapped data "
"transfers:\n"); "transfers:\n");
printf("No overlap at all (transfer-kernel-transfer): %f ms \n", printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
memcpy_h2d_time + memcpy_d2h_time + kernel_time); printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
printf("Compute can overlap with one transfer: %f ms\n",
max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
printf("Compute can overlap with both data transfers: %f ms\n", printf("Compute can overlap with both data transfers: %f ms\n",
max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time)); max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
@ -254,18 +247,13 @@ int main(int argc, char *argv[]) {
float overlap_time = processWithStreams(STREAM_COUNT); float overlap_time = processWithStreams(STREAM_COUNT);
printf("\nAverage measured timings over %d repetitions:\n", nreps); printf("\nAverage measured timings over %d repetitions:\n", nreps);
printf(" Avg. time when execution fully serialized\t: %f ms\n", printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps);
serial_time / nreps); printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps);
printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps);
overlap_time / nreps);
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
(serial_time - overlap_time) / nreps);
printf("\nMeasured throughput:\n"); printf("\nMeasured throughput:\n");
printf(" Fully serialized execution\t\t: %f GB/s\n", printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time);
(nreps * (memsize * 2e-6)) / serial_time); printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time);
printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
(nreps * (memsize * 2e-6)) / overlap_time);
// Verify the results, we will use the results for final output // Verify the results, we will use the results for final output
bool bResults = test(); bool bResults = test();
@ -293,7 +281,8 @@ int main(int argc, char *argv[]) {
exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE); exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
} }
float processWithStreams(int streams_used) { float processWithStreams(int streams_used)
{
int current_stream = 0; int current_stream = 0;
float time; float time;
@ -326,17 +315,17 @@ float processWithStreams(int streams_used) {
d_data_out[current_stream], d_data_in[current_stream], N, inner_reps); d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
// Upload next frame // Upload next frame
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(
cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize, d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream]));
cudaMemcpyHostToDevice, stream[next_stream]));
// Download current frame // Download current frame
checkCudaErrors(cudaMemcpyAsync( checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream],
h_data_out[current_stream], d_data_out[current_stream], memsize, d_data_out[current_stream],
cudaMemcpyDeviceToHost, stream[current_stream])); memsize,
cudaMemcpyDeviceToHost,
stream[current_stream]));
checkCudaErrors( checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
current_stream = next_stream; current_stream = next_stream;
} }
@ -350,7 +339,8 @@ float processWithStreams(int streams_used) {
return time; return time;
} }
void init() { void init()
{
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
h_data_source[i] = 0; h_data_source[i] = 0;
} }
@ -360,7 +350,8 @@ void init() {
} }
} }
bool test() { bool test()
{
bool passed = true; bool passed = true;
for (int j = 0; j < STREAM_COUNT; ++j) { for (int j = 0; j < STREAM_COUNT; ++j) {

View File

@ -37,15 +37,15 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
@ -64,12 +64,14 @@ const int DATA_N = 1048576 * 32;
// Refer to the 'reduction' CUDA Sample describing // Refer to the 'reduction' CUDA Sample describing
// reduction optimization strategies // reduction optimization strategies
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) { __global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x; const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int threadN = gridDim.x * blockDim.x; const int threadN = gridDim.x * blockDim.x;
float sum = 0; float sum = 0;
for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos]; for (int pos = tid; pos < N; pos += threadN)
sum += d_Input[pos];
d_Result[tid] = sum; d_Result[tid] = sum;
} }
@ -77,7 +79,8 @@ __global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
// Solver config // Solver config
TGPUplan plan[MAX_GPU_COUNT]; TGPUplan plan[MAX_GPU_COUNT];
@ -129,14 +132,10 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaSetDevice(i)); checkCudaErrors(cudaSetDevice(i));
checkCudaErrors(cudaStreamCreate(&plan[i].stream)); checkCudaErrors(cudaStreamCreate(&plan[i].stream));
// Allocate memory // Allocate memory
checkCudaErrors( checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
checkCudaErrors( checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float))); checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device,
ACCUM_N * sizeof(float)));
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data,
plan[i].dataN * sizeof(float)));
for (j = 0; j < plan[i].dataN; j++) { for (j = 0; j < plan[i].dataN; j++) {
plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX; plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
@ -158,19 +157,16 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaSetDevice(i)); checkCudaErrors(cudaSetDevice(i));
// Copy input data from CPU // Copy input data from CPU
checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data, checkCudaErrors(cudaMemcpyAsync(
plan[i].dataN * sizeof(float), plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
cudaMemcpyHostToDevice, plan[i].stream));
// Perform GPU computations // Perform GPU computations
reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>( reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
getLastCudaError("reduceKernel() execution failed.\n"); getLastCudaError("reduceKernel() execution failed.\n");
// Read back GPU results // Read back GPU results
checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum, checkCudaErrors(cudaMemcpyAsync(
ACCUM_N * sizeof(float), plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
cudaMemcpyDeviceToHost, plan[i].stream));
} }
// Process GPU results // Process GPU results

View File

@ -37,7 +37,8 @@
#ifndef SIMPLEMULTIGPU_H #ifndef SIMPLEMULTIGPU_H
#define SIMPLEMULTIGPU_H #define SIMPLEMULTIGPU_H
typedef struct { typedef struct
{
// Host-side input data // Host-side input data
int dataN; int dataN;
float *h_Data; float *h_Data;
@ -56,7 +57,6 @@ typedef struct {
} TGPUplan; } TGPUplan;
extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);
int BLOCK_N, int THREAD_N, cudaStream_t &s);
#endif #endif

View File

@ -25,8 +25,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <iostream>
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
#include <iostream>
const int manualBlockSize = 32; const int manualBlockSize = 32;
@ -38,7 +38,8 @@ const int manualBlockSize = 32;
// execution configuration, including anything the launch configurator // execution configuration, including anything the launch configurator
// API suggests. // API suggests.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void square(int *array, int arrayCount) { __global__ void square(int *array, int arrayCount)
{
extern __shared__ int dynamicSmem[]; extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
@ -58,8 +59,8 @@ __global__ void square(int *array, int arrayCount) {
// This wrapper routine computes the occupancy of kernel, and reports // This wrapper routine computes the occupancy of kernel, and reports
// it in terms of active warps / maximum warps per SM. // it in terms of active warps / maximum warps per SM.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static double reportPotentialOccupancy(void *kernel, int blockSize, static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem)
size_t dynamicSMem) { {
int device; int device;
cudaDeviceProp prop; cudaDeviceProp prop;
@ -72,8 +73,7 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
checkCudaErrors(cudaGetDevice(&device)); checkCudaErrors(cudaGetDevice(&device));
checkCudaErrors(cudaGetDeviceProperties(&prop, device)); checkCudaErrors(cudaGetDeviceProperties(&prop, device));
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem));
&numBlocks, kernel, blockSize, dynamicSMem));
activeWarps = numBlocks * blockSize / prop.warpSize; activeWarps = numBlocks * blockSize / prop.warpSize;
maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize; maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
@ -99,7 +99,8 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
// This function configures the launch based on the "automatic" // This function configures the launch based on the "automatic"
// argument, records the runtime, and reports occupancy and runtime. // argument, records the runtime, and reports occupancy and runtime.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int launchConfig(int *array, int arrayCount, bool automatic) { static int launchConfig(int *array, int arrayCount, bool automatic)
{
int blockSize; int blockSize;
int minGridSize; int minGridSize;
int gridSize; int gridSize;
@ -116,14 +117,13 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
checkCudaErrors(cudaEventCreate(&end)); checkCudaErrors(cudaEventCreate(&end));
if (automatic) { if (automatic) {
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( checkCudaErrors(
&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount));
arrayCount));
std::cout << "Suggested block size: " << blockSize << std::endl std::cout << "Suggested block size: " << blockSize << std::endl
<< "Minimum grid size for maximum occupancy: " << minGridSize << "Minimum grid size for maximum occupancy: " << minGridSize << std::endl;
<< std::endl; }
} else { else {
// This block size is too small. Given limited number of // This block size is too small. Given limited number of
// active blocks per multiprocessor, the number of active // active blocks per multiprocessor, the number of active
// threads will be limited, and thus unable to achieve maximum // threads will be limited, and thus unable to achieve maximum
@ -146,11 +146,9 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
// Calculate occupancy // Calculate occupancy
// //
potentialOccupancy = potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl;
<< std::endl;
// Report elapsed time // Report elapsed time
// //
@ -166,7 +164,8 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
// The test generates an array and squares it with a CUDA kernel, then // The test generates an array and squares it with a CUDA kernel, then
// verifies the result. // verifies the result.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int test(bool automaticLaunchConfig, const int count = 1000000) { static int test(bool automaticLaunchConfig, const int count = 1000000)
{
int *array; int *array;
int *dArray; int *dArray;
int size = count * sizeof(int); int size = count * sizeof(int);
@ -193,8 +192,7 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
// //
for (int i = 0; i < count; i += 1) { for (int i = 0; i < count; i += 1) {
if (array[i] != i * i) { if (array[i] != i * i) {
std::cout << "element " << i << " expected " << i * i << " actual " std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
<< array[i] << std::endl;
return 1; return 1;
} }
} }
@ -210,13 +208,13 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
// automatically configured launch, and reports the occupancy and // automatically configured launch, and reports the occupancy and
// performance. // performance.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main() { int main()
{
int status; int status;
std::cout << "starting Simple Occupancy" << std::endl << std::endl; std::cout << "starting Simple Occupancy" << std::endl << std::endl;
std::cout << "[ Manual configuration with " << manualBlockSize std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl;
<< " threads per block ]" << std::endl;
status = test(false); status = test(false);
if (status) { if (status) {

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -31,8 +31,8 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
// CUDA includes // CUDA includes
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -41,7 +41,8 @@
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples #include <helper_functions.h> // helper for shared that are common to CUDA Samples
__global__ void SimpleKernel(float *src, float *dst) { __global__ void SimpleKernel(float *src, float *dst)
{
// Just a dummy kernel, doing enough for us to verify that everything // Just a dummy kernel, doing enough for us to verify that everything
// worked // worked
const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x;
@ -50,12 +51,12 @@ __global__ void SimpleKernel(float *src, float *dst) {
inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; } inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("[%s] - Starting...\n", argv[0]); printf("[%s] - Starting...\n", argv[0]);
if (!IsAppBuiltAs64()) { if (!IsAppBuiltAs64()) {
printf( printf("%s is only supported with on 64-bit OSs and the application must be "
"%s is only supported with on 64-bit OSs and the application must be "
"built as a 64-bit target. Test is being waived.\n", "built as a 64-bit target. Test is being waived.\n",
argv[0]); argv[0]);
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
@ -68,8 +69,7 @@ int main(int argc, char **argv) {
printf("CUDA-capable device count: %i\n", gpu_n); printf("CUDA-capable device count: %i\n", gpu_n);
if (gpu_n < 2) { if (gpu_n < 2) {
printf( printf("Two or more GPUs with Peer-to-Peer access capability are required for "
"Two or more GPUs with Peer-to-Peer access capability are required for "
"%s.\n", "%s.\n",
argv[0]); argv[0]);
printf("Waiving test.\n"); printf("Waiving test.\n");
@ -97,8 +97,12 @@ int main(int argc, char **argv) {
continue; continue;
} }
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j)); checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name, printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
i, prop[j].name, j, can_access_peer ? "Yes" : "No"); prop[i].name,
i,
prop[j].name,
j,
can_access_peer ? "Yes" : "No");
if (can_access_peer && p2pCapableGPUs[0] == -1) { if (can_access_peer && p2pCapableGPUs[0] == -1) {
p2pCapableGPUs[0] = i; p2pCapableGPUs[0] = i;
p2pCapableGPUs[1] = j; p2pCapableGPUs[1] = j;
@ -107,12 +111,10 @@ int main(int argc, char **argv) {
} }
if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) { if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
printf( printf("Two or more GPUs with Peer-to-Peer access capability are required for "
"Two or more GPUs with Peer-to-Peer access capability are required for "
"%s.\n", "%s.\n",
argv[0]); argv[0]);
printf( printf("Peer to Peer access is not available amongst GPUs in the system, "
"Peer to Peer access is not available amongst GPUs in the system, "
"waiving test.\n"); "waiving test.\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
@ -123,8 +125,7 @@ int main(int argc, char **argv) {
gpuid[1] = p2pCapableGPUs[1]; gpuid[1] = p2pCapableGPUs[1];
// Enable peer access // Enable peer access
printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[0])); checkCudaErrors(cudaSetDevice(gpuid[0]));
checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0)); checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
checkCudaErrors(cudaSetDevice(gpuid[1])); checkCudaErrors(cudaSetDevice(gpuid[1]));
@ -132,8 +133,8 @@ int main(int argc, char **argv) {
// Allocate buffers // Allocate buffers
const size_t buf_size = 1024 * 1024 * 16 * sizeof(float); const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", printf(
int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]); "Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[0])); checkCudaErrors(cudaSetDevice(gpuid[0]));
float *g0; float *g0;
checkCudaErrors(cudaMalloc(&g0, buf_size)); checkCudaErrors(cudaMalloc(&g0, buf_size));
@ -141,8 +142,7 @@ int main(int argc, char **argv) {
float *g1; float *g1;
checkCudaErrors(cudaMalloc(&g1, buf_size)); checkCudaErrors(cudaMalloc(&g1, buf_size));
float *h0; float *h0;
checkCudaErrors( checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
// Create CUDA event handles // Create CUDA event handles
printf("Creating event handles...\n"); printf("Creating event handles...\n");
@ -161,7 +161,8 @@ int main(int argc, char **argv) {
// Ping-pong copy between GPUs // Ping-pong copy between GPUs
if (i % 2 == 0) { if (i % 2 == 0) {
checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault)); checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
} else { }
else {
checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault)); checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
} }
} }
@ -170,9 +171,9 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaEventSynchronize(stop_event)); checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n", printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
gpuid[0], gpuid[1], gpuid[0],
(1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / gpuid[1],
1024.0f / 1024.0f); (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
// Prepare host buffer and copy to GPU 0 // Prepare host buffer and copy to GPU 0
printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]); printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
@ -190,10 +191,11 @@ int main(int argc, char **argv) {
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer // output to the GPU 1 buffer
printf( printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
"Run kernel on GPU%d, taking source data from GPU%d and writing to "
"GPU%d...\n", "GPU%d...\n",
gpuid[1], gpuid[0], gpuid[1]); gpuid[1],
gpuid[0],
gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[1])); checkCudaErrors(cudaSetDevice(gpuid[1]));
SimpleKernel<<<blocks, threads>>>(g0, g1); SimpleKernel<<<blocks, threads>>>(g0, g1);
@ -201,10 +203,11 @@ int main(int argc, char **argv) {
// Run kernel on GPU 0, reading input from the GPU 1 buffer, writing // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
// output to the GPU 0 buffer // output to the GPU 0 buffer
printf( printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
"Run kernel on GPU%d, taking source data from GPU%d and writing to "
"GPU%d...\n", "GPU%d...\n",
gpuid[0], gpuid[1], gpuid[0]); gpuid[0],
gpuid[1],
gpuid[0]);
checkCudaErrors(cudaSetDevice(gpuid[0])); checkCudaErrors(cudaSetDevice(gpuid[0]));
SimpleKernel<<<blocks, threads>>>(g1, g0); SimpleKernel<<<blocks, threads>>>(g1, g0);
@ -220,8 +223,7 @@ int main(int argc, char **argv) {
// Re-generate input data and apply 2x '* 2.0f' computation of both // Re-generate input data and apply 2x '* 2.0f' computation of both
// kernel runs // kernel runs
if (h0[i] != float(i % 4096) * 2.0f * 2.0f) { if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));
(float(i % 4096) * 2.0f * 2.0f));
if (error_count++ > 10) { if (error_count++ > 10) {
break; break;
@ -253,7 +255,8 @@ int main(int argc, char **argv) {
if (error_count != 0) { if (error_count != 0) {
printf("Test failed!\n"); printf("Test failed!\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
else {
printf("Test passed\n"); printf("Test passed\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }

View File

@ -26,16 +26,16 @@
*/ */
/* pitchLinearTexture /* pitchLinearTexture
* *
* This example demonstrates how to use textures bound to pitch linear memory. * This example demonstrates how to use textures bound to pitch linear memory.
* It performs a shift of matrix elements using wrap addressing mode (aka * It performs a shift of matrix elements using wrap addressing mode (aka
* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array, * periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
* in order to highlight the differences in using each. * in order to highlight the differences in using each.
* *
* Textures binding to pitch linear memory is a new feature in CUDA 2.2, * Textures binding to pitch linear memory is a new feature in CUDA 2.2,
* and allows use of texture features such as wrap addressing mode and * and allows use of texture features such as wrap addressing mode and
* filtering which are not possible with textures bound to regular linear memory * filtering which are not possible with textures bound to regular linear memory
*/ */
// includes, system // includes, system
#include <stdio.h> #include <stdio.h>
@ -70,29 +70,26 @@ bool bTestResult = true;
//! Shifts matrix elements using pitch linear array //! Shifts matrix elements using pitch linear array
//! @param odata output data in global memory //! @param odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height, __global__ void
int shiftX, int shiftY, shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL)
cudaTextureObject_t texRefPL) { {
int xid = blockIdx.x * blockDim.x + threadIdx.x; int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y; int yid = blockIdx.y * blockDim.y + threadIdx.y;
odata[yid * pitch + xid] = tex2D<float>( odata[yid * pitch + xid] = tex2D<float>(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Shifts matrix elements using regular array //! Shifts matrix elements using regular array
//! @param odata output data in global memory //! @param odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void shiftArray(float *odata, int pitch, int width, int height, __global__ void
int shiftX, int shiftY, shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray)
cudaTextureObject_t texRefArray) { {
int xid = blockIdx.x * blockDim.x + threadIdx.x; int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y; int yid = blockIdx.y * blockDim.y + threadIdx.y;
odata[yid * pitch + xid] = odata[yid * pitch + xid] = tex2D<float>(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
(yid + shiftY) / (float)height);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -102,20 +99,21 @@ void runTest(int argc, char **argv);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n\n", sSDKsample); printf("%s starting...\n\n", sSDKsample);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sSDKsample, printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!");
bTestResult ? "OK" : "ERROR!");
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
// Set array size // Set array size
const int nx = 2048; const int nx = 2048;
const int ny = 2048; const int ny = 2048;
@ -154,8 +152,7 @@ void runTest(int argc, char **argv) {
float *d_idataPL; float *d_idataPL;
size_t d_pitchBytes; size_t d_pitchBytes;
checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny));
nx * sizeof(float), ny));
// Array input data // Array input data
cudaArray *d_idataArray; cudaArray *d_idataArray;
@ -165,20 +162,17 @@ void runTest(int argc, char **argv) {
// Pitch linear output data // Pitch linear output data
float *d_odata; float *d_odata;
checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny));
nx * sizeof(float), ny));
// Copy host data to device // Copy host data to device
// Pitch linear // Pitch linear
size_t h_pitchBytes = nx * sizeof(float); size_t h_pitchBytes = nx * sizeof(float);
checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, checkCudaErrors(
nx * sizeof(float), ny, cudaMemcpyHostToDevice)); cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice));
// Array // Array
checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice));
nx * ny * sizeof(float),
cudaMemcpyHostToDevice));
cudaTextureObject_t texRefPL; cudaTextureObject_t texRefPL;
cudaTextureObject_t texRefArray; cudaTextureObject_t texRefArray;
@ -210,8 +204,7 @@ void runTest(int argc, char **argv) {
texDescr.addressMode[0] = cudaAddressModeWrap; texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap; texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType; texDescr.readMode = cudaReadModeElementType;
checkCudaErrors( checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
// Reference calculation // Reference calculation
for (int j = 0; j < ny; ++j) { for (int j = 0; j < ny; ++j) {
@ -224,15 +217,13 @@ void runTest(int argc, char **argv) {
} }
// Run ShiftPitchLinear kernel // Run ShiftPitchLinear kernel
checkCudaErrors( checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0)); checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < NUM_REPS; ++i) { for (int i = 0; i < NUM_REPS; ++i) {
shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata, shiftPitchLinear<<<dimGrid, dimBlock>>>(
(int)(d_pitchBytes / sizeof(float)), d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL);
nx, ny, x_shift, y_shift, texRefPL);
} }
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventRecord(stop, 0));
@ -241,8 +232,8 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop)); checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
// Check results // Check results
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, checkCudaErrors(
nx * sizeof(float), ny, cudaMemcpyDeviceToHost)); cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f); bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
@ -254,14 +245,12 @@ void runTest(int argc, char **argv) {
} }
// Run ShiftArray kernel // Run ShiftArray kernel
checkCudaErrors( checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0)); checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < NUM_REPS; ++i) { for (int i = 0; i < NUM_REPS; ++i) {
shiftArray<<<dimGrid, dimBlock>>>(d_odata, shiftArray<<<dimGrid, dimBlock>>>(
(int)(d_pitchBytes / sizeof(float)), nx, d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray);
ny, x_shift, y_shift, texRefArray);
} }
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventRecord(stop, 0));
@ -270,8 +259,8 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop)); checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
// Check results // Check results
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, checkCudaErrors(
nx * sizeof(float), ny, cudaMemcpyDeviceToHost)); cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f); res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
if (res == false) { if (res == false) {
@ -279,21 +268,18 @@ void runTest(int argc, char **argv) {
bTestResult = false; bTestResult = false;
} }
float bandwidthPL = float bandwidthPL = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS); float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS);
float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
(timeArray / NUM_REPS);
printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray);
bandwidthPL, bandwidthArray);
float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS)); float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS)); float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
printf( printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
"\nTexture fetch rate (Mpix/s) for pitch linear: "
"%.2e; for array: %.2e\n\n", "%.2e; for array: %.2e\n\n",
fetchRatePL, fetchRateArray); fetchRatePL,
fetchRateArray);
// Cleanup // Cleanup
free(h_idata); free(h_idata);

View File

@ -26,28 +26,30 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
#endif #endif
__global__ void testKernel(int val) { __global__ void testKernel(int val)
printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x, {
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + printf("[%d, %d]:\t\tValue is:%d\n",
threadIdx.x, blockIdx.y * gridDim.x + blockIdx.x,
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x,
val); val);
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
int devID; int devID;
cudaDeviceProp props; cudaDeviceProp props;
@ -57,8 +59,7 @@ int main(int argc, char **argv) {
// Get GPU information // Get GPU information
checkCudaErrors(cudaGetDevice(&devID)); checkCudaErrors(cudaGetDevice(&devID));
checkCudaErrors(cudaGetDeviceProperties(&props, devID)); checkCudaErrors(cudaGetDeviceProperties(&props, devID));
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor);
props.major, props.minor);
printf("printf() is called. Output:\n\n"); printf("printf() is called. Output:\n\n");

View File

@ -44,28 +44,29 @@
* *
* Elapsed times are averaged over nreps repetitions (10 by default). * Elapsed times are averaged over nreps repetitions (10 by default).
* *
*/ */
const char *sSDKsample = "simpleStreams"; const char *sSDKsample = "simpleStreams";
const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL};
"cudaEventDisableTiming", NULL};
const char *sDeviceSyncMethod[] = { const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
"cudaDeviceScheduleAuto", "cudaDeviceScheduleSpin", "cudaDeviceScheduleSpin",
"cudaDeviceScheduleYield", "INVALID", "cudaDeviceScheduleYield",
"cudaDeviceScheduleBlockingSync", NULL}; "INVALID",
"cudaDeviceScheduleBlockingSync",
NULL};
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef WIN32 #ifndef WIN32
#include <sys/mman.h> // for mmap() / munmap() #include <sys/mman.h> // for mmap() / munmap()
@ -75,7 +76,8 @@ const char *sDeviceSyncMethod[] = {
#define MEMORY_ALIGNMENT 4096 #define MEMORY_ALIGNMENT 4096
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1))) #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
__global__ void init_array(int *g_data, int *factor, int num_iterations) { __global__ void init_array(int *g_data, int *factor, int num_iterations)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < num_iterations; i++) { for (int i = 0; i < num_iterations; i++) {
@ -83,7 +85,8 @@ __global__ void init_array(int *g_data, int *factor, int num_iterations) {
} }
} }
bool correct_data(int *a, const int n, const int c) { bool correct_data(int *a, const int n, const int c)
{
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (a[i] != c) { if (a[i] != c) {
printf("%d: %d %d\n", i, a[i], c); printf("%d: %d %d\n", i, a[i], c);
@ -94,51 +97,45 @@ bool correct_data(int *a, const int n, const int c) {
return true; return true;
} }
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
int **ppAligned_a, int nbytes) { {
#if CUDART_VERSION >= 4000 #if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__) #if !defined(__arm__) && !defined(__aarch64__)
if (bPinGenericMemory) { if (bPinGenericMemory) {
// allocate a generic page-aligned chunk of system memory // allocate a generic page-aligned chunk of system memory
#ifdef WIN32 #ifdef WIN32
printf( printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
"> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
"system memory)\n", "system memory)\n",
(float)nbytes / 1048576.0f); (float)nbytes / 1048576.0f);
*pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
#else #else
printf( printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system "
"> mmap() allocating %4.2f Mbytes (generic page-aligned system "
"memory)\n", "memory)\n",
(float)nbytes / 1048576.0f); (float)nbytes / 1048576.0f);
*pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
#endif #endif
*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT); *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
printf( printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
"> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
"system memory\n", "system memory\n",
(float)nbytes / 1048576.0f); (float)nbytes / 1048576.0f);
// pin allocate memory // pin allocate memory
checkCudaErrors( checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped)); }
} else else
#endif #endif
#endif #endif
{ {
printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f);
(float)nbytes / 1048576.0f);
// allocate host memory (pinned is required for achieve asynchronicity) // allocate host memory (pinned is required for achieve asynchronicity)
checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes)); checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
*ppAligned_a = *pp_a; *ppAligned_a = *pp_a;
} }
} }
inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
int **ppAligned_a, int nbytes) { {
#if CUDART_VERSION >= 4000 #if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__) #if !defined(__arm__) && !defined(__aarch64__)
// CUDA 4.0 support pinning of generic host memory // CUDA 4.0 support pinning of generic host memory
@ -150,7 +147,8 @@ inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
#else #else
munmap(*pp_a, nbytes); munmap(*pp_a, nbytes);
#endif #endif
} else }
else
#endif #endif
#endif #endif
{ {
@ -158,26 +156,24 @@ inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
} }
} }
static const char *sSyncMethod[] = { static const char *sSyncMethod[] = {"0 (Automatic Blocking)",
"0 (Automatic Blocking)",
"1 (Spin Blocking)", "1 (Spin Blocking)",
"2 (Yield Blocking)", "2 (Yield Blocking)",
"3 (Undefined Blocking Method)", "3 (Undefined Blocking Method)",
"4 (Blocking Sync Event) = low CPU utilization", "4 (Blocking Sync Event) = low CPU utilization",
NULL}; NULL};
void printHelp() { void printHelp()
{
printf("Usage: %s [options below]\n", sSDKsample); printf("Usage: %s [options below]\n", sSDKsample);
printf("\t--sync_method=n for CPU/GPU synchronization\n"); printf("\t--sync_method=n for CPU/GPU synchronization\n");
printf("\t n=%s\n", sSyncMethod[0]); printf("\t n=%s\n", sSyncMethod[0]);
printf("\t n=%s\n", sSyncMethod[1]); printf("\t n=%s\n", sSyncMethod[1]);
printf("\t n=%s\n", sSyncMethod[2]); printf("\t n=%s\n", sSyncMethod[2]);
printf("\t <Default> n=%s\n", sSyncMethod[4]); printf("\t <Default> n=%s\n", sSyncMethod[4]);
printf( printf("\t--use_generic_memory (default) use generic page-aligned for system "
"\t--use_generic_memory (default) use generic page-aligned for system "
"memory\n"); "memory\n");
printf( printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
"\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
"system memory\n"); "system memory\n");
} }
@ -187,7 +183,8 @@ void printHelp() {
#define DEFAULT_PINNED_GENERIC_MEMORY true #define DEFAULT_PINNED_GENERIC_MEMORY true
#endif #endif
int main(int argc, char **argv) { int main(int argc, char **argv)
{
int cuda_device = 0; int cuda_device = 0;
int nstreams = 4; // number of streams for CUDA calls int nstreams = 4; // number of streams for CUDA calls
int nreps = 10; // number of times each experiment is repeated int nreps = 10; // number of times each experiment is repeated
@ -199,10 +196,8 @@ int main(int argc, char **argv) {
// allocate generic memory and pin it laster instead of using cudaHostAlloc() // allocate generic memory and pin it laster instead of using cudaHostAlloc()
bool bPinGenericMemory = bool bPinGenericMemory = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior int device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync
int device_sync_method =
cudaDeviceBlockingSync; // by default we use BlockingSync
int niterations; // number of iterations for the loop inside the kernel int niterations; // number of iterations for the loop inside the kernel
@ -213,20 +208,18 @@ int main(int argc, char **argv) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) {
"sync_method")) >= 0) { if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
if (device_sync_method == 0 || device_sync_method == 1 || printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
device_sync_method == 2 || device_sync_method == 4) {
printf("Device synchronization method set to = %s\n",
sSyncMethod[device_sync_method]);
printf("Setting reps to 100 to demonstrate steady state\n"); printf("Setting reps to 100 to demonstrate steady state\n");
nreps = 100; nreps = 100;
} else { }
printf("Invalid command line option sync_method=\"%d\"\n", else {
device_sync_method); printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
return EXIT_FAILURE; return EXIT_FAILURE;
} }
} else { }
else {
printHelp(); printHelp();
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
@ -252,16 +245,13 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaGetDeviceCount(&num_devices)); checkCudaErrors(cudaGetDeviceCount(&num_devices));
if (0 == num_devices) { if (0 == num_devices) {
printf( printf("your system does not have a CUDA capable device, waiving test...\n");
"your system does not have a CUDA capable device, waiving test...\n");
return EXIT_WAIVED; return EXIT_WAIVED;
} }
// check if the command-line chosen device ID is within range, exit if not // check if the command-line chosen device ID is within range, exit if not
if (cuda_device >= num_devices) { if (cuda_device >= num_devices) {
printf( printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1);
"cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
cuda_device, num_devices - 1);
return EXIT_FAILURE; return EXIT_FAILURE;
} }
@ -276,12 +266,10 @@ int main(int argc, char **argv) {
// Check if GPU can map host memory (Generic Method), if not then we override // Check if GPU can map host memory (Generic Method), if not then we override
// bPinGenericMemory to be false // bPinGenericMemory to be false
if (bPinGenericMemory) { if (bPinGenericMemory) {
printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");
deviceProp.canMapHostMemory ? "Yes" : "No");
if (deviceProp.canMapHostMemory == 0) { if (deviceProp.canMapHostMemory == 0) {
printf( printf("Using cudaMallocHost, CUDA device does not support mapping of "
"Using cudaMallocHost, CUDA device does not support mapping of "
"generic host memory\n"); "generic host memory\n");
bPinGenericMemory = false; bPinGenericMemory = false;
} }
@ -289,27 +277,22 @@ int main(int argc, char **argv) {
// Anything that is less than 32 Cores will have scaled down workload // Anything that is less than 32 Cores will have scaled down workload
scale_factor = scale_factor =
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
(float)deviceProp.multiProcessorCount)),
1.0f); 1.0f);
n = (int)rint((float)n / scale_factor); n = (int)rint((float)n / scale_factor);
printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
deviceProp.minor);
printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n", printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
deviceProp.multiProcessorCount, deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
deviceProp.multiProcessorCount);
printf("> scale_factor = %1.4f\n", 1.0f / scale_factor); printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
printf("> array_size = %d\n\n", n); printf("> array_size = %d\n\n", n);
// enable use of blocking sync, to reduce CPU usage // enable use of blocking sync, to reduce CPU usage
printf("> Using CPU/GPU Device Synchronization method (%s)\n", printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
sDeviceSyncMethod[device_sync_method]); checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
checkCudaErrors(cudaSetDeviceFlags(
device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
// allocate host memory // allocate host memory
int c = 5; // value to which the array will be initialized int c = 5; // value to which the array will be initialized
@ -332,8 +315,7 @@ int main(int argc, char **argv) {
printf("\nStarting Test\n"); printf("\nStarting Test\n");
// allocate and initialize an array of stream handles // allocate and initialize an array of stream handles
cudaStream_t *streams = cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++) { for (int i = 0; i < nstreams; i++) {
checkCudaErrors(cudaStreamCreate(&(streams[i]))); checkCudaErrors(cudaStreamCreate(&(streams[i])));
@ -342,9 +324,7 @@ int main(int argc, char **argv) {
// create CUDA event handles // create CUDA event handles
// use blocking sync // use blocking sync
cudaEvent_t start_event, stop_event; cudaEvent_t start_event, stop_event;
int eventflags = int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault);
((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
: cudaEventDefault);
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags)); checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags)); checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
@ -354,11 +334,9 @@ int main(int argc, char **argv) {
// ensure that all previous // ensure that all previous
// CUDA calls have // CUDA calls have
// completed // completed
checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
cudaMemcpyDeviceToHost, streams[0]));
checkCudaErrors(cudaEventRecord(stop_event, 0)); checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize( checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded
stop_event)); // block until the event is actually recorded
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
printf("memcopy:\t%.2f\n", time_memcpy); printf("memcopy:\t%.2f\n", time_memcpy);
@ -380,8 +358,7 @@ int main(int argc, char **argv) {
for (int k = 0; k < nreps; k++) { for (int k = 0; k < nreps; k++) {
init_array<<<blocks, threads>>>(d_a, d_c, niterations); init_array<<<blocks, threads>>>(d_a, d_c, niterations);
checkCudaErrors( checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
} }
checkCudaErrors(cudaEventRecord(stop_event, 0)); checkCudaErrors(cudaEventRecord(stop_event, 0));
@ -395,16 +372,14 @@ int main(int argc, char **argv) {
blocks = dim3(n / (nstreams * threads.x), 1); blocks = dim3(n / (nstreams * threads.x), 1);
memset(hAligned_a, 255, memset(hAligned_a, 255,
nbytes); // set host memory bits to all 1s, for testing correctness nbytes); // set host memory bits to all 1s, for testing correctness
checkCudaErrors(cudaMemset( checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
checkCudaErrors(cudaEventRecord(start_event, 0)); checkCudaErrors(cudaEventRecord(start_event, 0));
for (int k = 0; k < nreps; k++) { for (int k = 0; k < nreps; k++) {
// asynchronously launch nstreams kernels, each operating on its own portion // asynchronously launch nstreams kernels, each operating on its own portion
// of data // of data
for (int i = 0; i < nstreams; i++) { for (int i = 0; i < nstreams; i++) {
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
d_c, niterations);
} }
// asynchronously launch nstreams memcopies. Note that memcopy in stream x // asynchronously launch nstreams memcopies. Note that memcopy in stream x
@ -413,8 +388,10 @@ int main(int argc, char **argv) {
// completed // completed
for (int i = 0; i < nstreams; i++) { for (int i = 0; i < nstreams; i++) {
checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams, checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
d_a + i * n / nstreams, nbytes / nstreams, d_a + i * n / nstreams,
cudaMemcpyDeviceToHost, streams[i])); nbytes / nstreams,
cudaMemcpyDeviceToHost,
streams[i]));
} }
} }

View File

@ -34,10 +34,10 @@
*/ */
// Includes, system // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -73,23 +73,22 @@ static const char *sampleName = "simpleSurfaceWrite";
//! Write to a cuArray (texture data source) using surface writes //! Write to a cuArray (texture data source) using surface writes
//! @param gIData input data in global memory //! @param gIData input data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void surfaceWriteKernel(float *gIData, int width, int height, __global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface)
cudaSurfaceObject_t outputSurface) { {
// calculate surface coordinates // calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// read from global memory and write to cuarray (via surface reference) // read from global memory and write to cuarray (via surface reference)
surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap);
cudaBoundaryModeTrap);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups //! Transform an image using texture lookups
//! @param gOData output data in global memory //! @param gOData output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *gOData, int width, int height, __global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex)
float theta, cudaTextureObject_t tex) { {
// calculate normalized texture coordinates // calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -111,29 +110,29 @@ __global__ void transformKernel(float *gOData, int width, int height,
// Declaration, forward // Declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName); printf("%s starting...\n", sampleName);
// Process command-line arguments // Process command-line arguments
if (argc > 1) { if (argc > 1) {
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
(char **)&imageFilename);
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
getCmdLineArgumentString(argc, (const char **)argv, "reference", getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
(char **)&refFilename); }
} else { else {
printf("-input flag should be used with -reference flag"); printf("-input flag should be used with -reference flag");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { }
else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
printf("-reference flag should be used with -input flag"); printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -141,15 +140,15 @@ int main(int argc, char **argv) {
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
// Use command-line specified CUDA device, // Use command-line specified CUDA device,
// otherwise use device with highest Gflops/s // otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
@ -159,7 +158,9 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n", printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major, deviceProps.name,
deviceProps.multiProcessorCount,
deviceProps.major,
deviceProps.minor); deviceProps.minor);
// Load image from disk // Load image from disk
@ -193,11 +194,9 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaMalloc((void **)&dData, size)); checkCudaErrors(cudaMalloc((void **)&dData, size));
// Allocate array and copy image data // Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cuArray; cudaArray *cuArray;
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));
cudaArraySurfaceLoadStore));
dim3 dimBlock(8, 8, 1); dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
@ -211,11 +210,9 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes)); checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
#if 1 #if 1
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
outputSurface);
#else // This is what differs from the example simpleTexture #else // This is what differs from the example simpleTexture
checkCudaErrors( checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
#endif #endif
cudaTextureObject_t tex; cudaTextureObject_t tex;
@ -254,8 +251,7 @@ void runTest(int argc, char **argv) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
printf("%.2f Mpixels/sec\n", printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// Allocate mem for the result on host side // Allocate mem for the result on host side
@ -272,9 +268,9 @@ void runTest(int argc, char **argv) {
// Write regression file if necessary // Write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// Write file for regression test // Write file for regression test
sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, false);
false); }
} else { else {
// We need to reload the data from disk, // We need to reload the data from disk,
// because it is inverted upon output // because it is inverted upon output
sdkLoadPGM(outputFilename, &hOData, &width, &height); sdkLoadPGM(outputFilename, &hOData, &width, &height);
@ -282,8 +278,7 @@ void runTest(int argc, char **argv) {
printf("Comparing files\n"); printf("Comparing files\n");
printf("\toutput: <%s>\n", outputFilename); printf("\toutput: <%s>\n", outputFilename);
printf("\treference: <%s>\n", refPath); printf("\treference: <%s>\n", refPath);
testResult = testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
} }
checkCudaErrors(cudaDestroySurfaceObject(outputSurface)); checkCudaErrors(cudaDestroySurfaceObject(outputSurface));

View File

@ -68,10 +68,11 @@
// this // this
// struct by putting an undefined symbol in the function body so it won't // struct by putting an undefined symbol in the function body so it won't
// compile. // compile.
template <typename T> template <typename T> struct SharedMemory
struct SharedMemory { {
// Ensure that we won't compile any un-specialized types // Ensure that we won't compile any un-specialized types
__device__ T *getPointer() { __device__ T *getPointer()
{
extern __device__ void error(void); extern __device__ void error(void);
error(); error();
return NULL; return NULL;
@ -82,89 +83,100 @@ struct SharedMemory {
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
// One could also specialize it for user-defined types. // One could also specialize it for user-defined types.
template <> template <> struct SharedMemory<int>
struct SharedMemory<int> { {
__device__ int *getPointer() { __device__ int *getPointer()
{
extern __shared__ int s_int[]; extern __shared__ int s_int[];
return s_int; return s_int;
} }
}; };
template <> template <> struct SharedMemory<unsigned int>
struct SharedMemory<unsigned int> { {
__device__ unsigned int *getPointer() { __device__ unsigned int *getPointer()
{
extern __shared__ unsigned int s_uint[]; extern __shared__ unsigned int s_uint[];
return s_uint; return s_uint;
} }
}; };
template <> template <> struct SharedMemory<char>
struct SharedMemory<char> { {
__device__ char *getPointer() { __device__ char *getPointer()
{
extern __shared__ char s_char[]; extern __shared__ char s_char[];
return s_char; return s_char;
} }
}; };
template <> template <> struct SharedMemory<unsigned char>
struct SharedMemory<unsigned char> { {
__device__ unsigned char *getPointer() { __device__ unsigned char *getPointer()
{
extern __shared__ unsigned char s_uchar[]; extern __shared__ unsigned char s_uchar[];
return s_uchar; return s_uchar;
} }
}; };
template <> template <> struct SharedMemory<short>
struct SharedMemory<short> { {
__device__ short *getPointer() { __device__ short *getPointer()
{
extern __shared__ short s_short[]; extern __shared__ short s_short[];
return s_short; return s_short;
} }
}; };
template <> template <> struct SharedMemory<unsigned short>
struct SharedMemory<unsigned short> { {
__device__ unsigned short *getPointer() { __device__ unsigned short *getPointer()
{
extern __shared__ unsigned short s_ushort[]; extern __shared__ unsigned short s_ushort[];
return s_ushort; return s_ushort;
} }
}; };
template <> template <> struct SharedMemory<long>
struct SharedMemory<long> { {
__device__ long *getPointer() { __device__ long *getPointer()
{
extern __shared__ long s_long[]; extern __shared__ long s_long[];
return s_long; return s_long;
} }
}; };
template <> template <> struct SharedMemory<unsigned long>
struct SharedMemory<unsigned long> { {
__device__ unsigned long *getPointer() { __device__ unsigned long *getPointer()
{
extern __shared__ unsigned long s_ulong[]; extern __shared__ unsigned long s_ulong[];
return s_ulong; return s_ulong;
} }
}; };
template <> template <> struct SharedMemory<bool>
struct SharedMemory<bool> { {
__device__ bool *getPointer() { __device__ bool *getPointer()
{
extern __shared__ bool s_bool[]; extern __shared__ bool s_bool[];
return s_bool; return s_bool;
} }
}; };
template <> template <> struct SharedMemory<float>
struct SharedMemory<float> { {
__device__ float *getPointer() { __device__ float *getPointer()
{
extern __shared__ float s_float[]; extern __shared__ float s_float[];
return s_float; return s_float;
} }
}; };
template <> template <> struct SharedMemory<double>
struct SharedMemory<double> { {
__device__ double *getPointer() { __device__ double *getPointer()
{
extern __shared__ double s_double[]; extern __shared__ double s_double[];
return s_double; return s_double;
} }

View File

@ -26,23 +26,23 @@
*/ */
/* This sample is a templatized version of the template project. /* This sample is a templatized version of the template project.
* It also shows how to correctly templatize dynamically allocated shared * It also shows how to correctly templatize dynamically allocated shared
* memory arrays. * memory arrays.
* Host code. * Host code.
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <string.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
@ -58,8 +58,8 @@ int g_TotalFailures = 0;
//! @param g_idata input data in global memory //! @param g_idata input data in global memory
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class T> template <class T> __global__ void testKernel(T *g_idata, T *g_odata)
__global__ void testKernel(T *g_idata, T *g_odata) { {
// Shared mem size is determined by the host app at run time // Shared mem size is determined by the host app at run time
SharedMemory<T> smem; SharedMemory<T> smem;
T *sdata = smem.getPointer(); T *sdata = smem.getPointer();
@ -83,11 +83,10 @@ __global__ void testKernel(T *g_idata, T *g_odata) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// declaration, forward // declaration, forward
template <class T> template <class T> void runTest(int argc, char **argv, int len);
void runTest(int argc, char **argv, int len);
template <class T> template <class T> void computeGold(T *reference, T *idata, const unsigned int len)
void computeGold(T *reference, T *idata, const unsigned int len) { {
const T T_len = static_cast<T>(len); const T T_len = static_cast<T>(len);
for (unsigned int i = 0; i < len; ++i) { for (unsigned int i = 0; i < len; ++i) {
@ -98,7 +97,8 @@ void computeGold(T *reference, T *idata, const unsigned int len) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("> runTest<float,32>\n"); printf("> runTest<float,32>\n");
runTest<float>(argc, argv, 32); runTest<float>(argc, argv, 32);
printf("> runTest<int,64>\n"); printf("> runTest<int,64>\n");
@ -114,60 +114,63 @@ int main(int argc, char **argv) {
// functions for different types. // functions for different types.
// Here's the generic wrapper for cutCompare* // Here's the generic wrapper for cutCompare*
template <class T> template <class T> class ArrayComparator
class ArrayComparator { {
public: public:
bool compare(const T *reference, T *data, unsigned int len) { bool compare(const T *reference, T *data, unsigned int len)
fprintf(stderr, {
"Error: no comparison function implemented for this type\n"); fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false; return false;
} }
}; };
// Here's the specialization for ints: // Here's the specialization for ints:
template <> template <> class ArrayComparator<int>
class ArrayComparator<int> { {
public: public:
bool compare(const int *reference, int *data, unsigned int len) { bool compare(const int *reference, int *data, unsigned int len)
{
return compareData(reference, data, len, 0.15f, 0.0f); return compareData(reference, data, len, 0.15f, 0.0f);
} }
}; };
// Here's the specialization for floats: // Here's the specialization for floats:
template <> template <> class ArrayComparator<float>
class ArrayComparator<float> { {
public: public:
bool compare(const float *reference, float *data, unsigned int len) { bool compare(const float *reference, float *data, unsigned int len)
{
return compareData(reference, data, len, 0.15f, 0.15f); return compareData(reference, data, len, 0.15f, 0.15f);
} }
}; };
// Here's the generic wrapper for cutWriteFile* // Here's the generic wrapper for cutWriteFile*
template <class T> template <class T> class ArrayFileWriter
class ArrayFileWriter { {
public: public:
bool write(const char *filename, T *data, unsigned int len, float epsilon) { bool write(const char *filename, T *data, unsigned int len, float epsilon)
fprintf(stderr, {
"Error: no file write function implemented for this type\n"); fprintf(stderr, "Error: no file write function implemented for this type\n");
return false; return false;
} }
}; };
// Here's the specialization for ints: // Here's the specialization for ints:
template <> template <> class ArrayFileWriter<int>
class ArrayFileWriter<int> { {
public: public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { bool write(const char *filename, int *data, unsigned int len, float epsilon)
{
return sdkWriteFile(filename, data, len, epsilon, false); return sdkWriteFile(filename, data, len, epsilon, false);
} }
}; };
// Here's the specialization for floats: // Here's the specialization for floats:
template <> template <> class ArrayFileWriter<float>
class ArrayFileWriter<float> { {
public: public:
bool write(const char *filename, float *data, unsigned int len, bool write(const char *filename, float *data, unsigned int len, float epsilon)
float epsilon) { {
return sdkWriteFile(filename, data, len, epsilon, false); return sdkWriteFile(filename, data, len, epsilon, false);
} }
}; };
@ -175,8 +178,8 @@ class ArrayFileWriter<float> {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class T> template <class T> void runTest(int argc, char **argv, int len)
void runTest(int argc, char **argv, int len) { {
int devID; int devID;
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
@ -184,8 +187,7 @@ void runTest(int argc, char **argv, int len) {
// get number of SMs on this GPU // get number of SMs on this GPU
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
deviceProps.multiProcessorCount);
// create and start timer // create and start timer
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
@ -209,8 +211,7 @@ void runTest(int argc, char **argv, int len) {
T *d_idata; T *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
// copy host memory to device // copy host memory to device
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
// allocate device memory for result // allocate device memory for result
T *d_odata; T *d_odata;
@ -229,8 +230,7 @@ void runTest(int argc, char **argv, int len) {
// allocate mem for the result on host side // allocate mem for the result on host side
T *h_odata = (T *)malloc(mem_size); T *h_odata = (T *)malloc(mem_size);
// copy result from device to host // copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
@ -247,7 +247,8 @@ void runTest(int argc, char **argv, int len) {
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test // write file for regression test
writer.write("./data/regression.dat", h_odata, num_threads, 0.0f); writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
} else { }
else {
// custom output handling when no regression test running // custom output handling when no regression test running
// in this case check if the result is equivalent to the expected solution // in this case check if the result is equivalent to the expected solution
bool res = comparator.compare(reference, h_odata, num_threads); bool res = comparator.compare(reference, h_odata, num_threads);

View File

@ -34,10 +34,10 @@
*/ */
// Includes, system // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -73,8 +73,8 @@ bool testResult = true;
//! Transform an image using texture lookups //! Transform an image using texture lookups
//! @param outputData output data in global memory //! @param outputData output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *outputData, int width, int height, __global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex)
float theta, cudaTextureObject_t tex) { {
// calculate normalized texture coordinates // calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -98,23 +98,24 @@ void runTest(int argc, char **argv);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName); printf("%s starting...\n", sampleName);
// Process command-line arguments // Process command-line arguments
if (argc > 1) { if (argc > 1) {
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
(char **)&imageFilename);
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
getCmdLineArgumentString(argc, (const char **)argv, "reference", getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
(char **)&refFilename); }
} else { else {
printf("-input flag should be used with -reference flag"); printf("-input flag should be used with -reference flag");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { }
else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
printf("-reference flag should be used with -input flag"); printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -122,15 +123,15 @@ int main(int argc, char **argv) {
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
// load image from disk // load image from disk
@ -164,12 +165,10 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaMalloc((void **)&dData, size)); checkCudaErrors(cudaMalloc((void **)&dData, size));
// Allocate array and copy image data // Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cuArray; cudaArray *cuArray;
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height)); checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
checkCudaErrors( checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
cudaTextureObject_t tex; cudaTextureObject_t tex;
cudaResourceDesc texRes; cudaResourceDesc texRes;
@ -209,8 +208,7 @@ void runTest(int argc, char **argv) {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
printf("%.2f Mpixels/sec\n", printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// Allocate mem for the result on host side // Allocate mem for the result on host side
@ -228,9 +226,9 @@ void runTest(int argc, char **argv) {
// Write regression file if necessary // Write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// Write file for regression test // Write file for regression test
sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, 0.0f, false);
0.0f, false); }
} else { else {
// We need to reload the data from disk, // We need to reload the data from disk,
// because it is inverted upon output // because it is inverted upon output
sdkLoadPGM(outputFilename, &hOutputData, &width, &height); sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
@ -239,8 +237,7 @@ void runTest(int argc, char **argv) {
printf("\toutput: <%s>\n", outputFilename); printf("\toutput: <%s>\n", outputFilename);
printf("\treference: <%s>\n", refPath); printf("\treference: <%s>\n", refPath);
testResult = compareData(hOutputData, hDataRef, width * height, testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f);
MAX_EPSILON_ERROR, 0.15f);
} }
checkCudaErrors(cudaDestroyTextureObject(tex)); checkCudaErrors(cudaDestroyTextureObject(tex));

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -32,11 +32,11 @@
using 3D texture lookups. using 3D texture lookups.
*/ */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_gl.h> #include <helper_gl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
#pragma clang diagnostic ignored "-Wdeprecated-declarations" #pragma clang diagnostic ignored "-Wdeprecated-declarations"
@ -49,9 +49,9 @@
#endif #endif
// includes, cuda // includes, cuda
#include <vector_types.h>
#include <cuda_runtime.h>
#include <cuda_gl_interop.h> #include <cuda_gl_interop.h>
#include <cuda_runtime.h>
#include <vector_types.h>
// CUDA utilities and system includes // CUDA utilities and system includes
#include <helper_cuda.h> #include <helper_cuda.h>
@ -76,8 +76,7 @@ const dim3 gridSize(width / blockSize.x, height / blockSize.y);
float w = 0.5; // texture coordinate in z float w = 0.5; // texture coordinate in z
GLuint pbo; // OpenGL pixel buffer object GLuint pbo; // OpenGL pixel buffer object
struct cudaGraphicsResource struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
*cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
bool linearFiltering = true; bool linearFiltering = true;
bool animate = true; bool animate = true;
@ -105,13 +104,13 @@ char **pArgv = NULL;
extern "C" void cleanup(); extern "C" void cleanup();
extern "C" void setTextureFilterMode(bool bLinearFilter); extern "C" void setTextureFilterMode(bool bLinearFilter);
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize); extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w);
uint imageW, uint imageH, float w);
extern void cleanupCuda(); extern void cleanupCuda();
void loadVolumeData(char *exec_path); void loadVolumeData(char *exec_path);
void computeFPS() { void computeFPS()
{
frameCount++; frameCount++;
fpsCount++; fpsCount++;
@ -129,13 +128,13 @@ void computeFPS() {
} }
// render image using CUDA // render image using CUDA
void render() { void render()
{
// map PBO to get CUDA device pointer // map PBO to get CUDA device pointer
g_GraphicsMapFlag++; g_GraphicsMapFlag++;
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
size_t num_bytes; size_t num_bytes;
checkCudaErrors(cudaGraphicsResourceGetMappedPointer( checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
(void **)&d_output, &num_bytes, cuda_pbo_resource));
// printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
// call CUDA kernel, writing results to PBO // call CUDA kernel, writing results to PBO
@ -150,7 +149,8 @@ void render() {
} }
// display results using OpenGL (called by GLUT) // display results using OpenGL (called by GLUT)
void display() { void display()
{
sdkStartTimer(&timer); sdkStartTimer(&timer);
render(); render();
@ -172,14 +172,16 @@ void display() {
computeFPS(); computeFPS();
} }
void idle() { void idle()
{
if (animate) { if (animate) {
w += 0.01f; w += 0.01f;
glutPostRedisplay(); glutPostRedisplay();
} }
} }
void keyboard(unsigned char key, int x, int y) { void keyboard(unsigned char key, int x, int y)
{
switch (key) { switch (key) {
case 27: case 27:
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
@ -216,7 +218,8 @@ void keyboard(unsigned char key, int x, int y) {
glutPostRedisplay(); glutPostRedisplay();
} }
void reshape(int x, int y) { void reshape(int x, int y)
{
glViewport(0, 0, x, y); glViewport(0, 0, x, y);
glMatrixMode(GL_MODELVIEW); glMatrixMode(GL_MODELVIEW);
@ -227,7 +230,8 @@ void reshape(int x, int y) {
glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
} }
void cleanup() { void cleanup()
{
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// add extra check to unmap the resource before unregistering it // add extra check to unmap the resource before unregistering it
@ -242,21 +246,21 @@ void cleanup() {
cleanupCuda(); cleanupCuda();
} }
void initGLBuffers() { void initGLBuffers()
{
// create pixel buffer object // create pixel buffer object
glGenBuffers(1, &pbo); glGenBuffers(1, &pbo);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB);
0, GL_STREAM_DRAW_ARB);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
// register this buffer object with CUDA // register this buffer object with CUDA
checkCudaErrors(cudaGraphicsGLRegisterBuffer( checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
} }
// Load raw data from disk // Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size) { uchar *loadRawFile(const char *filename, size_t size)
{
FILE *fp = fopen(filename, "rb"); FILE *fp = fopen(filename, "rb");
if (!fp) { if (!fp) {
@ -273,7 +277,8 @@ uchar *loadRawFile(const char *filename, size_t size) {
return data; return data;
} }
void initGL(int *argc, char **argv) { void initGL(int *argc, char **argv)
{
// initialize GLUT callback functions // initialize GLUT callback functions
glutInit(argc, argv); glutInit(argc, argv);
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
@ -284,16 +289,15 @@ void initGL(int *argc, char **argv) {
glutReshapeFunc(reshape); glutReshapeFunc(reshape);
glutIdleFunc(idle); glutIdleFunc(idle);
if (!isGLVersionSupported(2, 0) || if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
!areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
fprintf(stderr, "Required OpenGL extensions are missing."); fprintf(stderr, "Required OpenGL extensions are missing.");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
void runAutoTest(const char *ref_file, char *exec_path) { void runAutoTest(const char *ref_file, char *exec_path)
checkCudaErrors( {
cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4)); checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
// render the volumeData // render the volumeData
render_kernel(gridSize, blockSize, d_output, width, height, w); render_kernel(gridSize, blockSize, d_output, width, height, w);
@ -302,15 +306,15 @@ void runAutoTest(const char *ref_file, char *exec_path) {
getLastCudaError("render_kernel failed"); getLastCudaError("render_kernel failed");
void *h_output = malloc(width * height * sizeof(GLubyte) * 4); void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
checkCudaErrors(cudaMemcpy(h_output, d_output, checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost));
width * height * sizeof(GLubyte) * 4, sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin");
cudaMemcpyDeviceToHost));
sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
"simpleTexture3D.bin");
bool bTestResult = sdkCompareBin2BinFloat( bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin",
"simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), sdkFindFilePath(ref_file, exec_path),
width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path); width * height,
MAX_EPSILON_ERROR,
THRESHOLD,
exec_path);
checkCudaErrors(cudaFree(d_output)); checkCudaErrors(cudaFree(d_output));
free(h_output); free(h_output);
@ -321,13 +325,13 @@ void runAutoTest(const char *ref_file, char *exec_path) {
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
void loadVolumeData(char *exec_path) { void loadVolumeData(char *exec_path)
{
// load volume data // load volume data
const char *path = sdkFindFilePath(volumeFilename, exec_path); const char *path = sdkFindFilePath(volumeFilename, exec_path);
if (path == NULL) { if (path == NULL) {
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
volumeFilename);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -343,7 +347,8 @@ void loadVolumeData(char *exec_path) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
pArgc = &argc; pArgc = &argc;
pArgv = argv; pArgv = argv;
@ -367,7 +372,8 @@ int main(int argc, char **argv) {
if (ref_file) { if (ref_file) {
loadVolumeData(argv[0]); loadVolumeData(argv[0]);
runAutoTest(ref_file, argv[0]); runAutoTest(ref_file, argv[0]);
} else { }
else {
initGL(&argc, argv); initGL(&argc, argv);
// OpenGL buffers // OpenGL buffers
@ -376,8 +382,7 @@ int main(int argc, char **argv) {
loadVolumeData(argv[0]); loadVolumeData(argv[0]);
} }
printf( printf("Press space to toggle animation\n"
"Press space to toggle animation\n"
"Press '+' and '-' to change displayed slice\n"); "Press '+' and '-' to change displayed slice\n");
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)

View File

@ -28,13 +28,12 @@
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_ #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_ #define _SIMPLETEXTURE3D_KERNEL_CU_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_math.h> #include <helper_math.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef unsigned int uint; typedef unsigned int uint;
typedef unsigned char uchar; typedef unsigned char uchar;
@ -42,8 +41,8 @@ typedef unsigned char uchar;
cudaArray *d_volumeArray = 0; cudaArray *d_volumeArray = 0;
cudaTextureObject_t tex; // 3D texture cudaTextureObject_t tex; // 3D texture
__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, __global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj)
cudaTextureObject_t texObj) { {
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@ -59,7 +58,8 @@ __global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
} }
} }
extern "C" void setTextureFilterMode(bool bLinearFilter) { extern "C" void setTextureFilterMode(bool bLinearFilter)
{
if (tex) { if (tex) {
checkCudaErrors(cudaDestroyTextureObject(tex)); checkCudaErrors(cudaDestroyTextureObject(tex));
} }
@ -73,8 +73,7 @@ extern "C" void setTextureFilterMode(bool bLinearFilter) {
memset(&texDescr, 0, sizeof(cudaTextureDesc)); memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true; texDescr.normalizedCoords = true;
texDescr.filterMode = texDescr.filterMode = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
; ;
texDescr.addressMode[0] = cudaAddressModeWrap; texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap; texDescr.addressMode[1] = cudaAddressModeWrap;
@ -84,7 +83,8 @@ extern "C" void setTextureFilterMode(bool bLinearFilter) {
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
} }
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) { extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize)
{
// create 3D array // create 3D array
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>(); cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
@ -92,8 +92,7 @@ extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
// copy data to 3D array // copy data to 3D array
cudaMemcpy3DParms copyParams = {0}; cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = copyParams.srcPtr =
make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height);
volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray; copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize; copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice; copyParams.kind = cudaMemcpyHostToDevice;
@ -121,12 +120,13 @@ extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
} }
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w)
uint imageW, uint imageH, float w) { {
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex); d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
} }
void cleanupCuda() { void cleanupCuda()
{
if (tex) { if (tex) {
checkCudaErrors(cudaDestroyTextureObject(tex)); checkCudaErrors(cudaDestroyTextureObject(tex));
} }

View File

@ -26,29 +26,29 @@
*/ */
/* /*
* This sample demonstrates how use texture fetches in CUDA * This sample demonstrates how use texture fetches in CUDA
* *
* This sample takes an input PGM image (image_filename) and generates * This sample takes an input PGM image (image_filename) and generates
* an output PGM image (image_filename_out). This CUDA kernel performs * an output PGM image (image_filename_out). This CUDA kernel performs
* a simple 2D transform (rotation) on the texture coordinates (u,v). * a simple 2D transform (rotation) on the texture coordinates (u,v).
* The results between simpleTexture and simpleTextureDrv are identical. * The results between simpleTexture and simpleTextureDrv are identical.
* The main difference is the implementation. simpleTextureDrv makes calls * The main difference is the implementation. simpleTextureDrv makes calls
* to the CUDA driver API and demonstrates how to use cuModuleLoad to load * to the CUDA driver API and demonstrates how to use cuModuleLoad to load
* the CUDA ptx (*.ptx) kernel just prior to kernel launch. * the CUDA ptx (*.ptx) kernel just prior to kernel launch.
* *
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <cstring> #include <cstring>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, CUDA // includes, CUDA
#include <cuda.h>
#include <builtin_types.h> #include <builtin_types.h>
#include <cuda.h>
// includes, project // includes, project
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
#include <helper_functions.h> #include <helper_functions.h>
@ -65,8 +65,7 @@ float angle = 0.5f; // angle to rotate image by (in radians)
// declaration, forward // declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
static CUresult initCUDA(int argc, char **argv, CUfunction *); static CUresult initCUDA(int argc, char **argv, CUfunction *);
@ -84,7 +83,8 @@ CUdevice cuDevice;
CUcontext cuContext; CUcontext cuContext;
CUmodule cuModule; CUmodule cuModule;
void showHelp() { void showHelp()
{
printf("\n> [%s] Command line options\n", sSDKsample); printf("\n> [%s] Command line options\n", sSDKsample);
printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n"); printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n");
} }
@ -92,7 +92,8 @@ void showHelp() {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
if (checkCmdLineFlag(argc, (const char **)argv, "help")) { if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
showHelp(); showHelp();
return 0; return 0;
@ -104,7 +105,8 @@ int main(int argc, char **argv) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
bool bTestResults = true; bool bTestResults = true;
// initialize CUDA // initialize CUDA
@ -191,18 +193,17 @@ void runTest(int argc, char **argv) {
// Launching (simpler method) // Launching (simpler method)
void *args[5] = {&d_data, &width, &height, &angle, &TexObject}; void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
checkCudaErrors(cuLaunchKernel(transform, (width / block_size), checkCudaErrors(cuLaunchKernel(
(height / block_size), 1, block_size, transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
block_size, 1, 0, NULL, args, NULL));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
// launch kernel again for performance measurement // launch kernel again for performance measurement
checkCudaErrors(cuLaunchKernel(transform, (width / block_size), checkCudaErrors(cuLaunchKernel(
(height / block_size), 1, block_size, transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
block_size, 1, 0, NULL, args, NULL)); }
} else { else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method) // Launching (advanced method)
int offset = 0; int offset = 0;
@ -222,29 +223,43 @@ void runTest(int argc, char **argv) {
*((CUtexObject *)&argBuffer[offset]) = TexObject; *((CUtexObject *)&argBuffer[offset]) = TexObject;
offset += sizeof(TexObject); offset += sizeof(TexObject);
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, void *kernel_launch_config[5] = {
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call (warmup) // new CUDA 4.0 Driver API Kernel launch call (warmup)
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(transform,
transform, (width / block_size), (height / block_size), 1, block_size, (width / block_size),
block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config)); (height / block_size),
1,
block_size,
block_size,
1,
0,
NULL,
NULL,
(void **)&kernel_launch_config));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
// launch kernel again for performance measurement // launch kernel again for performance measurement
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(transform,
transform, (width / block_size), (height / block_size), 1, block_size, (width / block_size),
block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config)); (height / block_size),
1,
block_size,
block_size,
1,
0,
0,
NULL,
(void **)&kernel_launch_config));
} }
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
printf("%.2f Mpixels/sec\n", printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// allocate mem for the result on host side // allocate mem for the result on host side
@ -262,17 +277,16 @@ void runTest(int argc, char **argv) {
// write regression file if necessary // write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test // write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
false); }
} else { else {
// We need to reload the data from disk, because it is inverted upon output // We need to reload the data from disk, because it is inverted upon output
sdkLoadPGM(output_filename, &h_odata, &width, &height); sdkLoadPGM(output_filename, &h_odata, &width, &height);
printf("Comparing files\n"); printf("Comparing files\n");
printf("\toutput: <%s>\n", output_filename); printf("\toutput: <%s>\n", output_filename);
printf("\treference: <%s>\n", ref_path); printf("\treference: <%s>\n", ref_path);
bTestResults = compareData(h_odata, h_data_ref, width * height, bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f);
MIN_EPSILON_ERROR, 0.15f);
} }
// cleanup memory // cleanup memory
@ -293,7 +307,8 @@ void runTest(int argc, char **argv) {
//! kernel function. After the module is loaded, cuModuleGetFunction //! kernel function. After the module is loaded, cuModuleGetFunction
//! retrieves the CUDA function pointer "cuFunction" //! retrieves the CUDA function pointer "cuFunction"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static CUresult initCUDA(int argc, char **argv, CUfunction *transform) { static CUresult initCUDA(int argc, char **argv, CUfunction *transform)
{
CUfunction cuFunction = 0; CUfunction cuFunction = 0;
int major = 0, minor = 0, devID = 0; int major = 0, minor = 0, devID = 0;
char deviceName[100]; char deviceName[100];
@ -302,10 +317,8 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
cuDevice = findCudaDeviceDRV(argc, (const char **)argv); cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename // get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice)); checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor); printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
@ -316,7 +329,8 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
@ -328,8 +342,7 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
// Create module from binary file (FATBIN) // Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
checkCudaErrors( checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
*transform = cuFunction; *transform = cuFunction;

View File

@ -33,9 +33,8 @@
//! Transform an image using texture lookups //! Transform an image using texture lookups
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void transformKernel(float *g_odata, int width, extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex)
int height, float theta, {
CUtexObject tex) {
// calculate normalized texture coordinates // calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

View File

@ -53,7 +53,8 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0";
#include "simpleVote_kernel.cuh" #include "simpleVote_kernel.cuh"
// Generate the test pattern for Tests 1 and 2 // Generate the test pattern for Tests 1 and 2
void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) { void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
{
// For testing VOTE.Any (all of these threads will return 0) // For testing VOTE.Any (all of these threads will return 0)
for (int i = 0; i < size / 4; i++) { for (int i = 0; i < size / 4; i++) {
VOTE_PATTERN[i] = 0x00000000; VOTE_PATTERN[i] = 0x00000000;
@ -75,8 +76,8 @@ void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
} }
} }
int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
const char *voteType) { {
int i, sum = 0; int i, sum = 0;
for (sum = 0, i = start; i < end; i++) { for (sum = 0, i = start; i < end; i++) {
@ -96,8 +97,8 @@ int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
return (sum > 0); return (sum > 0);
} }
int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
const char *voteType) { {
int i, sum = 0; int i, sum = 0;
for (sum = 0, i = start; i < end; i++) { for (sum = 0, i = start; i < end; i++) {
@ -118,49 +119,42 @@ int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
} }
// Verification code for Kernel #1 // Verification code for Kernel #1
int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size)
int warp_size) { {
int error_count = 0; int error_count = 0;
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
warp_size, "Vote.Any"); error_count += checkErrors2(
error_count += h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors2(
2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
error_count += error_count += checkErrors2(
checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
error_count +=
checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
return error_count; return error_count;
} }
// Verification code for Kernel #2 // Verification code for Kernel #2
int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size)
int warp_size) { {
int error_count = 0; int error_count = 0;
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
warp_size, "Vote.All"); error_count += checkErrors1(
error_count += h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors1(
2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
error_count += error_count += checkErrors2(
checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
error_count +=
checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
return error_count; return error_count;
} }
// Verification code for Kernel #3 // Verification code for Kernel #3
int checkResultsVoteAnyKernel3(bool *hinfo, int size) { int checkResultsVoteAnyKernel3(bool *hinfo, int size)
{
int i, error_count = 0; int i, error_count = 0;
for (i = 0; i < size * 3; i++) { for (i = 0; i < size * 3; i++) {
@ -198,7 +192,8 @@ int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
return error_count; return error_count;
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
unsigned int *h_input, *h_result; unsigned int *h_input, *h_result;
unsigned int *d_input, *d_result; unsigned int *d_input, *d_result;
@ -216,24 +211,20 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
// Statistics about the GPU device // Statistics about the GPU device
printf( printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
"> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount,
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); deviceProp.major,
deviceProp.minor);
h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
sizeof(unsigned int)); h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
sizeof(unsigned int));
checkCudaErrors( checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&d_input), cudaMalloc(reinterpret_cast<void **>(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
checkCudaErrors( checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&d_result), cudaMalloc(reinterpret_cast<void **>(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size); genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
checkCudaErrors(cudaMemcpy(d_input, h_input, checkCudaErrors(
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
// Start of Vote Any Test Kernel #1 // Start of Vote Any Test Kernel #1
printf("[VOTE Kernel Test 1/3]\n"); printf("[VOTE Kernel Test 1/3]\n");
@ -242,16 +233,13 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
dim3 gridBlock(1, 1); dim3 gridBlock(1, 1);
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
VOTE_DATA_GROUP * warp_size);
getLastCudaError("VoteAnyKernel() execution failed\n"); getLastCudaError("VoteAnyKernel() execution failed\n");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
} }
checkCudaErrors(cudaMemcpy(h_result, d_result, checkCudaErrors(
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost)); error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
error_count[0] += checkResultsVoteAnyKernel1(
h_result, VOTE_DATA_GROUP * warp_size, warp_size);
// Start of Vote All Test Kernel #2 // Start of Vote All Test Kernel #2
printf("\n[VOTE Kernel Test 2/3]\n"); printf("\n[VOTE Kernel Test 2/3]\n");
@ -260,23 +248,18 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
dim3 gridBlock(1, 1); dim3 gridBlock(1, 1);
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
VOTE_DATA_GROUP * warp_size);
getLastCudaError("VoteAllKernel() execution failed\n"); getLastCudaError("VoteAllKernel() execution failed\n");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
} }
checkCudaErrors(cudaMemcpy(h_result, d_result, checkCudaErrors(
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost)); error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
error_count[1] += checkResultsVoteAllKernel2(
h_result, VOTE_DATA_GROUP * warp_size, warp_size);
// Second Vote Kernel Test #3 (both Any/All) // Second Vote Kernel Test #3 (both Any/All)
hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool))); hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
cudaMalloc(reinterpret_cast<void **>(&dinfo), cudaMalloc(reinterpret_cast<void **>(&dinfo), warp_size * 3 * 3 * sizeof(bool));
warp_size * 3 * 3 * sizeof(bool)); cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice);
cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool),
cudaMemcpyHostToDevice);
printf("\n[VOTE Kernel Test 3/3]\n"); printf("\n[VOTE Kernel Test 3/3]\n");
printf("\tRunning <<Vote.Any>> kernel3 ...\n"); printf("\tRunning <<Vote.Any>> kernel3 ...\n");
@ -286,8 +269,7 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
} }
cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost);
cudaMemcpyDeviceToHost);
error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3); error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
@ -303,7 +285,5 @@ int main(int argc, char **argv) {
printf("\tShutting down...\n"); printf("\tShutting down...\n");
return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
? EXIT_SUCCESS
: EXIT_FAILURE;
} }

View File

@ -38,8 +38,8 @@
// If ANY one of the threads (within the warp) of the predicated condition // If ANY one of the threads (within the warp) of the predicated condition
// returns a non-zero value, then all threads within this warp will return a // returns a non-zero value, then all threads within this warp will return a
// non-zero value // non-zero value
__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, __global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size)
int size) { {
int tx = threadIdx.x; int tx = threadIdx.x;
int mask = 0xffffffff; int mask = 0xffffffff;
@ -50,8 +50,8 @@ __global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
// If ALL of the threads (within the warp) of the predicated condition returns // If ALL of the threads (within the warp) of the predicated condition returns
// a non-zero value, then all threads within this warp will return a non-zero // a non-zero value, then all threads within this warp will return a non-zero
// value // value
__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, __global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size)
int size) { {
int tx = threadIdx.x; int tx = threadIdx.x;
int mask = 0xffffffff; int mask = 0xffffffff;
@ -60,7 +60,8 @@ __global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
// Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic. // Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
// This kernel will test for conditions across warps, and within half warps // This kernel will test for conditions across warps, and within half warps
__global__ void VoteAnyKernel3(bool *info, int warp_size) { __global__ void VoteAnyKernel3(bool *info, int warp_size)
{
int tx = threadIdx.x; int tx = threadIdx.x;
unsigned int mask = 0xffffffff; unsigned int mask = 0xffffffff;
bool *offs = info + (tx * 3); bool *offs = info + (tx * 3);

View File

@ -41,7 +41,8 @@
#endif #endif
/* Add two vectors on the GPU */ /* Add two vectors on the GPU */
__global__ void vectorAddGPU(float *a, float *b, float *c, int N) { __global__ void vectorAddGPU(float *a, float *b, float *c, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) { if (idx < N) {
@ -57,7 +58,8 @@ bool bPinGenericMemory = false;
#define MEMORY_ALIGNMENT 4096 #define MEMORY_ALIGNMENT 4096
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1))) #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
int main(int argc, char **argv) { int main(int argc, char **argv)
{
int n, nelem, deviceCount; int n, nelem, deviceCount;
int idev = 0; // use default device 0 int idev = 0; // use default device 0
char *device = NULL; char *device = NULL;
@ -73,8 +75,7 @@ int main(int argc, char **argv) {
printf("Usage: simpleZeroCopy [OPTION]\n\n"); printf("Usage: simpleZeroCopy [OPTION]\n\n");
printf("Options:\n"); printf("Options:\n");
printf(" --device=[device #] Specify the device to be used\n"); printf(" --device=[device #] Specify the device to be used\n");
printf( printf(" --use_generic_memory (optional) use generic page-aligned for system "
" --use_generic_memory (optional) use generic page-aligned for system "
"memory\n"); "memory\n");
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
@ -85,9 +86,7 @@ int main(int argc, char **argv) {
idev = atoi(device); idev = atoi(device);
if (idev >= deviceCount || idev < 0) { if (idev >= deviceCount || idev < 0) {
fprintf(stderr, fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev);
"Device number %d is invalid, will use default CUDA device 0.\n",
idev);
idev = 0; idev = 0;
} }
} }
@ -108,7 +107,8 @@ int main(int argc, char **argv) {
if (bPinGenericMemory) { if (bPinGenericMemory) {
printf("> Using Generic System Paged Memory (malloc)\n"); printf("> Using Generic System Paged Memory (malloc)\n");
} else { }
else {
printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
} }
@ -122,8 +122,7 @@ int main(int argc, char **argv) {
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
if (!deviceProp.canMapHostMemory) { if (!deviceProp.canMapHostMemory) {
fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev);
idev);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
@ -133,7 +132,9 @@ int main(int argc, char **argv) {
fprintf(stderr, fprintf(stderr,
"CUDART version %d.%d does not support " "CUDART version %d.%d does not support "
"<cudaDeviceProp.canMapHostMemory> field\n", "<cudaDeviceProp.canMapHostMemory> field\n",
, CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); ,
CUDART_VERSION / 1000,
(CUDART_VERSION % 100) / 10);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
#endif #endif
@ -141,10 +142,10 @@ int main(int argc, char **argv) {
#if CUDART_VERSION < 4000 #if CUDART_VERSION < 4000
if (bPinGenericMemory) { if (bPinGenericMemory) {
fprintf( fprintf(stderr,
stderr,
"CUDART version %d.%d does not support <cudaHostRegister> function\n", "CUDART version %d.%d does not support <cudaHostRegister> function\n",
CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); CUDART_VERSION / 1000,
(CUDART_VERSION % 100) / 10);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
@ -172,7 +173,8 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped)); checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped)); checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
#endif #endif
} else { }
else {
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
flags = cudaHostAllocMapped; flags = cudaHostAllocMapped;
checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags)); checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
@ -235,7 +237,8 @@ int main(int argc, char **argv) {
free(b_UA); free(b_UA);
free(c_UA); free(c_UA);
#endif #endif
} else { }
else {
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
checkCudaErrors(cudaFreeHost(a)); checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFreeHost(b)); checkCudaErrors(cudaFreeHost(b));

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -29,19 +29,20 @@
* memory. * memory.
*/ */
#include <cstdio>
#include <ctime>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <math.h> #include <math.h>
#include <stdint.h> #include <stdint.h>
#include <cstdio>
#include <ctime>
#define min(a, b) (a) < (b) ? (a) : (b) #define min(a, b) (a) < (b) ? (a) : (b)
#define max(a, b) (a) > (b) ? (a) : (b) #define max(a, b) (a) > (b) ? (a) : (b)
#define LOOP_NUM 50 #define LOOP_NUM 50
__global__ void atomicKernel(int *atom_arr) { __global__ void atomicKernel(int *atom_arr)
{
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = 0; i < LOOP_NUM; i++) { for (int i = 0; i < LOOP_NUM; i++) {
@ -79,7 +80,8 @@ __global__ void atomicKernel(int *atom_arr) {
} }
} }
void atomicKernel_CPU(int *atom_arr, int no_of_threads) { void atomicKernel_CPU(int *atom_arr, int no_of_threads)
{
for (int i = no_of_threads; i < 2 * no_of_threads; i++) { for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
for (int j = 0; j < LOOP_NUM; j++) { for (int j = 0; j < LOOP_NUM; j++) {
// Atomic addition // Atomic addition
@ -92,23 +94,20 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
int old, expected; int old, expected;
do { do {
expected = atom_arr[2]; expected = atom_arr[2];
old = __sync_val_compare_and_swap(&atom_arr[2], expected, old = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i));
max(expected, i));
} while (old != expected); } while (old != expected);
// Atomic minimum // Atomic minimum
do { do {
expected = atom_arr[3]; expected = atom_arr[3];
old = __sync_val_compare_and_swap(&atom_arr[3], expected, old = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i));
min(expected, i));
} while (old != expected); } while (old != expected);
// Atomic increment (modulo 17+1) // Atomic increment (modulo 17+1)
int limit = 17; int limit = 17;
do { do {
expected = atom_arr[4]; expected = atom_arr[4];
old = __sync_val_compare_and_swap( old = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
} while (old != expected); } while (old != expected);
// Atomic decrement // Atomic decrement
@ -116,8 +115,7 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
do { do {
expected = atom_arr[5]; expected = atom_arr[5];
old = __sync_val_compare_and_swap( old = __sync_val_compare_and_swap(
&atom_arr[5], expected, &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1);
((expected == 0) || (expected > limit)) ? limit : expected - 1);
} while (old != expected); } while (old != expected);
// Atomic compare-and-swap // Atomic compare-and-swap
@ -145,7 +143,8 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
//! @param idata input data as provided to device //! @param idata input data as provided to device
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int verify(int *testData, const int len) { int verify(int *testData, const int len)
{
int val = 0; int val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) { for (int i = 0; i < len * LOOP_NUM; ++i) {
@ -275,7 +274,8 @@ int verify(int *testData, const int len) {
return true; return true;
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
// set device // set device
cudaDeviceProp device_prop; cudaDeviceProp device_prop;
int dev_id = findCudaDevice(argc, (const char **)argv); int dev_id = findCudaDevice(argc, (const char **)argv);
@ -296,8 +296,7 @@ int main(int argc, char **argv) {
} }
if (device_prop.major < 6) { if (device_prop.major < 6) {
printf( printf("%s: requires a minimum CUDA compute 6.0 capability, waiving "
"%s: requires a minimum CUDA compute 6.0 capability, waiving "
"testing.\n", "testing.\n",
argv[0]); argv[0]);
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
@ -312,12 +311,14 @@ int main(int argc, char **argv) {
if (device_prop.pageableMemoryAccess) { if (device_prop.pageableMemoryAccess) {
printf("CAN access pageable memory\n"); printf("CAN access pageable memory\n");
atom_arr = (int *)malloc(sizeof(int) * numData); atom_arr = (int *)malloc(sizeof(int) * numData);
} else { }
else {
printf("CANNOT access pageable memory\n"); printf("CANNOT access pageable memory\n");
checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData)); checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
} }
for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0; for (unsigned int i = 0; i < numData; i++)
atom_arr[i] = 0;
// To make the AND and XOR tests generate something other than 0... // To make the AND and XOR tests generate something other than 0...
atom_arr[7] = atom_arr[9] = 0xff; atom_arr[7] = atom_arr[9] = 0xff;
@ -332,11 +333,11 @@ int main(int argc, char **argv) {
if (device_prop.pageableMemoryAccess) { if (device_prop.pageableMemoryAccess) {
free(atom_arr); free(atom_arr);
} else { }
else {
cudaFree(atom_arr); cudaFree(atom_arr);
} }
printf("systemWideAtomics completed, returned %s \n", printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -31,10 +31,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA // includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -47,15 +47,15 @@
// declaration, forward // declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality //! Simple test kernel for device functionality
//! @param g_idata input data in global memory //! @param g_idata input data in global memory
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(float *g_idata, float *g_odata) { __global__ void testKernel(float *g_idata, float *g_odata)
{
// shared memory // shared memory
// the size is determined by the host application // the size is determined by the host application
extern __shared__ float sdata[]; extern __shared__ float sdata[];
@ -85,7 +85,8 @@ int main(int argc, char **argv) { runTest(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
{
bool bTestResult = true; bool bTestResult = true;
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
@ -113,8 +114,7 @@ void runTest(int argc, char **argv) {
float *d_idata; float *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
// copy host memory to device // copy host memory to device
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
// allocate device memory for result // allocate device memory for result
float *d_odata; float *d_odata;
@ -133,8 +133,7 @@ void runTest(int argc, char **argv) {
// allocate mem for the result on host side // allocate mem for the result on host side
float *h_odata = (float *)malloc(mem_size); float *h_odata = (float *)malloc(mem_size);
// copy result from device to host // copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
@ -148,7 +147,8 @@ void runTest(int argc, char **argv) {
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test // write file for regression test
sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false); sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
} else { }
else {
// custom output handling when no regression test running // custom output handling when no regression test running
// in this case check if the result is equivalent to the expected solution // in this case check if the result is equivalent to the expected solution
bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f); bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);

View File

@ -26,8 +26,7 @@
*/ */
// export C interface // export C interface
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set //! Compute reference data set
@ -36,7 +35,8 @@ extern "C" void computeGold(float *reference, float *idata,
//! @param idata input data as provided to device //! @param idata input data as provided to device
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void computeGold(float *reference, float *idata, const unsigned int len) { void computeGold(float *reference, float *idata, const unsigned int len)
{
const float f_len = static_cast<float>(len); const float f_len = static_cast<float>(len);
for (unsigned int i = 0; i < len; ++i) { for (unsigned int i = 0; i < len; ++i) {

View File

@ -37,7 +37,6 @@
// For the CUDA runtime routines (prefixed with "cuda_") // For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
/** /**
* CUDA Kernel Device code * CUDA Kernel Device code
@ -45,8 +44,8 @@
* Computes the vector addition of A and B into C. The 3 vectors have the same * Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements. * number of elements numElements.
*/ */
__global__ void vectorAdd(const float *A, const float *B, float *C, __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
int numElements) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) { if (i < numElements) {
@ -57,7 +56,8 @@ __global__ void vectorAdd(const float *A, const float *B, float *C,
/** /**
* Host main routine * Host main routine
*/ */
int main(void) { int main(void)
{
// Error code to check return values for CUDA calls // Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess; cudaError_t err = cudaSuccess;
@ -92,8 +92,7 @@ int main(void) {
err = cudaMalloc((void **)&d_A, size); err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -102,8 +101,7 @@ int main(void) {
err = cudaMalloc((void **)&d_B, size); err = cudaMalloc((void **)&d_B, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -112,8 +110,7 @@ int main(void) {
err = cudaMalloc((void **)&d_C, size); err = cudaMalloc((void **)&d_C, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -124,32 +121,26 @@ int main(void) {
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
"Failed to copy vector A from host to device (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
"Failed to copy vector B from host to device (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// Launch the Vector Add CUDA Kernel // Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256; int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements); vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
err = cudaGetLastError(); err = cudaGetLastError();
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -159,9 +150,7 @@ int main(void) {
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
"Failed to copy vector C from device to host (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -179,24 +168,21 @@ int main(void) {
err = cudaFree(d_A); err = cudaFree(d_A);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
err = cudaFree(d_B); err = cudaFree(d_B);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
err = cudaFree(d_C); err = cudaFree(d_C);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }

View File

@ -34,11 +34,11 @@
*/ */
// Includes // Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring> #include <cstring>
#include <cuda.h> #include <cuda.h>
#include <iostream>
#include <stdio.h>
#include <string.h>
// includes, project // includes, project
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
@ -72,7 +72,8 @@ bool findModulePath(const char *, string &, char **, string &);
#endif #endif
// Host code // Host code
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("Vector Addition (Driver API)\n"); printf("Vector Addition (Driver API)\n");
int N = 50000, devID = 0; int N = 50000, devID = 0;
size_t size = N * sizeof(float); size_t size = N * sizeof(float);
@ -91,7 +92,8 @@ int main(int argc, char **argv) {
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
@ -104,8 +106,7 @@ int main(int argc, char **argv) {
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module // Get function handle from module
checkCudaErrors( checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
// Allocate input vectors h_A and h_B in host memory // Allocate input vectors h_A and h_B in host memory
h_A = (float *)malloc(size); h_A = (float *)malloc(size);
@ -139,9 +140,9 @@ int main(int argc, char **argv) {
void *args[] = {&d_A, &d_B, &d_C, &N}; void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel // Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
threadsPerBlock, 1, 1, 0, NULL, args, NULL)); }
} else { else {
// This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
// Launch (advanced method) // Launch (advanced method)
int offset = 0; int offset = 0;
@ -160,9 +161,8 @@ int main(int argc, char **argv) {
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// Launch the CUDA kernel // Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, checkCudaErrors(
threadsPerBlock, 1, 1, 0, NULL, NULL, cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer));
argBuffer));
} }
#ifdef _DEBUG #ifdef _DEBUG
@ -190,7 +190,8 @@ int main(int argc, char **argv) {
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
} }
int CleanupNoFailure() { int CleanupNoFailure()
{
// Free device memory // Free device memory
checkCudaErrors(cuMemFree(d_A)); checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B)); checkCudaErrors(cuMemFree(d_B));
@ -214,7 +215,8 @@ int CleanupNoFailure() {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
// Allocates an array with random float entries. // Allocates an array with random float entries.
void RandomInit(float *data, int n) { void RandomInit(float *data, int n)
{
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
data[i] = rand() / (float)RAND_MAX; data[i] = rand() / (float)RAND_MAX;
} }

View File

@ -33,9 +33,10 @@
*/ */
// Device code // Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
float *C, int N) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i]; if (i < N)
C[i] = A[i] + B[i];
} }

View File

@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details) ## References (for more details)

View File

@ -29,10 +29,13 @@
static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; } static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
CUresult simpleMallocMultiDeviceMmap( CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr,
CUdeviceptr *dptr, size_t *allocationSize, size_t size, size_t *allocationSize,
size_t size,
const std::vector<CUdevice> &residentDevices, const std::vector<CUdevice> &residentDevices,
const std::vector<CUdevice> &mappingDevices, size_t align) { const std::vector<CUdevice> &mappingDevices,
size_t align)
{
CUresult status = CUDA_SUCCESS; CUresult status = CUDA_SUCCESS;
size_t min_granularity = 0; size_t min_granularity = 0;
size_t stripeSize; size_t stripeSize;
@ -53,8 +56,7 @@ CUresult simpleMallocMultiDeviceMmap(
// get the minnimum granularity for residentDevices[idx] // get the minnimum granularity for residentDevices[idx]
prop.location.id = residentDevices[idx]; prop.location.id = residentDevices[idx];
status = cuMemGetAllocationGranularity(&granularity, &prop, status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (status != CUDA_SUCCESS) { if (status != CUDA_SUCCESS) {
goto done; goto done;
} }
@ -70,8 +72,7 @@ CUresult simpleMallocMultiDeviceMmap(
// get the minnimum granularity for mappingDevices[idx] // get the minnimum granularity for mappingDevices[idx]
prop.location.id = mappingDevices[idx]; prop.location.id = mappingDevices[idx];
status = cuMemGetAllocationGranularity(&granularity, &prop, status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (status != CUDA_SUCCESS) { if (status != CUDA_SUCCESS) {
goto done; goto done;
} }
@ -121,8 +122,7 @@ CUresult simpleMallocMultiDeviceMmap(
// Since we do not need to make any other mappings of this memory or export // Since we do not need to make any other mappings of this memory or export
// it, we no longer need and can release the allocationHandle. The // it, we no longer need and can release the allocationHandle. The
// allocation will be kept live until it is unmapped. // allocation will be kept live until it is unmapped.
status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0);
allocationHandle, 0);
// the handle needs to be released even if the mapping failed. // the handle needs to be released even if the mapping failed.
status2 = cuMemRelease(allocationHandle); status2 = cuMemRelease(allocationHandle);
@ -157,8 +157,7 @@ CUresult simpleMallocMultiDeviceMmap(
} }
// Apply the access descriptors to the whole VA range. // Apply the access descriptors to the whole VA range.
status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size());
accessDescriptors.size());
if (status != CUDA_SUCCESS) { if (status != CUDA_SUCCESS) {
goto done; goto done;
} }
@ -174,7 +173,8 @@ done:
return status; return status;
} }
CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) { CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size)
{
CUresult status = CUDA_SUCCESS; CUresult status = CUDA_SUCCESS;
// Unmap the mapped virtual memory region // Unmap the mapped virtual memory region

View File

@ -63,10 +63,12 @@
//! handle //! handle
//! is not needed after its mappings are set up. //! is not needed after its mappings are set up.
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
CUresult simpleMallocMultiDeviceMmap( CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr,
CUdeviceptr *dptr, size_t *allocationSize, size_t size, size_t *allocationSize,
size_t size,
const std::vector<CUdevice> &residentDevices, const std::vector<CUdevice> &residentDevices,
const std::vector<CUdevice> &mappingDevices, size_t align = 0); const std::vector<CUdevice> &mappingDevices,
size_t align = 0);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
//! Frees resources allocated by simpleMallocMultiDeviceMmap //! Frees resources allocated by simpleMallocMultiDeviceMmap

View File

@ -36,11 +36,11 @@
*/ */
// Includes // Includes
#include <cstring>
#include <cuda.h> #include <cuda.h>
#include <iostream>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <cstring>
#include <iostream>
// includes, project // includes, project
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
@ -70,13 +70,14 @@ size_t allocationSize = 0;
int CleanupNoFailure(); int CleanupNoFailure();
void RandomInit(float *, int); void RandomInit(float *, int);
//define input fatbin file // define input fatbin file
#ifndef FATBIN_FILE #ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin" #define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif #endif
// collect all of the devices whose memory can be mapped from cuDevice. // collect all of the devices whose memory can be mapped from cuDevice.
vector<CUdevice> getBackingDevices(CUdevice cuDevice) { vector<CUdevice> getBackingDevices(CUdevice cuDevice)
{
int num_devices; int num_devices;
checkCudaErrors(cuDeviceGetCount(&num_devices)); checkCudaErrors(cuDeviceGetCount(&num_devices));
@ -100,9 +101,8 @@ vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
// The device needs to support virtual address management for the required // The device needs to support virtual address management for the required
// apis to work // apis to work
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
cuDevice));
if (attributeVal == 0) { if (attributeVal == 0) {
continue; continue;
} }
@ -113,7 +113,8 @@ vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
} }
// Host code // Host code
int main(int argc, char **argv) { int main(int argc, char **argv)
{
printf("Vector Addition (Driver API)\n"); printf("Vector Addition (Driver API)\n");
int N = 50000; int N = 50000;
size_t size = N * sizeof(float); size_t size = N * sizeof(float);
@ -125,11 +126,9 @@ int main(int argc, char **argv) {
cuDevice = findCudaDeviceDRV(argc, (const char **)argv); cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// Check that the selected device supports virtual address management // Check that the selected device supports virtual address management
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
cuDevice)); printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal);
printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice,
attributeVal);
if (attributeVal == 0) { if (attributeVal == 0) {
printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice); printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
@ -152,17 +151,14 @@ int main(int argc, char **argv) {
std::ostringstream fatbin; std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
{
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
else else {
{
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
if (!fatbin.str().size()) if (!fatbin.str().size()) {
{
printf("fatbin file empty. exiting..\n"); printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -204,13 +200,10 @@ int main(int argc, char **argv) {
int threadsPerBlock = 256; int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
void *args[] = { &d_A, &d_B, &d_C, &N }; void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel // Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
threadsPerBlock, 1, 1,
0,
NULL, args, NULL));
// Copy result from device memory to host memory // Copy result from device memory to host memory
// h_C contains the result in host memory // h_C contains the result in host memory
@ -219,20 +212,18 @@ int main(int argc, char **argv) {
// Verify result // Verify result
int i; int i;
for (i = 0; i < N; ++i) for (i = 0; i < N; ++i) {
{
float sum = h_A[i] + h_B[i]; float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-7f) if (fabs(h_C[i] - sum) > 1e-7f) {
{
break; break;
} }
} }
CleanupNoFailure(); CleanupNoFailure();
printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL"); printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE); exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
} }
int CleanupNoFailure() int CleanupNoFailure()
@ -243,18 +234,15 @@ int CleanupNoFailure()
checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize)); checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
// Free host memory // Free host memory
if (h_A) if (h_A) {
{
free(h_A); free(h_A);
} }
if (h_B) if (h_B) {
{
free(h_B); free(h_B);
} }
if (h_C) if (h_C) {
{
free(h_C); free(h_C);
} }
@ -265,8 +253,7 @@ int CleanupNoFailure()
// Allocates an array with random float entries. // Allocates an array with random float entries.
void RandomInit(float *data, int n) void RandomInit(float *data, int n)
{ {
for (int i = 0; i < n; ++i) for (int i = 0; i < n; ++i) {
{
data[i] = rand() / (float)RAND_MAX; data[i] = rand() / (float)RAND_MAX;
} }
} }

View File

@ -34,9 +34,10 @@
*/ */
// Device code // Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
float *C, int N) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i]; if (i < N)
C[i] = A[i] + B[i];
} }

View File

@ -33,8 +33,8 @@
* of the programming guide with some additions like error checking. * of the programming guide with some additions like error checking.
*/ */
#include <stdio.h>
#include <cmath> #include <cmath>
#include <stdio.h>
// For the CUDA runtime routines (prefixed with "cuda_") // For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda.h> #include <cuda.h>
@ -42,13 +42,13 @@
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h> #include <helper_functions.h>
#include <nvrtc_helper.h> #include <nvrtc_helper.h>
/** /**
* Host main routine * Host main routine
*/ */
int main(int argc, char **argv) { int main(int argc, char **argv)
{
char *cubin, *kernel_file; char *cubin, *kernel_file;
size_t cubinSize; size_t cubinSize;
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]); kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
@ -105,19 +105,23 @@ int main(int argc, char **argv) {
// Launch the Vector Add CUDA Kernel // Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256; int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock, 1, 1); dim3 cudaBlockSize(threadsPerBlock, 1, 1);
dim3 cudaGridSize(blocksPerGrid, 1, 1); dim3 cudaGridSize(blocksPerGrid, 1, 1);
void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B), void *arr[] = {reinterpret_cast<void *>(&d_A),
reinterpret_cast<void *>(&d_B),
reinterpret_cast<void *>(&d_C), reinterpret_cast<void *>(&d_C),
reinterpret_cast<void *>(&numElements)}; reinterpret_cast<void *>(&numElements)};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, checkCudaErrors(cuLaunchKernel(kernel_addr,
cudaGridSize.x,
cudaGridSize.y,
cudaGridSize.z, /* grid dim */ cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.x,
cudaBlockSize.y,
cudaBlockSize.z, /* block dim */ cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */ 0,
0, /* shared mem, stream */
&arr[0], /* arguments */ &arr[0], /* arguments */
0)); 0));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());

View File

@ -32,8 +32,8 @@
* number of elements numElements. * number of elements numElements.
*/ */
extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
int numElements) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) { if (i < numElements) {

View File

@ -39,12 +39,10 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
// includes // includes
#include <cassert>
#include <cuda.h>
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization #include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper for shared functions common to CUDA Samples #include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <cuda.h>
#include <cassert>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
@ -83,8 +81,7 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV }; enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE }; enum memoryMode { PINNED, PAGEABLE };
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL};
"Device to Device", NULL};
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL}; const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
@ -97,36 +94,62 @@ char **pArgv = NULL;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// declaration, forward // declaration, forward
int runTest(const int argc, const char **argv); int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, void testBandwidth(unsigned int start,
testMode mode, memcpyKind kind, printMode printmode, unsigned int end,
memoryMode memMode, int startDevice, int endDevice, bool wc); unsigned int increment,
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, testMode mode,
memoryMode memMode, int startDevice, int endDevice, memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc); bool wc);
void testBandwidthRange(unsigned int start, unsigned int end, void testBandwidthQuick(unsigned int size,
unsigned int increment, memcpyKind kind, memcpyKind kind,
printMode printmode, memoryMode memMode, printMode printmode,
int startDevice, int endDevice, bool wc); memoryMode memMode,
void testBandwidthShmoo(memcpyKind kind, printMode printmode, int startDevice,
memoryMode memMode, int startDevice, int endDevice, int endDevice,
bool wc); bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, void testBandwidthRange(unsigned int start,
unsigned int end,
unsigned int increment,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc); bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, void testBandwidthShmoo(memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc); bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize); float testDeviceToDeviceTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths, void printResultsReadable(unsigned int *memSizes,
unsigned int count, memcpyKind kind, double *bandwidths,
memoryMode memMode, int iNumDevs, bool wc); unsigned int count,
void printResultsCSV(unsigned int *memSizes, double *bandwidths, memcpyKind kind,
unsigned int count, memcpyKind kind, memoryMode memMode, memoryMode memMode,
int iNumDevs, bool wc); int iNumDevs,
bool wc);
void printResultsCSV(unsigned int *memSizes,
double *bandwidths,
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc);
void printHelp(void); void printHelp(void);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
pArgc = &argc; pArgc = &argc;
pArgv = argv; pArgv = argv;
@ -144,8 +167,7 @@ int main(int argc, char **argv) {
// finish // finish
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
printf( printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n"); "Results may vary when GPU Boost is enabled.\n");
free(flush_buf); free(flush_buf);
@ -156,7 +178,8 @@ int main(int argc, char **argv) {
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Parse args, run the appropriate tests // Parse args, run the appropriate tests
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
int runTest(const int argc, const char **argv) { int runTest(const int argc, const char **argv)
{
int start = DEFAULT_SIZE; int start = DEFAULT_SIZE;
int end = DEFAULT_SIZE; int end = DEFAULT_SIZE;
int startDevice = 0; int startDevice = 0;
@ -186,14 +209,17 @@ int runTest(const int argc, const char **argv) {
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
if (strcmp(memModeStr, "pageable") == 0) { if (strcmp(memModeStr, "pageable") == 0) {
memMode = PAGEABLE; memMode = PAGEABLE;
} else if (strcmp(memModeStr, "pinned") == 0) { }
else if (strcmp(memModeStr, "pinned") == 0) {
memMode = PINNED; memMode = PINNED;
} else { }
else {
printf("Invalid memory mode - valid modes are pageable or pinned\n"); printf("Invalid memory mode - valid modes are pageable or pinned\n");
printf("See --help for more information\n"); printf("See --help for more information\n");
return -1000; return -1000;
} }
} else { }
else {
// default - pinned memory // default - pinned memory
memMode = PINNED; memMode = PINNED;
} }
@ -203,8 +229,7 @@ int runTest(const int argc, const char **argv) {
cudaError_t error_id = cudaGetDeviceCount(&deviceCount); cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) { if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
cudaGetErrorString(error_id));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -214,19 +239,19 @@ int runTest(const int argc, const char **argv) {
} }
if (strcmp(device, "all") == 0) { if (strcmp(device, "all") == 0) {
printf( printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"!!!!!!\n\n"); "!!!!!!\n\n");
startDevice = 0; startDevice = 0;
endDevice = deviceCount - 1; endDevice = deviceCount - 1;
} else { }
else {
startDevice = endDevice = atoi(device); startDevice = endDevice = atoi(device);
if (startDevice >= deviceCount || startDevice < 0) { if (startDevice >= deviceCount || startDevice < 0) {
printf( printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"used !!!!!\n", "used !!!!!\n",
startDevice, 0); startDevice,
0);
startDevice = endDevice = 0; startDevice = endDevice = 0;
} }
} }
@ -234,8 +259,7 @@ int runTest(const int argc, const char **argv) {
printf("Running on...\n\n"); printf("Running on...\n\n");
for (int currentDevice = startDevice; currentDevice <= endDevice; for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
currentDevice++) {
cudaDeviceProp deviceProp; cudaDeviceProp deviceProp;
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
@ -250,9 +274,9 @@ int runTest(const int argc, const char **argv) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} else { }
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, else {
cudaGetErrorString(error_id)); printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
checkCudaErrors(cudaSetDevice(currentDevice)); checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
@ -264,18 +288,22 @@ int runTest(const int argc, const char **argv) {
if (strcmp(modeStr, "quick") == 0) { if (strcmp(modeStr, "quick") == 0) {
printf(" Quick Mode\n\n"); printf(" Quick Mode\n\n");
mode = QUICK_MODE; mode = QUICK_MODE;
} else if (strcmp(modeStr, "shmoo") == 0) { }
else if (strcmp(modeStr, "shmoo") == 0) {
printf(" Shmoo Mode\n\n"); printf(" Shmoo Mode\n\n");
mode = SHMOO_MODE; mode = SHMOO_MODE;
} else if (strcmp(modeStr, "range") == 0) { }
else if (strcmp(modeStr, "range") == 0) {
printf(" Range Mode\n\n"); printf(" Range Mode\n\n");
mode = RANGE_MODE; mode = RANGE_MODE;
} else { }
else {
printf("Invalid mode - valid modes are quick, range, or shmoo\n"); printf("Invalid mode - valid modes are quick, range, or shmoo\n");
printf("See --help for more information\n"); printf("See --help for more information\n");
return -3000; return -3000;
} }
} else { }
else {
// default mode - quick // default mode - quick
printf(" Quick Mode\n\n"); printf(" Quick Mode\n\n");
mode = QUICK_MODE; mode = QUICK_MODE;
@ -320,7 +348,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - start must be greater than zero\n"); printf("Illegal argument - start must be greater than zero\n");
return -4000; return -4000;
} }
} else { }
else {
printf("Must specify a starting size in range mode\n"); printf("Must specify a starting size in range mode\n");
printf("See --help for more information\n"); printf("See --help for more information\n");
return -5000; return -5000;
@ -338,7 +367,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - start is greater than end\n"); printf("Illegal argument - start is greater than end\n");
return -7000; return -7000;
} }
} else { }
else {
printf("Must specify an end size in range mode.\n"); printf("Must specify an end size in range mode.\n");
printf("See --help for more information\n"); printf("See --help for more information\n");
return -8000; return -8000;
@ -351,7 +381,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - increment must be greater than zero\n"); printf("Illegal argument - increment must be greater than zero\n");
return -9000; return -9000;
} }
} else { }
else {
printf("Must specify an increment in user mode\n"); printf("Must specify an increment in user mode\n");
printf("See --help for more information\n"); printf("See --help for more information\n");
return -10000; return -10000;
@ -359,21 +390,42 @@ int runTest(const int argc, const char **argv) {
} }
if (htod) { if (htod) {
testBandwidth((unsigned int)start, (unsigned int)end, testBandwidth((unsigned int)start,
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode, (unsigned int)end,
memMode, startDevice, endDevice, wc); (unsigned int)increment,
mode,
HOST_TO_DEVICE,
printmode,
memMode,
startDevice,
endDevice,
wc);
} }
if (dtoh) { if (dtoh) {
testBandwidth((unsigned int)start, (unsigned int)end, testBandwidth((unsigned int)start,
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode, (unsigned int)end,
memMode, startDevice, endDevice, wc); (unsigned int)increment,
mode,
DEVICE_TO_HOST,
printmode,
memMode,
startDevice,
endDevice,
wc);
} }
if (dtod) { if (dtod) {
testBandwidth((unsigned int)start, (unsigned int)end, testBandwidth((unsigned int)start,
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, (unsigned int)end,
memMode, startDevice, endDevice, wc); (unsigned int)increment,
mode,
DEVICE_TO_DEVICE,
printmode,
memMode,
startDevice,
endDevice,
wc);
} }
// Ensure that we reset all CUDA Devices in question // Ensure that we reset all CUDA Devices in question
@ -387,19 +439,24 @@ int runTest(const int argc, const char **argv) {
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Run a bandwidth test // Run a bandwidth test
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, void testBandwidth(unsigned int start,
testMode mode, memcpyKind kind, printMode printmode, unsigned int end,
memoryMode memMode, int startDevice, int endDevice, unsigned int increment,
bool wc) { testMode mode,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
switch (mode) { switch (mode) {
case QUICK_MODE: case QUICK_MODE:
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
endDevice, wc);
break; break;
case RANGE_MODE: case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode, testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
startDevice, endDevice, wc);
break; break;
case SHMOO_MODE: case SHMOO_MODE:
@ -414,20 +471,30 @@ void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Run a quick mode bandwidth test // Run a quick mode bandwidth test
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, void testBandwidthQuick(unsigned int size,
memoryMode memMode, int startDevice, int endDevice, memcpyKind kind,
bool wc) { printMode printmode,
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, memoryMode memMode,
startDevice, endDevice, wc); int startDevice,
int endDevice,
bool wc)
{
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
} }
/////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////
// Run a range mode bandwidth test // Run a range mode bandwidth test
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
void testBandwidthRange(unsigned int start, unsigned int end, void testBandwidthRange(unsigned int start,
unsigned int increment, memcpyKind kind, unsigned int end,
printMode printmode, memoryMode memMode, unsigned int increment,
int startDevice, int endDevice, bool wc) { memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
// count the number of copies we're going to run // count the number of copies we're going to run
unsigned int count = 1 + ((end - start) / increment); unsigned int count = 1 + ((end - start) / increment);
@ -441,8 +508,7 @@ void testBandwidthRange(unsigned int start, unsigned int end,
} }
// Use the device asked by the user // Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice; for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
currentDevice++) {
cudaSetDevice(currentDevice); cudaSetDevice(currentDevice);
// run each of the copies // run each of the copies
@ -467,11 +533,10 @@ void testBandwidthRange(unsigned int start, unsigned int end,
// print results // print results
if (printmode == CSV) { if (printmode == CSV) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode, printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc); }
} else { else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode, printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc);
} }
// clean up // clean up
@ -482,18 +547,21 @@ void testBandwidthRange(unsigned int start, unsigned int end,
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Intense shmoo mode - covers a large range of values with varying increments // Intense shmoo mode - covers a large range of values with varying increments
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
void testBandwidthShmoo(memcpyKind kind, printMode printmode, void testBandwidthShmoo(memcpyKind kind,
memoryMode memMode, int startDevice, int endDevice, printMode printmode,
bool wc) { memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
// count the number of copies to make // count the number of copies to make
unsigned int count = unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) + + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) + + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) + + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) + + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) + + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) + + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double)); double *bandwidths = (double *)malloc(count * sizeof(double));
@ -505,8 +573,7 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
} }
// Use the device asked by the user // Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice; for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
currentDevice++) {
cudaSetDevice(currentDevice); cudaSetDevice(currentDevice);
// Run the shmoo // Run the shmoo
int iteration = 0; int iteration = 0;
@ -515,17 +582,23 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
while (memSize <= SHMOO_MEMSIZE_MAX) { while (memSize <= SHMOO_MEMSIZE_MAX) {
if (memSize < SHMOO_LIMIT_20KB) { if (memSize < SHMOO_LIMIT_20KB) {
memSize += SHMOO_INCREMENT_1KB; memSize += SHMOO_INCREMENT_1KB;
} else if (memSize < SHMOO_LIMIT_50KB) { }
else if (memSize < SHMOO_LIMIT_50KB) {
memSize += SHMOO_INCREMENT_2KB; memSize += SHMOO_INCREMENT_2KB;
} else if (memSize < SHMOO_LIMIT_100KB) { }
else if (memSize < SHMOO_LIMIT_100KB) {
memSize += SHMOO_INCREMENT_10KB; memSize += SHMOO_INCREMENT_10KB;
} else if (memSize < SHMOO_LIMIT_1MB) { }
else if (memSize < SHMOO_LIMIT_1MB) {
memSize += SHMOO_INCREMENT_100KB; memSize += SHMOO_INCREMENT_100KB;
} else if (memSize < SHMOO_LIMIT_16MB) { }
else if (memSize < SHMOO_LIMIT_16MB) {
memSize += SHMOO_INCREMENT_1MB; memSize += SHMOO_INCREMENT_1MB;
} else if (memSize < SHMOO_LIMIT_32MB) { }
else if (memSize < SHMOO_LIMIT_32MB) {
memSize += SHMOO_INCREMENT_2MB; memSize += SHMOO_INCREMENT_2MB;
} else { }
else {
memSize += SHMOO_INCREMENT_4MB; memSize += SHMOO_INCREMENT_4MB;
} }
@ -533,18 +606,15 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
switch (kind) { switch (kind) {
case DEVICE_TO_HOST: case DEVICE_TO_HOST:
bandwidths[iteration] += bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
break; break;
case HOST_TO_DEVICE: case HOST_TO_DEVICE:
bandwidths[iteration] += bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
break; break;
case DEVICE_TO_DEVICE: case DEVICE_TO_DEVICE:
bandwidths[iteration] += bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
testDeviceToDeviceTransfer(memSizes[iteration]);
break; break;
} }
@ -558,11 +628,10 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
printf("\n"); printf("\n");
if (CSV == printmode) { if (CSV == printmode) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode, printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc); }
} else { else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode, printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc);
} }
// clean up // clean up
@ -573,8 +642,8 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// test the bandwidth of a device to host memcopy of a specific size // test the bandwidth of a device to host memcopy of a specific size
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
bool wc) { {
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f; float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f; float bandwidthInGBs = 0.0f;
@ -590,15 +659,14 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) { if (PINNED == memMode) {
// pinned memory mode - use special function to get OS-pinned memory // pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
(wc) ? cudaHostAllocWriteCombined : 0)); checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else #else
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif #endif
} else { }
else {
// pageable memory mode - use malloc // pageable memory mode - use malloc
h_idata = (unsigned char *)malloc(memSize); h_idata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize); h_odata = (unsigned char *)malloc(memSize);
@ -619,16 +687,15 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// initialize the device memory // initialize the device memory
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// copy data from GPU to Host // copy data from GPU to Host
if (PINNED == memMode) { if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer); if (bDontUseGPUTiming)
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0)); checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0));
cudaMemcpyDeviceToHost, 0));
} }
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
@ -638,12 +705,12 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
elapsedTimeInMs = sdkGetTimerValue(&timer); elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer); sdkResetTimer(&timer);
} }
} else { }
else {
elapsedTimeInMs = 0; elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer); sdkStartTimer(&timer);
checkCudaErrors( checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
sdkStopTimer(&timer); sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer); elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer); sdkResetTimer(&timer);
@ -663,7 +730,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) { if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_idata)); checkCudaErrors(cudaFreeHost(h_idata));
checkCudaErrors(cudaFreeHost(h_odata)); checkCudaErrors(cudaFreeHost(h_odata));
} else { }
else {
free(h_idata); free(h_idata);
free(h_odata); free(h_odata);
} }
@ -676,8 +744,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a host to device memcopy of a specific size //! test the bandwidth of a host to device memcopy of a specific size
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
bool wc) { {
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f; float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f; float bandwidthInGBs = 0.0f;
@ -692,13 +760,13 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) { if (PINNED == memMode) {
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
// pinned memory mode - use special function to get OS-pinned memory // pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
(wc) ? cudaHostAllocWriteCombined : 0));
#else #else
// pinned memory mode - use special function to get OS-pinned memory // pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif #endif
} else { }
else {
// pageable memory mode - use malloc // pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize); h_odata = (unsigned char *)malloc(memSize);
@ -732,11 +800,11 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
// copy host memory to device memory // copy host memory to device memory
if (PINNED == memMode) { if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer); if (bDontUseGPUTiming)
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0)); checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0));
cudaMemcpyHostToDevice, 0));
} }
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
@ -746,12 +814,12 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
elapsedTimeInMs = sdkGetTimerValue(&timer); elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer); sdkResetTimer(&timer);
} }
} else { }
else {
elapsedTimeInMs = 0; elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer); sdkStartTimer(&timer);
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
sdkStopTimer(&timer); sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer); elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer); sdkResetTimer(&timer);
@ -770,7 +838,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) { if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_odata)); checkCudaErrors(cudaFreeHost(h_odata));
} else { }
else {
free(h_odata); free(h_odata);
} }
@ -784,7 +853,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a device to device memcopy of a specific size //! test the bandwidth of a device to device memcopy of a specific size
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
float testDeviceToDeviceTransfer(unsigned int memSize) { float testDeviceToDeviceTransfer(unsigned int memSize)
{
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f; float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f; float bandwidthInGBs = 0.0f;
@ -814,16 +884,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
// initialize memory // initialize memory
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// run the memcopy // run the memcopy
sdkStartTimer(&timer); sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0)); checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
} }
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventRecord(stop, 0));
@ -860,9 +928,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// print results in an easily read format // print results in an easily read format
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
void printResultsReadable(unsigned int *memSizes, double *bandwidths, void printResultsReadable(unsigned int *memSizes,
unsigned int count, memcpyKind kind, double *bandwidths,
memoryMode memMode, int iNumDevs, bool wc) { unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc)
{
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
printf(" %s Memory Transfers\n", sMemoryMode[memMode]); printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
@ -874,35 +947,41 @@ void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int i; unsigned int i;
for (i = 0; i < (count - 1); i++) { for (i = 0; i < (count - 1); i++) {
printf(" %u\t\t\t%s%.1f\n", memSizes[i], printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
} }
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
} }
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// print results in a database format // print results in a database format
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
void printResultsCSV(unsigned int *memSizes, double *bandwidths, void printResultsCSV(unsigned int *memSizes,
unsigned int count, memcpyKind kind, memoryMode memMode, double *bandwidths,
int iNumDevs, bool wc) { unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc)
{
std::string sConfig; std::string sConfig;
// log config information // log config information
if (kind == DEVICE_TO_DEVICE) { if (kind == DEVICE_TO_DEVICE) {
sConfig += "D2D"; sConfig += "D2D";
} else { }
else {
if (kind == DEVICE_TO_HOST) { if (kind == DEVICE_TO_HOST) {
sConfig += "D2H"; sConfig += "D2H";
} else if (kind == HOST_TO_DEVICE) { }
else if (kind == HOST_TO_DEVICE) {
sConfig += "H2D"; sConfig += "H2D";
} }
if (memMode == PAGEABLE) { if (memMode == PAGEABLE) {
sConfig += "-Paged"; sConfig += "-Paged";
} else if (memMode == PINNED) { }
else if (memMode == PINNED) {
sConfig += "-Pinned"; sConfig += "-Pinned";
if (wc) { if (wc) {
@ -916,27 +995,28 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths,
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
printf( printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bytes, NumDevsUsed = %d\n", "bytes, NumDevsUsed = %d\n",
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs); sConfig.c_str(),
bandwidths[i],
dSeconds,
memSizes[i],
iNumDevs);
} }
} }
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Print help screen // Print help screen
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
void printHelp(void) { void printHelp(void)
{
printf("Usage: bandwidthTest [OPTION]...\n"); printf("Usage: bandwidthTest [OPTION]...\n");
printf( printf("Test the bandwidth for device to host, host to device, and device to "
"Test the bandwidth for device to host, host to device, and device to "
"device transfers\n"); "device transfers\n");
printf("\n"); printf("\n");
printf( printf("Example: measure the bandwidth of device to host pinned memory copies "
"Example: measure the bandwidth of device to host pinned memory copies "
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
printf( printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"--increment=1024 --dtoh\n"); "--increment=1024 --dtoh\n");
printf("\n"); printf("\n");

View File

@ -32,7 +32,6 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <string> #include <string>
@ -46,16 +45,13 @@ char **pArgv = NULL;
#include <cuda.h> #include <cuda.h>
// This function wraps the CUDA Driver API into a template function // This function wraps the CUDA Driver API into a template function
template <class T> template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, {
int device) {
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
if (CUDA_SUCCESS != error) { if (CUDA_SUCCESS != error) {
fprintf( fprintf(
stderr, stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
"cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
error, __FILE__, __LINE__);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -66,20 +62,19 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
pArgc = &argc; pArgc = &argc;
pArgv = argv; pArgv = argv;
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
printf( printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
int deviceCount = 0; int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount); cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) { if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
static_cast<int>(error_id), cudaGetErrorString(error_id));
printf("Result = FAIL\n"); printf("Result = FAIL\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -87,7 +82,8 @@ int main(int argc, char **argv) {
// This function call returns 0 if there are no CUDA capable devices. // This function call returns 0 if there are no CUDA capable devices.
if (deviceCount == 0) { if (deviceCount == 0) {
printf("There are no available device(s) that support CUDA\n"); printf("There are no available device(s) that support CUDA\n");
} else { }
else {
printf("Detected %d CUDA Capable device(s)\n", deviceCount); printf("Detected %d CUDA Capable device(s)\n", deviceCount);
} }
@ -104,20 +100,23 @@ int main(int argc, char **argv) {
cudaDriverGetVersion(&driverVersion); cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion); cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10, driverVersion / 1000,
runtimeVersion / 1000, (runtimeVersion % 100) / 10); (driverVersion % 100) / 10,
printf(" CUDA Capability Major/Minor version number: %d.%d\n", runtimeVersion / 1000,
deviceProp.major, deviceProp.minor); (runtimeVersion % 100) / 10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
char msg[256]; char msg[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(msg, sizeof(msg), sprintf_s(msg,
sizeof(msg),
" Total amount of global memory: %.0f MBytes " " Total amount of global memory: %.0f MBytes "
"(%llu bytes)\n", "(%llu bytes)\n",
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f), static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
(unsigned long long)deviceProp.totalGlobalMem); (unsigned long long)deviceProp.totalGlobalMem);
#else #else
snprintf(msg, sizeof(msg), snprintf(msg,
sizeof(msg),
" Total amount of global memory: %.0f MBytes " " Total amount of global memory: %.0f MBytes "
"(%llu bytes)\n", "(%llu bytes)\n",
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f), static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
@ -128,121 +127,100 @@ int main(int argc, char **argv) {
printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n", printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n",
deviceProp.multiProcessorCount, deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
deviceProp.multiProcessorCount); printf(" GPU Max Clock rate: %.0f MHz (%0.2f "
printf(
" GPU Max Clock rate: %.0f MHz (%0.2f "
"GHz)\n", "GHz)\n",
deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); deviceProp.clockRate * 1e-3f,
deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 5000 #if CUDART_VERSION >= 5000
// This is supported in CUDA 5.0 (runtime API device properties) // This is supported in CUDA 5.0 (runtime API device properties)
printf(" Memory Clock rate: %.0f Mhz\n", printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
printf(" Memory Bus Width: %d-bit\n",
deviceProp.memoryBusWidth);
if (deviceProp.l2CacheSize) { if (deviceProp.l2CacheSize) {
printf(" L2 Cache Size: %d bytes\n", printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
deviceProp.l2CacheSize);
} }
#else #else
// This only available in CUDA 4.0-4.2 (but these were only exposed in the // This only available in CUDA 4.0-4.2 (but these were only exposed in the
// CUDA Driver API) // CUDA Driver API)
int memoryClock; int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
printf(" Memory Clock rate: %.0f Mhz\n",
memoryClock * 1e-3f);
int memBusWidth; int memBusWidth;
getCudaAttribute<int>(&memBusWidth, getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth);
printf(" Memory Bus Width: %d-bit\n",
memBusWidth);
int L2CacheSize; int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize) { if (L2CacheSize) {
printf(" L2 Cache Size: %d bytes\n", printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
L2CacheSize);
} }
#endif #endif
printf( printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
"%d), 3D=(%d, %d, %d)\n", "%d), 3D=(%d, %d, %d)\n",
deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture1D,
deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture2D[0],
deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); deviceProp.maxTexture2D[1],
printf( deviceProp.maxTexture3D[0],
" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture3D[1],
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); deviceProp.maxTexture3D[2]);
printf( printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d " deviceProp.maxTexture1DLayered[0],
deviceProp.maxTexture1DLayered[1]);
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
"layers\n", "layers\n",
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[0],
deviceProp.maxTexture2DLayered[1],
deviceProp.maxTexture2DLayered[2]); deviceProp.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %zu bytes\n", printf(" Total amount of constant memory: %zu bytes\n", deviceProp.totalConstMem);
deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %zu bytes\n", deviceProp.sharedMemPerBlock);
printf(" Total amount of shared memory per block: %zu bytes\n", printf(" Total shared memory per multiprocessor: %zu bytes\n", deviceProp.sharedMemPerMultiprocessor);
deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
printf(" Total shared memory per multiprocessor: %zu bytes\n", printf(" Warp size: %d\n", deviceProp.warpSize);
deviceProp.sharedMemPerMultiprocessor); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
printf(" Total number of registers available per block: %d\n", printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
deviceProp.regsPerBlock);
printf(" Warp size: %d\n",
deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n",
deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n",
deviceProp.maxThreadsPerBlock);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]); deviceProp.maxThreadsDim[2]);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]); deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %zu bytes\n", printf(" Maximum memory pitch: %zu bytes\n", deviceProp.memPitch);
deviceProp.memPitch); printf(" Texture alignment: %zu bytes\n", deviceProp.textureAlignment);
printf(" Texture alignment: %zu bytes\n", printf(" Concurrent copy and kernel execution: %s with %d copy "
deviceProp.textureAlignment);
printf(
" Concurrent copy and kernel execution: %s with %d copy "
"engine(s)\n", "engine(s)\n",
(deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); (deviceProp.deviceOverlap ? "Yes" : "No"),
deviceProp.asyncEngineCount);
printf(" Run time limit on kernels: %s\n", printf(" Run time limit on kernels: %s\n",
deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n", printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
printf(" Alignment requirement for Surfaces: %s\n",
deviceProp.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support: %s\n",
deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
: "WDDM (Windows Display Driver Model)");
#endif #endif
printf(" Device supports Unified Addressing (UVA): %s\n", printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device supports Managed Memory: %s\n", deviceProp.managedMemory ? "Yes" : "No");
printf(" Device supports Managed Memory: %s\n",
deviceProp.managedMemory ? "Yes" : "No");
printf(" Device supports Compute Preemption: %s\n", printf(" Device supports Compute Preemption: %s\n",
deviceProp.computePreemptionSupported ? "Yes" : "No"); deviceProp.computePreemptionSupported ? "Yes" : "No");
printf(" Supports Cooperative Kernel Launch: %s\n", printf(" Supports Cooperative Kernel Launch: %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
deviceProp.cooperativeLaunch ? "Yes" : "No");
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No"); deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); deviceProp.pciDomainID,
deviceProp.pciBusID,
deviceProp.pciDeviceID);
const char *sComputeMode[] = { const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
"Default (multiple host threads can use ::cudaSetDevice() with device "
"simultaneously)", "simultaneously)",
"Exclusive (only one host thread in one process is able to use " "Exclusive (only one host thread in one process is able to use "
"::cudaSetDevice() with this device)", "::cudaSetDevice() with this device)",
@ -250,7 +228,8 @@ int main(int argc, char **argv) {
"device)", "device)",
"Exclusive Process (many threads in one process is able to use " "Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)", "::cudaSetDevice() with this device)",
"Unknown", NULL}; "Unknown",
NULL};
printf(" Compute Mode:\n"); printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
} }
@ -286,10 +265,12 @@ int main(int argc, char **argv) {
if (gpuid[i] == gpuid[j]) { if (gpuid[i] == gpuid[j]) {
continue; continue;
} }
checkCudaErrors( checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], prop[gpuid[i]].name,
gpuid[i],
prop[gpuid[j]].name,
gpuid[j],
can_access_peer ? "Yes" : "No"); can_access_peer ? "Yes" : "No");
} }
} }
@ -306,22 +287,18 @@ int main(int argc, char **argv) {
// driver version // driver version
sProfileString += ", CUDA Driver Version = "; sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
(driverVersion % 100) / 10);
#else #else
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
(driverVersion % 100) / 10);
#endif #endif
sProfileString += cTemp; sProfileString += cTemp;
// Runtime version // Runtime version
sProfileString += ", CUDA Runtime Version = "; sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
(runtimeVersion % 100) / 10);
#else #else
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
(runtimeVersion % 100) / 10);
#endif #endif
sProfileString += cTemp; sProfileString += cTemp;

View File

@ -30,17 +30,17 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h> #include <cuda.h>
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
{
CUdevice dev; CUdevice dev;
int major = 0, minor = 0; int major = 0, minor = 0;
int deviceCount = 0; int deviceCount = 0;
@ -58,15 +58,14 @@ int main(int argc, char **argv) {
// This function call returns 0 if there are no CUDA capable devices. // This function call returns 0 if there are no CUDA capable devices.
if (deviceCount == 0) { if (deviceCount == 0) {
printf("There are no available device(s) that support CUDA\n"); printf("There are no available device(s) that support CUDA\n");
} else { }
else {
printf("Detected %d CUDA Capable device(s)\n", deviceCount); printf("Detected %d CUDA Capable device(s)\n", deviceCount);
} }
for (dev = 0; dev < deviceCount; ++dev) { for (dev = 0; dev < deviceCount; ++dev) {
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
checkCudaErrors(cuDeviceGetName(deviceName, 256, dev)); checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
@ -75,9 +74,9 @@ int main(int argc, char **argv) {
int driverVersion = 0; int driverVersion = 0;
checkCudaErrors(cuDriverGetVersion(&driverVersion)); checkCudaErrors(cuDriverGetVersion(&driverVersion));
printf(" CUDA Driver Version: %d.%d\n", printf(" CUDA Driver Version: %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10); driverVersion / 1000,
printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, (driverVersion % 100) / 10);
minor); printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor);
size_t totalGlobalMem; size_t totalGlobalMem;
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev)); checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
@ -91,231 +90,169 @@ int main(int argc, char **argv) {
printf("%s", msg); printf("%s", msg);
int multiProcessorCount; int multiProcessorCount;
getCudaAttribute<int>(&multiProcessorCount, getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor), multiProcessorCount,
_ConvertSMVer2CoresDRV(major, minor),
_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount); _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
int clockRate; int clockRate;
getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
printf( printf(" GPU Max Clock rate: %.0f MHz (%0.2f "
" GPU Max Clock rate: %.0f MHz (%0.2f "
"GHz)\n", "GHz)\n",
clockRate * 1e-3f, clockRate * 1e-6f); clockRate * 1e-3f,
clockRate * 1e-6f);
int memoryClock; int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
printf(" Memory Clock rate: %.0f Mhz\n",
memoryClock * 1e-3f);
int memBusWidth; int memBusWidth;
getCudaAttribute<int>(&memBusWidth, getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth);
printf(" Memory Bus Width: %d-bit\n",
memBusWidth);
int L2CacheSize; int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize) { if (L2CacheSize) {
printf(" L2 Cache Size: %d bytes\n", printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
L2CacheSize);
} }
int maxTex1D, maxTex2D[2], maxTex3D[3]; int maxTex1D, maxTex2D[2], maxTex3D[3];
getCudaAttribute<int>(&maxTex1D, getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev); getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
getCudaAttribute<int>(&maxTex2D[0], getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev); getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
getCudaAttribute<int>(&maxTex2D[1], getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev); getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
getCudaAttribute<int>(&maxTex3D[0], printf(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) "
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
getCudaAttribute<int>(&maxTex3D[1],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
getCudaAttribute<int>(&maxTex3D[2],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
printf(
" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) "
"3D=(%d, %d, %d)\n", "3D=(%d, %d, %d)\n",
maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex1D,
maxTex2D[0],
maxTex2D[1],
maxTex3D[0],
maxTex3D[1],
maxTex3D[2]); maxTex3D[2]);
int maxTex1DLayered[2]; int maxTex1DLayered[2];
getCudaAttribute<int>(&maxTex1DLayered[0], getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
dev); printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
getCudaAttribute<int>(&maxTex1DLayered[1], maxTex1DLayered[0],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, maxTex1DLayered[1]);
dev);
printf(
" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
maxTex1DLayered[0], maxTex1DLayered[1]);
int maxTex2DLayered[3]; int maxTex2DLayered[3];
getCudaAttribute<int>(&maxTex2DLayered[0], getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
dev); getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
getCudaAttribute<int>(&maxTex2DLayered[1], printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
dev);
getCudaAttribute<int>(&maxTex2DLayered[2],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
dev);
printf(
" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
"layers\n", "layers\n",
maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]); maxTex2DLayered[0],
maxTex2DLayered[1],
maxTex2DLayered[2]);
int totalConstantMemory; int totalConstantMemory;
getCudaAttribute<int>(&totalConstantMemory, getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev); printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory);
printf(" Total amount of constant memory: %u bytes\n",
totalConstantMemory);
int sharedMemPerBlock; int sharedMemPerBlock;
getCudaAttribute<int>(&sharedMemPerBlock, getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev); printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock);
printf(" Total amount of shared memory per block: %u bytes\n",
sharedMemPerBlock);
int regsPerBlock; int regsPerBlock;
getCudaAttribute<int>(&regsPerBlock, getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); printf(" Total number of registers available per block: %d\n", regsPerBlock);
printf(" Total number of registers available per block: %d\n",
regsPerBlock);
int warpSize; int warpSize;
getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
printf(" Warp size: %d\n", warpSize); printf(" Warp size: %d\n", warpSize);
int maxThreadsPerMultiProcessor; int maxThreadsPerMultiProcessor;
getCudaAttribute<int>(&maxThreadsPerMultiProcessor, getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor);
dev);
printf(" Maximum number of threads per multiprocessor: %d\n",
maxThreadsPerMultiProcessor);
int maxThreadsPerBlock; int maxThreadsPerBlock;
getCudaAttribute<int>(&maxThreadsPerBlock, getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock);
printf(" Maximum number of threads per block: %d\n",
maxThreadsPerBlock);
int blockDim[3]; int blockDim[3];
getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
dev); getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
dev); printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
dev);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
blockDim[0], blockDim[1], blockDim[2]);
int gridDim[3]; int gridDim[3];
getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev); getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev); getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev); getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
gridDim[0], gridDim[1], gridDim[2]);
int textureAlign; int textureAlign;
getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
dev); printf(" Texture alignment: %u bytes\n", textureAlign);
printf(" Texture alignment: %u bytes\n",
textureAlign);
int memPitch; int memPitch;
getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev); getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
printf(" Maximum memory pitch: %u bytes\n", printf(" Maximum memory pitch: %u bytes\n", memPitch);
memPitch);
int gpuOverlap; int gpuOverlap;
getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
int asyncEngineCount; int asyncEngineCount;
getCudaAttribute<int>(&asyncEngineCount, getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); printf(" Concurrent copy and kernel execution: %s with %d copy "
printf(
" Concurrent copy and kernel execution: %s with %d copy "
"engine(s)\n", "engine(s)\n",
(gpuOverlap ? "Yes" : "No"), asyncEngineCount); (gpuOverlap ? "Yes" : "No"),
asyncEngineCount);
int kernelExecTimeoutEnabled; int kernelExecTimeoutEnabled;
getCudaAttribute<int>(&kernelExecTimeoutEnabled, getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Run time limit on kernels: %s\n",
kernelExecTimeoutEnabled ? "Yes" : "No");
int integrated; int integrated;
getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
printf(" Integrated GPU sharing Host Memory: %s\n", printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No");
integrated ? "Yes" : "No");
int canMapHostMemory; int canMapHostMemory;
getCudaAttribute<int>(&canMapHostMemory, getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n",
canMapHostMemory ? "Yes" : "No");
int concurrentKernels; int concurrentKernels;
getCudaAttribute<int>(&concurrentKernels, getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No");
printf(" Concurrent kernel execution: %s\n",
concurrentKernels ? "Yes" : "No");
int surfaceAlignment; int surfaceAlignment;
getCudaAttribute<int>(&surfaceAlignment, getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev); printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n",
surfaceAlignment ? "Yes" : "No");
int eccEnabled; int eccEnabled;
getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev); getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
printf(" Device has ECC support: %s\n", printf(" Device has ECC support: %s\n", eccEnabled ? "Enabled" : "Disabled");
eccEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
int tccDriver; int tccDriver;
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev); getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
tccDriver ? "TCC (Tesla Compute Cluster Driver)" tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
: "WDDM (Windows Display Driver Model)");
#endif #endif
int unifiedAddressing; int unifiedAddressing;
getCudaAttribute<int>(&unifiedAddressing, getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No");
printf(" Device supports Unified Addressing (UVA): %s\n",
unifiedAddressing ? "Yes" : "No");
int managedMemory; int managedMemory;
getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
dev); printf(" Device supports Managed Memory: %s\n", managedMemory ? "Yes" : "No");
printf(" Device supports Managed Memory: %s\n",
managedMemory ? "Yes" : "No");
int computePreemption; int computePreemption;
getCudaAttribute<int>(&computePreemption, getCudaAttribute<int>(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, printf(" Device supports Compute Preemption: %s\n", computePreemption ? "Yes" : "No");
dev);
printf(" Device supports Compute Preemption: %s\n",
computePreemption ? "Yes" : "No");
int cooperativeLaunch; int cooperativeLaunch;
getCudaAttribute<int>(&cooperativeLaunch, getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev); printf(" Supports Cooperative Kernel Launch: %s\n", cooperativeLaunch ? "Yes" : "No");
printf(" Supports Cooperative Kernel Launch: %s\n",
cooperativeLaunch ? "Yes" : "No");
int cooperativeMultiDevLaunch; int cooperativeMultiDevLaunch;
getCudaAttribute<int>(&cooperativeMultiDevLaunch, getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
dev);
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
cooperativeMultiDevLaunch ? "Yes" : "No");
int pciDomainID, pciBusID, pciDeviceID; int pciDomainID, pciBusID, pciDeviceID;
getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev); getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
pciDomainID, pciBusID, pciDeviceID);
const char *sComputeMode[] = { const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
"Default (multiple host threads can use ::cudaSetDevice() with device "
"simultaneously)", "simultaneously)",
"Exclusive (only one host thread in one process is able to use " "Exclusive (only one host thread in one process is able to use "
"::cudaSetDevice() with this device)", "::cudaSetDevice() with this device)",
@ -323,7 +260,8 @@ int main(int argc, char **argv) {
"device)", "device)",
"Exclusive Process (many threads in one process is able to use " "Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)", "::cudaSetDevice() with this device)",
"Unknown", NULL}; "Unknown",
NULL};
int computeMode; int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
@ -338,10 +276,8 @@ int main(int argc, char **argv) {
int tccDriver = 0; int tccDriver = 0;
for (int i = 0; i < deviceCount; i++) { for (int i = 0; i < deviceCount; i++) {
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i); getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
// Only boards based on Fermi or later can support P2P // Only boards based on Fermi or later can support P2P
@ -367,14 +303,15 @@ int main(int argc, char **argv) {
if (gpuid[i] == gpuid[j]) { if (gpuid[i] == gpuid[j]) {
continue; continue;
} }
checkCudaErrors( checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i])); checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j])); checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
printf( printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
"> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
"%s\n", "%s\n",
deviceName0, gpuid[i], deviceName1, gpuid[j], deviceName0,
gpuid[i],
deviceName1,
gpuid[j],
can_access_peer ? "Yes" : "No"); can_access_peer ? "Yes" : "No");
} }
} }

View File

@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details) ## References (for more details)

View File

@ -37,32 +37,30 @@
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples #include <helper_functions.h> // helper for shared that are common to CUDA Samples
int main(int argc, char **argv) { int main(int argc, char **argv)
{
int deviceCount = 0; int deviceCount = 0;
checkCudaErrors(cudaGetDeviceCount(&deviceCount)); checkCudaErrors(cudaGetDeviceCount(&deviceCount));
// Enumerates Device <-> Device links // Enumerates Device <-> Device links
for (int device1 = 0; device1 < deviceCount; device1++) { for (int device1 = 0; device1 < deviceCount; device1++) {
for (int device2 = 0; device2 < deviceCount; device2++) { for (int device2 = 0; device2 < deviceCount; device2++) {
if (device1 == device2) continue; if (device1 == device2)
continue;
int perfRank = 0; int perfRank = 0;
int atomicSupported = 0; int atomicSupported = 0;
int accessSupported = 0; int accessSupported = 0;
checkCudaErrors(cudaDeviceGetP2PAttribute( checkCudaErrors(
&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2)); cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
checkCudaErrors(cudaDeviceGetP2PAttribute( checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2)); checkCudaErrors(
checkCudaErrors(cudaDeviceGetP2PAttribute( cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));
&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1,
device2));
if (accessSupported) { if (accessSupported) {
std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
<< std::endl; std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
std::cout << " * Atomic Supported: "
<< (atomicSupported ? "yes" : "no") << std::endl;
std::cout << " * Perf Rank: " << perfRank << std::endl; std::cout << " * Perf Rank: " << perfRank << std::endl;
} }
} }
@ -71,11 +69,9 @@ int main(int argc, char **argv) {
// Enumerates Device <-> Host links // Enumerates Device <-> Host links
for (int device = 0; device < deviceCount; device++) { for (int device = 0; device < deviceCount; device++) {
int atomicSupported = 0; int atomicSupported = 0;
checkCudaErrors(cudaDeviceGetAttribute( checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
std::cout << "GPU" << device << " <-> CPU:" << std::endl; std::cout << "GPU" << device << " <-> CPU:" << std::endl;
std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
<< std::endl;
} }
return 0; return 0;

View File

@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -29,12 +29,14 @@
// DESCRIPTION: Simple CUDA consumer rendering sample app // DESCRIPTION: Simple CUDA consumer rendering sample app
// //
#include <cuda_runtime.h>
#include "cuda_consumer.h" #include "cuda_consumer.h"
#include "eglstrm_common.h"
#include <cuda_runtime.h>
#include <math.h> #include <math.h>
#include <unistd.h> #include <unistd.h>
#include "eglstrm_common.h"
#if defined(EXTENSION_LIST) #if defined(EXTENSION_LIST)
EXTENSION_LIST(EXTLST_EXTERN) EXTENSION_LIST(EXTLST_EXTERN)
#endif #endif
@ -47,19 +49,22 @@ static int count_rel = 0;
static double rel_time[25000] = {0}, total_time_rel = 0; static double rel_time[25000] = {0}, total_time_rel = 0;
void acquireApiStat(void); void acquireApiStat(void);
void acquireApiStat(void) { void acquireApiStat(void)
{
int i = 0; int i = 0;
double min = 10000000, max = 0; double min = 10000000, max = 0;
double average_launch_time = 0, standard_deviation = 0; double average_launch_time = 0, standard_deviation = 0;
if (count_acq == 0) return; if (count_acq == 0)
return;
// lets compute the standard deviation // lets compute the standard deviation
min = max = acquire_time[1]; min = max = acquire_time[1];
average_launch_time = (total_time_acq - acquire_time[0]) / count_acq; average_launch_time = (total_time_acq - acquire_time[0]) / count_acq;
for (i = 1; i < count_acq; i++) { for (i = 1; i < count_acq; i++) {
standard_deviation += (acquire_time[i] - average_launch_time) * standard_deviation += (acquire_time[i] - average_launch_time) * (acquire_time[i] - average_launch_time);
(acquire_time[i] - average_launch_time); if (acquire_time[i] < min)
if (acquire_time[i] < min) min = acquire_time[i]; min = acquire_time[i];
if (acquire_time[i] > max) max = acquire_time[i]; if (acquire_time[i] > max)
max = acquire_time[i];
} }
standard_deviation = sqrt(standard_deviation / count_acq); standard_deviation = sqrt(standard_deviation / count_acq);
printf("acquire Avg: %lf\n", average_launch_time); printf("acquire Avg: %lf\n", average_launch_time);
@ -70,10 +75,11 @@ void acquireApiStat(void) {
min = max = rel_time[1]; min = max = rel_time[1];
average_launch_time = (total_time_rel - rel_time[0]) / count_rel; average_launch_time = (total_time_rel - rel_time[0]) / count_rel;
for (i = 1; i < count_rel; i++) { for (i = 1; i < count_rel; i++) {
standard_deviation += (rel_time[i] - average_launch_time) * standard_deviation += (rel_time[i] - average_launch_time) * (rel_time[i] - average_launch_time);
(rel_time[i] - average_launch_time); if (rel_time[i] < min)
if (rel_time[i] < min) min = rel_time[i]; min = rel_time[i];
if (rel_time[i] > max) max = rel_time[i]; if (rel_time[i] > max)
max = rel_time[i];
} }
standard_deviation = sqrt(standard_deviation / count_rel); standard_deviation = sqrt(standard_deviation / count_rel);
printf("release Avg: %lf\n", average_launch_time); printf("release Avg: %lf\n", average_launch_time);
@ -81,8 +87,8 @@ void acquireApiStat(void) {
printf("release min: %lf\n", min); printf("release min: %lf\n", min);
printf("release max: %lf\n", max); printf("release max: %lf\n", max);
} }
CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer, CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber)
int frameNumber) { {
CUresult cuStatus = CUDA_SUCCESS; CUresult cuStatus = CUDA_SUCCESS;
CUeglFrame cudaEgl; CUeglFrame cudaEgl;
struct timespec start, end; struct timespec start, end;
@ -95,8 +101,7 @@ CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
} }
while (1) { while (1) {
if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream, if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream, EGL_STREAM_STATE_KHR, &streamState)) {
EGL_STREAM_STATE_KHR, &streamState)) {
printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
cuStatus = CUDA_ERROR_UNKNOWN; cuStatus = CUDA_ERROR_UNKNOWN;
goto done; goto done;
@ -115,33 +120,35 @@ CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
getTime(&start); getTime(&start);
} }
cuStatus = cuStatus =
cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource, cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource, &cudaConsumer->consCudaStream, 16000);
&cudaConsumer->consCudaStream, 16000);
if (cudaConsumer->profileAPI) { if (cudaConsumer->profileAPI) {
getTime(&end); getTime(&end);
curTime = TIME_DIFF(end, start); curTime = TIME_DIFF(end, start);
acquire_time[count_acq++] = curTime; acquire_time[count_acq++] = curTime;
if (count_acq == 25000) count_acq = 0; if (count_acq == 25000)
count_acq = 0;
total_time_acq += curTime; total_time_acq += curTime;
} }
if (cuStatus == CUDA_SUCCESS) { if (cuStatus == CUDA_SUCCESS) {
CUdeviceptr pDevPtr = 0; CUdeviceptr pDevPtr = 0;
cudaError_t err; cudaError_t err;
cuStatus = cuStatus = cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
if (cuStatus != CUDA_SUCCESS) { if (cuStatus != CUDA_SUCCESS) {
printf("Cuda get resource failed with %d\n", cuStatus); printf("Cuda get resource failed with %d\n", cuStatus);
goto done; goto done;
} }
pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0]; pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0];
err = cudaConsumer_filter(cudaConsumer->consCudaStream, (char *)pDevPtr, err = cudaConsumer_filter(cudaConsumer->consCudaStream,
WIDTH * 4, HEIGHT, PROD_DATA + frameNumber, (char *)pDevPtr,
CONS_DATA + frameNumber, frameNumber); WIDTH * 4,
HEIGHT,
PROD_DATA + frameNumber,
CONS_DATA + frameNumber,
frameNumber);
if (err != cudaSuccess) { if (err != cudaSuccess) {
printf("Cuda Consumer: kernel failed with: %s\n", printf("Cuda Consumer: kernel failed with: %s\n", cudaGetErrorString(err));
cudaGetErrorString(err));
goto done; goto done;
} }
} }
@ -150,8 +157,8 @@ done:
return cuStatus; return cuStatus;
} }
CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer, CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber)
int frameNumber) { {
CUresult cuStatus = CUDA_SUCCESS; CUresult cuStatus = CUDA_SUCCESS;
struct timespec start, end; struct timespec start, end;
double curTime; double curTime;
@ -163,13 +170,13 @@ CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer,
if (cudaConsumer->profileAPI) { if (cudaConsumer->profileAPI) {
getTime(&start); getTime(&start);
} }
cuStatus = cuEGLStreamConsumerReleaseFrame( cuStatus = cuEGLStreamConsumerReleaseFrame(&cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream);
&cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream);
if (cudaConsumer->profileAPI) { if (cudaConsumer->profileAPI) {
getTime(&end); getTime(&end);
curTime = TIME_DIFF(end, start); curTime = TIME_DIFF(end, start);
rel_time[count_rel++] = curTime; rel_time[count_rel++] = curTime;
if (count_rel == 25000) count_rel = 0; if (count_rel == 25000)
count_rel = 0;
total_time_rel += curTime; total_time_rel += curTime;
} }
if (cuStatus != CUDA_SUCCESS) { if (cuStatus != CUDA_SUCCESS) {
@ -181,7 +188,8 @@ done:
return cuStatus; return cuStatus;
} }
CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) { CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer)
{
CUdevice device; CUdevice device;
CUresult status = CUDA_SUCCESS; CUresult status = CUDA_SUCCESS;
@ -190,34 +198,31 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
return status; return status;
} }
if (CUDA_SUCCESS != if (CUDA_SUCCESS != (status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) {
(status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) {
printf("failed to get CUDA device\n"); printf("failed to get CUDA device\n");
return status; return status;
} }
if (CUDA_SUCCESS != if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
(status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
printf("failed to create CUDA context\n"); printf("failed to create CUDA context\n");
return status; return status;
} }
int major = 0, minor = 0; int major = 0, minor = 0;
char deviceName[256]; char deviceName[256];
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
device); cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
device);
cuDeviceGetName(deviceName, 256, device); cuDeviceGetName(deviceName, 256, device);
printf( printf("CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
"CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
"%d.%d\n\n", "%d.%d\n\n",
device, deviceName, major, minor); device,
deviceName,
major,
minor);
cuCtxPopCurrent(&cudaConsumer->context); cuCtxPopCurrent(&cudaConsumer->context);
if (major < 6) { if (major < 6) {
printf( printf("EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. "
"EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. "
"Exiting...\n"); "Exiting...\n");
exit(2); // EXIT_WAIVED exit(2); // EXIT_WAIVED
} }
@ -225,8 +230,8 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
return status; return status;
} }
CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args)
TestArgs *args) { {
CUresult status = CUDA_SUCCESS; CUresult status = CUDA_SUCCESS;
int bufferSize; int bufferSize;
@ -250,7 +255,8 @@ done:
return status; return status;
} }
CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer) { CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer)
{
if (cudaConsumer->pCudaCopyMem) { if (cudaConsumer->pCudaCopyMem) {
free(cudaConsumer->pCudaCopyMem); free(cudaConsumer->pCudaCopyMem);
} }

View File

@ -32,15 +32,17 @@
#ifndef _CUDA_CONSUMER_H_ #ifndef _CUDA_CONSUMER_H_
#define _CUDA_CONSUMER_H_ #define _CUDA_CONSUMER_H_
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "cudaEGL.h" #include "cudaEGL.h"
#include "eglstrm_common.h" #include "eglstrm_common.h"
#include <cuda_runtime.h>
#include <cuda.h>
typedef struct _test_cuda_consumer_s { typedef struct _test_cuda_consumer_s
{
CUcontext context; CUcontext context;
CUeglStreamConnection cudaConn; CUeglStreamConnection cudaConn;
int cudaDevId; int cudaDevId;
@ -58,8 +60,12 @@ CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer);
CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *data, int frameNumber); CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *data, int frameNumber);
CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *data, int frameNumber); CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *data, int frameNumber);
CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer); CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer);
cudaError_t cudaConsumer_filter(CUstream cStream, char *pSrc, int width, cudaError_t cudaConsumer_filter(CUstream cStream,
int height, char expectedVal, char newVal, char *pSrc,
int width,
int height,
char expectedVal,
char newVal,
int frameNumber); int frameNumber);
cudaError_t cudaGetValueMismatch(void); cudaError_t cudaGetValueMismatch(void);

Some files were not shown because too many files have changed in this diff Show More