mirror of
				https://github.com/NVIDIA/cuda-samples.git
				synced 2025-11-04 15:47:50 +08:00 
			
		
		
		
	Merge branch 'master' into cuda_a_dev
This commit is contained in:
		
						commit
						eddc6fd7e1
					
				
							
								
								
									
										49
									
								
								.clang-format
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								.clang-format
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,49 @@
 | 
			
		||||
---
 | 
			
		||||
AccessModifierOffset: -4
 | 
			
		||||
AlignAfterOpenBracket: Align
 | 
			
		||||
AlignConsecutiveAssignments: Consecutive
 | 
			
		||||
AlignConsecutiveDeclarations: Consecutive
 | 
			
		||||
AlignConsecutiveMacros: Consecutive
 | 
			
		||||
AlignEscapedNewlines: Left
 | 
			
		||||
AlignOperands: AlignAfterOperator
 | 
			
		||||
AlignTrailingComments: true
 | 
			
		||||
AllowAllParametersOfDeclarationOnNextLine: false
 | 
			
		||||
BinPackArguments: false
 | 
			
		||||
BinPackParameters: false
 | 
			
		||||
BraceWrapping:
 | 
			
		||||
    AfterClass: true
 | 
			
		||||
    AfterControlStatement: false
 | 
			
		||||
    AfterExternBlock: true
 | 
			
		||||
    AfterFunction: true
 | 
			
		||||
    AfterStruct: true
 | 
			
		||||
    AfterUnion: true
 | 
			
		||||
    BeforeCatch: true
 | 
			
		||||
    BeforeElse: true
 | 
			
		||||
    IndentBraces: false
 | 
			
		||||
BreakBeforeBraces: Custom
 | 
			
		||||
BreakBeforeConceptDeclarations: true
 | 
			
		||||
BreakBeforeBinaryOperators: NonAssignment
 | 
			
		||||
BreakBeforeTernaryOperators: true
 | 
			
		||||
BreakConstructorInitializers: BeforeComma
 | 
			
		||||
BreakInheritanceList: BeforeComma
 | 
			
		||||
ColumnLimit: 120
 | 
			
		||||
DerivePointerAlignment: false
 | 
			
		||||
FixNamespaceComments: true
 | 
			
		||||
IncludeCategories:
 | 
			
		||||
  - Regex:           '^<.*>'
 | 
			
		||||
    Priority:        1
 | 
			
		||||
  - Regex:           '^".*"'
 | 
			
		||||
    Priority:        2
 | 
			
		||||
SortIncludes: true
 | 
			
		||||
IncludeBlocks: Regroup
 | 
			
		||||
IndentWidth: 4
 | 
			
		||||
MaxEmptyLinesToKeep: 2
 | 
			
		||||
PointerAlignment: Right
 | 
			
		||||
SortUsingDeclarations: true
 | 
			
		||||
SpaceAfterCStyleCast: false
 | 
			
		||||
SpaceBeforeAssignmentOperators: true
 | 
			
		||||
SpaceBeforeParens: ControlStatements
 | 
			
		||||
Standard: c++17
 | 
			
		||||
TabWidth: 4
 | 
			
		||||
UseTab: Never
 | 
			
		||||
...
 | 
			
		||||
							
								
								
									
										100
									
								
								.pre-commit-config.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								.pre-commit-config.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,100 @@
 | 
			
		||||
# Copyright (c) 2024, NVIDIA CORPORATION.
 | 
			
		||||
ci:
 | 
			
		||||
    autofix_commit_msg: |
 | 
			
		||||
      [pre-commit.ci] auto code formatting
 | 
			
		||||
    autofix_prs: false
 | 
			
		||||
    autoupdate_branch: ''
 | 
			
		||||
    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
 | 
			
		||||
    autoupdate_schedule: quarterly
 | 
			
		||||
    skip: []
 | 
			
		||||
    submodules: false
 | 
			
		||||
 | 
			
		||||
repos:
 | 
			
		||||
  - repo: https://github.com/pre-commit/pre-commit-hooks
 | 
			
		||||
    rev: v5.0.0
 | 
			
		||||
    hooks:
 | 
			
		||||
      - id: end-of-file-fixer
 | 
			
		||||
        exclude: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            .*\.raw$|
 | 
			
		||||
            .*\.bin$|
 | 
			
		||||
            .*\.dat$|
 | 
			
		||||
            .*\.nv12$|
 | 
			
		||||
            data/.*|
 | 
			
		||||
            Common/.*
 | 
			
		||||
          )
 | 
			
		||||
        files: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            .*\.txt$|
 | 
			
		||||
            .*\.md$|
 | 
			
		||||
            .*\.cpp$|
 | 
			
		||||
            .*\.cxx$|
 | 
			
		||||
            .*\.hpp$|
 | 
			
		||||
            .*\.h$|
 | 
			
		||||
            .*\.cu$|
 | 
			
		||||
            .*\.cuh$
 | 
			
		||||
          )
 | 
			
		||||
      - id: mixed-line-ending
 | 
			
		||||
        exclude: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            .*\.raw$|
 | 
			
		||||
            .*\.bin$|
 | 
			
		||||
            .*\.dat$|
 | 
			
		||||
            .*\.nv12$|
 | 
			
		||||
            data/.*|
 | 
			
		||||
            Common/.*
 | 
			
		||||
          )
 | 
			
		||||
        files: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            .*\.txt$|
 | 
			
		||||
            .*\.md$|
 | 
			
		||||
            .*\.cpp$|
 | 
			
		||||
            .*\.cxx$|
 | 
			
		||||
            .*\.hpp$|
 | 
			
		||||
            .*\.h$|
 | 
			
		||||
            .*\.cu$|
 | 
			
		||||
            .*\.cuh$
 | 
			
		||||
          )
 | 
			
		||||
      - id: trailing-whitespace
 | 
			
		||||
        exclude: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            .*\.raw$|
 | 
			
		||||
            .*\.bin$|
 | 
			
		||||
            .*\.dat$|
 | 
			
		||||
            .*\.nv12$|
 | 
			
		||||
            data/.*|
 | 
			
		||||
            Common/.*
 | 
			
		||||
          )
 | 
			
		||||
        files: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            .*\.txt$|
 | 
			
		||||
            .*\.md$|
 | 
			
		||||
            .*\.cpp$|
 | 
			
		||||
            .*\.cxx$|
 | 
			
		||||
            .*\.hpp$|
 | 
			
		||||
            .*\.h$|
 | 
			
		||||
            .*\.cu$|
 | 
			
		||||
            .*\.cuh$
 | 
			
		||||
          )
 | 
			
		||||
  - repo: https://github.com/pre-commit/mirrors-clang-format
 | 
			
		||||
    rev: v19.1.6
 | 
			
		||||
    hooks:
 | 
			
		||||
      - id: clang-format
 | 
			
		||||
        types_or: [file]
 | 
			
		||||
        files: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            ^.*\.c$|
 | 
			
		||||
            ^.*\.cpp$|
 | 
			
		||||
            ^.*\.cu$|
 | 
			
		||||
            ^.*\.cuh$|
 | 
			
		||||
            ^.*\.cxx$|
 | 
			
		||||
            ^.*\.h$|
 | 
			
		||||
            ^.*\.hpp$|
 | 
			
		||||
            ^.*\.inl$|
 | 
			
		||||
            ^.*\.mm$
 | 
			
		||||
          )
 | 
			
		||||
        exclude: |
 | 
			
		||||
          (?x)^(
 | 
			
		||||
            Common/.*
 | 
			
		||||
          )
 | 
			
		||||
        args: ["-fallback-style=none", "-style=file", "-i"]
 | 
			
		||||
@ -1,6 +1,6 @@
 | 
			
		||||
# CUDA Samples
 | 
			
		||||
 | 
			
		||||
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads).
 | 
			
		||||
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
 | 
			
		||||
 | 
			
		||||
## Release Notes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -31,10 +31,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// system includes
 | 
			
		||||
#include <algorithm>
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <algorithm>
 | 
			
		||||
#ifdef USE_PTHREADS
 | 
			
		||||
#include <pthread.h>
 | 
			
		||||
#else
 | 
			
		||||
@ -51,291 +51,287 @@
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
 | 
			
		||||
// functions
 | 
			
		||||
void srand48(long seed) { srand((unsigned int)seed); }
 | 
			
		||||
void   srand48(long seed) { srand((unsigned int)seed); }
 | 
			
		||||
double drand48() { return double(rand()) / RAND_MAX; }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
const char *sSDKname = "UnifiedMemoryStreams";
 | 
			
		||||
 | 
			
		||||
// simple task
 | 
			
		||||
template <typename T>
 | 
			
		||||
struct Task {
 | 
			
		||||
  unsigned int size, id;
 | 
			
		||||
  T *data;
 | 
			
		||||
  T *result;
 | 
			
		||||
  T *vector;
 | 
			
		||||
template <typename T> struct Task
 | 
			
		||||
{
 | 
			
		||||
    unsigned int size, id;
 | 
			
		||||
    T           *data;
 | 
			
		||||
    T           *result;
 | 
			
		||||
    T           *vector;
 | 
			
		||||
 | 
			
		||||
  Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
 | 
			
		||||
  Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
 | 
			
		||||
    // allocate unified memory -- the operation performed in this example will
 | 
			
		||||
    // be a DGEMV
 | 
			
		||||
    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
 | 
			
		||||
    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
 | 
			
		||||
    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ~Task() {
 | 
			
		||||
    // ensure all memory is deallocated
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    checkCudaErrors(cudaFree(data));
 | 
			
		||||
    checkCudaErrors(cudaFree(result));
 | 
			
		||||
    checkCudaErrors(cudaFree(vector));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void allocate(const unsigned int s, const unsigned int unique_id) {
 | 
			
		||||
    // allocate unified memory outside of constructor
 | 
			
		||||
    id = unique_id;
 | 
			
		||||
    size = s;
 | 
			
		||||
    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
 | 
			
		||||
    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
 | 
			
		||||
    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
    // populate data with random elements
 | 
			
		||||
    for (unsigned int i = 0; i < size * size; i++) {
 | 
			
		||||
      data[i] = drand48();
 | 
			
		||||
    Task()
 | 
			
		||||
        : size(0)
 | 
			
		||||
        , id(0)
 | 
			
		||||
        , data(NULL)
 | 
			
		||||
        , result(NULL)
 | 
			
		||||
        , vector(NULL) {};
 | 
			
		||||
    Task(unsigned int s)
 | 
			
		||||
        : size(s)
 | 
			
		||||
        , id(0)
 | 
			
		||||
        , data(NULL)
 | 
			
		||||
        , result(NULL)
 | 
			
		||||
    {
 | 
			
		||||
        // allocate unified memory -- the operation performed in this example will
 | 
			
		||||
        // be a DGEMV
 | 
			
		||||
        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
 | 
			
		||||
        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
 | 
			
		||||
        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (unsigned int i = 0; i < size; i++) {
 | 
			
		||||
      result[i] = 0.;
 | 
			
		||||
      vector[i] = drand48();
 | 
			
		||||
    ~Task()
 | 
			
		||||
    {
 | 
			
		||||
        // ensure all memory is deallocated
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
        checkCudaErrors(cudaFree(data));
 | 
			
		||||
        checkCudaErrors(cudaFree(result));
 | 
			
		||||
        checkCudaErrors(cudaFree(vector));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void allocate(const unsigned int s, const unsigned int unique_id)
 | 
			
		||||
    {
 | 
			
		||||
        // allocate unified memory outside of constructor
 | 
			
		||||
        id   = unique_id;
 | 
			
		||||
        size = s;
 | 
			
		||||
        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
 | 
			
		||||
        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
 | 
			
		||||
        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
        // populate data with random elements
 | 
			
		||||
        for (unsigned int i = 0; i < size * size; i++) {
 | 
			
		||||
            data[i] = drand48();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (unsigned int i = 0; i < size; i++) {
 | 
			
		||||
            result[i] = 0.;
 | 
			
		||||
            vector[i] = drand48();
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#ifdef USE_PTHREADS
 | 
			
		||||
struct threadData_t {
 | 
			
		||||
  int tid;
 | 
			
		||||
  Task<double> *TaskListPtr;
 | 
			
		||||
  cudaStream_t *streams;
 | 
			
		||||
  cublasHandle_t *handles;
 | 
			
		||||
  int taskSize;
 | 
			
		||||
struct threadData_t
 | 
			
		||||
{
 | 
			
		||||
    int             tid;
 | 
			
		||||
    Task<double>   *TaskListPtr;
 | 
			
		||||
    cudaStream_t   *streams;
 | 
			
		||||
    cublasHandle_t *handles;
 | 
			
		||||
    int             taskSize;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
typedef struct threadData_t threadData;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// simple host dgemv: assume data is in row-major format and square
 | 
			
		||||
template <typename T>
 | 
			
		||||
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
 | 
			
		||||
  // rows
 | 
			
		||||
  for (int i = 0; i < n; i++) {
 | 
			
		||||
    result[i] *= beta;
 | 
			
		||||
template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
 | 
			
		||||
{
 | 
			
		||||
    // rows
 | 
			
		||||
    for (int i = 0; i < n; i++) {
 | 
			
		||||
        result[i] *= beta;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < n; j++) {
 | 
			
		||||
      result[i] += A[i * n + j] * x[j];
 | 
			
		||||
        for (int j = 0; j < n; j++) {
 | 
			
		||||
            result[i] += A[i * n + j] * x[j];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// execute a single task on either host or device depending on size
 | 
			
		||||
#ifdef USE_PTHREADS
 | 
			
		||||
void *execute(void *inpArgs) {
 | 
			
		||||
  threadData *dataPtr = (threadData *)inpArgs;
 | 
			
		||||
  cudaStream_t *stream = dataPtr->streams;
 | 
			
		||||
  cublasHandle_t *handle = dataPtr->handles;
 | 
			
		||||
  int tid = dataPtr->tid;
 | 
			
		||||
void *execute(void *inpArgs)
 | 
			
		||||
{
 | 
			
		||||
    threadData     *dataPtr = (threadData *)inpArgs;
 | 
			
		||||
    cudaStream_t   *stream  = dataPtr->streams;
 | 
			
		||||
    cublasHandle_t *handle  = dataPtr->handles;
 | 
			
		||||
    int             tid     = dataPtr->tid;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < dataPtr->taskSize; i++) {
 | 
			
		||||
    Task<double> &t = dataPtr->TaskListPtr[i];
 | 
			
		||||
    for (int i = 0; i < dataPtr->taskSize; i++) {
 | 
			
		||||
        Task<double> &t = dataPtr->TaskListPtr[i];
 | 
			
		||||
 | 
			
		||||
    if (t.size < 100) {
 | 
			
		||||
      // perform on host
 | 
			
		||||
      printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
 | 
			
		||||
             t.size);
 | 
			
		||||
        if (t.size < 100) {
 | 
			
		||||
            // perform on host
 | 
			
		||||
            printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
 | 
			
		||||
 | 
			
		||||
      // attach managed memory to a (dummy) stream to allow host access while
 | 
			
		||||
      // the device is running
 | 
			
		||||
      checkCudaErrors(
 | 
			
		||||
          cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
 | 
			
		||||
      checkCudaErrors(
 | 
			
		||||
          cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
 | 
			
		||||
      checkCudaErrors(
 | 
			
		||||
          cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
 | 
			
		||||
      // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
 | 
			
		||||
      checkCudaErrors(cudaStreamSynchronize(stream[0]));
 | 
			
		||||
      // call the host operation
 | 
			
		||||
      gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
 | 
			
		||||
    } else {
 | 
			
		||||
      // perform on device
 | 
			
		||||
      printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
 | 
			
		||||
             t.size);
 | 
			
		||||
      double one = 1.0;
 | 
			
		||||
      double zero = 0.0;
 | 
			
		||||
            // attach managed memory to a (dummy) stream to allow host access while
 | 
			
		||||
            // the device is running
 | 
			
		||||
            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
 | 
			
		||||
            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
 | 
			
		||||
            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
 | 
			
		||||
            // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
 | 
			
		||||
            checkCudaErrors(cudaStreamSynchronize(stream[0]));
 | 
			
		||||
            // call the host operation
 | 
			
		||||
            gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            // perform on device
 | 
			
		||||
            printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
 | 
			
		||||
            double one  = 1.0;
 | 
			
		||||
            double zero = 0.0;
 | 
			
		||||
 | 
			
		||||
      // attach managed memory to my stream
 | 
			
		||||
      checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
 | 
			
		||||
      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
 | 
			
		||||
                                               cudaMemAttachSingle));
 | 
			
		||||
      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
 | 
			
		||||
                                               cudaMemAttachSingle));
 | 
			
		||||
      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
 | 
			
		||||
                                               cudaMemAttachSingle));
 | 
			
		||||
      // call the device operation
 | 
			
		||||
      checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
 | 
			
		||||
                                  &one, t.data, t.size, t.vector, 1, &zero,
 | 
			
		||||
                                  t.result, 1));
 | 
			
		||||
            // attach managed memory to my stream
 | 
			
		||||
            checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
 | 
			
		||||
            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
 | 
			
		||||
            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
 | 
			
		||||
            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
 | 
			
		||||
            // call the device operation
 | 
			
		||||
            checkCudaErrors(cublasDgemv(
 | 
			
		||||
                handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  pthread_exit(NULL);
 | 
			
		||||
    pthread_exit(NULL);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
template <typename T>
 | 
			
		||||
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
 | 
			
		||||
             int tid) {
 | 
			
		||||
  if (t.size < 100) {
 | 
			
		||||
    // perform on host
 | 
			
		||||
    printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
 | 
			
		||||
           t.size);
 | 
			
		||||
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
 | 
			
		||||
{
 | 
			
		||||
    if (t.size < 100) {
 | 
			
		||||
        // perform on host
 | 
			
		||||
        printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
 | 
			
		||||
 | 
			
		||||
    // attach managed memory to a (dummy) stream to allow host access while the
 | 
			
		||||
    // device is running
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
 | 
			
		||||
    // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream[0]));
 | 
			
		||||
    // call the host operation
 | 
			
		||||
    gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
 | 
			
		||||
  } else {
 | 
			
		||||
    // perform on device
 | 
			
		||||
    printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
 | 
			
		||||
           t.size);
 | 
			
		||||
    double one = 1.0;
 | 
			
		||||
    double zero = 0.0;
 | 
			
		||||
        // attach managed memory to a (dummy) stream to allow host access while the
 | 
			
		||||
        // device is running
 | 
			
		||||
        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
 | 
			
		||||
        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
 | 
			
		||||
        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
 | 
			
		||||
        // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
 | 
			
		||||
        checkCudaErrors(cudaStreamSynchronize(stream[0]));
 | 
			
		||||
        // call the host operation
 | 
			
		||||
        gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // perform on device
 | 
			
		||||
        printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
 | 
			
		||||
        double one  = 1.0;
 | 
			
		||||
        double zero = 0.0;
 | 
			
		||||
 | 
			
		||||
    // attach managed memory to my stream
 | 
			
		||||
    checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
 | 
			
		||||
    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
 | 
			
		||||
                                             cudaMemAttachSingle));
 | 
			
		||||
    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
 | 
			
		||||
                                             cudaMemAttachSingle));
 | 
			
		||||
    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
 | 
			
		||||
                                             cudaMemAttachSingle));
 | 
			
		||||
    // call the device operation
 | 
			
		||||
    checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
 | 
			
		||||
                                &one, t.data, t.size, t.vector, 1, &zero,
 | 
			
		||||
                                t.result, 1));
 | 
			
		||||
  }
 | 
			
		||||
        // attach managed memory to my stream
 | 
			
		||||
        checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
 | 
			
		||||
        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
 | 
			
		||||
        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
 | 
			
		||||
        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
 | 
			
		||||
        // call the device operation
 | 
			
		||||
        checkCudaErrors(cublasDgemv(
 | 
			
		||||
            handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// populate a list of tasks with random sizes
 | 
			
		||||
template <typename T>
 | 
			
		||||
void initialise_tasks(std::vector<Task<T> > &TaskList) {
 | 
			
		||||
  for (unsigned int i = 0; i < TaskList.size(); i++) {
 | 
			
		||||
    // generate random size
 | 
			
		||||
    int size;
 | 
			
		||||
    size = std::max((int)(drand48() * 1000.0), 64);
 | 
			
		||||
    TaskList[i].allocate(size, i);
 | 
			
		||||
  }
 | 
			
		||||
template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
 | 
			
		||||
{
 | 
			
		||||
    for (unsigned int i = 0; i < TaskList.size(); i++) {
 | 
			
		||||
        // generate random size
 | 
			
		||||
        int size;
 | 
			
		||||
        size = std::max((int)(drand48() * 1000.0), 64);
 | 
			
		||||
        TaskList[i].allocate(size, i);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  // set device
 | 
			
		||||
  cudaDeviceProp device_prop;
 | 
			
		||||
  int dev_id = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // set device
 | 
			
		||||
    cudaDeviceProp device_prop;
 | 
			
		||||
    int            dev_id = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
 | 
			
		||||
 | 
			
		||||
  if (!device_prop.managedMemory) {
 | 
			
		||||
    // This samples requires being run on a device that supports Unified Memory
 | 
			
		||||
    fprintf(stderr, "Unified Memory not supported on this device\n");
 | 
			
		||||
    if (!device_prop.managedMemory) {
 | 
			
		||||
        // This samples requires being run on a device that supports Unified Memory
 | 
			
		||||
        fprintf(stderr, "Unified Memory not supported on this device\n");
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (device_prop.computeMode == cudaComputeModeProhibited) {
 | 
			
		||||
    // This sample requires being run with a default or process exclusive mode
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "This sample requires a device in either default or process "
 | 
			
		||||
            "exclusive mode\n");
 | 
			
		||||
    if (device_prop.computeMode == cudaComputeModeProhibited) {
 | 
			
		||||
        // This sample requires being run with a default or process exclusive mode
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "This sample requires a device in either default or process "
 | 
			
		||||
                "exclusive mode\n");
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // randomise task sizes
 | 
			
		||||
  int seed = (int)time(NULL);
 | 
			
		||||
  srand48(seed);
 | 
			
		||||
    // randomise task sizes
 | 
			
		||||
    int seed = (int)time(NULL);
 | 
			
		||||
    srand48(seed);
 | 
			
		||||
 | 
			
		||||
  // set number of threads
 | 
			
		||||
  const int nthreads = 4;
 | 
			
		||||
    // set number of threads
 | 
			
		||||
    const int nthreads = 4;
 | 
			
		||||
 | 
			
		||||
  // number of streams = number of threads
 | 
			
		||||
  cudaStream_t *streams = new cudaStream_t[nthreads + 1];
 | 
			
		||||
  cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
 | 
			
		||||
    // number of streams = number of threads
 | 
			
		||||
    cudaStream_t   *streams = new cudaStream_t[nthreads + 1];
 | 
			
		||||
    cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nthreads + 1; i++) {
 | 
			
		||||
    checkCudaErrors(cudaStreamCreate(&streams[i]));
 | 
			
		||||
    checkCudaErrors(cublasCreate(&handles[i]));
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < nthreads + 1; i++) {
 | 
			
		||||
        checkCudaErrors(cudaStreamCreate(&streams[i]));
 | 
			
		||||
        checkCudaErrors(cublasCreate(&handles[i]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // create list of N tasks
 | 
			
		||||
  unsigned int N = 40;
 | 
			
		||||
  std::vector<Task<double> > TaskList(N);
 | 
			
		||||
  initialise_tasks(TaskList);
 | 
			
		||||
    // create list of N tasks
 | 
			
		||||
    unsigned int              N = 40;
 | 
			
		||||
    std::vector<Task<double>> TaskList(N);
 | 
			
		||||
    initialise_tasks(TaskList);
 | 
			
		||||
 | 
			
		||||
  printf("Executing tasks on host / device\n");
 | 
			
		||||
    printf("Executing tasks on host / device\n");
 | 
			
		||||
 | 
			
		||||
// run through all tasks using threads and streams
 | 
			
		||||
#ifdef USE_PTHREADS
 | 
			
		||||
  pthread_t threads[nthreads];
 | 
			
		||||
  threadData *InputToThreads = new threadData[nthreads];
 | 
			
		||||
    pthread_t   threads[nthreads];
 | 
			
		||||
    threadData *InputToThreads = new threadData[nthreads];
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nthreads; i++) {
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(dev_id));
 | 
			
		||||
    InputToThreads[i].tid = i;
 | 
			
		||||
    InputToThreads[i].streams = streams;
 | 
			
		||||
    InputToThreads[i].handles = handles;
 | 
			
		||||
    for (int i = 0; i < nthreads; i++) {
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(dev_id));
 | 
			
		||||
        InputToThreads[i].tid     = i;
 | 
			
		||||
        InputToThreads[i].streams = streams;
 | 
			
		||||
        InputToThreads[i].handles = handles;
 | 
			
		||||
 | 
			
		||||
    if ((TaskList.size() / nthreads) == 0) {
 | 
			
		||||
      InputToThreads[i].taskSize = (TaskList.size() / nthreads);
 | 
			
		||||
      InputToThreads[i].TaskListPtr =
 | 
			
		||||
          &TaskList[i * (TaskList.size() / nthreads)];
 | 
			
		||||
    } else {
 | 
			
		||||
      if (i == nthreads - 1) {
 | 
			
		||||
        InputToThreads[i].taskSize =
 | 
			
		||||
            (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
 | 
			
		||||
        InputToThreads[i].TaskListPtr =
 | 
			
		||||
            &TaskList[i * (TaskList.size() / nthreads) +
 | 
			
		||||
                      (TaskList.size() % nthreads)];
 | 
			
		||||
      } else {
 | 
			
		||||
        InputToThreads[i].taskSize = (TaskList.size() / nthreads);
 | 
			
		||||
        InputToThreads[i].TaskListPtr =
 | 
			
		||||
            &TaskList[i * (TaskList.size() / nthreads)];
 | 
			
		||||
      }
 | 
			
		||||
        if ((TaskList.size() / nthreads) == 0) {
 | 
			
		||||
            InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
 | 
			
		||||
            InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            if (i == nthreads - 1) {
 | 
			
		||||
                InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
 | 
			
		||||
                InputToThreads[i].TaskListPtr =
 | 
			
		||||
                    &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
 | 
			
		||||
            }
 | 
			
		||||
            else {
 | 
			
		||||
                InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
 | 
			
		||||
                InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
 | 
			
		||||
    }
 | 
			
		||||
    for (int i = 0; i < nthreads; i++) {
 | 
			
		||||
        pthread_join(threads[i], NULL);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
 | 
			
		||||
  }
 | 
			
		||||
  for (int i = 0; i < nthreads; i++) {
 | 
			
		||||
    pthread_join(threads[i], NULL);
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  omp_set_num_threads(nthreads);
 | 
			
		||||
    omp_set_num_threads(nthreads);
 | 
			
		||||
#pragma omp parallel for schedule(dynamic)
 | 
			
		||||
  for (int i = 0; i < TaskList.size(); i++) {
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(dev_id));
 | 
			
		||||
    int tid = omp_get_thread_num();
 | 
			
		||||
    execute(TaskList[i], handles, streams, tid);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < TaskList.size(); i++) {
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(dev_id));
 | 
			
		||||
        int tid = omp_get_thread_num();
 | 
			
		||||
        execute(TaskList[i], handles, streams, tid);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  cudaDeviceSynchronize();
 | 
			
		||||
    cudaDeviceSynchronize();
 | 
			
		||||
 | 
			
		||||
  // Destroy CUDA Streams, cuBlas handles
 | 
			
		||||
  for (int i = 0; i < nthreads + 1; i++) {
 | 
			
		||||
    cudaStreamDestroy(streams[i]);
 | 
			
		||||
    cublasDestroy(handles[i]);
 | 
			
		||||
  }
 | 
			
		||||
    // Destroy CUDA Streams, cuBlas handles
 | 
			
		||||
    for (int i = 0; i < nthreads + 1; i++) {
 | 
			
		||||
        cudaStreamDestroy(streams[i]);
 | 
			
		||||
        cublasDestroy(handles[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Free TaskList
 | 
			
		||||
  std::vector<Task<double> >().swap(TaskList);
 | 
			
		||||
    // Free TaskList
 | 
			
		||||
    std::vector<Task<double>>().swap(TaskList);
 | 
			
		||||
 | 
			
		||||
  printf("All Done!\n");
 | 
			
		||||
  exit(EXIT_SUCCESS);
 | 
			
		||||
    printf("All Done!\n");
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -38,105 +38,107 @@
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// includes CUDA Runtime
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <cuda_profiler_api.h>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>  // helper utility functions
 | 
			
		||||
#include <helper_functions.h> // helper utility functions
 | 
			
		||||
 | 
			
		||||
__global__ void increment_kernel(int *g_data, int inc_value) {
 | 
			
		||||
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  g_data[idx] = g_data[idx] + inc_value;
 | 
			
		||||
__global__ void increment_kernel(int *g_data, int inc_value)
 | 
			
		||||
{
 | 
			
		||||
    int idx     = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    g_data[idx] = g_data[idx] + inc_value;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool correct_output(int *data, const int n, const int x) {
 | 
			
		||||
  for (int i = 0; i < n; i++)
 | 
			
		||||
    if (data[i] != x) {
 | 
			
		||||
      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
 | 
			
		||||
      return false;
 | 
			
		||||
bool correct_output(int *data, const int n, const int x)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < n; i++)
 | 
			
		||||
        if (data[i] != x) {
 | 
			
		||||
            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
 | 
			
		||||
            return false;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char *argv[])
 | 
			
		||||
{
 | 
			
		||||
    int            devID;
 | 
			
		||||
    cudaDeviceProp deviceProps;
 | 
			
		||||
 | 
			
		||||
    printf("[%s] - Starting...\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
    // get device name
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
    printf("CUDA device [%s]\n", deviceProps.name);
 | 
			
		||||
 | 
			
		||||
    int n      = 16 * 1024 * 1024;
 | 
			
		||||
    int nbytes = n * sizeof(int);
 | 
			
		||||
    int value  = 26;
 | 
			
		||||
 | 
			
		||||
    // allocate host memory
 | 
			
		||||
    int *a = 0;
 | 
			
		||||
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
 | 
			
		||||
    memset(a, 0, nbytes);
 | 
			
		||||
 | 
			
		||||
    // allocate device memory
 | 
			
		||||
    int *d_a = 0;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
 | 
			
		||||
    checkCudaErrors(cudaMemset(d_a, 255, nbytes));
 | 
			
		||||
 | 
			
		||||
    // set kernel launch configuration
 | 
			
		||||
    dim3 threads = dim3(512, 1);
 | 
			
		||||
    dim3 blocks  = dim3(n / threads.x, 1);
 | 
			
		||||
 | 
			
		||||
    // create cuda event handles
 | 
			
		||||
    cudaEvent_t start, stop;
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&start));
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&stop));
 | 
			
		||||
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkResetTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    float gpu_time = 0.0f;
 | 
			
		||||
 | 
			
		||||
    // asynchronously issue work to the GPU (all to stream 0)
 | 
			
		||||
    checkCudaErrors(cudaProfilerStart());
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
    cudaEventRecord(start, 0);
 | 
			
		||||
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
 | 
			
		||||
    increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
 | 
			
		||||
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
 | 
			
		||||
    cudaEventRecord(stop, 0);
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    checkCudaErrors(cudaProfilerStop());
 | 
			
		||||
 | 
			
		||||
    // have CPU do some work while waiting for stage 1 to finish
 | 
			
		||||
    unsigned long int counter = 0;
 | 
			
		||||
 | 
			
		||||
    while (cudaEventQuery(stop) == cudaErrorNotReady) {
 | 
			
		||||
        counter++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char *argv[]) {
 | 
			
		||||
  int devID;
 | 
			
		||||
  cudaDeviceProp deviceProps;
 | 
			
		||||
 | 
			
		||||
  printf("[%s] - Starting...\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // get device name
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
  printf("CUDA device [%s]\n", deviceProps.name);
 | 
			
		||||
 | 
			
		||||
  int n = 16 * 1024 * 1024;
 | 
			
		||||
  int nbytes = n * sizeof(int);
 | 
			
		||||
  int value = 26;
 | 
			
		||||
 | 
			
		||||
  // allocate host memory
 | 
			
		||||
  int *a = 0;
 | 
			
		||||
  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
 | 
			
		||||
  memset(a, 0, nbytes);
 | 
			
		||||
 | 
			
		||||
  // allocate device memory
 | 
			
		||||
  int *d_a = 0;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
 | 
			
		||||
  checkCudaErrors(cudaMemset(d_a, 255, nbytes));
 | 
			
		||||
 | 
			
		||||
  // set kernel launch configuration
 | 
			
		||||
  dim3 threads = dim3(512, 1);
 | 
			
		||||
  dim3 blocks = dim3(n / threads.x, 1);
 | 
			
		||||
 | 
			
		||||
  // create cuda event handles
 | 
			
		||||
  cudaEvent_t start, stop;
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&start));
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&stop));
 | 
			
		||||
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkResetTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  float gpu_time = 0.0f;
 | 
			
		||||
 | 
			
		||||
  // asynchronously issue work to the GPU (all to stream 0)
 | 
			
		||||
  checkCudaErrors(cudaProfilerStart());
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
  cudaEventRecord(start, 0);
 | 
			
		||||
  cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
 | 
			
		||||
  increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
 | 
			
		||||
  cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
 | 
			
		||||
  cudaEventRecord(stop, 0);
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  checkCudaErrors(cudaProfilerStop());
 | 
			
		||||
 | 
			
		||||
  // have CPU do some work while waiting for stage 1 to finish
 | 
			
		||||
  unsigned long int counter = 0;
 | 
			
		||||
 | 
			
		||||
  while (cudaEventQuery(stop) == cudaErrorNotReady) {
 | 
			
		||||
    counter++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
 | 
			
		||||
 | 
			
		||||
  // print the cpu and gpu times
 | 
			
		||||
  printf("time spent executing by the GPU: %.2f\n", gpu_time);
 | 
			
		||||
  printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  printf("CPU executed %lu iterations while waiting for GPU to finish\n",
 | 
			
		||||
         counter);
 | 
			
		||||
 | 
			
		||||
  // check the output for correctness
 | 
			
		||||
  bool bFinalResults = correct_output(a, n, value);
 | 
			
		||||
 | 
			
		||||
  // release resources
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(start));
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(stop));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(a));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_a));
 | 
			
		||||
 | 
			
		||||
  exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
 | 
			
		||||
 | 
			
		||||
    // print the cpu and gpu times
 | 
			
		||||
    printf("time spent executing by the GPU: %.2f\n", gpu_time);
 | 
			
		||||
    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
 | 
			
		||||
 | 
			
		||||
    // check the output for correctness
 | 
			
		||||
    bool bFinalResults = correct_output(a, n, value);
 | 
			
		||||
 | 
			
		||||
    // release resources
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(start));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(stop));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(a));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_a));
 | 
			
		||||
 | 
			
		||||
    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -48,43 +48,46 @@
 | 
			
		||||
// This kernel computes a standard parallel reduction and evaluates the
 | 
			
		||||
// time it takes to do that for each block. The timing results are stored
 | 
			
		||||
// in device memory.
 | 
			
		||||
__global__ static void timedReduction(const float *input, float *output,
 | 
			
		||||
                                      clock_t *timer) {
 | 
			
		||||
  // __shared__ float shared[2 * blockDim.x];
 | 
			
		||||
  extern __shared__ float shared[];
 | 
			
		||||
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
 | 
			
		||||
{
 | 
			
		||||
    // __shared__ float shared[2 * blockDim.x];
 | 
			
		||||
    extern __shared__ float shared[];
 | 
			
		||||
 | 
			
		||||
  const int tid = threadIdx.x;
 | 
			
		||||
  const int bid = blockIdx.x;
 | 
			
		||||
    const int tid = threadIdx.x;
 | 
			
		||||
    const int bid = blockIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (tid == 0) timer[bid] = clock();
 | 
			
		||||
    if (tid == 0)
 | 
			
		||||
        timer[bid] = clock();
 | 
			
		||||
 | 
			
		||||
  // Copy input.
 | 
			
		||||
  shared[tid] = input[tid];
 | 
			
		||||
  shared[tid + blockDim.x] = input[tid + blockDim.x];
 | 
			
		||||
    // Copy input.
 | 
			
		||||
    shared[tid]              = input[tid];
 | 
			
		||||
    shared[tid + blockDim.x] = input[tid + blockDim.x];
 | 
			
		||||
 | 
			
		||||
    // Perform reduction to find minimum.
 | 
			
		||||
    for (int d = blockDim.x; d > 0; d /= 2) {
 | 
			
		||||
        __syncthreads();
 | 
			
		||||
 | 
			
		||||
        if (tid < d) {
 | 
			
		||||
            float f0 = shared[tid];
 | 
			
		||||
            float f1 = shared[tid + d];
 | 
			
		||||
 | 
			
		||||
            if (f1 < f0) {
 | 
			
		||||
                shared[tid] = f1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Write result.
 | 
			
		||||
    if (tid == 0)
 | 
			
		||||
        output[bid] = shared[0];
 | 
			
		||||
 | 
			
		||||
  // Perform reduction to find minimum.
 | 
			
		||||
  for (int d = blockDim.x; d > 0; d /= 2) {
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
 | 
			
		||||
    if (tid < d) {
 | 
			
		||||
      float f0 = shared[tid];
 | 
			
		||||
      float f1 = shared[tid + d];
 | 
			
		||||
 | 
			
		||||
      if (f1 < f0) {
 | 
			
		||||
        shared[tid] = f1;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Write result.
 | 
			
		||||
  if (tid == 0) output[bid] = shared[0];
 | 
			
		||||
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
 | 
			
		||||
  if (tid == 0) timer[bid + gridDim.x] = clock();
 | 
			
		||||
    if (tid == 0)
 | 
			
		||||
        timer[bid + gridDim.x] = clock();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define NUM_BLOCKS 64
 | 
			
		||||
#define NUM_BLOCKS  64
 | 
			
		||||
#define NUM_THREADS 256
 | 
			
		||||
 | 
			
		||||
// It's interesting to change the number of blocks and the number of threads to
 | 
			
		||||
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
 | 
			
		||||
// the memory. With more than 32 the speed scales linearly.
 | 
			
		||||
 | 
			
		||||
// Start the main CUDA Sample here
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("CUDA Clock sample\n");
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("CUDA Clock sample\n");
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  float *dinput = NULL;
 | 
			
		||||
  float *doutput = NULL;
 | 
			
		||||
  clock_t *dtimer = NULL;
 | 
			
		||||
    float   *dinput  = NULL;
 | 
			
		||||
    float   *doutput = NULL;
 | 
			
		||||
    clock_t *dtimer  = NULL;
 | 
			
		||||
 | 
			
		||||
  clock_t timer[NUM_BLOCKS * 2];
 | 
			
		||||
  float input[NUM_THREADS * 2];
 | 
			
		||||
    clock_t timer[NUM_BLOCKS * 2];
 | 
			
		||||
    float   input[NUM_THREADS * 2];
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < NUM_THREADS * 2; i++) {
 | 
			
		||||
    input[i] = (float)i;
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < NUM_THREADS * 2; i++) {
 | 
			
		||||
        input[i] = (float)i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
 | 
			
		||||
                             cudaMemcpyHostToDevice));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
 | 
			
		||||
      dinput, doutput, dtimer);
 | 
			
		||||
    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaFree(dinput));
 | 
			
		||||
  checkCudaErrors(cudaFree(doutput));
 | 
			
		||||
  checkCudaErrors(cudaFree(dtimer));
 | 
			
		||||
    checkCudaErrors(cudaFree(dinput));
 | 
			
		||||
    checkCudaErrors(cudaFree(doutput));
 | 
			
		||||
    checkCudaErrors(cudaFree(dtimer));
 | 
			
		||||
 | 
			
		||||
  long double avgElapsedClocks = 0;
 | 
			
		||||
    long double avgElapsedClocks = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < NUM_BLOCKS; i++) {
 | 
			
		||||
    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < NUM_BLOCKS; i++) {
 | 
			
		||||
        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
 | 
			
		||||
  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
 | 
			
		||||
    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
 | 
			
		||||
    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
 | 
			
		||||
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,12 +34,11 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// System includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdint.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <nvrtc_helper.h>
 | 
			
		||||
#include <stdint.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
@ -71,64 +70,68 @@
 | 
			
		||||
 | 
			
		||||
// Start the main CUDA Sample here
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("CUDA Clock sample\n");
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("CUDA Clock sample\n");
 | 
			
		||||
 | 
			
		||||
  typedef long clock_t;
 | 
			
		||||
    typedef long clock_t;
 | 
			
		||||
 | 
			
		||||
  clock_t timer[NUM_BLOCKS * 2];
 | 
			
		||||
    clock_t timer[NUM_BLOCKS * 2];
 | 
			
		||||
 | 
			
		||||
  float input[NUM_THREADS * 2];
 | 
			
		||||
    float input[NUM_THREADS * 2];
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < NUM_THREADS * 2; i++) {
 | 
			
		||||
    input[i] = (float)i;
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < NUM_THREADS * 2; i++) {
 | 
			
		||||
        input[i] = (float)i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  char *cubin, *kernel_file;
 | 
			
		||||
  size_t cubinSize;
 | 
			
		||||
    char  *cubin, *kernel_file;
 | 
			
		||||
    size_t cubinSize;
 | 
			
		||||
 | 
			
		||||
  kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
 | 
			
		||||
  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
    kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
 | 
			
		||||
    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
 | 
			
		||||
  CUmodule module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
  CUfunction kernel_addr;
 | 
			
		||||
    CUmodule   module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
    CUfunction kernel_addr;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
 | 
			
		||||
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
 | 
			
		||||
 | 
			
		||||
  dim3 cudaBlockSize(NUM_THREADS, 1, 1);
 | 
			
		||||
  dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
 | 
			
		||||
    dim3 cudaBlockSize(NUM_THREADS, 1, 1);
 | 
			
		||||
    dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
 | 
			
		||||
 | 
			
		||||
  CUdeviceptr dinput, doutput, dtimer;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
 | 
			
		||||
    CUdeviceptr dinput, doutput, dtimer;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
 | 
			
		||||
 | 
			
		||||
  void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
 | 
			
		||||
    void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
      kernel_addr, cudaGridSize.x, cudaGridSize.y,
 | 
			
		||||
      cudaGridSize.z,                                    /* grid dim */
 | 
			
		||||
      cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
 | 
			
		||||
      sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
 | 
			
		||||
      &arr[0],                            /* arguments */
 | 
			
		||||
      0));
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(kernel_addr,
 | 
			
		||||
                                   cudaGridSize.x,
 | 
			
		||||
                                   cudaGridSize.y,
 | 
			
		||||
                                   cudaGridSize.z, /* grid dim */
 | 
			
		||||
                                   cudaBlockSize.x,
 | 
			
		||||
                                   cudaBlockSize.y,
 | 
			
		||||
                                   cudaBlockSize.z, /* block dim */
 | 
			
		||||
                                   sizeof(float) * 2 * NUM_THREADS,
 | 
			
		||||
                                   0,       /* shared mem, stream */
 | 
			
		||||
                                   &arr[0], /* arguments */
 | 
			
		||||
                                   0));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
 | 
			
		||||
  checkCudaErrors(cuMemFree(dinput));
 | 
			
		||||
  checkCudaErrors(cuMemFree(doutput));
 | 
			
		||||
  checkCudaErrors(cuMemFree(dtimer));
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
 | 
			
		||||
    checkCudaErrors(cuMemFree(dinput));
 | 
			
		||||
    checkCudaErrors(cuMemFree(doutput));
 | 
			
		||||
    checkCudaErrors(cuMemFree(dtimer));
 | 
			
		||||
 | 
			
		||||
  long double avgElapsedClocks = 0;
 | 
			
		||||
    long double avgElapsedClocks = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < NUM_BLOCKS; i++) {
 | 
			
		||||
    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < NUM_BLOCKS; i++) {
 | 
			
		||||
        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
 | 
			
		||||
  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
 | 
			
		||||
    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
 | 
			
		||||
    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
 | 
			
		||||
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -37,38 +37,41 @@
 | 
			
		||||
// time it takes to do that for each block. The timing results are stored
 | 
			
		||||
// in device memory.
 | 
			
		||||
 | 
			
		||||
extern "C" __global__ void timedReduction(const float *input, float *output,
 | 
			
		||||
                                          clock_t *timer) {
 | 
			
		||||
  // __shared__ float shared[2 * blockDim.x];
 | 
			
		||||
  extern __shared__ float shared[];
 | 
			
		||||
extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
 | 
			
		||||
{
 | 
			
		||||
    // __shared__ float shared[2 * blockDim.x];
 | 
			
		||||
    extern __shared__ float shared[];
 | 
			
		||||
 | 
			
		||||
  const int tid = threadIdx.x;
 | 
			
		||||
  const int bid = blockIdx.x;
 | 
			
		||||
    const int tid = threadIdx.x;
 | 
			
		||||
    const int bid = blockIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (tid == 0) timer[bid] = clock();
 | 
			
		||||
    if (tid == 0)
 | 
			
		||||
        timer[bid] = clock();
 | 
			
		||||
 | 
			
		||||
  // Copy input.
 | 
			
		||||
  shared[tid] = input[tid];
 | 
			
		||||
  shared[tid + blockDim.x] = input[tid + blockDim.x];
 | 
			
		||||
    // Copy input.
 | 
			
		||||
    shared[tid]              = input[tid];
 | 
			
		||||
    shared[tid + blockDim.x] = input[tid + blockDim.x];
 | 
			
		||||
 | 
			
		||||
    // Perform reduction to find minimum.
 | 
			
		||||
    for (int d = blockDim.x; d > 0; d /= 2) {
 | 
			
		||||
        __syncthreads();
 | 
			
		||||
 | 
			
		||||
        if (tid < d) {
 | 
			
		||||
            float f0 = shared[tid];
 | 
			
		||||
            float f1 = shared[tid + d];
 | 
			
		||||
 | 
			
		||||
            if (f1 < f0) {
 | 
			
		||||
                shared[tid] = f1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Write result.
 | 
			
		||||
    if (tid == 0)
 | 
			
		||||
        output[bid] = shared[0];
 | 
			
		||||
 | 
			
		||||
  // Perform reduction to find minimum.
 | 
			
		||||
  for (int d = blockDim.x; d > 0; d /= 2) {
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
 | 
			
		||||
    if (tid < d) {
 | 
			
		||||
      float f0 = shared[tid];
 | 
			
		||||
      float f1 = shared[tid + d];
 | 
			
		||||
 | 
			
		||||
      if (f1 < f0) {
 | 
			
		||||
        shared[tid] = f1;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Write result.
 | 
			
		||||
  if (tid == 0) output[bid] = shared[0];
 | 
			
		||||
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
 | 
			
		||||
  if (tid == 0) timer[bid + gridDim.x] = clock();
 | 
			
		||||
    if (tid == 0)
 | 
			
		||||
        timer[bid + gridDim.x] = clock();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -32,128 +32,125 @@
 | 
			
		||||
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
#include <stdio.h>  // stdio functions are used since C++ streams aren't necessarily thread safe
 | 
			
		||||
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
 | 
			
		||||
// a simple kernel that simply increments each array element by b
 | 
			
		||||
__global__ void kernelAddConstant(int *g_a, const int b) {
 | 
			
		||||
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  g_a[idx] += b;
 | 
			
		||||
__global__ void kernelAddConstant(int *g_a, const int b)
 | 
			
		||||
{
 | 
			
		||||
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    g_a[idx] += b;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// a predicate that checks whether each array element is set to its index plus b
 | 
			
		||||
int correctResult(int *data, const int n, const int b) {
 | 
			
		||||
  for (int i = 0; i < n; i++)
 | 
			
		||||
    if (data[i] != i + b) return 0;
 | 
			
		||||
int correctResult(int *data, const int n, const int b)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < n; i++)
 | 
			
		||||
        if (data[i] != i + b)
 | 
			
		||||
            return 0;
 | 
			
		||||
 | 
			
		||||
  return 1;
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char *argv[]) {
 | 
			
		||||
  int num_gpus = 0;  // number of CUDA GPUs
 | 
			
		||||
int main(int argc, char *argv[])
 | 
			
		||||
{
 | 
			
		||||
    int num_gpus = 0; // number of CUDA GPUs
 | 
			
		||||
 | 
			
		||||
  printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
    printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////
 | 
			
		||||
  // determine the number of CUDA capable GPUs
 | 
			
		||||
  //
 | 
			
		||||
  cudaGetDeviceCount(&num_gpus);
 | 
			
		||||
    /////////////////////////////////////////////////////////////////
 | 
			
		||||
    // determine the number of CUDA capable GPUs
 | 
			
		||||
    //
 | 
			
		||||
    cudaGetDeviceCount(&num_gpus);
 | 
			
		||||
 | 
			
		||||
  if (num_gpus < 1) {
 | 
			
		||||
    printf("no CUDA capable devices were detected\n");
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
    if (num_gpus < 1) {
 | 
			
		||||
        printf("no CUDA capable devices were detected\n");
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////
 | 
			
		||||
  // display CPU and GPU configuration
 | 
			
		||||
  //
 | 
			
		||||
  printf("number of host CPUs:\t%d\n", omp_get_num_procs());
 | 
			
		||||
  printf("number of CUDA devices:\t%d\n", num_gpus);
 | 
			
		||||
    /////////////////////////////////////////////////////////////////
 | 
			
		||||
    // display CPU and GPU configuration
 | 
			
		||||
    //
 | 
			
		||||
    printf("number of host CPUs:\t%d\n", omp_get_num_procs());
 | 
			
		||||
    printf("number of CUDA devices:\t%d\n", num_gpus);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < num_gpus; i++) {
 | 
			
		||||
    cudaDeviceProp dprop;
 | 
			
		||||
    cudaGetDeviceProperties(&dprop, i);
 | 
			
		||||
    printf("   %d: %s\n", i, dprop.name);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < num_gpus; i++) {
 | 
			
		||||
        cudaDeviceProp dprop;
 | 
			
		||||
        cudaGetDeviceProperties(&dprop, i);
 | 
			
		||||
        printf("   %d: %s\n", i, dprop.name);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf("---------------------------\n");
 | 
			
		||||
    printf("---------------------------\n");
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////
 | 
			
		||||
  // initialize data
 | 
			
		||||
  //
 | 
			
		||||
  unsigned int n = num_gpus * 8192;
 | 
			
		||||
  unsigned int nbytes = n * sizeof(int);
 | 
			
		||||
  int *a = 0;  // pointer to data on the CPU
 | 
			
		||||
  int b = 3;   // value by which the array is incremented
 | 
			
		||||
  a = (int *)malloc(nbytes);
 | 
			
		||||
    /////////////////////////////////////////////////////////////////
 | 
			
		||||
    // initialize data
 | 
			
		||||
    //
 | 
			
		||||
    unsigned int n      = num_gpus * 8192;
 | 
			
		||||
    unsigned int nbytes = n * sizeof(int);
 | 
			
		||||
    int         *a      = 0; // pointer to data on the CPU
 | 
			
		||||
    int          b      = 3; // value by which the array is incremented
 | 
			
		||||
    a                   = (int *)malloc(nbytes);
 | 
			
		||||
 | 
			
		||||
  if (0 == a) {
 | 
			
		||||
    printf("couldn't allocate CPU memory\n");
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
    if (0 == a) {
 | 
			
		||||
        printf("couldn't allocate CPU memory\n");
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (unsigned int i = 0; i < n; i++) a[i] = i;
 | 
			
		||||
    for (unsigned int i = 0; i < n; i++)
 | 
			
		||||
        a[i] = i;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // run as many CPU threads as there are CUDA devices
 | 
			
		||||
  //   each CPU thread controls a different device, processing its
 | 
			
		||||
  //   portion of the data.  It's possible to use more CPU threads
 | 
			
		||||
  //   than there are CUDA devices, in which case several CPU
 | 
			
		||||
  //   threads will be allocating resources and launching kernels
 | 
			
		||||
  //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
 | 
			
		||||
  //   Recall that all variables declared inside an "omp parallel" scope are
 | 
			
		||||
  //   local to each CPU thread
 | 
			
		||||
  //
 | 
			
		||||
  omp_set_num_threads(
 | 
			
		||||
      num_gpus);  // create as many CPU threads as there are CUDA devices
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
    // run as many CPU threads as there are CUDA devices
 | 
			
		||||
    //   each CPU thread controls a different device, processing its
 | 
			
		||||
    //   portion of the data.  It's possible to use more CPU threads
 | 
			
		||||
    //   than there are CUDA devices, in which case several CPU
 | 
			
		||||
    //   threads will be allocating resources and launching kernels
 | 
			
		||||
    //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
 | 
			
		||||
    //   Recall that all variables declared inside an "omp parallel" scope are
 | 
			
		||||
    //   local to each CPU thread
 | 
			
		||||
    //
 | 
			
		||||
    omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
 | 
			
		||||
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
 | 
			
		||||
// are CUDA devices
 | 
			
		||||
#pragma omp parallel
 | 
			
		||||
  {
 | 
			
		||||
    unsigned int cpu_thread_id = omp_get_thread_num();
 | 
			
		||||
    unsigned int num_cpu_threads = omp_get_num_threads();
 | 
			
		||||
    {
 | 
			
		||||
        unsigned int cpu_thread_id   = omp_get_thread_num();
 | 
			
		||||
        unsigned int num_cpu_threads = omp_get_num_threads();
 | 
			
		||||
 | 
			
		||||
    // set and check the CUDA device for this CPU thread
 | 
			
		||||
    int gpu_id = -1;
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(
 | 
			
		||||
        cpu_thread_id %
 | 
			
		||||
        num_gpus));  // "% num_gpus" allows more CPU threads than GPU devices
 | 
			
		||||
    checkCudaErrors(cudaGetDevice(&gpu_id));
 | 
			
		||||
    printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
 | 
			
		||||
           num_cpu_threads, gpu_id);
 | 
			
		||||
        // set and check the CUDA device for this CPU thread
 | 
			
		||||
        int gpu_id = -1;
 | 
			
		||||
        checkCudaErrors(
 | 
			
		||||
            cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
 | 
			
		||||
        checkCudaErrors(cudaGetDevice(&gpu_id));
 | 
			
		||||
        printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
 | 
			
		||||
 | 
			
		||||
    int *d_a =
 | 
			
		||||
        0;  // pointer to memory on the device associated with this CPU thread
 | 
			
		||||
    int *sub_a =
 | 
			
		||||
        a +
 | 
			
		||||
        cpu_thread_id * n /
 | 
			
		||||
            num_cpu_threads;  // pointer to this CPU thread's portion of data
 | 
			
		||||
    unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
 | 
			
		||||
    dim3 gpu_threads(128);  // 128 threads per block
 | 
			
		||||
    dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
 | 
			
		||||
        int         *d_a   = 0; // pointer to memory on the device associated with this CPU thread
 | 
			
		||||
        int         *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
 | 
			
		||||
        unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
 | 
			
		||||
        dim3         gpu_threads(128); // 128 threads per block
 | 
			
		||||
        dim3         gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
 | 
			
		||||
    checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
 | 
			
		||||
    kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
 | 
			
		||||
        checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
 | 
			
		||||
        checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
 | 
			
		||||
        checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
 | 
			
		||||
        kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_a));
 | 
			
		||||
  }
 | 
			
		||||
  printf("---------------------------\n");
 | 
			
		||||
        checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
 | 
			
		||||
        checkCudaErrors(cudaFree(d_a));
 | 
			
		||||
    }
 | 
			
		||||
    printf("---------------------------\n");
 | 
			
		||||
 | 
			
		||||
  if (cudaSuccess != cudaGetLastError())
 | 
			
		||||
    printf("%s\n", cudaGetErrorString(cudaGetLastError()));
 | 
			
		||||
    if (cudaSuccess != cudaGetLastError())
 | 
			
		||||
        printf("%s\n", cudaGetErrorString(cudaGetLastError()));
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // check the result
 | 
			
		||||
  //
 | 
			
		||||
  bool bResult = correctResult(a, n, b);
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
    // check the result
 | 
			
		||||
    //
 | 
			
		||||
    bool bResult = correctResult(a, n, b);
 | 
			
		||||
 | 
			
		||||
  if (a) free(a);  // free CPU memory
 | 
			
		||||
    if (a)
 | 
			
		||||
        free(a); // free CPU memory
 | 
			
		||||
 | 
			
		||||
  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -25,191 +25,188 @@
 | 
			
		||||
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "cuda_fp16.h"
 | 
			
		||||
#include "helper_cuda.h"
 | 
			
		||||
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <cstdlib>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
 | 
			
		||||
#define NUM_OF_BLOCKS 128
 | 
			
		||||
#include "cuda_fp16.h"
 | 
			
		||||
#include "helper_cuda.h"
 | 
			
		||||
 | 
			
		||||
#define NUM_OF_BLOCKS  128
 | 
			
		||||
#define NUM_OF_THREADS 128
 | 
			
		||||
 | 
			
		||||
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
 | 
			
		||||
  if (threadIdx.x < 64)
 | 
			
		||||
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 32)
 | 
			
		||||
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 16)
 | 
			
		||||
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 8)
 | 
			
		||||
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 4)
 | 
			
		||||
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 2)
 | 
			
		||||
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 1)
 | 
			
		||||
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
 | 
			
		||||
{
 | 
			
		||||
    if (threadIdx.x < 64)
 | 
			
		||||
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 32)
 | 
			
		||||
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 16)
 | 
			
		||||
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 8)
 | 
			
		||||
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 4)
 | 
			
		||||
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 2)
 | 
			
		||||
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 1)
 | 
			
		||||
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
 | 
			
		||||
  if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
__forceinline__ __device__ void reduceInShared_native(half2 *const v)
 | 
			
		||||
{
 | 
			
		||||
    if (threadIdx.x < 64)
 | 
			
		||||
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 32)
 | 
			
		||||
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 16)
 | 
			
		||||
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 8)
 | 
			
		||||
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 4)
 | 
			
		||||
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 2)
 | 
			
		||||
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    if (threadIdx.x < 1)
 | 
			
		||||
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void scalarProductKernel_intrinsics(half2 const *const a,
 | 
			
		||||
                                               half2 const *const b,
 | 
			
		||||
                                               float *const results,
 | 
			
		||||
                                               size_t const size) {
 | 
			
		||||
  const int stride = gridDim.x * blockDim.x;
 | 
			
		||||
  __shared__ half2 shArray[NUM_OF_THREADS];
 | 
			
		||||
__global__ void
 | 
			
		||||
scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
 | 
			
		||||
{
 | 
			
		||||
    const int        stride = gridDim.x * blockDim.x;
 | 
			
		||||
    __shared__ half2 shArray[NUM_OF_THREADS];
 | 
			
		||||
 | 
			
		||||
  shArray[threadIdx.x] = __float2half2_rn(0.f);
 | 
			
		||||
  half2 value = __float2half2_rn(0.f);
 | 
			
		||||
    shArray[threadIdx.x] = __float2half2_rn(0.f);
 | 
			
		||||
    half2 value          = __float2half2_rn(0.f);
 | 
			
		||||
 | 
			
		||||
  for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
 | 
			
		||||
    value = __hfma2(a[i], b[i], value);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
 | 
			
		||||
        value = __hfma2(a[i], b[i], value);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  shArray[threadIdx.x] = value;
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  reduceInShared_intrinsics(shArray);
 | 
			
		||||
    shArray[threadIdx.x] = value;
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    reduceInShared_intrinsics(shArray);
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    half2 result = shArray[0];
 | 
			
		||||
    float f_result = __low2float(result) + __high2float(result);
 | 
			
		||||
    results[blockIdx.x] = f_result;
 | 
			
		||||
  }
 | 
			
		||||
    if (threadIdx.x == 0) {
 | 
			
		||||
        half2 result        = shArray[0];
 | 
			
		||||
        float f_result      = __low2float(result) + __high2float(result);
 | 
			
		||||
        results[blockIdx.x] = f_result;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void scalarProductKernel_native(half2 const *const a,
 | 
			
		||||
                                           half2 const *const b,
 | 
			
		||||
                                           float *const results,
 | 
			
		||||
                                           size_t const size) {
 | 
			
		||||
  const int stride = gridDim.x * blockDim.x;
 | 
			
		||||
  __shared__ half2 shArray[NUM_OF_THREADS];
 | 
			
		||||
__global__ void
 | 
			
		||||
scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
 | 
			
		||||
{
 | 
			
		||||
    const int        stride = gridDim.x * blockDim.x;
 | 
			
		||||
    __shared__ half2 shArray[NUM_OF_THREADS];
 | 
			
		||||
 | 
			
		||||
  half2 value(0.f, 0.f);
 | 
			
		||||
  shArray[threadIdx.x] = value;
 | 
			
		||||
    half2 value(0.f, 0.f);
 | 
			
		||||
    shArray[threadIdx.x] = value;
 | 
			
		||||
 | 
			
		||||
  for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
 | 
			
		||||
    value = a[i] * b[i] + value;
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
 | 
			
		||||
        value = a[i] * b[i] + value;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  shArray[threadIdx.x] = value;
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  reduceInShared_native(shArray);
 | 
			
		||||
    shArray[threadIdx.x] = value;
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    reduceInShared_native(shArray);
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    half2 result = shArray[0];
 | 
			
		||||
    float f_result = (float)result.y + (float)result.x;
 | 
			
		||||
    results[blockIdx.x] = f_result;
 | 
			
		||||
  }
 | 
			
		||||
    if (threadIdx.x == 0) {
 | 
			
		||||
        half2 result        = shArray[0];
 | 
			
		||||
        float f_result      = (float)result.y + (float)result.x;
 | 
			
		||||
        results[blockIdx.x] = f_result;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void generateInput(half2 *a, size_t size) {
 | 
			
		||||
  for (size_t i = 0; i < size; ++i) {
 | 
			
		||||
    half2 temp;
 | 
			
		||||
    temp.x = static_cast<float>(rand() % 4);
 | 
			
		||||
    temp.y = static_cast<float>(rand() % 2);
 | 
			
		||||
    a[i] = temp;
 | 
			
		||||
  }
 | 
			
		||||
void generateInput(half2 *a, size_t size)
 | 
			
		||||
{
 | 
			
		||||
    for (size_t i = 0; i < size; ++i) {
 | 
			
		||||
        half2 temp;
 | 
			
		||||
        temp.x = static_cast<float>(rand() % 4);
 | 
			
		||||
        temp.y = static_cast<float>(rand() % 2);
 | 
			
		||||
        a[i]   = temp;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char *argv[]) {
 | 
			
		||||
  srand((unsigned int)time(NULL));
 | 
			
		||||
  size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
 | 
			
		||||
int main(int argc, char *argv[])
 | 
			
		||||
{
 | 
			
		||||
    srand((unsigned int)time(NULL));
 | 
			
		||||
    size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
 | 
			
		||||
 | 
			
		||||
  half2 *vec[2];
 | 
			
		||||
  half2 *devVec[2];
 | 
			
		||||
    half2 *vec[2];
 | 
			
		||||
    half2 *devVec[2];
 | 
			
		||||
 | 
			
		||||
  float *results;
 | 
			
		||||
  float *devResults;
 | 
			
		||||
    float *results;
 | 
			
		||||
    float *devResults;
 | 
			
		||||
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  cudaDeviceProp devProp;
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
 | 
			
		||||
    cudaDeviceProp devProp;
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
 | 
			
		||||
 | 
			
		||||
  if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
 | 
			
		||||
        "higher.\n");
 | 
			
		||||
    return EXIT_WAIVED;
 | 
			
		||||
  }
 | 
			
		||||
    if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
 | 
			
		||||
        printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
 | 
			
		||||
               "higher.\n");
 | 
			
		||||
        return EXIT_WAIVED;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < 2; ++i) {
 | 
			
		||||
    checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < 2; ++i) {
 | 
			
		||||
        checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
 | 
			
		||||
        checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
 | 
			
		||||
    checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < 2; ++i) {
 | 
			
		||||
    generateInput(vec[i], size);
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
 | 
			
		||||
                               cudaMemcpyHostToDevice));
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < 2; ++i) {
 | 
			
		||||
        generateInput(vec[i], size);
 | 
			
		||||
        checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
 | 
			
		||||
      devVec[0], devVec[1], devResults, size);
 | 
			
		||||
    scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(results, devResults,
 | 
			
		||||
                             NUM_OF_BLOCKS * sizeof *results,
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  float result_native = 0;
 | 
			
		||||
  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
 | 
			
		||||
    result_native += results[i];
 | 
			
		||||
  }
 | 
			
		||||
  printf("Result native operators\t: %f \n", result_native);
 | 
			
		||||
    float result_native = 0;
 | 
			
		||||
    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
 | 
			
		||||
        result_native += results[i];
 | 
			
		||||
    }
 | 
			
		||||
    printf("Result native operators\t: %f \n", result_native);
 | 
			
		||||
 | 
			
		||||
  scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
 | 
			
		||||
      devVec[0], devVec[1], devResults, size);
 | 
			
		||||
    scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(results, devResults,
 | 
			
		||||
                             NUM_OF_BLOCKS * sizeof *results,
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  float result_intrinsics = 0;
 | 
			
		||||
  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
 | 
			
		||||
    result_intrinsics += results[i];
 | 
			
		||||
  }
 | 
			
		||||
  printf("Result intrinsics\t: %f \n", result_intrinsics);
 | 
			
		||||
    float result_intrinsics = 0;
 | 
			
		||||
    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
 | 
			
		||||
        result_intrinsics += results[i];
 | 
			
		||||
    }
 | 
			
		||||
    printf("Result intrinsics\t: %f \n", result_intrinsics);
 | 
			
		||||
 | 
			
		||||
  printf("&&&& fp16ScalarProduct %s\n",
 | 
			
		||||
         (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
 | 
			
		||||
                                                             : "FAILED");
 | 
			
		||||
    printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < 2; ++i) {
 | 
			
		||||
    checkCudaErrors(cudaFree(devVec[i]));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(vec[i]));
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < 2; ++i) {
 | 
			
		||||
        checkCudaErrors(cudaFree(devVec[i]));
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(vec[i]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaFree(devResults));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(results));
 | 
			
		||||
    checkCudaErrors(cudaFree(devResults));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(results));
 | 
			
		||||
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -40,314 +40,303 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// System includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// CUDA runtime
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <cuda_profiler_api.h>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// Helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 | 
			
		||||
 * wA is A's width and wB is B's width
 | 
			
		||||
 */
 | 
			
		||||
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
 | 
			
		||||
    float *B, int wA,
 | 
			
		||||
    int wB) {
 | 
			
		||||
  // Block index
 | 
			
		||||
  int bx = blockIdx.x;
 | 
			
		||||
  int by = blockIdx.y;
 | 
			
		||||
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
 | 
			
		||||
{
 | 
			
		||||
    // Block index
 | 
			
		||||
    int bx = blockIdx.x;
 | 
			
		||||
    int by = blockIdx.y;
 | 
			
		||||
 | 
			
		||||
  // Thread index
 | 
			
		||||
  int tx = threadIdx.x;
 | 
			
		||||
  int ty = threadIdx.y;
 | 
			
		||||
    // Thread index
 | 
			
		||||
    int tx = threadIdx.x;
 | 
			
		||||
    int ty = threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  // Index of the first sub-matrix of A processed by the block
 | 
			
		||||
  int aBegin = wA * BLOCK_SIZE * by;
 | 
			
		||||
    // Index of the first sub-matrix of A processed by the block
 | 
			
		||||
    int aBegin = wA * BLOCK_SIZE * by;
 | 
			
		||||
 | 
			
		||||
  // Index of the last sub-matrix of A processed by the block
 | 
			
		||||
  int aEnd   = aBegin + wA - 1;
 | 
			
		||||
    // Index of the last sub-matrix of A processed by the block
 | 
			
		||||
    int aEnd = aBegin + wA - 1;
 | 
			
		||||
 | 
			
		||||
  // Step size used to iterate through the sub-matrices of A
 | 
			
		||||
  int aStep  = BLOCK_SIZE;
 | 
			
		||||
    // Step size used to iterate through the sub-matrices of A
 | 
			
		||||
    int aStep = BLOCK_SIZE;
 | 
			
		||||
 | 
			
		||||
  // Index of the first sub-matrix of B processed by the block
 | 
			
		||||
  int bBegin = BLOCK_SIZE * bx;
 | 
			
		||||
    // Index of the first sub-matrix of B processed by the block
 | 
			
		||||
    int bBegin = BLOCK_SIZE * bx;
 | 
			
		||||
 | 
			
		||||
  // Step size used to iterate through the sub-matrices of B
 | 
			
		||||
  int bStep  = BLOCK_SIZE * wB;
 | 
			
		||||
    // Step size used to iterate through the sub-matrices of B
 | 
			
		||||
    int bStep = BLOCK_SIZE * wB;
 | 
			
		||||
 | 
			
		||||
  // Csub is used to store the element of the block sub-matrix
 | 
			
		||||
  // that is computed by the thread
 | 
			
		||||
  float Csub = 0;
 | 
			
		||||
    // Csub is used to store the element of the block sub-matrix
 | 
			
		||||
    // that is computed by the thread
 | 
			
		||||
    float Csub = 0;
 | 
			
		||||
 | 
			
		||||
  // Loop over all the sub-matrices of A and B
 | 
			
		||||
  // required to compute the block sub-matrix
 | 
			
		||||
  for (int a = aBegin, b = bBegin;
 | 
			
		||||
       a <= aEnd;
 | 
			
		||||
       a += aStep, b += bStep) {
 | 
			
		||||
    // Declaration of the shared memory array As used to
 | 
			
		||||
    // store the sub-matrix of A
 | 
			
		||||
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
    // Loop over all the sub-matrices of A and B
 | 
			
		||||
    // required to compute the block sub-matrix
 | 
			
		||||
    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
 | 
			
		||||
        // Declaration of the shared memory array As used to
 | 
			
		||||
        // store the sub-matrix of A
 | 
			
		||||
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
 | 
			
		||||
    // Declaration of the shared memory array Bs used to
 | 
			
		||||
    // store the sub-matrix of B
 | 
			
		||||
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
        // Declaration of the shared memory array Bs used to
 | 
			
		||||
        // store the sub-matrix of B
 | 
			
		||||
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
 | 
			
		||||
    // Load the matrices from device memory
 | 
			
		||||
    // to shared memory; each thread loads
 | 
			
		||||
    // one element of each matrix
 | 
			
		||||
    As[ty][tx] = A[a + wA * ty + tx];
 | 
			
		||||
    Bs[ty][tx] = B[b + wB * ty + tx];
 | 
			
		||||
        // Load the matrices from device memory
 | 
			
		||||
        // to shared memory; each thread loads
 | 
			
		||||
        // one element of each matrix
 | 
			
		||||
        As[ty][tx] = A[a + wA * ty + tx];
 | 
			
		||||
        Bs[ty][tx] = B[b + wB * ty + tx];
 | 
			
		||||
 | 
			
		||||
    // Synchronize to make sure the matrices are loaded
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
        // Synchronize to make sure the matrices are loaded
 | 
			
		||||
        __syncthreads();
 | 
			
		||||
 | 
			
		||||
    // Multiply the two matrices together;
 | 
			
		||||
    // each thread computes one element
 | 
			
		||||
    // of the block sub-matrix
 | 
			
		||||
        // Multiply the two matrices together;
 | 
			
		||||
        // each thread computes one element
 | 
			
		||||
        // of the block sub-matrix
 | 
			
		||||
#pragma unroll
 | 
			
		||||
 | 
			
		||||
    for (int k = 0; k < BLOCK_SIZE; ++k) {
 | 
			
		||||
      Csub += As[ty][k] * Bs[k][tx];
 | 
			
		||||
        for (int k = 0; k < BLOCK_SIZE; ++k) {
 | 
			
		||||
            Csub += As[ty][k] * Bs[k][tx];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Synchronize to make sure that the preceding
 | 
			
		||||
        // computation is done before loading two new
 | 
			
		||||
        // sub-matrices of A and B in the next iteration
 | 
			
		||||
        __syncthreads();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Synchronize to make sure that the preceding
 | 
			
		||||
    // computation is done before loading two new
 | 
			
		||||
    // sub-matrices of A and B in the next iteration
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Write the block sub-matrix to device memory;
 | 
			
		||||
  // each thread writes one element
 | 
			
		||||
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 | 
			
		||||
  C[c + wB * ty + tx] = Csub;
 | 
			
		||||
    // Write the block sub-matrix to device memory;
 | 
			
		||||
    // each thread writes one element
 | 
			
		||||
    int c               = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 | 
			
		||||
    C[c + wB * ty + tx] = Csub;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void ConstantInit(float *data, int size, float val) {
 | 
			
		||||
  for (int i = 0; i < size; ++i) {
 | 
			
		||||
    data[i] = val;
 | 
			
		||||
  }
 | 
			
		||||
void ConstantInit(float *data, int size, float val)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < size; ++i) {
 | 
			
		||||
        data[i] = val;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Run a simple test of matrix multiplication using CUDA
 | 
			
		||||
 */
 | 
			
		||||
int MatrixMultiply(int argc, char **argv,
 | 
			
		||||
                   int block_size, const dim3 &dimsA,
 | 
			
		||||
                   const dim3 &dimsB) {
 | 
			
		||||
  // Allocate host memory for matrices A and B
 | 
			
		||||
  unsigned int size_A = dimsA.x * dimsA.y;
 | 
			
		||||
  unsigned int mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
  float *h_A;
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
 | 
			
		||||
  unsigned int size_B = dimsB.x * dimsB.y;
 | 
			
		||||
  unsigned int mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
  float *h_B;
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
 | 
			
		||||
{
 | 
			
		||||
    // Allocate host memory for matrices A and B
 | 
			
		||||
    unsigned int size_A     = dimsA.x * dimsA.y;
 | 
			
		||||
    unsigned int mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
    float       *h_A;
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
 | 
			
		||||
    unsigned int size_B     = dimsB.x * dimsB.y;
 | 
			
		||||
    unsigned int mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
    float       *h_B;
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
 | 
			
		||||
    cudaStream_t stream;
 | 
			
		||||
 | 
			
		||||
  // Initialize host memory
 | 
			
		||||
  const float valB = 0.01f;
 | 
			
		||||
  ConstantInit(h_A, size_A, 1.0f);
 | 
			
		||||
  ConstantInit(h_B, size_B, valB);
 | 
			
		||||
    // Initialize host memory
 | 
			
		||||
    const float valB = 0.01f;
 | 
			
		||||
    ConstantInit(h_A, size_A, 1.0f);
 | 
			
		||||
    ConstantInit(h_B, size_B, valB);
 | 
			
		||||
 | 
			
		||||
  // Allocate device memory
 | 
			
		||||
  float *d_A, *d_B, *d_C;
 | 
			
		||||
    // Allocate device memory
 | 
			
		||||
    float *d_A, *d_B, *d_C;
 | 
			
		||||
 | 
			
		||||
  // Allocate host matrix C
 | 
			
		||||
  dim3 dimsC(dimsB.x, dimsA.y, 1);
 | 
			
		||||
  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 | 
			
		||||
  float *h_C;
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
 | 
			
		||||
    // Allocate host matrix C
 | 
			
		||||
    dim3         dimsC(dimsB.x, dimsA.y, 1);
 | 
			
		||||
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 | 
			
		||||
    float       *h_C;
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
  if (h_C == NULL) {
 | 
			
		||||
    fprintf(stderr, "Failed to allocate host matrix C!\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (h_C == NULL) {
 | 
			
		||||
        fprintf(stderr, "Failed to allocate host matrix C!\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
 | 
			
		||||
  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
 | 
			
		||||
  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
 | 
			
		||||
  // Allocate CUDA events that we'll use for timing
 | 
			
		||||
  cudaEvent_t start, stop;
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&start));
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&stop));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
 | 
			
		||||
    // Allocate CUDA events that we'll use for timing
 | 
			
		||||
    cudaEvent_t start, stop;
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&start));
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&stop));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
 | 
			
		||||
  // copy host memory to device
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
    // copy host memory to device
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
 | 
			
		||||
  // Setup execution parameters
 | 
			
		||||
  dim3 threads(block_size, block_size);
 | 
			
		||||
  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
 | 
			
		||||
    // Setup execution parameters
 | 
			
		||||
    dim3 threads(block_size, block_size);
 | 
			
		||||
    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
 | 
			
		||||
 | 
			
		||||
  // Create and start timer
 | 
			
		||||
  printf("Computing result using CUDA Kernel...\n");
 | 
			
		||||
    // Create and start timer
 | 
			
		||||
    printf("Computing result using CUDA Kernel...\n");
 | 
			
		||||
 | 
			
		||||
  // Performs warmup operation using matrixMul CUDA kernel
 | 
			
		||||
  if (block_size == 16) {
 | 
			
		||||
    MatrixMulCUDA<16>
 | 
			
		||||
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
  } else {
 | 
			
		||||
    MatrixMulCUDA<32>
 | 
			
		||||
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("done\n");
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
  // Record the start event
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start, stream));
 | 
			
		||||
 | 
			
		||||
  // Execute the kernel
 | 
			
		||||
  int nIter = 300;
 | 
			
		||||
 | 
			
		||||
  for (int j = 0; j < nIter; j++) {
 | 
			
		||||
    // Performs warmup operation using matrixMul CUDA kernel
 | 
			
		||||
    if (block_size == 16) {
 | 
			
		||||
      MatrixMulCUDA<16>
 | 
			
		||||
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
    } else {
 | 
			
		||||
      MatrixMulCUDA<32>
 | 
			
		||||
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
        MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Record the stop event
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop, stream));
 | 
			
		||||
 | 
			
		||||
  // Wait for the stop event to complete
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop));
 | 
			
		||||
 | 
			
		||||
  float msecTotal = 0.0f;
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
 | 
			
		||||
 | 
			
		||||
  // Compute and print the performance
 | 
			
		||||
  float msecPerMatrixMul = msecTotal / nIter;
 | 
			
		||||
  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
 | 
			
		||||
                             static_cast<double>(dimsA.y) *
 | 
			
		||||
                             static_cast<double>(dimsB.x);
 | 
			
		||||
  double gigaFlops =
 | 
			
		||||
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
 | 
			
		||||
  printf(
 | 
			
		||||
      "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
 | 
			
		||||
      " WorkgroupSize= %u threads/block\n",
 | 
			
		||||
      gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
 | 
			
		||||
 | 
			
		||||
  // Copy result from device to host
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
  printf("Checking computed result for correctness: ");
 | 
			
		||||
  bool correct = true;
 | 
			
		||||
 | 
			
		||||
  // test relative error by the formula
 | 
			
		||||
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
 | 
			
		||||
  double eps = 1.e-6;  // machine zero
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
 | 
			
		||||
    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
 | 
			
		||||
    double dot_length = dimsA.x;
 | 
			
		||||
    double abs_val = fabs(h_C[i]);
 | 
			
		||||
    double rel_err = abs_err / abs_val / dot_length;
 | 
			
		||||
 | 
			
		||||
    if (rel_err > eps) {
 | 
			
		||||
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
 | 
			
		||||
             i, h_C[i], dimsA.x * valB, eps);
 | 
			
		||||
      correct = false;
 | 
			
		||||
    else {
 | 
			
		||||
        MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
    printf("done\n");
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
  // Clean up memory
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(h_A));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(h_B));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(h_C));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_A));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_B));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_C));
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(start));
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(stop));
 | 
			
		||||
  printf(
 | 
			
		||||
      "\nNOTE: The CUDA Samples are not meant for performance "
 | 
			
		||||
      "measurements. Results may vary when GPU Boost is enabled.\n");
 | 
			
		||||
    // Record the start event
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start, stream));
 | 
			
		||||
 | 
			
		||||
  if (correct) {
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
  } else {
 | 
			
		||||
    return EXIT_FAILURE;
 | 
			
		||||
  }
 | 
			
		||||
    // Execute the kernel
 | 
			
		||||
    int nIter = 300;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < nIter; j++) {
 | 
			
		||||
        if (block_size == 16) {
 | 
			
		||||
            MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Record the stop event
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop, stream));
 | 
			
		||||
 | 
			
		||||
    // Wait for the stop event to complete
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop));
 | 
			
		||||
 | 
			
		||||
    float msecTotal = 0.0f;
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
 | 
			
		||||
 | 
			
		||||
    // Compute and print the performance
 | 
			
		||||
    float  msecPerMatrixMul = msecTotal / nIter;
 | 
			
		||||
    double flopsPerMatrixMul =
 | 
			
		||||
        2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
 | 
			
		||||
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
 | 
			
		||||
    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
 | 
			
		||||
           " WorkgroupSize= %u threads/block\n",
 | 
			
		||||
           gigaFlops,
 | 
			
		||||
           msecPerMatrixMul,
 | 
			
		||||
           flopsPerMatrixMul,
 | 
			
		||||
           threads.x * threads.y);
 | 
			
		||||
 | 
			
		||||
    // Copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
    printf("Checking computed result for correctness: ");
 | 
			
		||||
    bool correct = true;
 | 
			
		||||
 | 
			
		||||
    // test relative error by the formula
 | 
			
		||||
    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
 | 
			
		||||
    double eps = 1.e-6; // machine zero
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
 | 
			
		||||
        double abs_err    = fabs(h_C[i] - (dimsA.x * valB));
 | 
			
		||||
        double dot_length = dimsA.x;
 | 
			
		||||
        double abs_val    = fabs(h_C[i]);
 | 
			
		||||
        double rel_err    = abs_err / abs_val / dot_length;
 | 
			
		||||
 | 
			
		||||
        if (rel_err > eps) {
 | 
			
		||||
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
 | 
			
		||||
            correct = false;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
 | 
			
		||||
    // Clean up memory
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(h_A));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(h_B));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(h_C));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_A));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_B));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_C));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(start));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(stop));
 | 
			
		||||
    printf("\nNOTE: The CUDA Samples are not meant for performance "
 | 
			
		||||
           "measurements. Results may vary when GPU Boost is enabled.\n");
 | 
			
		||||
 | 
			
		||||
    if (correct) {
 | 
			
		||||
        return EXIT_SUCCESS;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        return EXIT_FAILURE;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Program main
 | 
			
		||||
 */
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("[Matrix Multiply Using CUDA] - Starting...\n");
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("[Matrix Multiply Using CUDA] - Starting...\n");
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
 | 
			
		||||
      checkCmdLineFlag(argc, (const char **)argv, "?")) {
 | 
			
		||||
    printf("Usage -device=n (n >= 0 for deviceID)\n");
 | 
			
		||||
    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
 | 
			
		||||
    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
 | 
			
		||||
    printf("  Note: Outer matrix dimensions of A & B matrices" \
 | 
			
		||||
           " must be equal.\n");
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
 | 
			
		||||
        printf("Usage -device=n (n >= 0 for deviceID)\n");
 | 
			
		||||
        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
 | 
			
		||||
        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
 | 
			
		||||
        printf("  Note: Outer matrix dimensions of A & B matrices"
 | 
			
		||||
               " must be equal.\n");
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device, otherwise
 | 
			
		||||
  // override the device ID based on input provided at the command line
 | 
			
		||||
  int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // This will pick the best possible CUDA capable device, otherwise
 | 
			
		||||
    // override the device ID based on input provided at the command line
 | 
			
		||||
    int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  int block_size = 32;
 | 
			
		||||
    int block_size = 32;
 | 
			
		||||
 | 
			
		||||
  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
 | 
			
		||||
  // width of Matrix A
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
 | 
			
		||||
    dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
 | 
			
		||||
  }
 | 
			
		||||
    // width of Matrix A
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
 | 
			
		||||
        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // height of Matrix A
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
 | 
			
		||||
    dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
 | 
			
		||||
  }
 | 
			
		||||
    // height of Matrix A
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
 | 
			
		||||
        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // width of Matrix B
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
 | 
			
		||||
    dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
 | 
			
		||||
  }
 | 
			
		||||
    // width of Matrix B
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
 | 
			
		||||
        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // height of Matrix B
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
 | 
			
		||||
    dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
 | 
			
		||||
  }
 | 
			
		||||
    // height of Matrix B
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
 | 
			
		||||
        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (dimsA.x != dimsB.y) {
 | 
			
		||||
    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
 | 
			
		||||
           dimsA.x, dimsB.y);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (dimsA.x != dimsB.y) {
 | 
			
		||||
        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
 | 
			
		||||
         dimsB.x, dimsB.y);
 | 
			
		||||
    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaProfilerStart());
 | 
			
		||||
  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
 | 
			
		||||
  checkCudaErrors(cudaProfilerStop());
 | 
			
		||||
    checkCudaErrors(cudaProfilerStart());
 | 
			
		||||
    int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
 | 
			
		||||
    checkCudaErrors(cudaProfilerStop());
 | 
			
		||||
 | 
			
		||||
  exit(matrix_result);
 | 
			
		||||
    exit(matrix_result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -30,11 +30,11 @@
 | 
			
		||||
 | 
			
		||||
// Matrix dimensions
 | 
			
		||||
// (chosen as multiples of the thread block size for simplicity)
 | 
			
		||||
#define WA (4 * block_size)  // Matrix A width
 | 
			
		||||
#define HA (6 * block_size)  // Matrix A height
 | 
			
		||||
#define WB (4 * block_size)  // Matrix B width
 | 
			
		||||
#define HB WA                // Matrix B height
 | 
			
		||||
#define WC WB                // Matrix C width
 | 
			
		||||
#define HC HA                // Matrix C height
 | 
			
		||||
#define WA (4 * block_size) // Matrix A width
 | 
			
		||||
#define HA (6 * block_size) // Matrix A height
 | 
			
		||||
#define WB (4 * block_size) // Matrix B width
 | 
			
		||||
#define HB WA               // Matrix B height
 | 
			
		||||
#define WC WB               // Matrix C width
 | 
			
		||||
#define HC HA               // Matrix C height
 | 
			
		||||
 | 
			
		||||
#endif  // _MATRIXMUL_H_
 | 
			
		||||
#endif // _MATRIXMUL_H_
 | 
			
		||||
 | 
			
		||||
@ -46,23 +46,23 @@
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <builtin_types.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes, project, CUDA
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
#include <helper_cuda_drvapi.h>
 | 
			
		||||
#include <helper_image.h>
 | 
			
		||||
#include <helper_string.h>
 | 
			
		||||
#include <helper_timer.h>
 | 
			
		||||
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <string>
 | 
			
		||||
 | 
			
		||||
#include "matrixMul.h"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -71,11 +71,9 @@
 | 
			
		||||
void runTest(int argc, char **argv);
 | 
			
		||||
void randomInit(float *, int);
 | 
			
		||||
 | 
			
		||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
 | 
			
		||||
                            unsigned int, unsigned int);
 | 
			
		||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 | 
			
		||||
 | 
			
		||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
 | 
			
		||||
                    int *blk_size);
 | 
			
		||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
 | 
			
		||||
 | 
			
		||||
#ifndef FATBIN_FILE
 | 
			
		||||
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
 | 
			
		||||
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Globals
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
CUdevice cuDevice;
 | 
			
		||||
CUdevice  cuDevice;
 | 
			
		||||
CUcontext cuContext;
 | 
			
		||||
CUmodule cuModule;
 | 
			
		||||
size_t totalGlobalMem;
 | 
			
		||||
CUmodule  cuModule;
 | 
			
		||||
size_t    totalGlobalMem;
 | 
			
		||||
 | 
			
		||||
const char *sSDKsample = "matrixMulDrv (Driver API)";
 | 
			
		||||
 | 
			
		||||
void constantInit(float *data, int size, float val) {
 | 
			
		||||
  for (int i = 0; i < size; ++i) {
 | 
			
		||||
    data[i] = val;
 | 
			
		||||
  }
 | 
			
		||||
void constantInit(float *data, int size, float val)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < size; ++i) {
 | 
			
		||||
        data[i] = val;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("[ %s ]\n", sSDKsample);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("[ %s ]\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  // initialize CUDA
 | 
			
		||||
  CUfunction matrixMul = NULL;
 | 
			
		||||
  int block_size = 0;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // initialize CUDA
 | 
			
		||||
    CUfunction matrixMul  = NULL;
 | 
			
		||||
    int        block_size = 0;
 | 
			
		||||
 | 
			
		||||
  initCUDA(argc, argv, &matrixMul, &block_size);
 | 
			
		||||
    initCUDA(argc, argv, &matrixMul, &block_size);
 | 
			
		||||
 | 
			
		||||
  // set seed for rand()
 | 
			
		||||
  srand(2006);
 | 
			
		||||
    // set seed for rand()
 | 
			
		||||
    srand(2006);
 | 
			
		||||
 | 
			
		||||
  // allocate host memory for matrices A and B
 | 
			
		||||
  unsigned int size_A = WA * HA;
 | 
			
		||||
  unsigned int mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
  float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
 | 
			
		||||
  unsigned int size_B = WB * HB;
 | 
			
		||||
  unsigned int mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
  float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
 | 
			
		||||
    // allocate host memory for matrices A and B
 | 
			
		||||
    unsigned int size_A     = WA * HA;
 | 
			
		||||
    unsigned int mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
    float       *h_A        = reinterpret_cast<float *>(malloc(mem_size_A));
 | 
			
		||||
    unsigned int size_B     = WB * HB;
 | 
			
		||||
    unsigned int mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
    float       *h_B        = reinterpret_cast<float *>(malloc(mem_size_B));
 | 
			
		||||
 | 
			
		||||
  // initialize host memory
 | 
			
		||||
  const float valB = 0.01f;
 | 
			
		||||
  constantInit(h_A, size_A, 1.0f);
 | 
			
		||||
  constantInit(h_B, size_B, valB);
 | 
			
		||||
    // initialize host memory
 | 
			
		||||
    const float valB = 0.01f;
 | 
			
		||||
    constantInit(h_A, size_A, 1.0f);
 | 
			
		||||
    constantInit(h_B, size_B, valB);
 | 
			
		||||
 | 
			
		||||
  // allocate device memory
 | 
			
		||||
  CUdeviceptr d_A;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
 | 
			
		||||
  CUdeviceptr d_B;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
 | 
			
		||||
    // allocate device memory
 | 
			
		||||
    CUdeviceptr d_A;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
 | 
			
		||||
    CUdeviceptr d_B;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
 | 
			
		||||
 | 
			
		||||
  // copy host memory to device
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
 | 
			
		||||
    // copy host memory to device
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
 | 
			
		||||
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  size_t size_C = WC * HC;
 | 
			
		||||
  size_t mem_size_C = sizeof(float) * size_C;
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    size_t size_C     = WC * HC;
 | 
			
		||||
    size_t mem_size_C = sizeof(float) * size_C;
 | 
			
		||||
 | 
			
		||||
  CUdeviceptr d_C;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
 | 
			
		||||
    CUdeviceptr d_C;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
 | 
			
		||||
 | 
			
		||||
  // create and start timer
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
    // create and start timer
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // start the timer
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    // start the timer
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // There are two ways to launch CUDA kernels via the Driver API.
 | 
			
		||||
  // In this CUDA Sample, we illustrate both ways to pass parameters
 | 
			
		||||
  // and specify parameters.  By default we use the simpler method.
 | 
			
		||||
  dim3 block(block_size, block_size, 1);
 | 
			
		||||
  dim3 grid(WC / block_size, HC / block_size, 1);
 | 
			
		||||
    // There are two ways to launch CUDA kernels via the Driver API.
 | 
			
		||||
    // In this CUDA Sample, we illustrate both ways to pass parameters
 | 
			
		||||
    // and specify parameters.  By default we use the simpler method.
 | 
			
		||||
    dim3 block(block_size, block_size, 1);
 | 
			
		||||
    dim3 grid(WC / block_size, HC / block_size, 1);
 | 
			
		||||
 | 
			
		||||
  if (1) {
 | 
			
		||||
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
    // Launching (simplier method)
 | 
			
		||||
    size_t Matrix_Width_A = (size_t)WA;
 | 
			
		||||
    size_t Matrix_Width_B = (size_t)WB;
 | 
			
		||||
    void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
 | 
			
		||||
    // new CUDA 4.0 Driver API Kernel launch call
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
 | 
			
		||||
        2 * block_size * block_size * sizeof(float), NULL, args, NULL));
 | 
			
		||||
  } else {
 | 
			
		||||
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
    // Launching (advanced method)
 | 
			
		||||
    int offset = 0;
 | 
			
		||||
    char argBuffer[256];
 | 
			
		||||
 | 
			
		||||
    // pass in launch parameters (not actually de-referencing CUdeviceptr).
 | 
			
		||||
    // CUdeviceptr is storing the value of the parameters
 | 
			
		||||
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
 | 
			
		||||
    offset += sizeof(d_C);
 | 
			
		||||
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
 | 
			
		||||
    offset += sizeof(d_A);
 | 
			
		||||
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
 | 
			
		||||
    offset += sizeof(d_B);
 | 
			
		||||
 | 
			
		||||
    size_t Matrix_Width_A = (size_t)WA;
 | 
			
		||||
    size_t Matrix_Width_B = (size_t)WB;
 | 
			
		||||
 | 
			
		||||
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
 | 
			
		||||
    offset += sizeof(Matrix_Width_A);
 | 
			
		||||
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
 | 
			
		||||
    offset += sizeof(Matrix_Width_B);
 | 
			
		||||
 | 
			
		||||
    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
 | 
			
		||||
                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
 | 
			
		||||
                                     CU_LAUNCH_PARAM_END};
 | 
			
		||||
 | 
			
		||||
    // new CUDA 4.0 Driver API Kernel launch call
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
 | 
			
		||||
        2 * block_size * block_size * sizeof(float), NULL, NULL,
 | 
			
		||||
        reinterpret_cast<void **>(&kernel_launch_config)));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
  // stop and destroy timer
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  printf("Checking computed result for correctness: ");
 | 
			
		||||
  bool correct = true;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < static_cast<int>(WC * HC); i++) {
 | 
			
		||||
    if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
 | 
			
		||||
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
 | 
			
		||||
             h_C[i], WA * valB);
 | 
			
		||||
      correct = false;
 | 
			
		||||
    if (1) {
 | 
			
		||||
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
        // Launching (simplier method)
 | 
			
		||||
        size_t Matrix_Width_A = (size_t)WA;
 | 
			
		||||
        size_t Matrix_Width_B = (size_t)WB;
 | 
			
		||||
        void  *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
 | 
			
		||||
        // new CUDA 4.0 Driver API Kernel launch call
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(matrixMul,
 | 
			
		||||
                                       grid.x,
 | 
			
		||||
                                       grid.y,
 | 
			
		||||
                                       grid.z,
 | 
			
		||||
                                       block.x,
 | 
			
		||||
                                       block.y,
 | 
			
		||||
                                       block.z,
 | 
			
		||||
                                       2 * block_size * block_size * sizeof(float),
 | 
			
		||||
                                       NULL,
 | 
			
		||||
                                       args,
 | 
			
		||||
                                       NULL));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
    else {
 | 
			
		||||
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
        // Launching (advanced method)
 | 
			
		||||
        int  offset = 0;
 | 
			
		||||
        char argBuffer[256];
 | 
			
		||||
 | 
			
		||||
  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
        // pass in launch parameters (not actually de-referencing CUdeviceptr).
 | 
			
		||||
        // CUdeviceptr is storing the value of the parameters
 | 
			
		||||
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
 | 
			
		||||
        offset += sizeof(d_C);
 | 
			
		||||
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
 | 
			
		||||
        offset += sizeof(d_A);
 | 
			
		||||
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
 | 
			
		||||
        offset += sizeof(d_B);
 | 
			
		||||
 | 
			
		||||
  printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
 | 
			
		||||
         "Results may vary when GPU Boost is enabled.\n");
 | 
			
		||||
        size_t Matrix_Width_A = (size_t)WA;
 | 
			
		||||
        size_t Matrix_Width_B = (size_t)WB;
 | 
			
		||||
 | 
			
		||||
  // clean up memory
 | 
			
		||||
  free(h_A);
 | 
			
		||||
  free(h_B);
 | 
			
		||||
  free(h_C);
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
  checkCudaErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
 | 
			
		||||
        offset += sizeof(Matrix_Width_A);
 | 
			
		||||
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
 | 
			
		||||
        offset += sizeof(Matrix_Width_B);
 | 
			
		||||
 | 
			
		||||
        void *kernel_launch_config[5] = {
 | 
			
		||||
            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
 | 
			
		||||
 | 
			
		||||
        // new CUDA 4.0 Driver API Kernel launch call
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(matrixMul,
 | 
			
		||||
                                       grid.x,
 | 
			
		||||
                                       grid.y,
 | 
			
		||||
                                       grid.z,
 | 
			
		||||
                                       block.x,
 | 
			
		||||
                                       block.y,
 | 
			
		||||
                                       block.z,
 | 
			
		||||
                                       2 * block_size * block_size * sizeof(float),
 | 
			
		||||
                                       NULL,
 | 
			
		||||
                                       NULL,
 | 
			
		||||
                                       reinterpret_cast<void **>(&kernel_launch_config)));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
    // stop and destroy timer
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    printf("Checking computed result for correctness: ");
 | 
			
		||||
    bool correct = true;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < static_cast<int>(WC * HC); i++) {
 | 
			
		||||
        if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
 | 
			
		||||
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
 | 
			
		||||
            correct = false;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
 | 
			
		||||
    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
 | 
			
		||||
           "Results may vary when GPU Boost is enabled.\n");
 | 
			
		||||
 | 
			
		||||
    // clean up memory
 | 
			
		||||
    free(h_A);
 | 
			
		||||
    free(h_B);
 | 
			
		||||
    free(h_C);
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
    checkCudaErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Allocates a matrix with random float entries.
 | 
			
		||||
void randomInit(float *data, int size) {
 | 
			
		||||
  for (int i = 0; i < size; ++i) {
 | 
			
		||||
    data[i] = rand() / static_cast<float>(RAND_MAX);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
 | 
			
		||||
                    int *blk_size) {
 | 
			
		||||
  CUfunction cuFunction = 0;
 | 
			
		||||
  int major = 0, minor = 0;
 | 
			
		||||
  char deviceName[100];
 | 
			
		||||
 | 
			
		||||
  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // get compute capabilities and the devicename
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
 | 
			
		||||
  checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
 | 
			
		||||
  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
 | 
			
		||||
  printf("  Total amount of global memory:     %llu bytes\n",
 | 
			
		||||
         (long long unsigned int)totalGlobalMem);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
 | 
			
		||||
  // first search for the module path before we load the results
 | 
			
		||||
  std::string module_path;
 | 
			
		||||
  std::ostringstream fatbin;
 | 
			
		||||
 | 
			
		||||
  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!fatbin.str().size()) {
 | 
			
		||||
    printf("fatbin file empty. exiting..\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Create module from binary file (FATBIN)
 | 
			
		||||
  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
 | 
			
		||||
  // select the suitable kernel function
 | 
			
		||||
  const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
 | 
			
		||||
                           "matrixMul_bs8_64bit"};
 | 
			
		||||
 | 
			
		||||
  int idx = 0;
 | 
			
		||||
  int block_size = 32;
 | 
			
		||||
  while (idx < 3) {
 | 
			
		||||
    int threadsPerBlock = 0;
 | 
			
		||||
    int blocksPerGrid = 0;
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
 | 
			
		||||
    checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
 | 
			
		||||
        &blocksPerGrid, &threadsPerBlock, cuFunction, 0,
 | 
			
		||||
        2 * block_size * block_size * sizeof(float), 0));
 | 
			
		||||
    if (block_size * block_size <= threadsPerBlock) {
 | 
			
		||||
      printf("> %d block size selected\n", block_size);
 | 
			
		||||
      break;
 | 
			
		||||
    } else {
 | 
			
		||||
      block_size /= 2;
 | 
			
		||||
void randomInit(float *data, int size)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < size; ++i) {
 | 
			
		||||
        data[i] = rand() / static_cast<float>(RAND_MAX);
 | 
			
		||||
    }
 | 
			
		||||
    idx++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  *pMatrixMul = cuFunction;
 | 
			
		||||
  *blk_size = block_size;
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
 | 
			
		||||
{
 | 
			
		||||
    CUfunction cuFunction = 0;
 | 
			
		||||
    int        major = 0, minor = 0;
 | 
			
		||||
    char       deviceName[100];
 | 
			
		||||
 | 
			
		||||
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
    // get compute capabilities and the devicename
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
 | 
			
		||||
    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
 | 
			
		||||
    printf("  Total amount of global memory:     %llu bytes\n", (long long unsigned int)totalGlobalMem);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
 | 
			
		||||
    // first search for the module path before we load the results
 | 
			
		||||
    std::string        module_path;
 | 
			
		||||
    std::ostringstream fatbin;
 | 
			
		||||
 | 
			
		||||
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!fatbin.str().size()) {
 | 
			
		||||
        printf("fatbin file empty. exiting..\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Create module from binary file (FATBIN)
 | 
			
		||||
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
 | 
			
		||||
    // select the suitable kernel function
 | 
			
		||||
    const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
 | 
			
		||||
 | 
			
		||||
    int idx        = 0;
 | 
			
		||||
    int block_size = 32;
 | 
			
		||||
    while (idx < 3) {
 | 
			
		||||
        int threadsPerBlock = 0;
 | 
			
		||||
        int blocksPerGrid   = 0;
 | 
			
		||||
 | 
			
		||||
        checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
 | 
			
		||||
        checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
 | 
			
		||||
            &blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
 | 
			
		||||
        if (block_size * block_size <= threadsPerBlock) {
 | 
			
		||||
            printf("> %d block size selected\n", block_size);
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            block_size /= 2;
 | 
			
		||||
        }
 | 
			
		||||
        idx++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    *pMatrixMul = cuFunction;
 | 
			
		||||
    *blk_size   = block_size;
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -42,86 +42,87 @@
 | 
			
		||||
//! wA is A's width and wB is B's width
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <int block_size, typename size_type>
 | 
			
		||||
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
 | 
			
		||||
                          size_type wB) {
 | 
			
		||||
  // Block index
 | 
			
		||||
  size_type bx = blockIdx.x;
 | 
			
		||||
  size_type by = blockIdx.y;
 | 
			
		||||
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
 | 
			
		||||
{
 | 
			
		||||
    // Block index
 | 
			
		||||
    size_type bx = blockIdx.x;
 | 
			
		||||
    size_type by = blockIdx.y;
 | 
			
		||||
 | 
			
		||||
  // Thread index
 | 
			
		||||
  size_type tx = threadIdx.x;
 | 
			
		||||
  size_type ty = threadIdx.y;
 | 
			
		||||
    // Thread index
 | 
			
		||||
    size_type tx = threadIdx.x;
 | 
			
		||||
    size_type ty = threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  // Index of the first sub-matrix of A processed by the block
 | 
			
		||||
  size_type aBegin = wA * block_size * by;
 | 
			
		||||
    // Index of the first sub-matrix of A processed by the block
 | 
			
		||||
    size_type aBegin = wA * block_size * by;
 | 
			
		||||
 | 
			
		||||
  // Index of the last sub-matrix of A processed by the block
 | 
			
		||||
  size_type aEnd = aBegin + wA - 1;
 | 
			
		||||
    // Index of the last sub-matrix of A processed by the block
 | 
			
		||||
    size_type aEnd = aBegin + wA - 1;
 | 
			
		||||
 | 
			
		||||
  // Step size used to iterate through the sub-matrices of A
 | 
			
		||||
  size_type aStep = block_size;
 | 
			
		||||
    // Step size used to iterate through the sub-matrices of A
 | 
			
		||||
    size_type aStep = block_size;
 | 
			
		||||
 | 
			
		||||
  // Index of the first sub-matrix of B processed by the block
 | 
			
		||||
  size_type bBegin = block_size * bx;
 | 
			
		||||
    // Index of the first sub-matrix of B processed by the block
 | 
			
		||||
    size_type bBegin = block_size * bx;
 | 
			
		||||
 | 
			
		||||
  // Step size used to iterate through the sub-matrices of B
 | 
			
		||||
  size_type bStep = block_size * wB;
 | 
			
		||||
    // Step size used to iterate through the sub-matrices of B
 | 
			
		||||
    size_type bStep = block_size * wB;
 | 
			
		||||
 | 
			
		||||
  // Csub is used to store the element of the block sub-matrix
 | 
			
		||||
  // that is computed by the thread
 | 
			
		||||
  float Csub = 0;
 | 
			
		||||
    // Csub is used to store the element of the block sub-matrix
 | 
			
		||||
    // that is computed by the thread
 | 
			
		||||
    float Csub = 0;
 | 
			
		||||
 | 
			
		||||
  // Loop over all the sub-matrices of A and B
 | 
			
		||||
  // required to compute the block sub-matrix
 | 
			
		||||
  for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
 | 
			
		||||
    // Declaration of the shared memory array As used to
 | 
			
		||||
    // store the sub-matrix of A
 | 
			
		||||
    __shared__ float As[block_size][block_size];
 | 
			
		||||
    // Loop over all the sub-matrices of A and B
 | 
			
		||||
    // required to compute the block sub-matrix
 | 
			
		||||
    for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
 | 
			
		||||
        // Declaration of the shared memory array As used to
 | 
			
		||||
        // store the sub-matrix of A
 | 
			
		||||
        __shared__ float As[block_size][block_size];
 | 
			
		||||
 | 
			
		||||
    // Declaration of the shared memory array Bs used to
 | 
			
		||||
    // store the sub-matrix of B
 | 
			
		||||
    __shared__ float Bs[block_size][block_size];
 | 
			
		||||
        // Declaration of the shared memory array Bs used to
 | 
			
		||||
        // store the sub-matrix of B
 | 
			
		||||
        __shared__ float Bs[block_size][block_size];
 | 
			
		||||
 | 
			
		||||
    // Load the matrices from device memory
 | 
			
		||||
    // to shared memory; each thread loads
 | 
			
		||||
    // one element of each matrix
 | 
			
		||||
    AS(ty, tx) = A[a + wA * ty + tx];
 | 
			
		||||
    BS(ty, tx) = B[b + wB * ty + tx];
 | 
			
		||||
        // Load the matrices from device memory
 | 
			
		||||
        // to shared memory; each thread loads
 | 
			
		||||
        // one element of each matrix
 | 
			
		||||
        AS(ty, tx) = A[a + wA * ty + tx];
 | 
			
		||||
        BS(ty, tx) = B[b + wB * ty + tx];
 | 
			
		||||
 | 
			
		||||
    // Synchronize to make sure the matrices are loaded
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
        // Synchronize to make sure the matrices are loaded
 | 
			
		||||
        __syncthreads();
 | 
			
		||||
 | 
			
		||||
    // Multiply the two matrices together;
 | 
			
		||||
    // each thread computes one element
 | 
			
		||||
    // of the block sub-matrix
 | 
			
		||||
        // Multiply the two matrices together;
 | 
			
		||||
        // each thread computes one element
 | 
			
		||||
        // of the block sub-matrix
 | 
			
		||||
#pragma unroll
 | 
			
		||||
 | 
			
		||||
    for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
 | 
			
		||||
        for (size_type k = 0; k < block_size; ++k)
 | 
			
		||||
            Csub += AS(ty, k) * BS(k, tx);
 | 
			
		||||
 | 
			
		||||
    // Synchronize to make sure that the preceding
 | 
			
		||||
    // computation is done before loading two new
 | 
			
		||||
    // sub-matrices of A and B in the next iteration
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
  }
 | 
			
		||||
        // Synchronize to make sure that the preceding
 | 
			
		||||
        // computation is done before loading two new
 | 
			
		||||
        // sub-matrices of A and B in the next iteration
 | 
			
		||||
        __syncthreads();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Write the block sub-matrix to device memory;
 | 
			
		||||
  // each thread writes one element
 | 
			
		||||
  size_type c = wB * block_size * by + block_size * bx;
 | 
			
		||||
  C[c + wB * ty + tx] = Csub;
 | 
			
		||||
    // Write the block sub-matrix to device memory;
 | 
			
		||||
    // each thread writes one element
 | 
			
		||||
    size_type c         = wB * block_size * by + block_size * bx;
 | 
			
		||||
    C[c + wB * ty + tx] = Csub;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// C wrappers around our template kernel
 | 
			
		||||
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
 | 
			
		||||
                                               size_t wA, size_t wB) {
 | 
			
		||||
  matrixMul<8, size_t>(C, A, B, wA, wB);
 | 
			
		||||
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
 | 
			
		||||
{
 | 
			
		||||
    matrixMul<8, size_t>(C, A, B, wA, wB);
 | 
			
		||||
}
 | 
			
		||||
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
 | 
			
		||||
                                                size_t wA, size_t wB) {
 | 
			
		||||
  matrixMul<16, size_t>(C, A, B, wA, wB);
 | 
			
		||||
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
 | 
			
		||||
{
 | 
			
		||||
    matrixMul<16, size_t>(C, A, B, wA, wB);
 | 
			
		||||
}
 | 
			
		||||
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
 | 
			
		||||
                                                size_t wA, size_t wB) {
 | 
			
		||||
  matrixMul<32, size_t>(C, A, B, wA, wB);
 | 
			
		||||
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
 | 
			
		||||
{
 | 
			
		||||
    matrixMul<32, size_t>(C, A, B, wA, wB);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif  // #ifndef _MATRIXMUL_KERNEL_H_
 | 
			
		||||
#endif // #ifndef _MATRIXMUL_KERNEL_H_
 | 
			
		||||
 | 
			
		||||
@ -15,210 +15,211 @@
 | 
			
		||||
 | 
			
		||||
// With these flags defined, this source file will dynamically
 | 
			
		||||
// load the corresponding functions.  Disabled by default.
 | 
			
		||||
//#define CUDA_INIT_D3D9
 | 
			
		||||
//#define CUDA_INIT_D3D10
 | 
			
		||||
//#define CUDA_INIT_D3D11
 | 
			
		||||
//#define CUDA_INIT_OPENGL
 | 
			
		||||
// #define CUDA_INIT_D3D9
 | 
			
		||||
// #define CUDA_INIT_D3D10
 | 
			
		||||
// #define CUDA_INIT_D3D11
 | 
			
		||||
// #define CUDA_INIT_OPENGL
 | 
			
		||||
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include "cuda_drvapi_dynlink.h"
 | 
			
		||||
 | 
			
		||||
tcuInit                               *_cuInit;
 | 
			
		||||
tcuDriverGetVersion                   *cuDriverGetVersion;
 | 
			
		||||
tcuDeviceGet                          *cuDeviceGet;
 | 
			
		||||
tcuDeviceGetCount                     *cuDeviceGetCount;
 | 
			
		||||
tcuDeviceGetName                      *cuDeviceGetName;
 | 
			
		||||
tcuDeviceComputeCapability            *cuDeviceComputeCapability;
 | 
			
		||||
tcuDeviceTotalMem                     *cuDeviceTotalMem;
 | 
			
		||||
tcuDeviceGetProperties                *cuDeviceGetProperties;
 | 
			
		||||
tcuDeviceGetAttribute                 *cuDeviceGetAttribute;
 | 
			
		||||
tcuGetErrorString                     *cuGetErrorString;
 | 
			
		||||
tcuCtxCreate                          *cuCtxCreate;
 | 
			
		||||
tcuCtxDestroy                         *cuCtxDestroy;
 | 
			
		||||
tcuCtxAttach                          *cuCtxAttach;
 | 
			
		||||
tcuCtxDetach                          *cuCtxDetach;
 | 
			
		||||
tcuCtxPushCurrent                     *cuCtxPushCurrent;
 | 
			
		||||
tcuCtxPopCurrent                      *cuCtxPopCurrent;
 | 
			
		||||
tcuCtxGetCurrent                      *cuCtxGetCurrent;
 | 
			
		||||
tcuCtxSetCurrent                      *cuCtxSetCurrent;
 | 
			
		||||
tcuCtxGetDevice                       *cuCtxGetDevice;
 | 
			
		||||
tcuCtxSynchronize                     *cuCtxSynchronize;
 | 
			
		||||
tcuModuleLoad                         *cuModuleLoad;
 | 
			
		||||
tcuModuleLoadData                     *cuModuleLoadData;
 | 
			
		||||
tcuModuleLoadDataEx                   *cuModuleLoadDataEx;
 | 
			
		||||
tcuModuleLoadFatBinary                *cuModuleLoadFatBinary;
 | 
			
		||||
tcuModuleUnload                       *cuModuleUnload;
 | 
			
		||||
tcuModuleGetFunction                  *cuModuleGetFunction;
 | 
			
		||||
tcuModuleGetGlobal                    *cuModuleGetGlobal;
 | 
			
		||||
tcuModuleGetTexRef                    *cuModuleGetTexRef;
 | 
			
		||||
tcuModuleGetSurfRef                   *cuModuleGetSurfRef;
 | 
			
		||||
tcuMemGetInfo                         *cuMemGetInfo;
 | 
			
		||||
tcuMemAlloc                           *cuMemAlloc;
 | 
			
		||||
tcuMemAllocPitch                      *cuMemAllocPitch;
 | 
			
		||||
tcuMemFree                            *cuMemFree;
 | 
			
		||||
tcuMemGetAddressRange                 *cuMemGetAddressRange;
 | 
			
		||||
tcuMemAllocHost                       *cuMemAllocHost;
 | 
			
		||||
tcuMemFreeHost                        *cuMemFreeHost;
 | 
			
		||||
tcuMemHostAlloc                       *cuMemHostAlloc;
 | 
			
		||||
tcuMemHostGetFlags                    *cuMemHostGetFlags;
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
tcuMemHostGetDevicePointer            *cuMemHostGetDevicePointer;
 | 
			
		||||
tcuDeviceGetByPCIBusId                *cuDeviceGetByPCIBusId;
 | 
			
		||||
tcuDeviceGetPCIBusId                  *cuDeviceGetPCIBusId;
 | 
			
		||||
tcuIpcGetEventHandle                  *cuIpcGetEventHandle;
 | 
			
		||||
tcuIpcOpenEventHandle                 *cuIpcOpenEventHandle;
 | 
			
		||||
tcuIpcGetMemHandle                    *cuIpcGetMemHandle;
 | 
			
		||||
tcuIpcOpenMemHandle                   *cuIpcOpenMemHandle;
 | 
			
		||||
tcuIpcCloseMemHandle                  *cuIpcCloseMemHandle;
 | 
			
		||||
tcuInit                    *_cuInit;
 | 
			
		||||
tcuDriverGetVersion        *cuDriverGetVersion;
 | 
			
		||||
tcuDeviceGet               *cuDeviceGet;
 | 
			
		||||
tcuDeviceGetCount          *cuDeviceGetCount;
 | 
			
		||||
tcuDeviceGetName           *cuDeviceGetName;
 | 
			
		||||
tcuDeviceComputeCapability *cuDeviceComputeCapability;
 | 
			
		||||
tcuDeviceTotalMem          *cuDeviceTotalMem;
 | 
			
		||||
tcuDeviceGetProperties     *cuDeviceGetProperties;
 | 
			
		||||
tcuDeviceGetAttribute      *cuDeviceGetAttribute;
 | 
			
		||||
tcuGetErrorString          *cuGetErrorString;
 | 
			
		||||
tcuCtxCreate               *cuCtxCreate;
 | 
			
		||||
tcuCtxDestroy              *cuCtxDestroy;
 | 
			
		||||
tcuCtxAttach               *cuCtxAttach;
 | 
			
		||||
tcuCtxDetach               *cuCtxDetach;
 | 
			
		||||
tcuCtxPushCurrent          *cuCtxPushCurrent;
 | 
			
		||||
tcuCtxPopCurrent           *cuCtxPopCurrent;
 | 
			
		||||
tcuCtxGetCurrent           *cuCtxGetCurrent;
 | 
			
		||||
tcuCtxSetCurrent           *cuCtxSetCurrent;
 | 
			
		||||
tcuCtxGetDevice            *cuCtxGetDevice;
 | 
			
		||||
tcuCtxSynchronize          *cuCtxSynchronize;
 | 
			
		||||
tcuModuleLoad              *cuModuleLoad;
 | 
			
		||||
tcuModuleLoadData          *cuModuleLoadData;
 | 
			
		||||
tcuModuleLoadDataEx        *cuModuleLoadDataEx;
 | 
			
		||||
tcuModuleLoadFatBinary     *cuModuleLoadFatBinary;
 | 
			
		||||
tcuModuleUnload            *cuModuleUnload;
 | 
			
		||||
tcuModuleGetFunction       *cuModuleGetFunction;
 | 
			
		||||
tcuModuleGetGlobal         *cuModuleGetGlobal;
 | 
			
		||||
tcuModuleGetTexRef         *cuModuleGetTexRef;
 | 
			
		||||
tcuModuleGetSurfRef        *cuModuleGetSurfRef;
 | 
			
		||||
tcuMemGetInfo              *cuMemGetInfo;
 | 
			
		||||
tcuMemAlloc                *cuMemAlloc;
 | 
			
		||||
tcuMemAllocPitch           *cuMemAllocPitch;
 | 
			
		||||
tcuMemFree                 *cuMemFree;
 | 
			
		||||
tcuMemGetAddressRange      *cuMemGetAddressRange;
 | 
			
		||||
tcuMemAllocHost            *cuMemAllocHost;
 | 
			
		||||
tcuMemFreeHost             *cuMemFreeHost;
 | 
			
		||||
tcuMemHostAlloc            *cuMemHostAlloc;
 | 
			
		||||
tcuMemHostGetFlags         *cuMemHostGetFlags;
 | 
			
		||||
 | 
			
		||||
tcuMemHostRegister                    *cuMemHostRegister;
 | 
			
		||||
tcuMemHostUnregister                  *cuMemHostUnregister;
 | 
			
		||||
tcuMemcpyHtoD                         *cuMemcpyHtoD;
 | 
			
		||||
tcuMemcpyDtoH                         *cuMemcpyDtoH;
 | 
			
		||||
tcuMemcpyDtoD                         *cuMemcpyDtoD;
 | 
			
		||||
tcuMemcpyDtoA                         *cuMemcpyDtoA;
 | 
			
		||||
tcuMemcpyAtoD                         *cuMemcpyAtoD;
 | 
			
		||||
tcuMemcpyHtoA                         *cuMemcpyHtoA;
 | 
			
		||||
tcuMemcpyAtoH                         *cuMemcpyAtoH;
 | 
			
		||||
tcuMemcpyAtoA                         *cuMemcpyAtoA;
 | 
			
		||||
tcuMemcpy2D                           *cuMemcpy2D;
 | 
			
		||||
tcuMemcpy2DUnaligned                  *cuMemcpy2DUnaligned;
 | 
			
		||||
tcuMemcpy3D                           *cuMemcpy3D;
 | 
			
		||||
tcuMemcpyHtoDAsync                    *cuMemcpyHtoDAsync;
 | 
			
		||||
tcuMemcpyDtoHAsync                    *cuMemcpyDtoHAsync;
 | 
			
		||||
tcuMemcpyDtoDAsync                    *cuMemcpyDtoDAsync;
 | 
			
		||||
tcuMemcpyHtoAAsync                    *cuMemcpyHtoAAsync;
 | 
			
		||||
tcuMemcpyAtoHAsync                    *cuMemcpyAtoHAsync;
 | 
			
		||||
tcuMemcpy2DAsync                      *cuMemcpy2DAsync;
 | 
			
		||||
tcuMemcpy3DAsync                      *cuMemcpy3DAsync;
 | 
			
		||||
tcuMemcpy                             *cuMemcpy;
 | 
			
		||||
tcuMemcpyPeer                         *cuMemcpyPeer;
 | 
			
		||||
tcuMemsetD8                           *cuMemsetD8;
 | 
			
		||||
tcuMemsetD16                          *cuMemsetD16;
 | 
			
		||||
tcuMemsetD32                          *cuMemsetD32;
 | 
			
		||||
tcuMemsetD2D8                         *cuMemsetD2D8;
 | 
			
		||||
tcuMemsetD2D16                        *cuMemsetD2D16;
 | 
			
		||||
tcuMemsetD2D32                        *cuMemsetD2D32;
 | 
			
		||||
tcuFuncSetBlockShape                  *cuFuncSetBlockShape;
 | 
			
		||||
tcuFuncSetSharedSize                  *cuFuncSetSharedSize;
 | 
			
		||||
tcuFuncGetAttribute                   *cuFuncGetAttribute;
 | 
			
		||||
tcuFuncSetCacheConfig                 *cuFuncSetCacheConfig;
 | 
			
		||||
tcuFuncSetSharedMemConfig             *cuFuncSetSharedMemConfig;
 | 
			
		||||
tcuLaunchKernel                       *cuLaunchKernel;
 | 
			
		||||
tcuArrayCreate                        *cuArrayCreate;
 | 
			
		||||
tcuArrayGetDescriptor                 *cuArrayGetDescriptor;
 | 
			
		||||
tcuArrayDestroy                       *cuArrayDestroy;
 | 
			
		||||
tcuArray3DCreate                      *cuArray3DCreate;
 | 
			
		||||
tcuArray3DGetDescriptor               *cuArray3DGetDescriptor;
 | 
			
		||||
tcuTexRefCreate                       *cuTexRefCreate;
 | 
			
		||||
tcuTexRefDestroy                      *cuTexRefDestroy;
 | 
			
		||||
tcuTexRefSetArray                     *cuTexRefSetArray;
 | 
			
		||||
tcuTexRefSetAddress                   *cuTexRefSetAddress;
 | 
			
		||||
tcuTexRefSetAddress2D                 *cuTexRefSetAddress2D;
 | 
			
		||||
tcuTexRefSetFormat                    *cuTexRefSetFormat;
 | 
			
		||||
tcuTexRefSetAddressMode               *cuTexRefSetAddressMode;
 | 
			
		||||
tcuTexRefSetFilterMode                *cuTexRefSetFilterMode;
 | 
			
		||||
tcuTexRefSetFlags                     *cuTexRefSetFlags;
 | 
			
		||||
tcuTexRefGetAddress                   *cuTexRefGetAddress;
 | 
			
		||||
tcuTexRefGetArray                     *cuTexRefGetArray;
 | 
			
		||||
tcuTexRefGetAddressMode               *cuTexRefGetAddressMode;
 | 
			
		||||
tcuTexRefGetFilterMode                *cuTexRefGetFilterMode;
 | 
			
		||||
tcuTexRefGetFormat                    *cuTexRefGetFormat;
 | 
			
		||||
tcuTexRefGetFlags                     *cuTexRefGetFlags;
 | 
			
		||||
tcuSurfRefSetArray                    *cuSurfRefSetArray;
 | 
			
		||||
tcuSurfRefGetArray                    *cuSurfRefGetArray;
 | 
			
		||||
tcuParamSetSize                       *cuParamSetSize;
 | 
			
		||||
tcuParamSeti                          *cuParamSeti;
 | 
			
		||||
tcuParamSetf                          *cuParamSetf;
 | 
			
		||||
tcuParamSetv                          *cuParamSetv;
 | 
			
		||||
tcuParamSetTexRef                     *cuParamSetTexRef;
 | 
			
		||||
tcuLaunch                             *cuLaunch;
 | 
			
		||||
tcuLaunchGrid                         *cuLaunchGrid;
 | 
			
		||||
tcuLaunchGridAsync                    *cuLaunchGridAsync;
 | 
			
		||||
tcuEventCreate                        *cuEventCreate;
 | 
			
		||||
tcuEventRecord                        *cuEventRecord;
 | 
			
		||||
tcuEventQuery                         *cuEventQuery;
 | 
			
		||||
tcuEventSynchronize                   *cuEventSynchronize;
 | 
			
		||||
tcuEventDestroy                       *cuEventDestroy;
 | 
			
		||||
tcuEventElapsedTime                   *cuEventElapsedTime;
 | 
			
		||||
tcuStreamCreate                       *cuStreamCreate;
 | 
			
		||||
tcuStreamWaitEvent                    *cuStreamWaitEvent;
 | 
			
		||||
tcuStreamAddCallback                  *cuStreamAddCallback;
 | 
			
		||||
tcuStreamQuery                        *cuStreamQuery;
 | 
			
		||||
tcuStreamSynchronize                  *cuStreamSynchronize;
 | 
			
		||||
tcuStreamDestroy                      *cuStreamDestroy;
 | 
			
		||||
tcuGraphicsUnregisterResource         *cuGraphicsUnregisterResource;
 | 
			
		||||
tcuGraphicsSubResourceGetMappedArray  *cuGraphicsSubResourceGetMappedArray;
 | 
			
		||||
tcuGraphicsResourceGetMappedPointer   *cuGraphicsResourceGetMappedPointer;
 | 
			
		||||
tcuGraphicsResourceSetMapFlags        *cuGraphicsResourceSetMapFlags;
 | 
			
		||||
tcuGraphicsMapResources               *cuGraphicsMapResources;
 | 
			
		||||
tcuGraphicsUnmapResources             *cuGraphicsUnmapResources;
 | 
			
		||||
tcuGetExportTable                     *cuGetExportTable;
 | 
			
		||||
tcuCtxSetLimit                        *cuCtxSetLimit;
 | 
			
		||||
tcuCtxGetLimit                        *cuCtxGetLimit;
 | 
			
		||||
tcuCtxGetCacheConfig                  *cuCtxGetCacheConfig;
 | 
			
		||||
tcuCtxSetCacheConfig                  *cuCtxSetCacheConfig;
 | 
			
		||||
tcuCtxGetSharedMemConfig              *cuCtxGetSharedMemConfig;
 | 
			
		||||
tcuCtxSetSharedMemConfig              *cuCtxSetSharedMemConfig;
 | 
			
		||||
tcuCtxGetApiVersion                   *cuCtxGetApiVersion;
 | 
			
		||||
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
 | 
			
		||||
tcuDeviceGetByPCIBusId     *cuDeviceGetByPCIBusId;
 | 
			
		||||
tcuDeviceGetPCIBusId       *cuDeviceGetPCIBusId;
 | 
			
		||||
tcuIpcGetEventHandle       *cuIpcGetEventHandle;
 | 
			
		||||
tcuIpcOpenEventHandle      *cuIpcOpenEventHandle;
 | 
			
		||||
tcuIpcGetMemHandle         *cuIpcGetMemHandle;
 | 
			
		||||
tcuIpcOpenMemHandle        *cuIpcOpenMemHandle;
 | 
			
		||||
tcuIpcCloseMemHandle       *cuIpcCloseMemHandle;
 | 
			
		||||
 | 
			
		||||
tcuMipmappedArrayCreate               *cuMipmappedArrayCreate;
 | 
			
		||||
tcuMipmappedArrayGetLevel             *cuMipmappedArrayGetLevel;
 | 
			
		||||
tcuMipmappedArrayDestroy              *cuMipmappedArrayDestroy;
 | 
			
		||||
tcuMemHostRegister                   *cuMemHostRegister;
 | 
			
		||||
tcuMemHostUnregister                 *cuMemHostUnregister;
 | 
			
		||||
tcuMemcpyHtoD                        *cuMemcpyHtoD;
 | 
			
		||||
tcuMemcpyDtoH                        *cuMemcpyDtoH;
 | 
			
		||||
tcuMemcpyDtoD                        *cuMemcpyDtoD;
 | 
			
		||||
tcuMemcpyDtoA                        *cuMemcpyDtoA;
 | 
			
		||||
tcuMemcpyAtoD                        *cuMemcpyAtoD;
 | 
			
		||||
tcuMemcpyHtoA                        *cuMemcpyHtoA;
 | 
			
		||||
tcuMemcpyAtoH                        *cuMemcpyAtoH;
 | 
			
		||||
tcuMemcpyAtoA                        *cuMemcpyAtoA;
 | 
			
		||||
tcuMemcpy2D                          *cuMemcpy2D;
 | 
			
		||||
tcuMemcpy2DUnaligned                 *cuMemcpy2DUnaligned;
 | 
			
		||||
tcuMemcpy3D                          *cuMemcpy3D;
 | 
			
		||||
tcuMemcpyHtoDAsync                   *cuMemcpyHtoDAsync;
 | 
			
		||||
tcuMemcpyDtoHAsync                   *cuMemcpyDtoHAsync;
 | 
			
		||||
tcuMemcpyDtoDAsync                   *cuMemcpyDtoDAsync;
 | 
			
		||||
tcuMemcpyHtoAAsync                   *cuMemcpyHtoAAsync;
 | 
			
		||||
tcuMemcpyAtoHAsync                   *cuMemcpyAtoHAsync;
 | 
			
		||||
tcuMemcpy2DAsync                     *cuMemcpy2DAsync;
 | 
			
		||||
tcuMemcpy3DAsync                     *cuMemcpy3DAsync;
 | 
			
		||||
tcuMemcpy                            *cuMemcpy;
 | 
			
		||||
tcuMemcpyPeer                        *cuMemcpyPeer;
 | 
			
		||||
tcuMemsetD8                          *cuMemsetD8;
 | 
			
		||||
tcuMemsetD16                         *cuMemsetD16;
 | 
			
		||||
tcuMemsetD32                         *cuMemsetD32;
 | 
			
		||||
tcuMemsetD2D8                        *cuMemsetD2D8;
 | 
			
		||||
tcuMemsetD2D16                       *cuMemsetD2D16;
 | 
			
		||||
tcuMemsetD2D32                       *cuMemsetD2D32;
 | 
			
		||||
tcuFuncSetBlockShape                 *cuFuncSetBlockShape;
 | 
			
		||||
tcuFuncSetSharedSize                 *cuFuncSetSharedSize;
 | 
			
		||||
tcuFuncGetAttribute                  *cuFuncGetAttribute;
 | 
			
		||||
tcuFuncSetCacheConfig                *cuFuncSetCacheConfig;
 | 
			
		||||
tcuFuncSetSharedMemConfig            *cuFuncSetSharedMemConfig;
 | 
			
		||||
tcuLaunchKernel                      *cuLaunchKernel;
 | 
			
		||||
tcuArrayCreate                       *cuArrayCreate;
 | 
			
		||||
tcuArrayGetDescriptor                *cuArrayGetDescriptor;
 | 
			
		||||
tcuArrayDestroy                      *cuArrayDestroy;
 | 
			
		||||
tcuArray3DCreate                     *cuArray3DCreate;
 | 
			
		||||
tcuArray3DGetDescriptor              *cuArray3DGetDescriptor;
 | 
			
		||||
tcuTexRefCreate                      *cuTexRefCreate;
 | 
			
		||||
tcuTexRefDestroy                     *cuTexRefDestroy;
 | 
			
		||||
tcuTexRefSetArray                    *cuTexRefSetArray;
 | 
			
		||||
tcuTexRefSetAddress                  *cuTexRefSetAddress;
 | 
			
		||||
tcuTexRefSetAddress2D                *cuTexRefSetAddress2D;
 | 
			
		||||
tcuTexRefSetFormat                   *cuTexRefSetFormat;
 | 
			
		||||
tcuTexRefSetAddressMode              *cuTexRefSetAddressMode;
 | 
			
		||||
tcuTexRefSetFilterMode               *cuTexRefSetFilterMode;
 | 
			
		||||
tcuTexRefSetFlags                    *cuTexRefSetFlags;
 | 
			
		||||
tcuTexRefGetAddress                  *cuTexRefGetAddress;
 | 
			
		||||
tcuTexRefGetArray                    *cuTexRefGetArray;
 | 
			
		||||
tcuTexRefGetAddressMode              *cuTexRefGetAddressMode;
 | 
			
		||||
tcuTexRefGetFilterMode               *cuTexRefGetFilterMode;
 | 
			
		||||
tcuTexRefGetFormat                   *cuTexRefGetFormat;
 | 
			
		||||
tcuTexRefGetFlags                    *cuTexRefGetFlags;
 | 
			
		||||
tcuSurfRefSetArray                   *cuSurfRefSetArray;
 | 
			
		||||
tcuSurfRefGetArray                   *cuSurfRefGetArray;
 | 
			
		||||
tcuParamSetSize                      *cuParamSetSize;
 | 
			
		||||
tcuParamSeti                         *cuParamSeti;
 | 
			
		||||
tcuParamSetf                         *cuParamSetf;
 | 
			
		||||
tcuParamSetv                         *cuParamSetv;
 | 
			
		||||
tcuParamSetTexRef                    *cuParamSetTexRef;
 | 
			
		||||
tcuLaunch                            *cuLaunch;
 | 
			
		||||
tcuLaunchGrid                        *cuLaunchGrid;
 | 
			
		||||
tcuLaunchGridAsync                   *cuLaunchGridAsync;
 | 
			
		||||
tcuEventCreate                       *cuEventCreate;
 | 
			
		||||
tcuEventRecord                       *cuEventRecord;
 | 
			
		||||
tcuEventQuery                        *cuEventQuery;
 | 
			
		||||
tcuEventSynchronize                  *cuEventSynchronize;
 | 
			
		||||
tcuEventDestroy                      *cuEventDestroy;
 | 
			
		||||
tcuEventElapsedTime                  *cuEventElapsedTime;
 | 
			
		||||
tcuStreamCreate                      *cuStreamCreate;
 | 
			
		||||
tcuStreamWaitEvent                   *cuStreamWaitEvent;
 | 
			
		||||
tcuStreamAddCallback                 *cuStreamAddCallback;
 | 
			
		||||
tcuStreamQuery                       *cuStreamQuery;
 | 
			
		||||
tcuStreamSynchronize                 *cuStreamSynchronize;
 | 
			
		||||
tcuStreamDestroy                     *cuStreamDestroy;
 | 
			
		||||
tcuGraphicsUnregisterResource        *cuGraphicsUnregisterResource;
 | 
			
		||||
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
 | 
			
		||||
tcuGraphicsResourceGetMappedPointer  *cuGraphicsResourceGetMappedPointer;
 | 
			
		||||
tcuGraphicsResourceSetMapFlags       *cuGraphicsResourceSetMapFlags;
 | 
			
		||||
tcuGraphicsMapResources              *cuGraphicsMapResources;
 | 
			
		||||
tcuGraphicsUnmapResources            *cuGraphicsUnmapResources;
 | 
			
		||||
tcuGetExportTable                    *cuGetExportTable;
 | 
			
		||||
tcuCtxSetLimit                       *cuCtxSetLimit;
 | 
			
		||||
tcuCtxGetLimit                       *cuCtxGetLimit;
 | 
			
		||||
tcuCtxGetCacheConfig                 *cuCtxGetCacheConfig;
 | 
			
		||||
tcuCtxSetCacheConfig                 *cuCtxSetCacheConfig;
 | 
			
		||||
tcuCtxGetSharedMemConfig             *cuCtxGetSharedMemConfig;
 | 
			
		||||
tcuCtxSetSharedMemConfig             *cuCtxSetSharedMemConfig;
 | 
			
		||||
tcuCtxGetApiVersion                  *cuCtxGetApiVersion;
 | 
			
		||||
 | 
			
		||||
tcuProfilerStop                       *cuProfilerStop;
 | 
			
		||||
tcuMipmappedArrayCreate   *cuMipmappedArrayCreate;
 | 
			
		||||
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
 | 
			
		||||
tcuMipmappedArrayDestroy  *cuMipmappedArrayDestroy;
 | 
			
		||||
 | 
			
		||||
tcuProfilerStop *cuProfilerStop;
 | 
			
		||||
 | 
			
		||||
#ifdef CUDA_INIT_D3D9
 | 
			
		||||
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions
 | 
			
		||||
// are deprecated; please use the ones below
 | 
			
		||||
tcuD3D9Begin                          *cuD3D9Begin;
 | 
			
		||||
tcuD3D9End                            *cuD3DEnd;
 | 
			
		||||
tcuD3D9RegisterVertexBuffer           *cuD3D9RegisterVertexBuffer;
 | 
			
		||||
tcuD3D9MapVertexBuffer                *cuD3D9MapVertexBuffer;
 | 
			
		||||
tcuD3D9UnmapVertexBuffer              *cuD3D9UnmapVertexBuffer;
 | 
			
		||||
tcuD3D9UnregisterVertexBuffer         *cuD3D9UnregisterVertexBuffer;
 | 
			
		||||
tcuD3D9Begin                  *cuD3D9Begin;
 | 
			
		||||
tcuD3D9End                    *cuD3DEnd;
 | 
			
		||||
tcuD3D9RegisterVertexBuffer   *cuD3D9RegisterVertexBuffer;
 | 
			
		||||
tcuD3D9MapVertexBuffer        *cuD3D9MapVertexBuffer;
 | 
			
		||||
tcuD3D9UnmapVertexBuffer      *cuD3D9UnmapVertexBuffer;
 | 
			
		||||
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
 | 
			
		||||
 | 
			
		||||
// D3D9/CUDA interop (CUDA 2.x compatible)
 | 
			
		||||
tcuD3D9GetDirect3DDevice              *cuD3D9GetDirect3DDevice;
 | 
			
		||||
tcuD3D9RegisterResource               *cuD3D9RegisterResource;
 | 
			
		||||
tcuD3D9UnregisterResource             *cuD3D9UnregisterResource;
 | 
			
		||||
tcuD3D9MapResources                   *cuD3D9MapResources;
 | 
			
		||||
tcuD3D9UnmapResources                 *cuD3D9UnmapResources;
 | 
			
		||||
tcuD3D9ResourceSetMapFlags            *cuD3D9ResourceSetMapFlags;
 | 
			
		||||
tcuD3D9ResourceGetSurfaceDimensions   *cuD3D9ResourceGetSurfaceDimensions;
 | 
			
		||||
tcuD3D9ResourceGetMappedArray         *cuD3D9ResourceGetMappedArray;
 | 
			
		||||
tcuD3D9ResourceGetMappedPointer       *cuD3D9ResourceGetMappedPointer;
 | 
			
		||||
tcuD3D9ResourceGetMappedSize          *cuD3D9ResourceGetMappedSize;
 | 
			
		||||
tcuD3D9ResourceGetMappedPitch         *cuD3D9ResourceGetMappedPitch;
 | 
			
		||||
tcuD3D9GetDirect3DDevice            *cuD3D9GetDirect3DDevice;
 | 
			
		||||
tcuD3D9RegisterResource             *cuD3D9RegisterResource;
 | 
			
		||||
tcuD3D9UnregisterResource           *cuD3D9UnregisterResource;
 | 
			
		||||
tcuD3D9MapResources                 *cuD3D9MapResources;
 | 
			
		||||
tcuD3D9UnmapResources               *cuD3D9UnmapResources;
 | 
			
		||||
tcuD3D9ResourceSetMapFlags          *cuD3D9ResourceSetMapFlags;
 | 
			
		||||
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
 | 
			
		||||
tcuD3D9ResourceGetMappedArray       *cuD3D9ResourceGetMappedArray;
 | 
			
		||||
tcuD3D9ResourceGetMappedPointer     *cuD3D9ResourceGetMappedPointer;
 | 
			
		||||
tcuD3D9ResourceGetMappedSize        *cuD3D9ResourceGetMappedSize;
 | 
			
		||||
tcuD3D9ResourceGetMappedPitch       *cuD3D9ResourceGetMappedPitch;
 | 
			
		||||
 | 
			
		||||
// D3D9/CUDA interop (CUDA 2.0+)
 | 
			
		||||
tcuD3D9GetDevice                      *cuD3D9GetDevice;
 | 
			
		||||
tcuD3D9CtxCreate                      *cuD3D9CtxCreate;
 | 
			
		||||
tcuGraphicsD3D9RegisterResource       *cuGraphicsD3D9RegisterResource;
 | 
			
		||||
tcuD3D9GetDevice                *cuD3D9GetDevice;
 | 
			
		||||
tcuD3D9CtxCreate                *cuD3D9CtxCreate;
 | 
			
		||||
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef CUDA_INIT_D3D10
 | 
			
		||||
// D3D10/CUDA interop (CUDA 3.0+)
 | 
			
		||||
tcuD3D10GetDevice                     *cuD3D10GetDevice;
 | 
			
		||||
tcuD3D10CtxCreate                     *cuD3D10CtxCreate;
 | 
			
		||||
tcuGraphicsD3D10RegisterResource      *cuGraphicsD3D10RegisterResource;
 | 
			
		||||
tcuD3D10GetDevice                *cuD3D10GetDevice;
 | 
			
		||||
tcuD3D10CtxCreate                *cuD3D10CtxCreate;
 | 
			
		||||
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifdef CUDA_INIT_D3D11
 | 
			
		||||
// D3D11/CUDA interop (CUDA 3.0+)
 | 
			
		||||
tcuD3D11GetDevice                     *cuD3D11GetDevice;
 | 
			
		||||
tcuD3D11CtxCreate                     *cuD3D11CtxCreate;
 | 
			
		||||
tcuGraphicsD3D11RegisterResource      *cuGraphicsD3D11RegisterResource;
 | 
			
		||||
tcuD3D11GetDevice                *cuD3D11GetDevice;
 | 
			
		||||
tcuD3D11CtxCreate                *cuD3D11CtxCreate;
 | 
			
		||||
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// GL/CUDA interop
 | 
			
		||||
#ifdef CUDA_INIT_OPENGL
 | 
			
		||||
tcuGLCtxCreate                        *cuGLCtxCreate;
 | 
			
		||||
tcuGraphicsGLRegisterBuffer           *cuGraphicsGLRegisterBuffer;
 | 
			
		||||
tcuGraphicsGLRegisterImage            *cuGraphicsGLRegisterImage;
 | 
			
		||||
tcuGLCtxCreate              *cuGLCtxCreate;
 | 
			
		||||
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
 | 
			
		||||
tcuGraphicsGLRegisterImage  *cuGraphicsGLRegisterImage;
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
tcuWGLGetDevice                       *cuWGLGetDevice;
 | 
			
		||||
tcuWGLGetDevice *cuWGLGetDevice;
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 | 
			
		||||
{
 | 
			
		||||
    *pInstance = LoadLibrary(__CudaLibName);
 | 
			
		||||
 | 
			
		||||
    if (*pInstance == NULL)
 | 
			
		||||
    {
 | 
			
		||||
    if (*pInstance == NULL) {
 | 
			
		||||
        printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;
 | 
			
		||||
    }
 | 
			
		||||
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 | 
			
		||||
    return CUDA_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define GET_PROC_EX(name, alias, required)                     \
 | 
			
		||||
    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);               \
 | 
			
		||||
    if (alias == NULL && required) {                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n",       \
 | 
			
		||||
               #name, __CudaLibName);                                  \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                      \
 | 
			
		||||
#define GET_PROC_EX(name, alias, required)                                               \
 | 
			
		||||
    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);                                \
 | 
			
		||||
    if (alias == NULL && required) {                                                     \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                                       \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#define GET_PROC_EX_V2(name, alias, required)                           \
 | 
			
		||||
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
 | 
			
		||||
    if (alias == NULL && required) {                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n",       \
 | 
			
		||||
               STRINGIFY(name##_v2), __CudaLibName);                       \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                      \
 | 
			
		||||
#define GET_PROC_EX_V2(name, alias, required)                                                           \
 | 
			
		||||
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));                                \
 | 
			
		||||
    if (alias == NULL && required) {                                                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                                                      \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#define GET_PROC_EX_V3(name, alias, required)                           \
 | 
			
		||||
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
 | 
			
		||||
    if (alias == NULL && required) {                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n",       \
 | 
			
		||||
               STRINGIFY(name##_v3), __CudaLibName);                       \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                      \
 | 
			
		||||
#define GET_PROC_EX_V3(name, alias, required)                                                           \
 | 
			
		||||
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));                                \
 | 
			
		||||
    if (alias == NULL && required) {                                                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                                                      \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
 | 
			
		||||
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
 | 
			
		||||
 | 
			
		||||
#include <dlfcn.h>
 | 
			
		||||
 | 
			
		||||
#if defined(__APPLE__) || defined(__MACOSX)
 | 
			
		||||
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
 | 
			
		||||
#elif defined(__ANDROID__)
 | 
			
		||||
#if defined (__aarch64__)
 | 
			
		||||
#if defined(__aarch64__)
 | 
			
		||||
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
 | 
			
		||||
#elif defined(__arm__)
 | 
			
		||||
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
 | 
			
		||||
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 | 
			
		||||
{
 | 
			
		||||
    *pInstance = dlopen(__CudaLibName, RTLD_NOW);
 | 
			
		||||
 | 
			
		||||
    if (*pInstance == NULL)
 | 
			
		||||
    {
 | 
			
		||||
    if (*pInstance == NULL) {
 | 
			
		||||
        printf("dlopen \"%s\" failed!\n", __CudaLibName);
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;
 | 
			
		||||
    }
 | 
			
		||||
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 | 
			
		||||
    return CUDA_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define GET_PROC_EX(name, alias, required)                              \
 | 
			
		||||
    alias = (t##name *)dlsym(CudaDrvLib, #name);                        \
 | 
			
		||||
    if (alias == NULL && required) {                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n",       \
 | 
			
		||||
               #name, __CudaLibName);                                  \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                      \
 | 
			
		||||
#define GET_PROC_EX(name, alias, required)                                               \
 | 
			
		||||
    alias = (t##name *)dlsym(CudaDrvLib, #name);                                         \
 | 
			
		||||
    if (alias == NULL && required) {                                                     \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                                       \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#define GET_PROC_EX_V2(name, alias, required)                           \
 | 
			
		||||
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));         \
 | 
			
		||||
    if (alias == NULL && required) {                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n",       \
 | 
			
		||||
               STRINGIFY(name##_v2), __CudaLibName);                    \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                      \
 | 
			
		||||
#define GET_PROC_EX_V2(name, alias, required)                                                           \
 | 
			
		||||
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));                                         \
 | 
			
		||||
    if (alias == NULL && required) {                                                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                                                      \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#define GET_PROC_EX_V3(name, alias, required)                           \
 | 
			
		||||
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));         \
 | 
			
		||||
    if (alias == NULL && required) {                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n",       \
 | 
			
		||||
               STRINGIFY(name##_v3), __CudaLibName);                    \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                      \
 | 
			
		||||
#define GET_PROC_EX_V3(name, alias, required)                                                           \
 | 
			
		||||
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));                                         \
 | 
			
		||||
    if (alias == NULL && required) {                                                                    \
 | 
			
		||||
        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
 | 
			
		||||
        return CUDA_ERROR_UNKNOWN;                                                                      \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
#error unsupported platform
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define CHECKED_CALL(call)              \
 | 
			
		||||
    do {                                \
 | 
			
		||||
        CUresult result = (call);       \
 | 
			
		||||
        if (CUDA_SUCCESS != result) {   \
 | 
			
		||||
            return result;              \
 | 
			
		||||
        }                               \
 | 
			
		||||
    } while(0)
 | 
			
		||||
#define CHECKED_CALL(call)            \
 | 
			
		||||
    do {                              \
 | 
			
		||||
        CUresult result = (call);     \
 | 
			
		||||
        if (CUDA_SUCCESS != result) { \
 | 
			
		||||
            return result;            \
 | 
			
		||||
        }                             \
 | 
			
		||||
    } while (0)
 | 
			
		||||
 | 
			
		||||
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
 | 
			
		||||
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
 | 
			
		||||
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
 | 
			
		||||
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
 | 
			
		||||
#define GET_PROC(name)          GET_PROC_REQUIRED(name)
 | 
			
		||||
#define GET_PROC_V2(name)       GET_PROC_EX_V2(name,name,1)
 | 
			
		||||
#define GET_PROC_V3(name)       GET_PROC_EX_V3(name,name,1)
 | 
			
		||||
#define GET_PROC_V2(name)       GET_PROC_EX_V2(name, name, 1)
 | 
			
		||||
#define GET_PROC_V3(name)       GET_PROC_EX_V3(name, name, 1)
 | 
			
		||||
 | 
			
		||||
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
{
 | 
			
		||||
    CUDADRIVER CudaDrvLib;
 | 
			
		||||
    int driverVer = 1000;
 | 
			
		||||
    int        driverVer = 1000;
 | 
			
		||||
 | 
			
		||||
    CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
 | 
			
		||||
 | 
			
		||||
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
    // available since 2.2. if not present, version 1.0 is assumed
 | 
			
		||||
    GET_PROC_OPTIONAL(cuDriverGetVersion);
 | 
			
		||||
 | 
			
		||||
    if (cuDriverGetVersion)
 | 
			
		||||
    {
 | 
			
		||||
    if (cuDriverGetVersion) {
 | 
			
		||||
        CHECKED_CALL(cuDriverGetVersion(&driverVer));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
    GET_PROC(cuStreamDestroy);
 | 
			
		||||
 | 
			
		||||
    // These are CUDA 5.0 new functions
 | 
			
		||||
    if (driverVer >= 5000)
 | 
			
		||||
    {
 | 
			
		||||
    if (driverVer >= 5000) {
 | 
			
		||||
        GET_PROC(cuMipmappedArrayCreate);
 | 
			
		||||
        GET_PROC(cuMipmappedArrayDestroy);
 | 
			
		||||
        GET_PROC(cuMipmappedArrayGetLevel);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // These are CUDA 4.2 new functions
 | 
			
		||||
    if (driverVer >= 4020)
 | 
			
		||||
    {
 | 
			
		||||
    if (driverVer >= 4020) {
 | 
			
		||||
        GET_PROC(cuFuncSetSharedMemConfig);
 | 
			
		||||
        GET_PROC(cuCtxGetSharedMemConfig);
 | 
			
		||||
        GET_PROC(cuCtxSetSharedMemConfig);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // These are CUDA 4.1 new functions
 | 
			
		||||
    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
 | 
			
		||||
    {
 | 
			
		||||
    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
 | 
			
		||||
        GET_PROC(cuDeviceGetByPCIBusId);
 | 
			
		||||
        GET_PROC(cuDeviceGetPCIBusId);
 | 
			
		||||
        GET_PROC(cuIpcGetEventHandle);
 | 
			
		||||
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // These could be _v2 interfaces
 | 
			
		||||
    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
 | 
			
		||||
    {
 | 
			
		||||
    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
 | 
			
		||||
        GET_PROC_V2(cuCtxDestroy);
 | 
			
		||||
        GET_PROC_V2(cuCtxPopCurrent);
 | 
			
		||||
        GET_PROC_V2(cuCtxPushCurrent);
 | 
			
		||||
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
        GET_PROC_V2(cuEventDestroy);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
 | 
			
		||||
    {
 | 
			
		||||
    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
 | 
			
		||||
        GET_PROC_V2(cuDeviceTotalMem);
 | 
			
		||||
        GET_PROC_V2(cuCtxCreate);
 | 
			
		||||
        GET_PROC_V2(cuModuleGetGlobal);
 | 
			
		||||
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
        GET_PROC_V2(cuTexRefSetAddress);
 | 
			
		||||
        GET_PROC_V2(cuTexRefGetAddress);
 | 
			
		||||
 | 
			
		||||
        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
 | 
			
		||||
        {
 | 
			
		||||
        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
 | 
			
		||||
            GET_PROC_V3(cuTexRefSetAddress2D);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
        else {
 | 
			
		||||
            GET_PROC_V2(cuTexRefSetAddress2D);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
    else {
 | 
			
		||||
        // versions earlier than 3020
 | 
			
		||||
        GET_PROC(cuDeviceTotalMem);
 | 
			
		||||
        GET_PROC(cuCtxCreate);
 | 
			
		||||
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // The following functions are specific to CUDA versions
 | 
			
		||||
    if (driverVer >= 4000)
 | 
			
		||||
    {
 | 
			
		||||
    if (driverVer >= 4000) {
 | 
			
		||||
        GET_PROC(cuCtxSetCurrent);
 | 
			
		||||
        GET_PROC(cuCtxGetCurrent);
 | 
			
		||||
        GET_PROC(cuMemHostRegister);
 | 
			
		||||
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
        GET_PROC(cuProfilerStop);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (driverVer >= 3010)
 | 
			
		||||
    {
 | 
			
		||||
    if (driverVer >= 3010) {
 | 
			
		||||
        GET_PROC(cuModuleGetSurfRef);
 | 
			
		||||
        GET_PROC(cuSurfRefSetArray);
 | 
			
		||||
        GET_PROC(cuSurfRefGetArray);
 | 
			
		||||
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
        GET_PROC(cuCtxGetLimit);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (driverVer >= 3000)
 | 
			
		||||
    {
 | 
			
		||||
    if (driverVer >= 3000) {
 | 
			
		||||
        GET_PROC(cuMemcpyDtoDAsync);
 | 
			
		||||
        GET_PROC(cuFuncSetCacheConfig);
 | 
			
		||||
#ifdef CUDA_INIT_D3D11
 | 
			
		||||
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
        GET_PROC(cuGraphicsUnregisterResource);
 | 
			
		||||
        GET_PROC(cuGraphicsSubResourceGetMappedArray);
 | 
			
		||||
 | 
			
		||||
        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
 | 
			
		||||
        {
 | 
			
		||||
        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
 | 
			
		||||
            GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
        else {
 | 
			
		||||
            GET_PROC(cuGraphicsResourceGetMappedPointer);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
        GET_PROC(cuGetExportTable);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (driverVer >= 2030)
 | 
			
		||||
    {
 | 
			
		||||
    if (driverVer >= 2030) {
 | 
			
		||||
        GET_PROC(cuMemHostGetFlags);
 | 
			
		||||
#ifdef CUDA_INIT_D3D10
 | 
			
		||||
        GET_PROC(cuD3D10GetDevice);
 | 
			
		||||
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (driverVer >= 2010)
 | 
			
		||||
    {
 | 
			
		||||
    if (driverVer >= 2010) {
 | 
			
		||||
        GET_PROC(cuModuleLoadDataEx);
 | 
			
		||||
        GET_PROC(cuModuleLoadFatBinary);
 | 
			
		||||
#ifdef CUDA_INIT_OPENGL
 | 
			
		||||
        GET_PROC(cuGLCtxCreate);
 | 
			
		||||
        GET_PROC(cuGraphicsGLRegisterBuffer);
 | 
			
		||||
        GET_PROC(cuGraphicsGLRegisterImage);
 | 
			
		||||
#  ifdef WIN32
 | 
			
		||||
#ifdef WIN32
 | 
			
		||||
        GET_PROC(cuWGLGetDevice);
 | 
			
		||||
#  endif
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef CUDA_INIT_D3D9
 | 
			
		||||
        GET_PROC(cuD3D9GetDevice);
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -14,21 +14,17 @@
 | 
			
		||||
#ifndef HELPER_CUDA_DRVAPI_H
 | 
			
		||||
#define HELPER_CUDA_DRVAPI_H
 | 
			
		||||
 | 
			
		||||
#include <helper_string.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#include <helper_string.h>
 | 
			
		||||
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(a, b) (a > b ? a : b)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifndef HELPER_CUDA_DRVAPI_H
 | 
			
		||||
inline int ftoi(float value) {
 | 
			
		||||
  return (value >= 0 ? static_cast<int>(value + 0.5)
 | 
			
		||||
                     : static_cast<int>(value - 0.5));
 | 
			
		||||
}
 | 
			
		||||
inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifndef EXIT_WAIVED
 | 
			
		||||
@ -47,311 +43,302 @@ inline int ftoi(float value) {
 | 
			
		||||
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
 | 
			
		||||
 | 
			
		||||
// These are the inline versions for all of the SDK helper functions
 | 
			
		||||
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
 | 
			
		||||
  if (CUDA_SUCCESS != err) {
 | 
			
		||||
    const char *errorStr = NULL;
 | 
			
		||||
    cuGetErrorString(err, &errorStr);
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
 | 
			
		||||
            "line %i.\n",
 | 
			
		||||
            err, errorStr, file, line);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
 | 
			
		||||
{
 | 
			
		||||
    if (CUDA_SUCCESS != err) {
 | 
			
		||||
        const char *errorStr = NULL;
 | 
			
		||||
        cuGetErrorString(err, &errorStr);
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
 | 
			
		||||
                "line %i.\n",
 | 
			
		||||
                err,
 | 
			
		||||
                errorStr,
 | 
			
		||||
                file,
 | 
			
		||||
                line);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// This function wraps the CUDA Driver API into a template function
 | 
			
		||||
template <class T>
 | 
			
		||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
 | 
			
		||||
                             int device) {
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
 | 
			
		||||
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
 | 
			
		||||
{
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Beginning of GPU Architecture definitions
 | 
			
		||||
inline int _ConvertSMVer2CoresDRV(int major, int minor) {
 | 
			
		||||
  // Defines for GPU Architecture types (using the SM version to determine the #
 | 
			
		||||
  // of cores per SM
 | 
			
		||||
  typedef struct {
 | 
			
		||||
    int SM;  // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
 | 
			
		||||
             // minor version
 | 
			
		||||
    int Cores;
 | 
			
		||||
  } sSMtoCores;
 | 
			
		||||
inline int _ConvertSMVer2CoresDRV(int major, int minor)
 | 
			
		||||
{
 | 
			
		||||
    // Defines for GPU Architecture types (using the SM version to determine the #
 | 
			
		||||
    // of cores per SM
 | 
			
		||||
    typedef struct
 | 
			
		||||
    {
 | 
			
		||||
        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
 | 
			
		||||
                // minor version
 | 
			
		||||
        int Cores;
 | 
			
		||||
    } sSMtoCores;
 | 
			
		||||
 | 
			
		||||
  sSMtoCores nGpuArchCoresPerSM[] = {
 | 
			
		||||
      {0x30, 192},
 | 
			
		||||
      {0x32, 192},
 | 
			
		||||
      {0x35, 192},
 | 
			
		||||
      {0x37, 192},
 | 
			
		||||
      {0x50, 128},
 | 
			
		||||
      {0x52, 128},
 | 
			
		||||
      {0x53, 128},
 | 
			
		||||
      {0x60,  64},
 | 
			
		||||
      {0x61, 128},
 | 
			
		||||
      {0x62, 128},
 | 
			
		||||
      {0x70,  64},
 | 
			
		||||
      {0x72,  64},
 | 
			
		||||
      {0x75,  64},
 | 
			
		||||
      {0x80,  64},
 | 
			
		||||
      {0x86, 128},
 | 
			
		||||
      {0x87, 128},
 | 
			
		||||
      {0x90, 128},
 | 
			
		||||
      {-1, -1}};
 | 
			
		||||
    sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
 | 
			
		||||
                                       {0x32, 192},
 | 
			
		||||
                                       {0x35, 192},
 | 
			
		||||
                                       {0x37, 192},
 | 
			
		||||
                                       {0x50, 128},
 | 
			
		||||
                                       {0x52, 128},
 | 
			
		||||
                                       {0x53, 128},
 | 
			
		||||
                                       {0x60, 64},
 | 
			
		||||
                                       {0x61, 128},
 | 
			
		||||
                                       {0x62, 128},
 | 
			
		||||
                                       {0x70, 64},
 | 
			
		||||
                                       {0x72, 64},
 | 
			
		||||
                                       {0x75, 64},
 | 
			
		||||
                                       {0x80, 64},
 | 
			
		||||
                                       {0x86, 128},
 | 
			
		||||
                                       {0x87, 128},
 | 
			
		||||
                                       {0x90, 128},
 | 
			
		||||
                                       {-1, -1}};
 | 
			
		||||
 | 
			
		||||
  int index = 0;
 | 
			
		||||
    int index = 0;
 | 
			
		||||
 | 
			
		||||
  while (nGpuArchCoresPerSM[index].SM != -1) {
 | 
			
		||||
    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
 | 
			
		||||
      return nGpuArchCoresPerSM[index].Cores;
 | 
			
		||||
    while (nGpuArchCoresPerSM[index].SM != -1) {
 | 
			
		||||
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
 | 
			
		||||
            return nGpuArchCoresPerSM[index].Cores;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        index++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    index++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // If we don't find the values, we default use the previous one to run
 | 
			
		||||
  // properly
 | 
			
		||||
  printf(
 | 
			
		||||
      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
 | 
			
		||||
      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
 | 
			
		||||
  return nGpuArchCoresPerSM[index - 1].Cores;
 | 
			
		||||
    // If we don't find the values, we default use the previous one to run
 | 
			
		||||
    // properly
 | 
			
		||||
    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
 | 
			
		||||
           major,
 | 
			
		||||
           minor,
 | 
			
		||||
           nGpuArchCoresPerSM[index - 1].Cores);
 | 
			
		||||
    return nGpuArchCoresPerSM[index - 1].Cores;
 | 
			
		||||
}
 | 
			
		||||
  // end of GPU Architecture definitions
 | 
			
		||||
// end of GPU Architecture definitions
 | 
			
		||||
 | 
			
		||||
#ifdef __cuda_cuda_h__
 | 
			
		||||
// General GPU Device CUDA Initialization
 | 
			
		||||
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
 | 
			
		||||
  int cuDevice = 0;
 | 
			
		||||
  int deviceCount = 0;
 | 
			
		||||
  checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
 | 
			
		||||
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
 | 
			
		||||
{
 | 
			
		||||
    int cuDevice    = 0;
 | 
			
		||||
    int deviceCount = 0;
 | 
			
		||||
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuDeviceGetCount(&deviceCount));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetCount(&deviceCount));
 | 
			
		||||
 | 
			
		||||
  if (deviceCount == 0) {
 | 
			
		||||
    fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (deviceCount == 0) {
 | 
			
		||||
        fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  int dev = 0;
 | 
			
		||||
  dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
 | 
			
		||||
    int dev = 0;
 | 
			
		||||
    dev     = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
 | 
			
		||||
 | 
			
		||||
  if (dev < 0) {
 | 
			
		||||
    dev = 0;
 | 
			
		||||
  }
 | 
			
		||||
    if (dev < 0) {
 | 
			
		||||
        dev = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (dev > deviceCount - 1) {
 | 
			
		||||
    fprintf(stderr, "\n");
 | 
			
		||||
    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
 | 
			
		||||
            deviceCount);
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
 | 
			
		||||
            dev);
 | 
			
		||||
    fprintf(stderr, "\n");
 | 
			
		||||
    return -dev;
 | 
			
		||||
  }
 | 
			
		||||
    if (dev > deviceCount - 1) {
 | 
			
		||||
        fprintf(stderr, "\n");
 | 
			
		||||
        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
 | 
			
		||||
        fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
 | 
			
		||||
        fprintf(stderr, "\n");
 | 
			
		||||
        return -dev;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuDeviceGet(&cuDevice, dev));
 | 
			
		||||
  char name[100];
 | 
			
		||||
  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGet(&cuDevice, dev));
 | 
			
		||||
    char name[100];
 | 
			
		||||
    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
 | 
			
		||||
 | 
			
		||||
  int computeMode;
 | 
			
		||||
  getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 | 
			
		||||
    int computeMode;
 | 
			
		||||
    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 | 
			
		||||
 | 
			
		||||
  if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
 | 
			
		||||
            "threads can use this CUDA Device.\n");
 | 
			
		||||
    return -1;
 | 
			
		||||
  }
 | 
			
		||||
    if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
 | 
			
		||||
                "threads can use this CUDA Device.\n");
 | 
			
		||||
        return -1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
 | 
			
		||||
    printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
 | 
			
		||||
  }
 | 
			
		||||
    if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
 | 
			
		||||
        printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return dev;
 | 
			
		||||
    return dev;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// This function returns the best GPU based on performance
 | 
			
		||||
inline int gpuGetMaxGflopsDeviceIdDRV() {
 | 
			
		||||
  CUdevice current_device = 0;
 | 
			
		||||
  CUdevice max_perf_device = 0;
 | 
			
		||||
  int device_count = 0;
 | 
			
		||||
  int sm_per_multiproc = 0;
 | 
			
		||||
  unsigned long long max_compute_perf = 0;
 | 
			
		||||
  int major = 0;
 | 
			
		||||
  int minor = 0;
 | 
			
		||||
  int multiProcessorCount;
 | 
			
		||||
  int clockRate;
 | 
			
		||||
  int devices_prohibited = 0;
 | 
			
		||||
inline int gpuGetMaxGflopsDeviceIdDRV()
 | 
			
		||||
{
 | 
			
		||||
    CUdevice           current_device   = 0;
 | 
			
		||||
    CUdevice           max_perf_device  = 0;
 | 
			
		||||
    int                device_count     = 0;
 | 
			
		||||
    int                sm_per_multiproc = 0;
 | 
			
		||||
    unsigned long long max_compute_perf = 0;
 | 
			
		||||
    int                major            = 0;
 | 
			
		||||
    int                minor            = 0;
 | 
			
		||||
    int                multiProcessorCount;
 | 
			
		||||
    int                clockRate;
 | 
			
		||||
    int                devices_prohibited = 0;
 | 
			
		||||
 | 
			
		||||
  cuInit(0, __CUDA_API_VERSION);
 | 
			
		||||
  checkCudaErrors(cuDeviceGetCount(&device_count));
 | 
			
		||||
    cuInit(0, __CUDA_API_VERSION);
 | 
			
		||||
    checkCudaErrors(cuDeviceGetCount(&device_count));
 | 
			
		||||
 | 
			
		||||
  if (device_count == 0) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Find the best CUDA capable GPU device
 | 
			
		||||
  current_device = 0;
 | 
			
		||||
 | 
			
		||||
  while (current_device < device_count) {
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
 | 
			
		||||
        current_device));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
 | 
			
		||||
 | 
			
		||||
    int computeMode;
 | 
			
		||||
    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
 | 
			
		||||
                          current_device);
 | 
			
		||||
 | 
			
		||||
    if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
 | 
			
		||||
      if (major == 9999 && minor == 9999) {
 | 
			
		||||
        sm_per_multiproc = 1;
 | 
			
		||||
      } else {
 | 
			
		||||
        sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      unsigned long long compute_perf =
 | 
			
		||||
          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
 | 
			
		||||
                               clockRate);
 | 
			
		||||
 | 
			
		||||
      if (compute_perf > max_compute_perf) {
 | 
			
		||||
          max_compute_perf = compute_perf;
 | 
			
		||||
          max_perf_device = current_device;
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      devices_prohibited++;
 | 
			
		||||
    if (device_count == 0) {
 | 
			
		||||
        fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ++current_device;
 | 
			
		||||
  }
 | 
			
		||||
    // Find the best CUDA capable GPU device
 | 
			
		||||
    current_device = 0;
 | 
			
		||||
 | 
			
		||||
  if (devices_prohibited == device_count) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
 | 
			
		||||
            "prohibited.\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    while (current_device < device_count) {
 | 
			
		||||
        checkCudaErrors(
 | 
			
		||||
            cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
 | 
			
		||||
        checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
 | 
			
		||||
        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
 | 
			
		||||
        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
 | 
			
		||||
 | 
			
		||||
  return max_perf_device;
 | 
			
		||||
        int computeMode;
 | 
			
		||||
        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
 | 
			
		||||
 | 
			
		||||
        if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
 | 
			
		||||
            if (major == 9999 && minor == 9999) {
 | 
			
		||||
                sm_per_multiproc = 1;
 | 
			
		||||
            }
 | 
			
		||||
            else {
 | 
			
		||||
                sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
 | 
			
		||||
 | 
			
		||||
            if (compute_perf > max_compute_perf) {
 | 
			
		||||
                max_compute_perf = compute_perf;
 | 
			
		||||
                max_perf_device  = current_device;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            devices_prohibited++;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        ++current_device;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (devices_prohibited == device_count) {
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
 | 
			
		||||
                "prohibited.\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return max_perf_device;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// General initialization call to pick the best CUDA Device
 | 
			
		||||
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
 | 
			
		||||
  CUdevice cuDevice;
 | 
			
		||||
  int devID = 0;
 | 
			
		||||
inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
 | 
			
		||||
{
 | 
			
		||||
    CUdevice cuDevice;
 | 
			
		||||
    int      devID = 0;
 | 
			
		||||
 | 
			
		||||
  // If the command-line has a device number specified, use it
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
 | 
			
		||||
    devID = gpuDeviceInitDRV(argc, argv);
 | 
			
		||||
    // If the command-line has a device number specified, use it
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
 | 
			
		||||
        devID = gpuDeviceInitDRV(argc, argv);
 | 
			
		||||
 | 
			
		||||
    if (devID < 0) {
 | 
			
		||||
      printf("exiting...\n");
 | 
			
		||||
      exit(EXIT_SUCCESS);
 | 
			
		||||
        if (devID < 0) {
 | 
			
		||||
            printf("exiting...\n");
 | 
			
		||||
            exit(EXIT_SUCCESS);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // Otherwise pick the device with highest Gflops/s
 | 
			
		||||
        char name[100];
 | 
			
		||||
        devID = gpuGetMaxGflopsDeviceIdDRV();
 | 
			
		||||
        checkCudaErrors(cuDeviceGet(&cuDevice, devID));
 | 
			
		||||
        cuDeviceGetName(name, 100, cuDevice);
 | 
			
		||||
        printf("> Using CUDA Device [%d]: %s\n", devID, name);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    // Otherwise pick the device with highest Gflops/s
 | 
			
		||||
    char name[100];
 | 
			
		||||
    devID = gpuGetMaxGflopsDeviceIdDRV();
 | 
			
		||||
    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
 | 
			
		||||
    cuDeviceGetName(name, 100, cuDevice);
 | 
			
		||||
    printf("> Using CUDA Device [%d]: %s\n", devID, name);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cuDeviceGet(&cuDevice, devID);
 | 
			
		||||
    cuDeviceGet(&cuDevice, devID);
 | 
			
		||||
 | 
			
		||||
  return cuDevice;
 | 
			
		||||
    return cuDevice;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline CUdevice findIntegratedGPUDrv() {
 | 
			
		||||
  CUdevice current_device = 0;
 | 
			
		||||
  int device_count = 0;
 | 
			
		||||
  int devices_prohibited = 0;
 | 
			
		||||
  int isIntegrated;
 | 
			
		||||
inline CUdevice findIntegratedGPUDrv()
 | 
			
		||||
{
 | 
			
		||||
    CUdevice current_device     = 0;
 | 
			
		||||
    int      device_count       = 0;
 | 
			
		||||
    int      devices_prohibited = 0;
 | 
			
		||||
    int      isIntegrated;
 | 
			
		||||
 | 
			
		||||
  cuInit(0, __CUDA_API_VERSION);
 | 
			
		||||
  checkCudaErrors(cuDeviceGetCount(&device_count));
 | 
			
		||||
    cuInit(0, __CUDA_API_VERSION);
 | 
			
		||||
    checkCudaErrors(cuDeviceGetCount(&device_count));
 | 
			
		||||
 | 
			
		||||
  if (device_count == 0) {
 | 
			
		||||
    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Find the integrated GPU which is compute capable
 | 
			
		||||
  while (current_device < device_count) {
 | 
			
		||||
    int computeMode = -1;
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
 | 
			
		||||
 | 
			
		||||
    // If GPU is integrated and is not running on Compute Mode prohibited use
 | 
			
		||||
    // that
 | 
			
		||||
    if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
 | 
			
		||||
      int major = 0, minor = 0;
 | 
			
		||||
      char deviceName[256];
 | 
			
		||||
      checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
 | 
			
		||||
          current_device));
 | 
			
		||||
      checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
 | 
			
		||||
          current_device));
 | 
			
		||||
      checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
 | 
			
		||||
      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
 | 
			
		||||
             current_device, deviceName, major, minor);
 | 
			
		||||
 | 
			
		||||
      return current_device;
 | 
			
		||||
    } else {
 | 
			
		||||
      devices_prohibited++;
 | 
			
		||||
    if (device_count == 0) {
 | 
			
		||||
        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    current_device++;
 | 
			
		||||
  }
 | 
			
		||||
    // Find the integrated GPU which is compute capable
 | 
			
		||||
    while (current_device < device_count) {
 | 
			
		||||
        int computeMode = -1;
 | 
			
		||||
        checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
 | 
			
		||||
        checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
 | 
			
		||||
 | 
			
		||||
  if (devices_prohibited == device_count) {
 | 
			
		||||
    fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
        // If GPU is integrated and is not running on Compute Mode prohibited use
 | 
			
		||||
        // that
 | 
			
		||||
        if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
 | 
			
		||||
            int  major = 0, minor = 0;
 | 
			
		||||
            char deviceName[256];
 | 
			
		||||
            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
 | 
			
		||||
            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
 | 
			
		||||
            checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
 | 
			
		||||
            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
 | 
			
		||||
 | 
			
		||||
  return -1;
 | 
			
		||||
            return current_device;
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            devices_prohibited++;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        current_device++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (devices_prohibited == device_count) {
 | 
			
		||||
        fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return -1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// General check for CUDA GPU SM Capabilities
 | 
			
		||||
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
 | 
			
		||||
                                     int devID) {
 | 
			
		||||
  CUdevice cuDevice;
 | 
			
		||||
  char name[256];
 | 
			
		||||
  int major = 0, minor = 0;
 | 
			
		||||
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
 | 
			
		||||
{
 | 
			
		||||
    CUdevice cuDevice;
 | 
			
		||||
    char     name[256];
 | 
			
		||||
    int      major = 0, minor = 0;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuDeviceGet(&cuDevice, devID));
 | 
			
		||||
  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
 | 
			
		||||
 | 
			
		||||
  if ((major > major_version) ||
 | 
			
		||||
      (major == major_version && minor >= minor_version)) {
 | 
			
		||||
    printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
 | 
			
		||||
           major, minor);
 | 
			
		||||
    return true;
 | 
			
		||||
  } else {
 | 
			
		||||
    printf(
 | 
			
		||||
        "No GPU device was found that can support CUDA compute capability "
 | 
			
		||||
        "%d.%d.\n",
 | 
			
		||||
        major_version, minor_version);
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    if ((major > major_version) || (major == major_version && minor >= minor_version)) {
 | 
			
		||||
        printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
 | 
			
		||||
        return true;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("No GPU device was found that can support CUDA compute capability "
 | 
			
		||||
               "%d.%d.\n",
 | 
			
		||||
               major_version,
 | 
			
		||||
               minor_version);
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  // end of CUDA Helper Functions
 | 
			
		||||
 | 
			
		||||
#endif  // HELPER_CUDA_DRVAPI_H
 | 
			
		||||
// end of CUDA Helper Functions
 | 
			
		||||
 | 
			
		||||
#endif // HELPER_CUDA_DRVAPI_H
 | 
			
		||||
 | 
			
		||||
@ -34,8 +34,8 @@
 | 
			
		||||
#define WA (4 * block_size) // Matrix A width
 | 
			
		||||
#define HA (6 * block_size) // Matrix A height
 | 
			
		||||
#define WB (4 * block_size) // Matrix B width
 | 
			
		||||
#define HB WA  // Matrix B height
 | 
			
		||||
#define WC WB  // Matrix C width 
 | 
			
		||||
#define HC HA  // Matrix C height
 | 
			
		||||
#define HB WA               // Matrix B height
 | 
			
		||||
#define WC WB               // Matrix C width
 | 
			
		||||
#define HC HA               // Matrix C height
 | 
			
		||||
 | 
			
		||||
#endif // _MATRIXMUL_H_
 | 
			
		||||
 | 
			
		||||
@ -43,10 +43,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes, CUDA
 | 
			
		||||
#include "cuda_drvapi_dynlink.h"
 | 
			
		||||
@ -60,7 +60,7 @@
 | 
			
		||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 | 
			
		||||
 | 
			
		||||
#if defined _MSC_VER
 | 
			
		||||
#pragma warning (disable : 4312)
 | 
			
		||||
#pragma warning(disable : 4312)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
 | 
			
		||||
// Globals
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
CUcontext g_cuContext;
 | 
			
		||||
bool noprompt = false;
 | 
			
		||||
bool      noprompt = false;
 | 
			
		||||
 | 
			
		||||
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
 | 
			
		||||
 | 
			
		||||
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void randomInit(float *data, size_t size)
 | 
			
		||||
{
 | 
			
		||||
    for (size_t i = 0; i < size; ++i)
 | 
			
		||||
    {
 | 
			
		||||
    for (size_t i = 0; i < size; ++i) {
 | 
			
		||||
        data[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
 | 
			
		||||
{
 | 
			
		||||
    CUresult status;
 | 
			
		||||
    CUdevice cuDevice;
 | 
			
		||||
    CUmodule cuModule;
 | 
			
		||||
    CUresult   status;
 | 
			
		||||
    CUdevice   cuDevice;
 | 
			
		||||
    CUmodule   cuModule;
 | 
			
		||||
    CUfunction cuFunction;
 | 
			
		||||
    int major, minor, block_size, devID = 0;
 | 
			
		||||
    char deviceName[256];
 | 
			
		||||
    int        major, minor, block_size, devID = 0;
 | 
			
		||||
    char       deviceName[256];
 | 
			
		||||
 | 
			
		||||
    // link to cuda driver dynamically
 | 
			
		||||
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
 | 
			
		||||
 | 
			
		||||
    // This assumes that the user is attempting to specify a explicit device -device=n
 | 
			
		||||
    if (argc > 1)
 | 
			
		||||
    {
 | 
			
		||||
    if (argc > 1) {
 | 
			
		||||
        bool bFound = false;
 | 
			
		||||
 | 
			
		||||
        for (int param=0; param < argc; param++)
 | 
			
		||||
        {
 | 
			
		||||
            if (!strncmp(argv[param], "-device", 7))
 | 
			
		||||
            {
 | 
			
		||||
                int i=(int)strlen(argv[1]);
 | 
			
		||||
        for (int param = 0; param < argc; param++) {
 | 
			
		||||
            if (!strncmp(argv[param], "-device", 7)) {
 | 
			
		||||
                int i = (int)strlen(argv[1]);
 | 
			
		||||
 | 
			
		||||
                while (argv[1][i] != '=')
 | 
			
		||||
                {
 | 
			
		||||
                while (argv[1][i] != '=') {
 | 
			
		||||
                    i--;
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                devID = atoi(&argv[1][++i]);
 | 
			
		||||
                devID  = atoi(&argv[1][++i]);
 | 
			
		||||
                bFound = true;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
 | 
			
		||||
    int deviceCount = 0;
 | 
			
		||||
    checkCudaErrors(cuDeviceGetCount(&deviceCount));
 | 
			
		||||
 | 
			
		||||
    if (deviceCount == 0)
 | 
			
		||||
    {
 | 
			
		||||
    if (deviceCount == 0) {
 | 
			
		||||
        fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (devID < 0) devID = 0;
 | 
			
		||||
    if (devID < 0)
 | 
			
		||||
        devID = 0;
 | 
			
		||||
 | 
			
		||||
    if (devID > deviceCount -1)
 | 
			
		||||
    {
 | 
			
		||||
    if (devID > deviceCount - 1) {
 | 
			
		||||
        fprintf(stderr, "initCUDA (Device=%d) invalid GPU device.  %d GPU device(s) detected.\n\n", devID, deviceCount);
 | 
			
		||||
        status = CUDA_ERROR_NOT_FOUND;
 | 
			
		||||
 | 
			
		||||
@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
 | 
			
		||||
    checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
 | 
			
		||||
    printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
 | 
			
		||||
 | 
			
		||||
    block_size = 32;
 | 
			
		||||
    block_size      = 32;
 | 
			
		||||
    *block_size_out = block_size;
 | 
			
		||||
 | 
			
		||||
    // create context for picked device
 | 
			
		||||
    status = cuCtxCreate(&g_cuContext, 0, cuDevice);
 | 
			
		||||
 | 
			
		||||
    if (CUDA_SUCCESS != status)
 | 
			
		||||
    {
 | 
			
		||||
    if (CUDA_SUCCESS != status) {
 | 
			
		||||
        cuCtxDestroy(g_cuContext);
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
 | 
			
		||||
    {
 | 
			
		||||
        // in this branch we use compilation with parameters
 | 
			
		||||
        const unsigned int jitNumOptions = 3;
 | 
			
		||||
        CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
 | 
			
		||||
        void **jitOptVals = new void *[jitNumOptions];
 | 
			
		||||
        CUjit_option      *jitOptions    = new CUjit_option[jitNumOptions];
 | 
			
		||||
        void             **jitOptVals    = new void *[jitNumOptions];
 | 
			
		||||
 | 
			
		||||
        // set up size of compilation log buffer
 | 
			
		||||
        jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 | 
			
		||||
        jitOptions[0]        = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 | 
			
		||||
        int jitLogBufferSize = 1024;
 | 
			
		||||
        jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
 | 
			
		||||
        jitOptVals[0]        = (void *)(size_t)jitLogBufferSize;
 | 
			
		||||
 | 
			
		||||
        // set up pointer to the compilation log buffer
 | 
			
		||||
        jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
 | 
			
		||||
        jitOptions[1]      = CU_JIT_INFO_LOG_BUFFER;
 | 
			
		||||
        char *jitLogBuffer = new char[jitLogBufferSize];
 | 
			
		||||
        jitOptVals[1] = jitLogBuffer;
 | 
			
		||||
        jitOptVals[1]      = jitLogBuffer;
 | 
			
		||||
 | 
			
		||||
        // set up pointer to set the Maximum # of registers for a particular kernel
 | 
			
		||||
        jitOptions[2] = CU_JIT_MAX_REGISTERS;
 | 
			
		||||
        jitOptions[2]   = CU_JIT_MAX_REGISTERS;
 | 
			
		||||
        int jitRegCount = 32;
 | 
			
		||||
        jitOptVals[2] = (void *)(size_t)jitRegCount;
 | 
			
		||||
        jitOptVals[2]   = (void *)(size_t)jitRegCount;
 | 
			
		||||
 | 
			
		||||
        // compile with set parameters
 | 
			
		||||
        printf("> Compiling CUDA module\n");
 | 
			
		||||
 | 
			
		||||
#if defined(_WIN64) || defined(__LP64__)
 | 
			
		||||
        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 | 
			
		||||
        status =
 | 
			
		||||
            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 | 
			
		||||
#else
 | 
			
		||||
        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 | 
			
		||||
        status =
 | 
			
		||||
            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        printf("> PTX JIT log:\n%s\n", jitLogBuffer);
 | 
			
		||||
 | 
			
		||||
        delete [] jitOptions;
 | 
			
		||||
        delete [] jitOptVals;
 | 
			
		||||
        delete [] jitLogBuffer;
 | 
			
		||||
        delete[] jitOptions;
 | 
			
		||||
        delete[] jitOptVals;
 | 
			
		||||
        delete[] jitLogBuffer;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (CUDA_SUCCESS != status)
 | 
			
		||||
    {
 | 
			
		||||
    if (CUDA_SUCCESS != status) {
 | 
			
		||||
        printf("Error while compiling PTX\n");
 | 
			
		||||
        cuCtxDestroy(g_cuContext);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // retrieve CUDA function from the compiled module
 | 
			
		||||
    status = cuModuleGetFunction(&cuFunction, cuModule,
 | 
			
		||||
                                 (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
 | 
			
		||||
    status = cuModuleGetFunction(
 | 
			
		||||
        &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
 | 
			
		||||
 | 
			
		||||
    if (CUDA_SUCCESS != status)
 | 
			
		||||
    {
 | 
			
		||||
    if (CUDA_SUCCESS != status) {
 | 
			
		||||
        cuCtxDestroy(g_cuContext);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
@ -233,21 +226,21 @@ int main(int argc, char **argv)
 | 
			
		||||
    printf("[ %s ]\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
    // initialize CUDA
 | 
			
		||||
    CUfunction matrixMul = NULL;
 | 
			
		||||
    int block_size = 0;
 | 
			
		||||
    CUfunction matrixMul  = NULL;
 | 
			
		||||
    int        block_size = 0;
 | 
			
		||||
    checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
 | 
			
		||||
 | 
			
		||||
    // set seed for rand()
 | 
			
		||||
    srand(2006);
 | 
			
		||||
 | 
			
		||||
    // allocate host memory for matrices A and B
 | 
			
		||||
    size_t       size_A = WA * HA;
 | 
			
		||||
    size_t       mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
    size_t       size_B = WB * HB;
 | 
			
		||||
    size_t       mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
    size_t size_A     = WA * HA;
 | 
			
		||||
    size_t mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
    size_t size_B     = WB * HB;
 | 
			
		||||
    size_t mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
 | 
			
		||||
    float *h_A = (float *) malloc(mem_size_A);
 | 
			
		||||
    float *h_B = (float *) malloc(mem_size_B);
 | 
			
		||||
    float *h_A = (float *)malloc(mem_size_A);
 | 
			
		||||
    float *h_B = (float *)malloc(mem_size_B);
 | 
			
		||||
 | 
			
		||||
    // initialize host memory
 | 
			
		||||
    randomInit(h_A, size_A);
 | 
			
		||||
@ -264,26 +257,24 @@ int main(int argc, char **argv)
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
 | 
			
		||||
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    size_t       size_C = WC * HC;
 | 
			
		||||
    size_t       mem_size_C = sizeof(float) * size_C;
 | 
			
		||||
    size_t size_C     = WC * HC;
 | 
			
		||||
    size_t mem_size_C = sizeof(float) * size_C;
 | 
			
		||||
 | 
			
		||||
    CUdeviceptr d_C;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    float *h_C = (float *) malloc(mem_size_C);
 | 
			
		||||
    float *h_C = (float *)malloc(mem_size_C);
 | 
			
		||||
 | 
			
		||||
#if __CUDA_API_VERSION >= 4000
 | 
			
		||||
    {
 | 
			
		||||
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
 | 
			
		||||
        int Matrix_Width_A = WA;
 | 
			
		||||
        int Matrix_Width_B = WB;
 | 
			
		||||
        void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
 | 
			
		||||
        int   Matrix_Width_A = WA;
 | 
			
		||||
        int   Matrix_Width_B = WB;
 | 
			
		||||
        void *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
 | 
			
		||||
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
 | 
			
		||||
                                       block_size     , block_size     , 1,
 | 
			
		||||
                                       0,
 | 
			
		||||
                                       NULL, args, NULL));
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
            matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
 | 
			
		||||
    }
 | 
			
		||||
#else // __CUDA_API_VERSION <= 3020
 | 
			
		||||
    {
 | 
			
		||||
@ -312,7 +303,7 @@ int main(int argc, char **argv)
 | 
			
		||||
 | 
			
		||||
        checkCudaErrors(cuParamSetSize(matrixMul, offset));
 | 
			
		||||
        checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
 | 
			
		||||
        checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float)));
 | 
			
		||||
        checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
 | 
			
		||||
 | 
			
		||||
        // set execution configuration for the CUDA kernel
 | 
			
		||||
        checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
 | 
			
		||||
@ -322,19 +313,18 @@ int main(int argc, char **argv)
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C));
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
    // compute reference solution
 | 
			
		||||
    float *reference = (float *) malloc(mem_size_C);
 | 
			
		||||
    float *reference = (float *)malloc(mem_size_C);
 | 
			
		||||
    computeGold(reference, h_A, h_B, HA, WA, WB);
 | 
			
		||||
 | 
			
		||||
    // check result
 | 
			
		||||
    float diff=0.0f;
 | 
			
		||||
    float diff = 0.0f;
 | 
			
		||||
 | 
			
		||||
    for (unsigned int i=0; i<size_C; i++)
 | 
			
		||||
    {
 | 
			
		||||
    for (unsigned int i = 0; i < size_C; i++) {
 | 
			
		||||
        float tmp = reference[i] - h_C[i];
 | 
			
		||||
        diff += tmp*tmp;
 | 
			
		||||
        diff += tmp * tmp;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int res = (diff / (float)size_C < 1e-6f);
 | 
			
		||||
@ -349,7 +339,7 @@ int main(int argc, char **argv)
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
    checkCudaErrors(cuCtxDestroy(g_cuContext));
 | 
			
		||||
 | 
			
		||||
    printf("Test run %s\n", (1==res) ? "success!" : "failed!");
 | 
			
		||||
    printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
 | 
			
		||||
 | 
			
		||||
    exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -28,8 +28,7 @@
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// export C interface
 | 
			
		||||
extern "C"
 | 
			
		||||
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 | 
			
		||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Compute reference data set
 | 
			
		||||
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
 | 
			
		||||
//! @param hA         height of matrix A
 | 
			
		||||
//! @param wB         width of matrix B
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void
 | 
			
		||||
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 | 
			
		||||
void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 | 
			
		||||
{
 | 
			
		||||
    for (unsigned int i = 0; i < hA; ++i)
 | 
			
		||||
        for (unsigned int j = 0; j < wB; ++j)
 | 
			
		||||
        {
 | 
			
		||||
        for (unsigned int j = 0; j < wB; ++j) {
 | 
			
		||||
            double sum = 0;
 | 
			
		||||
 | 
			
		||||
            for (unsigned int k = 0; k < wA; ++k)
 | 
			
		||||
            {
 | 
			
		||||
            for (unsigned int k = 0; k < wA; ++k) {
 | 
			
		||||
                double a = A[i * wA + k];
 | 
			
		||||
                double b = B[k * wB + j];
 | 
			
		||||
                sum += a * b;
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -32,7 +32,8 @@
 | 
			
		||||
#define __matrixMul_kernel_32_ptxdump_h__
 | 
			
		||||
 | 
			
		||||
#if defined __cplusplus
 | 
			
		||||
extern "C" {
 | 
			
		||||
extern "C"
 | 
			
		||||
{
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    extern unsigned char matrixMul_kernel_32_ptxdump[25784];
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -32,7 +32,8 @@
 | 
			
		||||
#define __matrixMul_kernel_64_ptxdump_h__
 | 
			
		||||
 | 
			
		||||
#if defined __cplusplus
 | 
			
		||||
extern "C" {
 | 
			
		||||
extern "C"
 | 
			
		||||
{
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    extern unsigned char matrixMul_kernel_64_ptxdump[26489];
 | 
			
		||||
 | 
			
		||||
@ -42,207 +42,208 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// System includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// CUDA runtime
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
#include "nvrtc_helper.h"
 | 
			
		||||
 | 
			
		||||
// Helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
void constantInit(float *data, int size, float val) {
 | 
			
		||||
  for (int i = 0; i < size; ++i) {
 | 
			
		||||
    data[i] = val;
 | 
			
		||||
  }
 | 
			
		||||
void constantInit(float *data, int size, float val)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < size; ++i) {
 | 
			
		||||
        data[i] = val;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Run a simple test of matrix multiplication using CUDA
 | 
			
		||||
 */
 | 
			
		||||
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
 | 
			
		||||
                   dim3 &dimsB) {
 | 
			
		||||
  // Allocate host memory for matrices A and B
 | 
			
		||||
  unsigned int size_A = dimsA.x * dimsA.y;
 | 
			
		||||
  unsigned int mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
  float *h_A = (float *)malloc(mem_size_A);
 | 
			
		||||
  unsigned int size_B = dimsB.x * dimsB.y;
 | 
			
		||||
  unsigned int mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
  float *h_B = (float *)malloc(mem_size_B);
 | 
			
		||||
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
 | 
			
		||||
{
 | 
			
		||||
    // Allocate host memory for matrices A and B
 | 
			
		||||
    unsigned int size_A     = dimsA.x * dimsA.y;
 | 
			
		||||
    unsigned int mem_size_A = sizeof(float) * size_A;
 | 
			
		||||
    float       *h_A        = (float *)malloc(mem_size_A);
 | 
			
		||||
    unsigned int size_B     = dimsB.x * dimsB.y;
 | 
			
		||||
    unsigned int mem_size_B = sizeof(float) * size_B;
 | 
			
		||||
    float       *h_B        = (float *)malloc(mem_size_B);
 | 
			
		||||
 | 
			
		||||
  // Initialize host memory
 | 
			
		||||
  const float valB = 0.01f;
 | 
			
		||||
  constantInit(h_A, size_A, 1.0f);
 | 
			
		||||
  constantInit(h_B, size_B, valB);
 | 
			
		||||
    // Initialize host memory
 | 
			
		||||
    const float valB = 0.01f;
 | 
			
		||||
    constantInit(h_A, size_A, 1.0f);
 | 
			
		||||
    constantInit(h_B, size_B, valB);
 | 
			
		||||
 | 
			
		||||
  // Allocate device memory
 | 
			
		||||
  CUdeviceptr d_A, d_B, d_C;
 | 
			
		||||
    // Allocate device memory
 | 
			
		||||
    CUdeviceptr d_A, d_B, d_C;
 | 
			
		||||
 | 
			
		||||
  char *cubin, *kernel_file;
 | 
			
		||||
  size_t cubinSize;
 | 
			
		||||
    char  *cubin, *kernel_file;
 | 
			
		||||
    size_t cubinSize;
 | 
			
		||||
 | 
			
		||||
  kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
 | 
			
		||||
  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
 | 
			
		||||
    kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
 | 
			
		||||
    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
 | 
			
		||||
 | 
			
		||||
  CUmodule module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
    CUmodule module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
 | 
			
		||||
  // Allocate host matrix C
 | 
			
		||||
  dim3 dimsC(dimsB.x, dimsA.y, 1);
 | 
			
		||||
  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 | 
			
		||||
  float *h_C = (float *)malloc(mem_size_C);
 | 
			
		||||
    // Allocate host matrix C
 | 
			
		||||
    dim3         dimsC(dimsB.x, dimsA.y, 1);
 | 
			
		||||
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 | 
			
		||||
    float       *h_C        = (float *)malloc(mem_size_C);
 | 
			
		||||
 | 
			
		||||
  if (h_C == NULL) {
 | 
			
		||||
    fprintf(stderr, "Failed to allocate host matrix C!\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
  // copy host memory to device
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
 | 
			
		||||
 | 
			
		||||
  // Setup execution parameters
 | 
			
		||||
  dim3 threads(block_size, block_size);
 | 
			
		||||
  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
 | 
			
		||||
 | 
			
		||||
  // Create and start timer
 | 
			
		||||
  printf("Computing result using CUDA Kernel...\n");
 | 
			
		||||
 | 
			
		||||
  CUfunction kernel_addr;
 | 
			
		||||
  if (block_size == 16) {
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
 | 
			
		||||
  } else {
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
 | 
			
		||||
                 (void *)&dimsB.x};
 | 
			
		||||
 | 
			
		||||
  // Execute the kernel
 | 
			
		||||
  int nIter = 300;
 | 
			
		||||
 | 
			
		||||
  for (int j = 0; j < nIter; j++) {
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
 | 
			
		||||
                       threads.x, threads.y, threads.z,     /* block dim */
 | 
			
		||||
                       0, 0,    /* shared mem, stream */
 | 
			
		||||
                       &arr[0], /* arguments */
 | 
			
		||||
                       0));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Copy result from device to host
 | 
			
		||||
  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
  printf("Checking computed result for correctness: ");
 | 
			
		||||
 | 
			
		||||
  bool correct = true;
 | 
			
		||||
 | 
			
		||||
  // test relative error by the formula
 | 
			
		||||
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
 | 
			
		||||
 | 
			
		||||
  double eps = 1.e-6;  // machine zero
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
 | 
			
		||||
    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
 | 
			
		||||
    double dot_length = dimsA.x;
 | 
			
		||||
    double abs_val = fabs(h_C[i]);
 | 
			
		||||
    double rel_err = abs_err / abs_val / dot_length;
 | 
			
		||||
 | 
			
		||||
    if (rel_err > eps) {
 | 
			
		||||
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
 | 
			
		||||
             h_C[i], dimsA.x * valB, eps);
 | 
			
		||||
      correct = false;
 | 
			
		||||
    if (h_C == NULL) {
 | 
			
		||||
        fprintf(stderr, "Failed to allocate host matrix C!\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
  printf(
 | 
			
		||||
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
 | 
			
		||||
      "Results may vary when GPU Boost is enabled.\n");
 | 
			
		||||
    // copy host memory to device
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
 | 
			
		||||
 | 
			
		||||
  // Clean up memory
 | 
			
		||||
  free(h_A);
 | 
			
		||||
  free(h_B);
 | 
			
		||||
  free(h_C);
 | 
			
		||||
    // Setup execution parameters
 | 
			
		||||
    dim3 threads(block_size, block_size);
 | 
			
		||||
    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
    // Create and start timer
 | 
			
		||||
    printf("Computing result using CUDA Kernel...\n");
 | 
			
		||||
 | 
			
		||||
  if (correct) {
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
  } else {
 | 
			
		||||
    return EXIT_FAILURE;
 | 
			
		||||
  }
 | 
			
		||||
    CUfunction kernel_addr;
 | 
			
		||||
    if (block_size == 16) {
 | 
			
		||||
        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
 | 
			
		||||
 | 
			
		||||
    // Execute the kernel
 | 
			
		||||
    int nIter = 300;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < nIter; j++) {
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(kernel_addr,
 | 
			
		||||
                                       grid.x,
 | 
			
		||||
                                       grid.y,
 | 
			
		||||
                                       grid.z, /* grid dim */
 | 
			
		||||
                                       threads.x,
 | 
			
		||||
                                       threads.y,
 | 
			
		||||
                                       threads.z, /* block dim */
 | 
			
		||||
                                       0,
 | 
			
		||||
                                       0,       /* shared mem, stream */
 | 
			
		||||
                                       &arr[0], /* arguments */
 | 
			
		||||
                                       0));
 | 
			
		||||
 | 
			
		||||
        checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Copy result from device to host
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
 | 
			
		||||
 | 
			
		||||
    printf("Checking computed result for correctness: ");
 | 
			
		||||
 | 
			
		||||
    bool correct = true;
 | 
			
		||||
 | 
			
		||||
    // test relative error by the formula
 | 
			
		||||
    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
 | 
			
		||||
 | 
			
		||||
    double eps = 1.e-6; // machine zero
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
 | 
			
		||||
        double abs_err    = fabs(h_C[i] - (dimsA.x * valB));
 | 
			
		||||
        double dot_length = dimsA.x;
 | 
			
		||||
        double abs_val    = fabs(h_C[i]);
 | 
			
		||||
        double rel_err    = abs_err / abs_val / dot_length;
 | 
			
		||||
 | 
			
		||||
        if (rel_err > eps) {
 | 
			
		||||
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
 | 
			
		||||
            correct = false;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
 | 
			
		||||
    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
 | 
			
		||||
           "Results may vary when GPU Boost is enabled.\n");
 | 
			
		||||
 | 
			
		||||
    // Clean up memory
 | 
			
		||||
    free(h_A);
 | 
			
		||||
    free(h_B);
 | 
			
		||||
    free(h_C);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
 | 
			
		||||
    if (correct) {
 | 
			
		||||
        return EXIT_SUCCESS;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        return EXIT_FAILURE;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Program main
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("[Matrix Multiply Using CUDA] - Starting...\n");
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("[Matrix Multiply Using CUDA] - Starting...\n");
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
 | 
			
		||||
      checkCmdLineFlag(argc, (const char **)argv, "?")) {
 | 
			
		||||
    printf("Usage -device=n (n >= 0 for deviceID)\n");
 | 
			
		||||
    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
 | 
			
		||||
    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
 | 
			
		||||
        printf("Usage -device=n (n >= 0 for deviceID)\n");
 | 
			
		||||
        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
 | 
			
		||||
        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
 | 
			
		||||
        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  int block_size = 32;
 | 
			
		||||
    int block_size = 32;
 | 
			
		||||
 | 
			
		||||
  // original:
 | 
			
		||||
  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
    // original:
 | 
			
		||||
    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
 | 
			
		||||
 | 
			
		||||
  // reduce sizes to avoid running out of memory
 | 
			
		||||
  // dim3 dimsA(32,32, 1);
 | 
			
		||||
  // dim3 dimsB(32,32,1);
 | 
			
		||||
    // reduce sizes to avoid running out of memory
 | 
			
		||||
    // dim3 dimsA(32,32, 1);
 | 
			
		||||
    // dim3 dimsB(32,32,1);
 | 
			
		||||
 | 
			
		||||
  // width of Matrix A
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
 | 
			
		||||
    dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
 | 
			
		||||
  }
 | 
			
		||||
    // width of Matrix A
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
 | 
			
		||||
        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // height of Matrix A
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
 | 
			
		||||
    dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
 | 
			
		||||
  }
 | 
			
		||||
    // height of Matrix A
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
 | 
			
		||||
        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // width of Matrix B
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
 | 
			
		||||
    dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
 | 
			
		||||
  }
 | 
			
		||||
    // width of Matrix B
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
 | 
			
		||||
        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // height of Matrix B
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
 | 
			
		||||
    dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
 | 
			
		||||
  }
 | 
			
		||||
    // height of Matrix B
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
 | 
			
		||||
        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (dimsA.x != dimsB.y) {
 | 
			
		||||
    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
 | 
			
		||||
           dimsA.x, dimsB.y);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (dimsA.x != dimsB.y) {
 | 
			
		||||
        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
 | 
			
		||||
         dimsB.y);
 | 
			
		||||
    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
 | 
			
		||||
 | 
			
		||||
  int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
 | 
			
		||||
    int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
 | 
			
		||||
 | 
			
		||||
  exit(matrix_result);
 | 
			
		||||
    exit(matrix_result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -48,84 +48,83 @@
 | 
			
		||||
 | 
			
		||||
#include <cooperative_groups.h>
 | 
			
		||||
 | 
			
		||||
template <int BLOCK_SIZE>
 | 
			
		||||
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
 | 
			
		||||
  // Handle to thread block group
 | 
			
		||||
  cooperative_groups::thread_block cta =
 | 
			
		||||
      cooperative_groups::this_thread_block();
 | 
			
		||||
  // Block index
 | 
			
		||||
  int bx = blockIdx.x;
 | 
			
		||||
  int by = blockIdx.y;
 | 
			
		||||
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
 | 
			
		||||
{
 | 
			
		||||
    // Handle to thread block group
 | 
			
		||||
    cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
 | 
			
		||||
    // Block index
 | 
			
		||||
    int bx = blockIdx.x;
 | 
			
		||||
    int by = blockIdx.y;
 | 
			
		||||
 | 
			
		||||
  // Thread index
 | 
			
		||||
  int tx = threadIdx.x;
 | 
			
		||||
  int ty = threadIdx.y;
 | 
			
		||||
    // Thread index
 | 
			
		||||
    int tx = threadIdx.x;
 | 
			
		||||
    int ty = threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  // Index of the first sub-matrix of A processed by the block
 | 
			
		||||
  int aBegin = wA * BLOCK_SIZE * by;
 | 
			
		||||
    // Index of the first sub-matrix of A processed by the block
 | 
			
		||||
    int aBegin = wA * BLOCK_SIZE * by;
 | 
			
		||||
 | 
			
		||||
  // Index of the last sub-matrix of A processed by the block
 | 
			
		||||
  int aEnd = aBegin + wA - 1;
 | 
			
		||||
    // Index of the last sub-matrix of A processed by the block
 | 
			
		||||
    int aEnd = aBegin + wA - 1;
 | 
			
		||||
 | 
			
		||||
  // Step size used to iterate through the sub-matrices of A
 | 
			
		||||
  int aStep = BLOCK_SIZE;
 | 
			
		||||
    // Step size used to iterate through the sub-matrices of A
 | 
			
		||||
    int aStep = BLOCK_SIZE;
 | 
			
		||||
 | 
			
		||||
  // Index of the first sub-matrix of B processed by the block
 | 
			
		||||
  int bBegin = BLOCK_SIZE * bx;
 | 
			
		||||
    // Index of the first sub-matrix of B processed by the block
 | 
			
		||||
    int bBegin = BLOCK_SIZE * bx;
 | 
			
		||||
 | 
			
		||||
  // Step size used to iterate through the sub-matrices of B
 | 
			
		||||
  int bStep = BLOCK_SIZE * wB;
 | 
			
		||||
    // Step size used to iterate through the sub-matrices of B
 | 
			
		||||
    int bStep = BLOCK_SIZE * wB;
 | 
			
		||||
 | 
			
		||||
  // Csub is used to store the element of the block sub-matrix
 | 
			
		||||
  // that is computed by the thread
 | 
			
		||||
  float Csub = 0;
 | 
			
		||||
    // Csub is used to store the element of the block sub-matrix
 | 
			
		||||
    // that is computed by the thread
 | 
			
		||||
    float Csub = 0;
 | 
			
		||||
 | 
			
		||||
  // Loop over all the sub-matrices of A and B
 | 
			
		||||
  // required to compute the block sub-matrix
 | 
			
		||||
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
 | 
			
		||||
    // Declaration of the shared memory array As used to
 | 
			
		||||
    // store the sub-matrix of A
 | 
			
		||||
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
    // Loop over all the sub-matrices of A and B
 | 
			
		||||
    // required to compute the block sub-matrix
 | 
			
		||||
    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
 | 
			
		||||
        // Declaration of the shared memory array As used to
 | 
			
		||||
        // store the sub-matrix of A
 | 
			
		||||
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
 | 
			
		||||
    // Declaration of the shared memory array Bs used to
 | 
			
		||||
    // store the sub-matrix of B
 | 
			
		||||
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
        // Declaration of the shared memory array Bs used to
 | 
			
		||||
        // store the sub-matrix of B
 | 
			
		||||
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 | 
			
		||||
 | 
			
		||||
    // Load the matrices from device memory
 | 
			
		||||
    // to shared memory; each thread loads
 | 
			
		||||
    // one element of each matrix
 | 
			
		||||
    As[ty][tx] = A[a + wA * ty + tx];
 | 
			
		||||
    Bs[ty][tx] = B[b + wB * ty + tx];
 | 
			
		||||
        // Load the matrices from device memory
 | 
			
		||||
        // to shared memory; each thread loads
 | 
			
		||||
        // one element of each matrix
 | 
			
		||||
        As[ty][tx] = A[a + wA * ty + tx];
 | 
			
		||||
        Bs[ty][tx] = B[b + wB * ty + tx];
 | 
			
		||||
 | 
			
		||||
    // Synchronize to make sure the matrices are loaded
 | 
			
		||||
    cooperative_groups::sync(cta);
 | 
			
		||||
        // Synchronize to make sure the matrices are loaded
 | 
			
		||||
        cooperative_groups::sync(cta);
 | 
			
		||||
 | 
			
		||||
// Multiply the two matrices together;
 | 
			
		||||
// each thread computes one element
 | 
			
		||||
// of the block sub-matrix
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int k = 0; k < BLOCK_SIZE; ++k) {
 | 
			
		||||
      Csub += As[ty][k] * Bs[k][tx];
 | 
			
		||||
        for (int k = 0; k < BLOCK_SIZE; ++k) {
 | 
			
		||||
            Csub += As[ty][k] * Bs[k][tx];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Synchronize to make sure that the preceding
 | 
			
		||||
        // computation is done before loading two new
 | 
			
		||||
        // sub-matrices of A and B in the next iteration
 | 
			
		||||
        cooperative_groups::sync(cta);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Synchronize to make sure that the preceding
 | 
			
		||||
    // computation is done before loading two new
 | 
			
		||||
    // sub-matrices of A and B in the next iteration
 | 
			
		||||
    cooperative_groups::sync(cta);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Write the block sub-matrix to device memory;
 | 
			
		||||
  // each thread writes one element
 | 
			
		||||
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 | 
			
		||||
  C[c + wB * ty + tx] = Csub;
 | 
			
		||||
    // Write the block sub-matrix to device memory;
 | 
			
		||||
    // each thread writes one element
 | 
			
		||||
    int c               = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 | 
			
		||||
    C[c + wB * ty + tx] = Csub;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
 | 
			
		||||
                                                 int wA, int wB) {
 | 
			
		||||
  matrixMulCUDA<16>(C, A, B, wA, wB);
 | 
			
		||||
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
 | 
			
		||||
{
 | 
			
		||||
    matrixMulCUDA<16>(C, A, B, wA, wB);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
 | 
			
		||||
                                                 int wA, int wB) {
 | 
			
		||||
  matrixMulCUDA<32>(C, A, B, wA, wB);
 | 
			
		||||
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
 | 
			
		||||
{
 | 
			
		||||
    matrixMulCUDA<32>(C, A, B, wA, wB);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -28,252 +28,254 @@
 | 
			
		||||
#include <cooperative_groups.h>
 | 
			
		||||
 | 
			
		||||
namespace cg = cooperative_groups;
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
 | 
			
		||||
#include "mergeSort_common.h"
 | 
			
		||||
 | 
			
		||||
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
 | 
			
		||||
                                  uint &valB, uint arrowDir) {
 | 
			
		||||
  uint t;
 | 
			
		||||
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
 | 
			
		||||
{
 | 
			
		||||
    uint t;
 | 
			
		||||
 | 
			
		||||
  if ((keyA > keyB) == arrowDir) {
 | 
			
		||||
    t = keyA;
 | 
			
		||||
    keyA = keyB;
 | 
			
		||||
    keyB = t;
 | 
			
		||||
    t = valA;
 | 
			
		||||
    valA = valB;
 | 
			
		||||
    valB = t;
 | 
			
		||||
  }
 | 
			
		||||
    if ((keyA > keyB) == arrowDir) {
 | 
			
		||||
        t    = keyA;
 | 
			
		||||
        keyA = keyB;
 | 
			
		||||
        keyB = t;
 | 
			
		||||
        t    = valA;
 | 
			
		||||
        valA = valB;
 | 
			
		||||
        valB = t;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                        uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                                        uint arrayLength, uint sortDir) {
 | 
			
		||||
  // Handle to thread block group
 | 
			
		||||
  cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
  // Shared memory storage for one or more short vectors
 | 
			
		||||
  __shared__ uint s_key[SHARED_SIZE_LIMIT];
 | 
			
		||||
  __shared__ uint s_val[SHARED_SIZE_LIMIT];
 | 
			
		||||
__global__ void
 | 
			
		||||
bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    // Handle to thread block group
 | 
			
		||||
    cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
    // Shared memory storage for one or more short vectors
 | 
			
		||||
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
 | 
			
		||||
    __shared__ uint s_val[SHARED_SIZE_LIMIT];
 | 
			
		||||
 | 
			
		||||
  // Offset to the beginning of subbatch and load data
 | 
			
		||||
  d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  s_key[threadIdx.x + 0] = d_SrcKey[0];
 | 
			
		||||
  s_val[threadIdx.x + 0] = d_SrcVal[0];
 | 
			
		||||
  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    // Offset to the beginning of subbatch and load data
 | 
			
		||||
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
 | 
			
		||||
    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
 | 
			
		||||
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
 | 
			
		||||
  for (uint size = 2; size < arrayLength; size <<= 1) {
 | 
			
		||||
    // Bitonic merge
 | 
			
		||||
    uint dir = (threadIdx.x & (size / 2)) != 0;
 | 
			
		||||
    for (uint size = 2; size < arrayLength; size <<= 1) {
 | 
			
		||||
        // Bitonic merge
 | 
			
		||||
        uint dir = (threadIdx.x & (size / 2)) != 0;
 | 
			
		||||
 | 
			
		||||
    for (uint stride = size / 2; stride > 0; stride >>= 1) {
 | 
			
		||||
      cg::sync(cta);
 | 
			
		||||
      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
 | 
			
		||||
      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
 | 
			
		||||
                 s_val[pos + stride], dir);
 | 
			
		||||
        for (uint stride = size / 2; stride > 0; stride >>= 1) {
 | 
			
		||||
            cg::sync(cta);
 | 
			
		||||
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
 | 
			
		||||
            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // ddd == sortDir for the last bitonic merge step
 | 
			
		||||
  {
 | 
			
		||||
    for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
 | 
			
		||||
      cg::sync(cta);
 | 
			
		||||
      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
 | 
			
		||||
      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
 | 
			
		||||
                 s_val[pos + stride], sortDir);
 | 
			
		||||
    // ddd == sortDir for the last bitonic merge step
 | 
			
		||||
    {
 | 
			
		||||
        for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
 | 
			
		||||
            cg::sync(cta);
 | 
			
		||||
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
 | 
			
		||||
            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
  d_DstKey[0] = s_key[threadIdx.x + 0];
 | 
			
		||||
  d_DstVal[0] = s_val[threadIdx.x + 0];
 | 
			
		||||
  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
    d_DstKey[0]                       = s_key[threadIdx.x + 0];
 | 
			
		||||
    d_DstVal[0]                       = s_val[threadIdx.x + 0];
 | 
			
		||||
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Helper function (also used by odd-even merge sort)
 | 
			
		||||
extern "C" uint factorRadix2(uint *log2L, uint L) {
 | 
			
		||||
  if (!L) {
 | 
			
		||||
    *log2L = 0;
 | 
			
		||||
    return 0;
 | 
			
		||||
  } else {
 | 
			
		||||
    for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
 | 
			
		||||
      ;
 | 
			
		||||
extern "C" uint factorRadix2(uint *log2L, uint L)
 | 
			
		||||
{
 | 
			
		||||
    if (!L) {
 | 
			
		||||
        *log2L = 0;
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
 | 
			
		||||
            ;
 | 
			
		||||
 | 
			
		||||
    return L;
 | 
			
		||||
  }
 | 
			
		||||
        return L;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                  uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                                  uint batchSize, uint arrayLength,
 | 
			
		||||
                                  uint sortDir) {
 | 
			
		||||
  // Nothing to sort
 | 
			
		||||
  if (arrayLength < 2) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
extern "C" void bitonicSortShared(uint *d_DstKey,
 | 
			
		||||
                                  uint *d_DstVal,
 | 
			
		||||
                                  uint *d_SrcKey,
 | 
			
		||||
                                  uint *d_SrcVal,
 | 
			
		||||
                                  uint  batchSize,
 | 
			
		||||
                                  uint  arrayLength,
 | 
			
		||||
                                  uint  sortDir)
 | 
			
		||||
{
 | 
			
		||||
    // Nothing to sort
 | 
			
		||||
    if (arrayLength < 2) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Only power-of-two array lengths are supported by this implementation
 | 
			
		||||
  uint log2L;
 | 
			
		||||
  uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
 | 
			
		||||
  assert(factorizationRemainder == 1);
 | 
			
		||||
    // Only power-of-two array lengths are supported by this implementation
 | 
			
		||||
    uint log2L;
 | 
			
		||||
    uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
 | 
			
		||||
    assert(factorizationRemainder == 1);
 | 
			
		||||
 | 
			
		||||
  uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
 | 
			
		||||
  uint threadCount = SHARED_SIZE_LIMIT / 2;
 | 
			
		||||
    uint blockCount  = batchSize * arrayLength / SHARED_SIZE_LIMIT;
 | 
			
		||||
    uint threadCount = SHARED_SIZE_LIMIT / 2;
 | 
			
		||||
 | 
			
		||||
  assert(arrayLength <= SHARED_SIZE_LIMIT);
 | 
			
		||||
  assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
 | 
			
		||||
    assert(arrayLength <= SHARED_SIZE_LIMIT);
 | 
			
		||||
    assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
 | 
			
		||||
 | 
			
		||||
  bitonicSortSharedKernel<<<blockCount, threadCount>>>(
 | 
			
		||||
      d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
 | 
			
		||||
  getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
 | 
			
		||||
    bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
 | 
			
		||||
    getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge step 3: merge elementary intervals
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
 | 
			
		||||
  return ((a % b) == 0) ? (a / b) : (a / b + 1);
 | 
			
		||||
}
 | 
			
		||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
 | 
			
		||||
 | 
			
		||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
 | 
			
		||||
  return iDivUp(dividend, SAMPLE_STRIDE);
 | 
			
		||||
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
 | 
			
		||||
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
static inline __device__ void
 | 
			
		||||
ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
 | 
			
		||||
{
 | 
			
		||||
    uint t;
 | 
			
		||||
 | 
			
		||||
    if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
 | 
			
		||||
        || ((arrowDir != sortDir) && (flagB == 1))) {
 | 
			
		||||
        t     = keyA;
 | 
			
		||||
        keyA  = keyB;
 | 
			
		||||
        keyB  = t;
 | 
			
		||||
        t     = valA;
 | 
			
		||||
        valA  = valB;
 | 
			
		||||
        valB  = t;
 | 
			
		||||
        t     = flagA;
 | 
			
		||||
        flagA = flagB;
 | 
			
		||||
        flagB = t;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
 | 
			
		||||
                                                 uint &flagA, uint &keyB,
 | 
			
		||||
                                                 uint &valB, uint &flagB,
 | 
			
		||||
                                                 uint arrowDir) {
 | 
			
		||||
  uint t;
 | 
			
		||||
__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
 | 
			
		||||
                                                      uint *d_DstVal,
 | 
			
		||||
                                                      uint *d_SrcKey,
 | 
			
		||||
                                                      uint *d_SrcVal,
 | 
			
		||||
                                                      uint *d_LimitsA,
 | 
			
		||||
                                                      uint *d_LimitsB,
 | 
			
		||||
                                                      uint  stride,
 | 
			
		||||
                                                      uint  N)
 | 
			
		||||
{
 | 
			
		||||
    // Handle to thread block group
 | 
			
		||||
    cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
 | 
			
		||||
    __shared__ uint  s_val[2 * SAMPLE_STRIDE];
 | 
			
		||||
    __shared__ uint  s_inf[2 * SAMPLE_STRIDE];
 | 
			
		||||
 | 
			
		||||
  if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
 | 
			
		||||
      ((arrowDir == sortDir) && (flagA == 1)) ||
 | 
			
		||||
      ((arrowDir != sortDir) && (flagB == 1))) {
 | 
			
		||||
    t = keyA;
 | 
			
		||||
    keyA = keyB;
 | 
			
		||||
    keyB = t;
 | 
			
		||||
    t = valA;
 | 
			
		||||
    valA = valB;
 | 
			
		||||
    valB = t;
 | 
			
		||||
    t = flagA;
 | 
			
		||||
    flagA = flagB;
 | 
			
		||||
    flagB = t;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
    const uint intervalI   = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
 | 
			
		||||
    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
 | 
			
		||||
    d_SrcKey += segmentBase;
 | 
			
		||||
    d_SrcVal += segmentBase;
 | 
			
		||||
    d_DstKey += segmentBase;
 | 
			
		||||
    d_DstVal += segmentBase;
 | 
			
		||||
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
__global__ void bitonicMergeElementaryIntervalsKernel(
 | 
			
		||||
    uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
    uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
 | 
			
		||||
  // Handle to thread block group
 | 
			
		||||
  cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
  __shared__ uint s_key[2 * SAMPLE_STRIDE];
 | 
			
		||||
  __shared__ uint s_val[2 * SAMPLE_STRIDE];
 | 
			
		||||
  __shared__ uint s_inf[2 * SAMPLE_STRIDE];
 | 
			
		||||
    // Set up threadblock-wide parameters
 | 
			
		||||
    __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
 | 
			
		||||
 | 
			
		||||
  const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
 | 
			
		||||
  const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
 | 
			
		||||
  d_SrcKey += segmentBase;
 | 
			
		||||
  d_SrcVal += segmentBase;
 | 
			
		||||
  d_DstKey += segmentBase;
 | 
			
		||||
  d_DstVal += segmentBase;
 | 
			
		||||
    if (threadIdx.x == 0) {
 | 
			
		||||
        uint segmentElementsA = stride;
 | 
			
		||||
        uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
        uint segmentSamplesA  = stride / SAMPLE_STRIDE;
 | 
			
		||||
        uint segmentSamplesB  = getSampleCount(segmentElementsB);
 | 
			
		||||
        uint segmentSamples   = segmentSamplesA + segmentSamplesB;
 | 
			
		||||
 | 
			
		||||
  // Set up threadblock-wide parameters
 | 
			
		||||
  __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
 | 
			
		||||
        startSrcA = d_LimitsA[blockIdx.x];
 | 
			
		||||
        startSrcB = d_LimitsB[blockIdx.x];
 | 
			
		||||
        startDst  = startSrcA + startSrcB;
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    uint segmentElementsA = stride;
 | 
			
		||||
    uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
    uint segmentSamplesA = stride / SAMPLE_STRIDE;
 | 
			
		||||
    uint segmentSamplesB = getSampleCount(segmentElementsB);
 | 
			
		||||
    uint segmentSamples = segmentSamplesA + segmentSamplesB;
 | 
			
		||||
        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
 | 
			
		||||
        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
 | 
			
		||||
        lenSrcA      = endSrcA - startSrcA;
 | 
			
		||||
        lenSrcB      = endSrcB - startSrcB;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    startSrcA = d_LimitsA[blockIdx.x];
 | 
			
		||||
    startSrcB = d_LimitsB[blockIdx.x];
 | 
			
		||||
    startDst = startSrcA + startSrcB;
 | 
			
		||||
    s_inf[threadIdx.x + 0]             = 1;
 | 
			
		||||
    s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
 | 
			
		||||
 | 
			
		||||
    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
 | 
			
		||||
                                                    : segmentElementsA;
 | 
			
		||||
    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
 | 
			
		||||
                                                    : segmentElementsB;
 | 
			
		||||
    lenSrcA = endSrcA - startSrcA;
 | 
			
		||||
    lenSrcB = endSrcB - startSrcB;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  s_inf[threadIdx.x + 0] = 1;
 | 
			
		||||
  s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
 | 
			
		||||
 | 
			
		||||
  // Load input data
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenSrcA) {
 | 
			
		||||
    s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
 | 
			
		||||
    s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
 | 
			
		||||
    s_inf[threadIdx.x] = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Prepare for bitonic merge by inversing the ordering
 | 
			
		||||
  if (threadIdx.x < lenSrcB) {
 | 
			
		||||
    s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
 | 
			
		||||
        d_SrcKey[stride + startSrcB + threadIdx.x];
 | 
			
		||||
    s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
 | 
			
		||||
        d_SrcVal[stride + startSrcB + threadIdx.x];
 | 
			
		||||
    s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //"Extended" bitonic merge
 | 
			
		||||
  for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
 | 
			
		||||
    // Load input data
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
    uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
 | 
			
		||||
    ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
 | 
			
		||||
                                s_key[pos + stride], s_val[pos + stride],
 | 
			
		||||
                                s_inf[pos + stride], sortDir);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Store sorted data
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
  d_DstKey += startDst;
 | 
			
		||||
  d_DstVal += startDst;
 | 
			
		||||
    if (threadIdx.x < lenSrcA) {
 | 
			
		||||
        s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
 | 
			
		||||
        s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
 | 
			
		||||
        s_inf[threadIdx.x] = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenSrcA) {
 | 
			
		||||
    d_DstKey[threadIdx.x] = s_key[threadIdx.x];
 | 
			
		||||
    d_DstVal[threadIdx.x] = s_val[threadIdx.x];
 | 
			
		||||
  }
 | 
			
		||||
    // Prepare for bitonic merge by inversing the ordering
 | 
			
		||||
    if (threadIdx.x < lenSrcB) {
 | 
			
		||||
        s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
 | 
			
		||||
        s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
 | 
			
		||||
        s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenSrcB) {
 | 
			
		||||
    d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
 | 
			
		||||
    d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
 | 
			
		||||
  }
 | 
			
		||||
    //"Extended" bitonic merge
 | 
			
		||||
    for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
 | 
			
		||||
        cg::sync(cta);
 | 
			
		||||
        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
 | 
			
		||||
        ComparatorExtended<sortDir>(s_key[pos + 0],
 | 
			
		||||
                                    s_val[pos + 0],
 | 
			
		||||
                                    s_inf[pos + 0],
 | 
			
		||||
                                    s_key[pos + stride],
 | 
			
		||||
                                    s_val[pos + stride],
 | 
			
		||||
                                    s_inf[pos + stride],
 | 
			
		||||
                                    sortDir);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Store sorted data
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
    d_DstKey += startDst;
 | 
			
		||||
    d_DstVal += startDst;
 | 
			
		||||
 | 
			
		||||
    if (threadIdx.x < lenSrcA) {
 | 
			
		||||
        d_DstKey[threadIdx.x] = s_key[threadIdx.x];
 | 
			
		||||
        d_DstVal[threadIdx.x] = s_val[threadIdx.x];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (threadIdx.x < lenSrcB) {
 | 
			
		||||
        d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
 | 
			
		||||
        d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                                uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
 | 
			
		||||
                                                uint *d_DstVal,
 | 
			
		||||
                                                uint *d_SrcKey,
 | 
			
		||||
                                                uint *d_SrcVal,
 | 
			
		||||
                                                uint *d_LimitsA,
 | 
			
		||||
                                                uint *d_LimitsB, uint stride,
 | 
			
		||||
                                                uint N, uint sortDir) {
 | 
			
		||||
  uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
                                                uint *d_LimitsB,
 | 
			
		||||
                                                uint  stride,
 | 
			
		||||
                                                uint  N,
 | 
			
		||||
                                                uint  sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
 | 
			
		||||
  uint mergePairs = (lastSegmentElements > stride)
 | 
			
		||||
                        ? getSampleCount(N)
 | 
			
		||||
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
 | 
			
		||||
    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
 | 
			
		||||
 | 
			
		||||
  if (sortDir) {
 | 
			
		||||
    bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
 | 
			
		||||
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
 | 
			
		||||
        N);
 | 
			
		||||
    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
 | 
			
		||||
  } else {
 | 
			
		||||
    bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
 | 
			
		||||
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
 | 
			
		||||
        N);
 | 
			
		||||
    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
 | 
			
		||||
  }
 | 
			
		||||
    if (sortDir) {
 | 
			
		||||
        bitonicMergeElementaryIntervalsKernel<1U>
 | 
			
		||||
            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
 | 
			
		||||
        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        bitonicMergeElementaryIntervalsKernel<0U>
 | 
			
		||||
            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
 | 
			
		||||
        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,96 +26,94 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
 | 
			
		||||
#include "mergeSort_common.h"
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Test driver
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
 | 
			
		||||
  uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
 | 
			
		||||
  StopWatchInterface *hTimer = NULL;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    uint               *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
 | 
			
		||||
    uint               *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
 | 
			
		||||
    StopWatchInterface *hTimer = NULL;
 | 
			
		||||
 | 
			
		||||
  const uint N = 4 * 1048576;
 | 
			
		||||
  const uint DIR = 1;
 | 
			
		||||
  const uint numValues = 65536;
 | 
			
		||||
    const uint N         = 4 * 1048576;
 | 
			
		||||
    const uint DIR       = 1;
 | 
			
		||||
    const uint numValues = 65536;
 | 
			
		||||
 | 
			
		||||
  printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
    printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  if (dev == -1) {
 | 
			
		||||
    return EXIT_FAILURE;
 | 
			
		||||
  }
 | 
			
		||||
    if (dev == -1) {
 | 
			
		||||
        return EXIT_FAILURE;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf("Allocating and initializing host arrays...\n\n");
 | 
			
		||||
  sdkCreateTimer(&hTimer);
 | 
			
		||||
  h_SrcKey = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
  h_SrcVal = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
  h_DstKey = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
  h_DstVal = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
    printf("Allocating and initializing host arrays...\n\n");
 | 
			
		||||
    sdkCreateTimer(&hTimer);
 | 
			
		||||
    h_SrcKey = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
    h_SrcVal = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
    h_DstKey = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
    h_DstVal = (uint *)malloc(N * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
  srand(2009);
 | 
			
		||||
    srand(2009);
 | 
			
		||||
 | 
			
		||||
  for (uint i = 0; i < N; i++) {
 | 
			
		||||
    h_SrcKey[i] = rand() % numValues;
 | 
			
		||||
  }
 | 
			
		||||
    for (uint i = 0; i < N; i++) {
 | 
			
		||||
        h_SrcKey[i] = rand() % numValues;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  fillValues(h_SrcVal, N);
 | 
			
		||||
    fillValues(h_SrcVal, N);
 | 
			
		||||
 | 
			
		||||
  printf("Allocating and initializing CUDA arrays...\n\n");
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
 | 
			
		||||
    printf("Allocating and initializing CUDA arrays...\n\n");
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  printf("Initializing GPU merge sort...\n");
 | 
			
		||||
  initMergeSort();
 | 
			
		||||
    printf("Initializing GPU merge sort...\n");
 | 
			
		||||
    initMergeSort();
 | 
			
		||||
 | 
			
		||||
  printf("Running GPU merge sort...\n");
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  sdkResetTimer(&hTimer);
 | 
			
		||||
  sdkStartTimer(&hTimer);
 | 
			
		||||
  mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  sdkStopTimer(&hTimer);
 | 
			
		||||
  printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
 | 
			
		||||
    printf("Running GPU merge sort...\n");
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    sdkResetTimer(&hTimer);
 | 
			
		||||
    sdkStartTimer(&hTimer);
 | 
			
		||||
    mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    sdkStopTimer(&hTimer);
 | 
			
		||||
    printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
 | 
			
		||||
 | 
			
		||||
  printf("Reading back GPU merge sort results...\n");
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
 | 
			
		||||
    printf("Reading back GPU merge sort results...\n");
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  printf("Inspecting the results...\n");
 | 
			
		||||
  uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
 | 
			
		||||
    printf("Inspecting the results...\n");
 | 
			
		||||
    uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
 | 
			
		||||
 | 
			
		||||
  uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
 | 
			
		||||
    uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
 | 
			
		||||
 | 
			
		||||
  printf("Shutting down...\n");
 | 
			
		||||
  closeMergeSort();
 | 
			
		||||
  sdkDeleteTimer(&hTimer);
 | 
			
		||||
  checkCudaErrors(cudaFree(d_SrcVal));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_SrcKey));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_BufVal));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_BufKey));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_DstVal));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_DstKey));
 | 
			
		||||
  free(h_DstVal);
 | 
			
		||||
  free(h_DstKey);
 | 
			
		||||
  free(h_SrcVal);
 | 
			
		||||
  free(h_SrcKey);
 | 
			
		||||
    printf("Shutting down...\n");
 | 
			
		||||
    closeMergeSort();
 | 
			
		||||
    sdkDeleteTimer(&hTimer);
 | 
			
		||||
    checkCudaErrors(cudaFree(d_SrcVal));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_SrcKey));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_BufVal));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_BufKey));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_DstVal));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_DstKey));
 | 
			
		||||
    free(h_DstVal);
 | 
			
		||||
    free(h_DstKey);
 | 
			
		||||
    free(h_SrcVal);
 | 
			
		||||
    free(h_SrcKey);
 | 
			
		||||
 | 
			
		||||
  exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -39,491 +39,499 @@
 | 
			
		||||
namespace cg = cooperative_groups;
 | 
			
		||||
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
 | 
			
		||||
#include "mergeSort_common.h"
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Helper functions
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
 | 
			
		||||
  return ((a % b) == 0) ? (a / b) : (a / b + 1);
 | 
			
		||||
}
 | 
			
		||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
 | 
			
		||||
 | 
			
		||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
 | 
			
		||||
  return iDivUp(dividend, SAMPLE_STRIDE);
 | 
			
		||||
}
 | 
			
		||||
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
 | 
			
		||||
 | 
			
		||||
#define W (sizeof(uint) * 8)
 | 
			
		||||
static inline __device__ uint nextPowerOfTwo(uint x) {
 | 
			
		||||
  /*
 | 
			
		||||
      --x;
 | 
			
		||||
      x |= x >> 1;
 | 
			
		||||
      x |= x >> 2;
 | 
			
		||||
      x |= x >> 4;
 | 
			
		||||
      x |= x >> 8;
 | 
			
		||||
      x |= x >> 16;
 | 
			
		||||
      return ++x;
 | 
			
		||||
  */
 | 
			
		||||
  return 1U << (W - __clz(x - 1));
 | 
			
		||||
static inline __device__ uint nextPowerOfTwo(uint x)
 | 
			
		||||
{
 | 
			
		||||
    /*
 | 
			
		||||
        --x;
 | 
			
		||||
        x |= x >> 1;
 | 
			
		||||
        x |= x >> 2;
 | 
			
		||||
        x |= x >> 4;
 | 
			
		||||
        x |= x >> 8;
 | 
			
		||||
        x |= x >> 16;
 | 
			
		||||
        return ++x;
 | 
			
		||||
    */
 | 
			
		||||
    return 1U << (W - __clz(x - 1));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
static inline __device__ uint binarySearchInclusive(uint val, uint *data,
 | 
			
		||||
                                                    uint L, uint stride) {
 | 
			
		||||
  if (L == 0) {
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  uint pos = 0;
 | 
			
		||||
 | 
			
		||||
  for (; stride > 0; stride >>= 1) {
 | 
			
		||||
    uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
    if ((sortDir && (data[newPos - 1] <= val)) ||
 | 
			
		||||
        (!sortDir && (data[newPos - 1] >= val))) {
 | 
			
		||||
      pos = newPos;
 | 
			
		||||
template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
 | 
			
		||||
{
 | 
			
		||||
    if (L == 0) {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return pos;
 | 
			
		||||
    uint pos = 0;
 | 
			
		||||
 | 
			
		||||
    for (; stride > 0; stride >>= 1) {
 | 
			
		||||
        uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
 | 
			
		||||
            pos = newPos;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return pos;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
static inline __device__ uint binarySearchExclusive(uint val, uint *data,
 | 
			
		||||
                                                    uint L, uint stride) {
 | 
			
		||||
  if (L == 0) {
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  uint pos = 0;
 | 
			
		||||
 | 
			
		||||
  for (; stride > 0; stride >>= 1) {
 | 
			
		||||
    uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
    if ((sortDir && (data[newPos - 1] < val)) ||
 | 
			
		||||
        (!sortDir && (data[newPos - 1] > val))) {
 | 
			
		||||
      pos = newPos;
 | 
			
		||||
template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
 | 
			
		||||
{
 | 
			
		||||
    if (L == 0) {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return pos;
 | 
			
		||||
    uint pos = 0;
 | 
			
		||||
 | 
			
		||||
    for (; stride > 0; stride >>= 1) {
 | 
			
		||||
        uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
 | 
			
		||||
            pos = newPos;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return pos;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Bottom-level merge sort (binary search-based)
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                      uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                                      uint arrayLength) {
 | 
			
		||||
  // Handle to thread block group
 | 
			
		||||
  cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
  __shared__ uint s_key[SHARED_SIZE_LIMIT];
 | 
			
		||||
  __shared__ uint s_val[SHARED_SIZE_LIMIT];
 | 
			
		||||
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
 | 
			
		||||
{
 | 
			
		||||
    // Handle to thread block group
 | 
			
		||||
    cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
    __shared__ uint  s_key[SHARED_SIZE_LIMIT];
 | 
			
		||||
    __shared__ uint  s_val[SHARED_SIZE_LIMIT];
 | 
			
		||||
 | 
			
		||||
  d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
  s_key[threadIdx.x + 0] = d_SrcKey[0];
 | 
			
		||||
  s_val[threadIdx.x + 0] = d_SrcVal[0];
 | 
			
		||||
  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
 | 
			
		||||
    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
 | 
			
		||||
    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
 | 
			
		||||
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
 | 
			
		||||
  for (uint stride = 1; stride < arrayLength; stride <<= 1) {
 | 
			
		||||
    uint lPos = threadIdx.x & (stride - 1);
 | 
			
		||||
    uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
 | 
			
		||||
    uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
 | 
			
		||||
    for (uint stride = 1; stride < arrayLength; stride <<= 1) {
 | 
			
		||||
        uint  lPos    = threadIdx.x & (stride - 1);
 | 
			
		||||
        uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
 | 
			
		||||
        uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
 | 
			
		||||
 | 
			
		||||
        cg::sync(cta);
 | 
			
		||||
        uint keyA = baseKey[lPos + 0];
 | 
			
		||||
        uint valA = baseVal[lPos + 0];
 | 
			
		||||
        uint keyB = baseKey[lPos + stride];
 | 
			
		||||
        uint valB = baseVal[lPos + stride];
 | 
			
		||||
        uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
 | 
			
		||||
        uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
 | 
			
		||||
 | 
			
		||||
        cg::sync(cta);
 | 
			
		||||
        baseKey[posA] = keyA;
 | 
			
		||||
        baseVal[posA] = valA;
 | 
			
		||||
        baseKey[posB] = keyB;
 | 
			
		||||
        baseVal[posB] = valB;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
    uint keyA = baseKey[lPos + 0];
 | 
			
		||||
    uint valA = baseVal[lPos + 0];
 | 
			
		||||
    uint keyB = baseKey[lPos + stride];
 | 
			
		||||
    uint valB = baseVal[lPos + stride];
 | 
			
		||||
    uint posA =
 | 
			
		||||
        binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
 | 
			
		||||
        lPos;
 | 
			
		||||
    uint posB =
 | 
			
		||||
        binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
 | 
			
		||||
        lPos;
 | 
			
		||||
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
    baseKey[posA] = keyA;
 | 
			
		||||
    baseVal[posA] = valA;
 | 
			
		||||
    baseKey[posB] = keyB;
 | 
			
		||||
    baseVal[posB] = valB;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
  d_DstKey[0] = s_key[threadIdx.x + 0];
 | 
			
		||||
  d_DstVal[0] = s_val[threadIdx.x + 0];
 | 
			
		||||
  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
 | 
			
		||||
      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    d_DstKey[0]                       = s_key[threadIdx.x + 0];
 | 
			
		||||
    d_DstVal[0]                       = s_val[threadIdx.x + 0];
 | 
			
		||||
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
 | 
			
		||||
                            uint *d_SrcVal, uint batchSize, uint arrayLength,
 | 
			
		||||
                            uint sortDir) {
 | 
			
		||||
  if (arrayLength < 2) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
static void mergeSortShared(uint *d_DstKey,
 | 
			
		||||
                            uint *d_DstVal,
 | 
			
		||||
                            uint *d_SrcKey,
 | 
			
		||||
                            uint *d_SrcVal,
 | 
			
		||||
                            uint  batchSize,
 | 
			
		||||
                            uint  arrayLength,
 | 
			
		||||
                            uint  sortDir)
 | 
			
		||||
{
 | 
			
		||||
    if (arrayLength < 2) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  assert(SHARED_SIZE_LIMIT % arrayLength == 0);
 | 
			
		||||
  assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
 | 
			
		||||
  uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
 | 
			
		||||
  uint threadCount = SHARED_SIZE_LIMIT / 2;
 | 
			
		||||
    assert(SHARED_SIZE_LIMIT % arrayLength == 0);
 | 
			
		||||
    assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
 | 
			
		||||
    uint blockCount  = batchSize * arrayLength / SHARED_SIZE_LIMIT;
 | 
			
		||||
    uint threadCount = SHARED_SIZE_LIMIT / 2;
 | 
			
		||||
 | 
			
		||||
  if (sortDir) {
 | 
			
		||||
    mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
 | 
			
		||||
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
 | 
			
		||||
    getLastCudaError("mergeSortShared<1><<<>>> failed\n");
 | 
			
		||||
  } else {
 | 
			
		||||
    mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
 | 
			
		||||
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
 | 
			
		||||
    getLastCudaError("mergeSortShared<0><<<>>> failed\n");
 | 
			
		||||
  }
 | 
			
		||||
    if (sortDir) {
 | 
			
		||||
        mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
 | 
			
		||||
        getLastCudaError("mergeSortShared<1><<<>>> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
 | 
			
		||||
        getLastCudaError("mergeSortShared<0><<<>>> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge step 1: generate sample ranks
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
 | 
			
		||||
                                          uint *d_SrcKey, uint stride, uint N,
 | 
			
		||||
                                          uint threadCount) {
 | 
			
		||||
  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
__global__ void
 | 
			
		||||
generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
 | 
			
		||||
{
 | 
			
		||||
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (pos >= threadCount) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
    if (pos >= threadCount) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
  const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
  d_SrcKey += segmentBase;
 | 
			
		||||
  d_RanksA += segmentBase / SAMPLE_STRIDE;
 | 
			
		||||
  d_RanksB += segmentBase / SAMPLE_STRIDE;
 | 
			
		||||
    const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
    d_SrcKey += segmentBase;
 | 
			
		||||
    d_RanksA += segmentBase / SAMPLE_STRIDE;
 | 
			
		||||
    d_RanksB += segmentBase / SAMPLE_STRIDE;
 | 
			
		||||
 | 
			
		||||
  const uint segmentElementsA = stride;
 | 
			
		||||
  const uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
  const uint segmentSamplesA = getSampleCount(segmentElementsA);
 | 
			
		||||
  const uint segmentSamplesB = getSampleCount(segmentElementsB);
 | 
			
		||||
    const uint segmentElementsA = stride;
 | 
			
		||||
    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
    const uint segmentSamplesA  = getSampleCount(segmentElementsA);
 | 
			
		||||
    const uint segmentSamplesB  = getSampleCount(segmentElementsB);
 | 
			
		||||
 | 
			
		||||
  if (i < segmentSamplesA) {
 | 
			
		||||
    d_RanksA[i] = i * SAMPLE_STRIDE;
 | 
			
		||||
    d_RanksB[i] = binarySearchExclusive<sortDir>(
 | 
			
		||||
        d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
 | 
			
		||||
        nextPowerOfTwo(segmentElementsB));
 | 
			
		||||
  }
 | 
			
		||||
    if (i < segmentSamplesA) {
 | 
			
		||||
        d_RanksA[i] = i * SAMPLE_STRIDE;
 | 
			
		||||
        d_RanksB[i] = binarySearchExclusive<sortDir>(
 | 
			
		||||
            d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (i < segmentSamplesB) {
 | 
			
		||||
    d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
 | 
			
		||||
    d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
 | 
			
		||||
        d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
 | 
			
		||||
        nextPowerOfTwo(segmentElementsA));
 | 
			
		||||
  }
 | 
			
		||||
    if (i < segmentSamplesB) {
 | 
			
		||||
        d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
 | 
			
		||||
        d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
 | 
			
		||||
            d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
 | 
			
		||||
                                uint stride, uint N, uint sortDir) {
 | 
			
		||||
  uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
  uint threadCount =
 | 
			
		||||
      (lastSegmentElements > stride)
 | 
			
		||||
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
 | 
			
		||||
  if (sortDir) {
 | 
			
		||||
    generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
 | 
			
		||||
        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
 | 
			
		||||
    getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
 | 
			
		||||
  } else {
 | 
			
		||||
    generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
 | 
			
		||||
        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
 | 
			
		||||
    getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
 | 
			
		||||
  }
 | 
			
		||||
    if (sortDir) {
 | 
			
		||||
        generateSampleRanksKernel<1U>
 | 
			
		||||
            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
 | 
			
		||||
        getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        generateSampleRanksKernel<0U>
 | 
			
		||||
            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
 | 
			
		||||
        getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge step 2: generate sample ranks and indices
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
 | 
			
		||||
                                           uint stride, uint N,
 | 
			
		||||
                                           uint threadCount) {
 | 
			
		||||
  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
 | 
			
		||||
{
 | 
			
		||||
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (pos >= threadCount) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
    if (pos >= threadCount) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
  const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
  d_Ranks += (pos - i) * 2;
 | 
			
		||||
  d_Limits += (pos - i) * 2;
 | 
			
		||||
    const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
    d_Ranks += (pos - i) * 2;
 | 
			
		||||
    d_Limits += (pos - i) * 2;
 | 
			
		||||
 | 
			
		||||
  const uint segmentElementsA = stride;
 | 
			
		||||
  const uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
  const uint segmentSamplesA = getSampleCount(segmentElementsA);
 | 
			
		||||
  const uint segmentSamplesB = getSampleCount(segmentElementsB);
 | 
			
		||||
    const uint segmentElementsA = stride;
 | 
			
		||||
    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
    const uint segmentSamplesA  = getSampleCount(segmentElementsA);
 | 
			
		||||
    const uint segmentSamplesB  = getSampleCount(segmentElementsB);
 | 
			
		||||
 | 
			
		||||
  if (i < segmentSamplesA) {
 | 
			
		||||
    uint dstPos = binarySearchExclusive<1U>(
 | 
			
		||||
                      d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
 | 
			
		||||
                      nextPowerOfTwo(segmentSamplesB)) +
 | 
			
		||||
                  i;
 | 
			
		||||
    d_Limits[dstPos] = d_Ranks[i];
 | 
			
		||||
  }
 | 
			
		||||
    if (i < segmentSamplesA) {
 | 
			
		||||
        uint dstPos = binarySearchExclusive<1U>(
 | 
			
		||||
                          d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
 | 
			
		||||
                    + i;
 | 
			
		||||
        d_Limits[dstPos] = d_Ranks[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (i < segmentSamplesB) {
 | 
			
		||||
    uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
 | 
			
		||||
                                            d_Ranks, segmentSamplesA,
 | 
			
		||||
                                            nextPowerOfTwo(segmentSamplesA)) +
 | 
			
		||||
                  i;
 | 
			
		||||
    d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
 | 
			
		||||
  }
 | 
			
		||||
    if (i < segmentSamplesB) {
 | 
			
		||||
        uint dstPos = binarySearchInclusive<1U>(
 | 
			
		||||
                          d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
 | 
			
		||||
                    + i;
 | 
			
		||||
        d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
 | 
			
		||||
                                 uint *d_RanksA, uint *d_RanksB, uint stride,
 | 
			
		||||
                                 uint N) {
 | 
			
		||||
  uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
  uint threadCount =
 | 
			
		||||
      (lastSegmentElements > stride)
 | 
			
		||||
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
 | 
			
		||||
{
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
 | 
			
		||||
  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
 | 
			
		||||
      d_LimitsA, d_RanksA, stride, N, threadCount);
 | 
			
		||||
  getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
 | 
			
		||||
    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
 | 
			
		||||
    getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
 | 
			
		||||
 | 
			
		||||
  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
 | 
			
		||||
      d_LimitsB, d_RanksB, stride, N, threadCount);
 | 
			
		||||
  getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
 | 
			
		||||
    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
 | 
			
		||||
    getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge step 3: merge elementary intervals
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
 | 
			
		||||
                             uint *srcAVal, uint *srcBKey, uint *srcBVal,
 | 
			
		||||
                             uint lenA, uint nPowTwoLenA, uint lenB,
 | 
			
		||||
                             uint nPowTwoLenB, cg::thread_block cta) {
 | 
			
		||||
  uint keyA, valA, keyB, valB, dstPosA, dstPosB;
 | 
			
		||||
inline __device__ void merge(uint            *dstKey,
 | 
			
		||||
                             uint            *dstVal,
 | 
			
		||||
                             uint            *srcAKey,
 | 
			
		||||
                             uint            *srcAVal,
 | 
			
		||||
                             uint            *srcBKey,
 | 
			
		||||
                             uint            *srcBVal,
 | 
			
		||||
                             uint             lenA,
 | 
			
		||||
                             uint             nPowTwoLenA,
 | 
			
		||||
                             uint             lenB,
 | 
			
		||||
                             uint             nPowTwoLenB,
 | 
			
		||||
                             cg::thread_block cta)
 | 
			
		||||
{
 | 
			
		||||
    uint keyA, valA, keyB, valB, dstPosA, dstPosB;
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenA) {
 | 
			
		||||
    keyA = srcAKey[threadIdx.x];
 | 
			
		||||
    valA = srcAVal[threadIdx.x];
 | 
			
		||||
    dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
 | 
			
		||||
              threadIdx.x;
 | 
			
		||||
  }
 | 
			
		||||
    if (threadIdx.x < lenA) {
 | 
			
		||||
        keyA    = srcAKey[threadIdx.x];
 | 
			
		||||
        valA    = srcAVal[threadIdx.x];
 | 
			
		||||
        dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenB) {
 | 
			
		||||
    keyB = srcBKey[threadIdx.x];
 | 
			
		||||
    valB = srcBVal[threadIdx.x];
 | 
			
		||||
    dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
 | 
			
		||||
              threadIdx.x;
 | 
			
		||||
  }
 | 
			
		||||
    if (threadIdx.x < lenB) {
 | 
			
		||||
        keyB    = srcBKey[threadIdx.x];
 | 
			
		||||
        valB    = srcBVal[threadIdx.x];
 | 
			
		||||
        dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenA) {
 | 
			
		||||
    dstKey[dstPosA] = keyA;
 | 
			
		||||
    dstVal[dstPosA] = valA;
 | 
			
		||||
  }
 | 
			
		||||
    if (threadIdx.x < lenA) {
 | 
			
		||||
        dstKey[dstPosA] = keyA;
 | 
			
		||||
        dstVal[dstPosA] = valA;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenB) {
 | 
			
		||||
    dstKey[dstPosB] = keyB;
 | 
			
		||||
    dstVal[dstPosB] = valB;
 | 
			
		||||
  }
 | 
			
		||||
    if (threadIdx.x < lenB) {
 | 
			
		||||
        dstKey[dstPosB] = keyB;
 | 
			
		||||
        dstVal[dstPosB] = valB;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <uint sortDir>
 | 
			
		||||
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                               uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                                               uint *d_LimitsA, uint *d_LimitsB,
 | 
			
		||||
                                               uint stride, uint N) {
 | 
			
		||||
  // Handle to thread block group
 | 
			
		||||
  cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
  __shared__ uint s_key[2 * SAMPLE_STRIDE];
 | 
			
		||||
  __shared__ uint s_val[2 * SAMPLE_STRIDE];
 | 
			
		||||
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
 | 
			
		||||
                                               uint *d_DstVal,
 | 
			
		||||
                                               uint *d_SrcKey,
 | 
			
		||||
                                               uint *d_SrcVal,
 | 
			
		||||
                                               uint *d_LimitsA,
 | 
			
		||||
                                               uint *d_LimitsB,
 | 
			
		||||
                                               uint  stride,
 | 
			
		||||
                                               uint  N)
 | 
			
		||||
{
 | 
			
		||||
    // Handle to thread block group
 | 
			
		||||
    cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
 | 
			
		||||
    __shared__ uint  s_val[2 * SAMPLE_STRIDE];
 | 
			
		||||
 | 
			
		||||
  const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
 | 
			
		||||
  const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
 | 
			
		||||
  d_SrcKey += segmentBase;
 | 
			
		||||
  d_SrcVal += segmentBase;
 | 
			
		||||
  d_DstKey += segmentBase;
 | 
			
		||||
  d_DstVal += segmentBase;
 | 
			
		||||
    const uint intervalI   = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
 | 
			
		||||
    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
 | 
			
		||||
    d_SrcKey += segmentBase;
 | 
			
		||||
    d_SrcVal += segmentBase;
 | 
			
		||||
    d_DstKey += segmentBase;
 | 
			
		||||
    d_DstVal += segmentBase;
 | 
			
		||||
 | 
			
		||||
  // Set up threadblock-wide parameters
 | 
			
		||||
  __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
 | 
			
		||||
    // Set up threadblock-wide parameters
 | 
			
		||||
    __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    uint segmentElementsA = stride;
 | 
			
		||||
    uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
    uint segmentSamplesA = getSampleCount(segmentElementsA);
 | 
			
		||||
    uint segmentSamplesB = getSampleCount(segmentElementsB);
 | 
			
		||||
    uint segmentSamples = segmentSamplesA + segmentSamplesB;
 | 
			
		||||
    if (threadIdx.x == 0) {
 | 
			
		||||
        uint segmentElementsA = stride;
 | 
			
		||||
        uint segmentElementsB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
        uint segmentSamplesA  = getSampleCount(segmentElementsA);
 | 
			
		||||
        uint segmentSamplesB  = getSampleCount(segmentElementsB);
 | 
			
		||||
        uint segmentSamples   = segmentSamplesA + segmentSamplesB;
 | 
			
		||||
 | 
			
		||||
    startSrcA = d_LimitsA[blockIdx.x];
 | 
			
		||||
    startSrcB = d_LimitsB[blockIdx.x];
 | 
			
		||||
    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
 | 
			
		||||
                                                    : segmentElementsA;
 | 
			
		||||
    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
 | 
			
		||||
                                                    : segmentElementsB;
 | 
			
		||||
    lenSrcA = endSrcA - startSrcA;
 | 
			
		||||
    lenSrcB = endSrcB - startSrcB;
 | 
			
		||||
    startDstA = startSrcA + startSrcB;
 | 
			
		||||
    startDstB = startDstA + lenSrcA;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Load main input data
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenSrcA) {
 | 
			
		||||
    s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
 | 
			
		||||
    s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenSrcB) {
 | 
			
		||||
    s_key[threadIdx.x + SAMPLE_STRIDE] =
 | 
			
		||||
        d_SrcKey[stride + startSrcB + threadIdx.x];
 | 
			
		||||
    s_val[threadIdx.x + SAMPLE_STRIDE] =
 | 
			
		||||
        d_SrcVal[stride + startSrcB + threadIdx.x];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Merge data in shared memory
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
  merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
 | 
			
		||||
                 s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
 | 
			
		||||
                 SAMPLE_STRIDE, cta);
 | 
			
		||||
 | 
			
		||||
  // Store merged data
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenSrcA) {
 | 
			
		||||
    d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
 | 
			
		||||
    d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x < lenSrcB) {
 | 
			
		||||
    d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
 | 
			
		||||
    d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                     uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                                     uint *d_LimitsA, uint *d_LimitsB,
 | 
			
		||||
                                     uint stride, uint N, uint sortDir) {
 | 
			
		||||
  uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
  uint mergePairs = (lastSegmentElements > stride)
 | 
			
		||||
                        ? getSampleCount(N)
 | 
			
		||||
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
 | 
			
		||||
 | 
			
		||||
  if (sortDir) {
 | 
			
		||||
    mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
 | 
			
		||||
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
 | 
			
		||||
        N);
 | 
			
		||||
    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
 | 
			
		||||
  } else {
 | 
			
		||||
    mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
 | 
			
		||||
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
 | 
			
		||||
        N);
 | 
			
		||||
    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                  uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                                  uint batchSize, uint arrayLength,
 | 
			
		||||
                                  uint sortDir);
 | 
			
		||||
 | 
			
		||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
 | 
			
		||||
                                                uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                                                uint *d_LimitsA,
 | 
			
		||||
                                                uint *d_LimitsB, uint stride,
 | 
			
		||||
                                                uint N, uint sortDir);
 | 
			
		||||
 | 
			
		||||
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
 | 
			
		||||
static const uint MAX_SAMPLE_COUNT = 32768;
 | 
			
		||||
 | 
			
		||||
extern "C" void initMergeSort(void) {
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void closeMergeSort(void) {
 | 
			
		||||
  checkCudaErrors(cudaFree(d_RanksA));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_RanksB));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_LimitsB));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_LimitsA));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
 | 
			
		||||
                          uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
 | 
			
		||||
                          uint N, uint sortDir) {
 | 
			
		||||
  uint stageCount = 0;
 | 
			
		||||
 | 
			
		||||
  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
 | 
			
		||||
    ;
 | 
			
		||||
 | 
			
		||||
  uint *ikey, *ival, *okey, *oval;
 | 
			
		||||
 | 
			
		||||
  if (stageCount & 1) {
 | 
			
		||||
    ikey = d_BufKey;
 | 
			
		||||
    ival = d_BufVal;
 | 
			
		||||
    okey = d_DstKey;
 | 
			
		||||
    oval = d_DstVal;
 | 
			
		||||
  } else {
 | 
			
		||||
    ikey = d_DstKey;
 | 
			
		||||
    ival = d_DstVal;
 | 
			
		||||
    okey = d_BufKey;
 | 
			
		||||
    oval = d_BufVal;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
 | 
			
		||||
  assert(N % SHARED_SIZE_LIMIT == 0);
 | 
			
		||||
  mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
 | 
			
		||||
                  SHARED_SIZE_LIMIT, sortDir);
 | 
			
		||||
 | 
			
		||||
  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
 | 
			
		||||
    // Find sample ranks and prepare for limiters merge
 | 
			
		||||
    generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
 | 
			
		||||
 | 
			
		||||
    // Merge ranks and indices
 | 
			
		||||
    mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
 | 
			
		||||
 | 
			
		||||
    // Merge elementary intervals
 | 
			
		||||
    mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
 | 
			
		||||
                             stride, N, sortDir);
 | 
			
		||||
 | 
			
		||||
    if (lastSegmentElements <= stride) {
 | 
			
		||||
      // Last merge segment consists of a single array which just needs to be
 | 
			
		||||
      // passed through
 | 
			
		||||
      checkCudaErrors(cudaMemcpy(
 | 
			
		||||
          okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
 | 
			
		||||
          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
 | 
			
		||||
      checkCudaErrors(cudaMemcpy(
 | 
			
		||||
          oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
 | 
			
		||||
          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
 | 
			
		||||
        startSrcA    = d_LimitsA[blockIdx.x];
 | 
			
		||||
        startSrcB    = d_LimitsB[blockIdx.x];
 | 
			
		||||
        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
 | 
			
		||||
        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
 | 
			
		||||
        lenSrcA      = endSrcA - startSrcA;
 | 
			
		||||
        lenSrcB      = endSrcB - startSrcB;
 | 
			
		||||
        startDstA    = startSrcA + startSrcB;
 | 
			
		||||
        startDstB    = startDstA + lenSrcA;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    uint *t;
 | 
			
		||||
    t = ikey;
 | 
			
		||||
    ikey = okey;
 | 
			
		||||
    okey = t;
 | 
			
		||||
    t = ival;
 | 
			
		||||
    ival = oval;
 | 
			
		||||
    oval = t;
 | 
			
		||||
  }
 | 
			
		||||
    // Load main input data
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
    if (threadIdx.x < lenSrcA) {
 | 
			
		||||
        s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
 | 
			
		||||
        s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (threadIdx.x < lenSrcB) {
 | 
			
		||||
        s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
 | 
			
		||||
        s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Merge data in shared memory
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
    merge<sortDir>(s_key,
 | 
			
		||||
                   s_val,
 | 
			
		||||
                   s_key + 0,
 | 
			
		||||
                   s_val + 0,
 | 
			
		||||
                   s_key + SAMPLE_STRIDE,
 | 
			
		||||
                   s_val + SAMPLE_STRIDE,
 | 
			
		||||
                   lenSrcA,
 | 
			
		||||
                   SAMPLE_STRIDE,
 | 
			
		||||
                   lenSrcB,
 | 
			
		||||
                   SAMPLE_STRIDE,
 | 
			
		||||
                   cta);
 | 
			
		||||
 | 
			
		||||
    // Store merged data
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
    if (threadIdx.x < lenSrcA) {
 | 
			
		||||
        d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
 | 
			
		||||
        d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (threadIdx.x < lenSrcB) {
 | 
			
		||||
        d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
 | 
			
		||||
        d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mergeElementaryIntervals(uint *d_DstKey,
 | 
			
		||||
                                     uint *d_DstVal,
 | 
			
		||||
                                     uint *d_SrcKey,
 | 
			
		||||
                                     uint *d_SrcVal,
 | 
			
		||||
                                     uint *d_LimitsA,
 | 
			
		||||
                                     uint *d_LimitsB,
 | 
			
		||||
                                     uint  stride,
 | 
			
		||||
                                     uint  N,
 | 
			
		||||
                                     uint  sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
 | 
			
		||||
 | 
			
		||||
    if (sortDir) {
 | 
			
		||||
        mergeElementaryIntervalsKernel<1U>
 | 
			
		||||
            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
 | 
			
		||||
        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        mergeElementaryIntervalsKernel<0U>
 | 
			
		||||
            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
 | 
			
		||||
        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void bitonicSortShared(uint *d_DstKey,
 | 
			
		||||
                                  uint *d_DstVal,
 | 
			
		||||
                                  uint *d_SrcKey,
 | 
			
		||||
                                  uint *d_SrcVal,
 | 
			
		||||
                                  uint  batchSize,
 | 
			
		||||
                                  uint  arrayLength,
 | 
			
		||||
                                  uint  sortDir);
 | 
			
		||||
 | 
			
		||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
 | 
			
		||||
                                                uint *d_DstVal,
 | 
			
		||||
                                                uint *d_SrcKey,
 | 
			
		||||
                                                uint *d_SrcVal,
 | 
			
		||||
                                                uint *d_LimitsA,
 | 
			
		||||
                                                uint *d_LimitsB,
 | 
			
		||||
                                                uint  stride,
 | 
			
		||||
                                                uint  N,
 | 
			
		||||
                                                uint  sortDir);
 | 
			
		||||
 | 
			
		||||
static uint      *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
 | 
			
		||||
static const uint MAX_SAMPLE_COUNT = 32768;
 | 
			
		||||
 | 
			
		||||
extern "C" void initMergeSort(void)
 | 
			
		||||
{
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void closeMergeSort(void)
 | 
			
		||||
{
 | 
			
		||||
    checkCudaErrors(cudaFree(d_RanksA));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_RanksB));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_LimitsB));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_LimitsA));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void mergeSort(uint *d_DstKey,
 | 
			
		||||
                          uint *d_DstVal,
 | 
			
		||||
                          uint *d_BufKey,
 | 
			
		||||
                          uint *d_BufVal,
 | 
			
		||||
                          uint *d_SrcKey,
 | 
			
		||||
                          uint *d_SrcVal,
 | 
			
		||||
                          uint  N,
 | 
			
		||||
                          uint  sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint stageCount = 0;
 | 
			
		||||
 | 
			
		||||
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
 | 
			
		||||
        ;
 | 
			
		||||
 | 
			
		||||
    uint *ikey, *ival, *okey, *oval;
 | 
			
		||||
 | 
			
		||||
    if (stageCount & 1) {
 | 
			
		||||
        ikey = d_BufKey;
 | 
			
		||||
        ival = d_BufVal;
 | 
			
		||||
        okey = d_DstKey;
 | 
			
		||||
        oval = d_DstVal;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        ikey = d_DstKey;
 | 
			
		||||
        ival = d_DstVal;
 | 
			
		||||
        okey = d_BufKey;
 | 
			
		||||
        oval = d_BufVal;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
 | 
			
		||||
    assert(N % SHARED_SIZE_LIMIT == 0);
 | 
			
		||||
    mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
 | 
			
		||||
 | 
			
		||||
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
 | 
			
		||||
        uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
 | 
			
		||||
        // Find sample ranks and prepare for limiters merge
 | 
			
		||||
        generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
 | 
			
		||||
 | 
			
		||||
        // Merge ranks and indices
 | 
			
		||||
        mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
 | 
			
		||||
 | 
			
		||||
        // Merge elementary intervals
 | 
			
		||||
        mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
 | 
			
		||||
 | 
			
		||||
        if (lastSegmentElements <= stride) {
 | 
			
		||||
            // Last merge segment consists of a single array which just needs to be
 | 
			
		||||
            // passed through
 | 
			
		||||
            checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
 | 
			
		||||
                                       ikey + (N - lastSegmentElements),
 | 
			
		||||
                                       lastSegmentElements * sizeof(uint),
 | 
			
		||||
                                       cudaMemcpyDeviceToDevice));
 | 
			
		||||
            checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
 | 
			
		||||
                                       ival + (N - lastSegmentElements),
 | 
			
		||||
                                       lastSegmentElements * sizeof(uint),
 | 
			
		||||
                                       cudaMemcpyDeviceToDevice));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        uint *t;
 | 
			
		||||
        t    = ikey;
 | 
			
		||||
        ikey = okey;
 | 
			
		||||
        okey = t;
 | 
			
		||||
        t    = ival;
 | 
			
		||||
        ival = oval;
 | 
			
		||||
        oval = t;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -31,19 +31,17 @@
 | 
			
		||||
typedef unsigned int uint;
 | 
			
		||||
 | 
			
		||||
#define SHARED_SIZE_LIMIT 1024U
 | 
			
		||||
#define SAMPLE_STRIDE 128
 | 
			
		||||
#define SAMPLE_STRIDE     128
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Extensive sort validation routine
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
 | 
			
		||||
                                   uint arrayLength, uint numValues,
 | 
			
		||||
                                   uint sortDir);
 | 
			
		||||
extern "C" uint
 | 
			
		||||
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
 | 
			
		||||
 | 
			
		||||
extern "C" void fillValues(uint *val, uint N);
 | 
			
		||||
 | 
			
		||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
 | 
			
		||||
                                    uint batchSize, uint arrayLength);
 | 
			
		||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// CUDA merge sort
 | 
			
		||||
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
 | 
			
		||||
 | 
			
		||||
extern "C" void closeMergeSort(void);
 | 
			
		||||
 | 
			
		||||
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
 | 
			
		||||
                          uint *bufVal, uint *srcKey, uint *srcVal, uint N,
 | 
			
		||||
                          uint sortDir);
 | 
			
		||||
extern "C" void
 | 
			
		||||
mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// CPU "emulation"
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
 | 
			
		||||
                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
 | 
			
		||||
                              uint sortDir);
 | 
			
		||||
extern "C" void
 | 
			
		||||
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
 | 
			
		||||
 | 
			
		||||
@ -29,329 +29,335 @@
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#include "mergeSort_common.h"
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Helper functions
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static void checkOrder(uint *data, uint N, uint sortDir) {
 | 
			
		||||
  if (N <= 1) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (uint i = 0; i < N - 1; i++)
 | 
			
		||||
    if ((sortDir && (data[i] > data[i + 1])) ||
 | 
			
		||||
        (!sortDir && (data[i] < data[i + 1]))) {
 | 
			
		||||
      fprintf(stderr, "checkOrder() failed!!!\n");
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
static void checkOrder(uint *data, uint N, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    if (N <= 1) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (uint i = 0; i < N - 1; i++)
 | 
			
		||||
        if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
 | 
			
		||||
            fprintf(stderr, "checkOrder() failed!!!\n");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
 | 
			
		||||
 | 
			
		||||
static uint getSampleCount(uint dividend) {
 | 
			
		||||
  return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
 | 
			
		||||
                                           : (dividend / SAMPLE_STRIDE);
 | 
			
		||||
static uint getSampleCount(uint dividend)
 | 
			
		||||
{
 | 
			
		||||
    return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static uint nextPowerOfTwo(uint x) {
 | 
			
		||||
  --x;
 | 
			
		||||
  x |= x >> 1;
 | 
			
		||||
  x |= x >> 2;
 | 
			
		||||
  x |= x >> 4;
 | 
			
		||||
  x |= x >> 8;
 | 
			
		||||
  x |= x >> 16;
 | 
			
		||||
  return ++x;
 | 
			
		||||
static uint nextPowerOfTwo(uint x)
 | 
			
		||||
{
 | 
			
		||||
    --x;
 | 
			
		||||
    x |= x >> 1;
 | 
			
		||||
    x |= x >> 2;
 | 
			
		||||
    x |= x >> 4;
 | 
			
		||||
    x |= x >> 8;
 | 
			
		||||
    x |= x >> 16;
 | 
			
		||||
    return ++x;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
 | 
			
		||||
  if (L == 0) {
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  uint pos = 0;
 | 
			
		||||
 | 
			
		||||
  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
 | 
			
		||||
    uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
    if ((sortDir && (data[newPos - 1] <= val)) ||
 | 
			
		||||
        (!sortDir && (data[newPos - 1] >= val))) {
 | 
			
		||||
      pos = newPos;
 | 
			
		||||
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    if (L == 0) {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return pos;
 | 
			
		||||
    uint pos = 0;
 | 
			
		||||
 | 
			
		||||
    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
 | 
			
		||||
        uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
 | 
			
		||||
            pos = newPos;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return pos;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
 | 
			
		||||
  if (L == 0) {
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  uint pos = 0;
 | 
			
		||||
 | 
			
		||||
  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
 | 
			
		||||
    uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
    if ((sortDir && (data[newPos - 1] < val)) ||
 | 
			
		||||
        (!sortDir && (data[newPos - 1] > val))) {
 | 
			
		||||
      pos = newPos;
 | 
			
		||||
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    if (L == 0) {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return pos;
 | 
			
		||||
    uint pos = 0;
 | 
			
		||||
 | 
			
		||||
    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
 | 
			
		||||
        uint newPos = umin(pos + stride, L);
 | 
			
		||||
 | 
			
		||||
        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
 | 
			
		||||
            pos = newPos;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return pos;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge step 1: find sample ranks in each segment
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
 | 
			
		||||
                                uint stride, uint N, uint sortDir) {
 | 
			
		||||
  uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
  uint sampleCount =
 | 
			
		||||
      (lastSegmentElements > stride)
 | 
			
		||||
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
 | 
			
		||||
  for (uint pos = 0; pos < sampleCount; pos++) {
 | 
			
		||||
    const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
    for (uint pos = 0; pos < sampleCount; pos++) {
 | 
			
		||||
        const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
        const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
 | 
			
		||||
    const uint lenA = stride;
 | 
			
		||||
    const uint lenB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
    const uint nA = stride / SAMPLE_STRIDE;
 | 
			
		||||
    const uint nB = getSampleCount(lenB);
 | 
			
		||||
        const uint lenA = stride;
 | 
			
		||||
        const uint lenB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
        const uint nA   = stride / SAMPLE_STRIDE;
 | 
			
		||||
        const uint nB   = getSampleCount(lenB);
 | 
			
		||||
 | 
			
		||||
    if (i < nA) {
 | 
			
		||||
      ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
 | 
			
		||||
      ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
 | 
			
		||||
          binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
 | 
			
		||||
                                srcKey + segmentBase + stride, lenB, sortDir);
 | 
			
		||||
        if (i < nA) {
 | 
			
		||||
            ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
 | 
			
		||||
            ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
 | 
			
		||||
                srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (i < nB) {
 | 
			
		||||
            ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
 | 
			
		||||
            ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
 | 
			
		||||
                srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (i < nB) {
 | 
			
		||||
      ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
 | 
			
		||||
      ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
 | 
			
		||||
          binarySearchInclusive(
 | 
			
		||||
              srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
 | 
			
		||||
              srcKey + segmentBase, lenA, sortDir);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge step 2: merge ranks and indices to derive elementary intervals
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
 | 
			
		||||
                                 uint N) {
 | 
			
		||||
  uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
  uint sampleCount =
 | 
			
		||||
      (lastSegmentElements > stride)
 | 
			
		||||
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
 | 
			
		||||
{
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
 | 
			
		||||
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
 | 
			
		||||
 | 
			
		||||
  for (uint pos = 0; pos < sampleCount; pos++) {
 | 
			
		||||
    const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
    for (uint pos = 0; pos < sampleCount; pos++) {
 | 
			
		||||
        const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
 | 
			
		||||
        const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
 | 
			
		||||
 | 
			
		||||
    const uint lenA = stride;
 | 
			
		||||
    const uint lenB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
    const uint nA = stride / SAMPLE_STRIDE;
 | 
			
		||||
    const uint nB = getSampleCount(lenB);
 | 
			
		||||
        const uint lenA = stride;
 | 
			
		||||
        const uint lenB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
        const uint nA   = stride / SAMPLE_STRIDE;
 | 
			
		||||
        const uint nB   = getSampleCount(lenB);
 | 
			
		||||
 | 
			
		||||
    if (i < nA) {
 | 
			
		||||
      uint dstPosA =
 | 
			
		||||
          binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
 | 
			
		||||
                                ranks + (segmentBase + stride) / SAMPLE_STRIDE,
 | 
			
		||||
                                nB, 1) +
 | 
			
		||||
          i;
 | 
			
		||||
      assert(dstPosA < nA + nB);
 | 
			
		||||
      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
 | 
			
		||||
          ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
 | 
			
		||||
        if (i < nA) {
 | 
			
		||||
            uint dstPosA =
 | 
			
		||||
                binarySearchExclusive(
 | 
			
		||||
                    ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
 | 
			
		||||
                + i;
 | 
			
		||||
            assert(dstPosA < nA + nB);
 | 
			
		||||
            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (i < nB) {
 | 
			
		||||
            uint dstPosA =
 | 
			
		||||
                binarySearchInclusive(
 | 
			
		||||
                    ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
 | 
			
		||||
                + i;
 | 
			
		||||
            assert(dstPosA < nA + nB);
 | 
			
		||||
            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (i < nB) {
 | 
			
		||||
      uint dstPosA = binarySearchInclusive(
 | 
			
		||||
                         ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
 | 
			
		||||
                         ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
 | 
			
		||||
                     i;
 | 
			
		||||
      assert(dstPosA < nA + nB);
 | 
			
		||||
      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
 | 
			
		||||
          ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
 | 
			
		||||
                  uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
 | 
			
		||||
                  uint sortDir) {
 | 
			
		||||
  checkOrder(srcAKey, lenA, sortDir);
 | 
			
		||||
  checkOrder(srcBKey, lenB, sortDir);
 | 
			
		||||
static void merge(uint *dstKey,
 | 
			
		||||
                  uint *dstVal,
 | 
			
		||||
                  uint *srcAKey,
 | 
			
		||||
                  uint *srcAVal,
 | 
			
		||||
                  uint *srcBKey,
 | 
			
		||||
                  uint *srcBVal,
 | 
			
		||||
                  uint  lenA,
 | 
			
		||||
                  uint  lenB,
 | 
			
		||||
                  uint  sortDir)
 | 
			
		||||
{
 | 
			
		||||
    checkOrder(srcAKey, lenA, sortDir);
 | 
			
		||||
    checkOrder(srcBKey, lenB, sortDir);
 | 
			
		||||
 | 
			
		||||
  for (uint i = 0; i < lenA; i++) {
 | 
			
		||||
    uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
 | 
			
		||||
    assert(dstPos < lenA + lenB);
 | 
			
		||||
    dstKey[dstPos] = srcAKey[i];
 | 
			
		||||
    dstVal[dstPos] = srcAVal[i];
 | 
			
		||||
  }
 | 
			
		||||
    for (uint i = 0; i < lenA; i++) {
 | 
			
		||||
        uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
 | 
			
		||||
        assert(dstPos < lenA + lenB);
 | 
			
		||||
        dstKey[dstPos] = srcAKey[i];
 | 
			
		||||
        dstVal[dstPos] = srcAVal[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (uint i = 0; i < lenB; i++) {
 | 
			
		||||
    uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
 | 
			
		||||
    assert(dstPos < lenA + lenB);
 | 
			
		||||
    dstKey[dstPos] = srcBKey[i];
 | 
			
		||||
    dstVal[dstPos] = srcBVal[i];
 | 
			
		||||
  }
 | 
			
		||||
    for (uint i = 0; i < lenB; i++) {
 | 
			
		||||
        uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
 | 
			
		||||
        assert(dstPos < lenA + lenB);
 | 
			
		||||
        dstKey[dstPos] = srcBKey[i];
 | 
			
		||||
        dstVal[dstPos] = srcBVal[i];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
 | 
			
		||||
                                     uint *srcVal, uint *limitsA, uint *limitsB,
 | 
			
		||||
                                     uint stride, uint N, uint sortDir) {
 | 
			
		||||
  uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
  uint mergePairs = (lastSegmentElements > stride)
 | 
			
		||||
                        ? getSampleCount(N)
 | 
			
		||||
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
 | 
			
		||||
static void mergeElementaryIntervals(uint *dstKey,
 | 
			
		||||
                                     uint *dstVal,
 | 
			
		||||
                                     uint *srcKey,
 | 
			
		||||
                                     uint *srcVal,
 | 
			
		||||
                                     uint *limitsA,
 | 
			
		||||
                                     uint *limitsB,
 | 
			
		||||
                                     uint  stride,
 | 
			
		||||
                                     uint  N,
 | 
			
		||||
                                     uint  sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
 | 
			
		||||
 | 
			
		||||
  for (uint pos = 0; pos < mergePairs; pos++) {
 | 
			
		||||
    uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
 | 
			
		||||
    uint segmentBase = (pos - i) * SAMPLE_STRIDE;
 | 
			
		||||
    for (uint pos = 0; pos < mergePairs; pos++) {
 | 
			
		||||
        uint i           = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
 | 
			
		||||
        uint segmentBase = (pos - i) * SAMPLE_STRIDE;
 | 
			
		||||
 | 
			
		||||
    const uint lenA = stride;
 | 
			
		||||
    const uint lenB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
    const uint nA = stride / SAMPLE_STRIDE;
 | 
			
		||||
    const uint nB = getSampleCount(lenB);
 | 
			
		||||
    const uint n = nA + nB;
 | 
			
		||||
        const uint lenA = stride;
 | 
			
		||||
        const uint lenB = umin(stride, N - segmentBase - stride);
 | 
			
		||||
        const uint nA   = stride / SAMPLE_STRIDE;
 | 
			
		||||
        const uint nB   = getSampleCount(lenB);
 | 
			
		||||
        const uint n    = nA + nB;
 | 
			
		||||
 | 
			
		||||
    const uint startPosA = limitsA[pos];
 | 
			
		||||
    const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
 | 
			
		||||
    const uint startPosB = limitsB[pos];
 | 
			
		||||
    const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
 | 
			
		||||
    const uint startPosDst = startPosA + startPosB;
 | 
			
		||||
        const uint startPosA   = limitsA[pos];
 | 
			
		||||
        const uint endPosA     = (i + 1 < n) ? limitsA[pos + 1] : lenA;
 | 
			
		||||
        const uint startPosB   = limitsB[pos];
 | 
			
		||||
        const uint endPosB     = (i + 1 < n) ? limitsB[pos + 1] : lenB;
 | 
			
		||||
        const uint startPosDst = startPosA + startPosB;
 | 
			
		||||
 | 
			
		||||
    assert(startPosA <= endPosA && endPosA <= lenA);
 | 
			
		||||
    assert(startPosB <= endPosB && endPosB <= lenB);
 | 
			
		||||
    assert((endPosA - startPosA) <= SAMPLE_STRIDE);
 | 
			
		||||
    assert((endPosB - startPosB) <= SAMPLE_STRIDE);
 | 
			
		||||
        assert(startPosA <= endPosA && endPosA <= lenA);
 | 
			
		||||
        assert(startPosB <= endPosB && endPosB <= lenB);
 | 
			
		||||
        assert((endPosA - startPosA) <= SAMPLE_STRIDE);
 | 
			
		||||
        assert((endPosB - startPosB) <= SAMPLE_STRIDE);
 | 
			
		||||
 | 
			
		||||
    merge(dstKey + segmentBase + startPosDst,
 | 
			
		||||
          dstVal + segmentBase + startPosDst,
 | 
			
		||||
          (srcKey + segmentBase + 0) + startPosA,
 | 
			
		||||
          (srcVal + segmentBase + 0) + startPosA,
 | 
			
		||||
          (srcKey + segmentBase + stride) + startPosB,
 | 
			
		||||
          (srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
 | 
			
		||||
          endPosB - startPosB, sortDir);
 | 
			
		||||
  }
 | 
			
		||||
        merge(dstKey + segmentBase + startPosDst,
 | 
			
		||||
              dstVal + segmentBase + startPosDst,
 | 
			
		||||
              (srcKey + segmentBase + 0) + startPosA,
 | 
			
		||||
              (srcVal + segmentBase + 0) + startPosA,
 | 
			
		||||
              (srcKey + segmentBase + stride) + startPosB,
 | 
			
		||||
              (srcVal + segmentBase + stride) + startPosB,
 | 
			
		||||
              endPosA - startPosA,
 | 
			
		||||
              endPosB - startPosB,
 | 
			
		||||
              sortDir);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Retarded bubble sort
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
 | 
			
		||||
  if (N <= 1) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (uint bottom = 0; bottom < N - 1; bottom++) {
 | 
			
		||||
    uint savePos = bottom;
 | 
			
		||||
    uint saveKey = key[bottom];
 | 
			
		||||
 | 
			
		||||
    for (uint i = bottom + 1; i < N; i++)
 | 
			
		||||
      if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
 | 
			
		||||
        savePos = i;
 | 
			
		||||
        saveKey = key[i];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    if (savePos != bottom) {
 | 
			
		||||
      uint t;
 | 
			
		||||
      t = key[savePos];
 | 
			
		||||
      key[savePos] = key[bottom];
 | 
			
		||||
      key[bottom] = t;
 | 
			
		||||
      t = val[savePos];
 | 
			
		||||
      val[savePos] = val[bottom];
 | 
			
		||||
      val[bottom] = t;
 | 
			
		||||
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    if (N <= 1) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (uint bottom = 0; bottom < N - 1; bottom++) {
 | 
			
		||||
        uint savePos = bottom;
 | 
			
		||||
        uint saveKey = key[bottom];
 | 
			
		||||
 | 
			
		||||
        for (uint i = bottom + 1; i < N; i++)
 | 
			
		||||
            if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
 | 
			
		||||
                savePos = i;
 | 
			
		||||
                saveKey = key[i];
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        if (savePos != bottom) {
 | 
			
		||||
            uint t;
 | 
			
		||||
            t            = key[savePos];
 | 
			
		||||
            key[savePos] = key[bottom];
 | 
			
		||||
            key[bottom]  = t;
 | 
			
		||||
            t            = val[savePos];
 | 
			
		||||
            val[savePos] = val[bottom];
 | 
			
		||||
            val[bottom]  = t;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Interface function
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
 | 
			
		||||
                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
 | 
			
		||||
                              uint sortDir) {
 | 
			
		||||
  uint *ikey, *ival, *okey, *oval;
 | 
			
		||||
  uint stageCount = 0;
 | 
			
		||||
extern "C" void
 | 
			
		||||
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint *ikey, *ival, *okey, *oval;
 | 
			
		||||
    uint  stageCount = 0;
 | 
			
		||||
 | 
			
		||||
  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
 | 
			
		||||
    ;
 | 
			
		||||
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
 | 
			
		||||
        ;
 | 
			
		||||
 | 
			
		||||
  if (stageCount & 1) {
 | 
			
		||||
    ikey = bufKey;
 | 
			
		||||
    ival = bufVal;
 | 
			
		||||
    okey = dstKey;
 | 
			
		||||
    oval = dstVal;
 | 
			
		||||
  } else {
 | 
			
		||||
    ikey = dstKey;
 | 
			
		||||
    ival = dstVal;
 | 
			
		||||
    okey = bufKey;
 | 
			
		||||
    oval = bufVal;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("Bottom-level sort...\n");
 | 
			
		||||
  memcpy(ikey, srcKey, N * sizeof(uint));
 | 
			
		||||
  memcpy(ival, srcVal, N * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
  for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
 | 
			
		||||
    bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
 | 
			
		||||
               sortDir);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("Merge...\n");
 | 
			
		||||
  uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
  uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
  uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
  uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
  memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
  memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
  memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
  memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
 | 
			
		||||
    uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
 | 
			
		||||
    // Find sample ranks and prepare for limiters merge
 | 
			
		||||
    generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
 | 
			
		||||
 | 
			
		||||
    // Merge ranks and indices
 | 
			
		||||
    mergeRanksAndIndices(limitsA, ranksA, stride, N);
 | 
			
		||||
    mergeRanksAndIndices(limitsB, ranksB, stride, N);
 | 
			
		||||
 | 
			
		||||
    // Merge elementary intervals
 | 
			
		||||
    mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
 | 
			
		||||
                             N, sortDir);
 | 
			
		||||
 | 
			
		||||
    if (lastSegmentElements <= stride) {
 | 
			
		||||
      // Last merge segment consists of a single array which just needs to be
 | 
			
		||||
      // passed through
 | 
			
		||||
      memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
 | 
			
		||||
             lastSegmentElements * sizeof(uint));
 | 
			
		||||
      memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
 | 
			
		||||
             lastSegmentElements * sizeof(uint));
 | 
			
		||||
    if (stageCount & 1) {
 | 
			
		||||
        ikey = bufKey;
 | 
			
		||||
        ival = bufVal;
 | 
			
		||||
        okey = dstKey;
 | 
			
		||||
        oval = dstVal;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        ikey = dstKey;
 | 
			
		||||
        ival = dstVal;
 | 
			
		||||
        okey = bufKey;
 | 
			
		||||
        oval = bufVal;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    uint *t;
 | 
			
		||||
    t = ikey;
 | 
			
		||||
    ikey = okey;
 | 
			
		||||
    okey = t;
 | 
			
		||||
    t = ival;
 | 
			
		||||
    ival = oval;
 | 
			
		||||
    oval = t;
 | 
			
		||||
  }
 | 
			
		||||
    printf("Bottom-level sort...\n");
 | 
			
		||||
    memcpy(ikey, srcKey, N * sizeof(uint));
 | 
			
		||||
    memcpy(ival, srcVal, N * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
  free(limitsB);
 | 
			
		||||
  free(limitsA);
 | 
			
		||||
  free(ranksB);
 | 
			
		||||
  free(ranksA);
 | 
			
		||||
    for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
 | 
			
		||||
        bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("Merge...\n");
 | 
			
		||||
    uint *ranksA  = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
    uint *ranksB  = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
    uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
    uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
 | 
			
		||||
    memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
    memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
    memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
    memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
 | 
			
		||||
        uint lastSegmentElements = N % (2 * stride);
 | 
			
		||||
 | 
			
		||||
        // Find sample ranks and prepare for limiters merge
 | 
			
		||||
        generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
 | 
			
		||||
 | 
			
		||||
        // Merge ranks and indices
 | 
			
		||||
        mergeRanksAndIndices(limitsA, ranksA, stride, N);
 | 
			
		||||
        mergeRanksAndIndices(limitsB, ranksB, stride, N);
 | 
			
		||||
 | 
			
		||||
        // Merge elementary intervals
 | 
			
		||||
        mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
 | 
			
		||||
 | 
			
		||||
        if (lastSegmentElements <= stride) {
 | 
			
		||||
            // Last merge segment consists of a single array which just needs to be
 | 
			
		||||
            // passed through
 | 
			
		||||
            memcpy(
 | 
			
		||||
                okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
 | 
			
		||||
            memcpy(
 | 
			
		||||
                oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        uint *t;
 | 
			
		||||
        t    = ikey;
 | 
			
		||||
        ikey = okey;
 | 
			
		||||
        okey = t;
 | 
			
		||||
        t    = ival;
 | 
			
		||||
        ival = oval;
 | 
			
		||||
        oval = t;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    free(limitsB);
 | 
			
		||||
    free(limitsA);
 | 
			
		||||
    free(ranksB);
 | 
			
		||||
    free(ranksA);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -29,104 +29,100 @@
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#include "mergeSort_common.h"
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Validate sorted keys array (check for integrity and proper order)
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
 | 
			
		||||
                                   uint arrayLength, uint numValues,
 | 
			
		||||
                                   uint sortDir) {
 | 
			
		||||
  uint *srcHist;
 | 
			
		||||
  uint *resHist;
 | 
			
		||||
extern "C" uint
 | 
			
		||||
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
 | 
			
		||||
{
 | 
			
		||||
    uint *srcHist;
 | 
			
		||||
    uint *resHist;
 | 
			
		||||
 | 
			
		||||
  if (arrayLength < 2) {
 | 
			
		||||
    printf("validateSortedKeys(): arrays too short, exiting...\n");
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("...inspecting keys array: ");
 | 
			
		||||
  srcHist = (uint *)malloc(numValues * sizeof(uint));
 | 
			
		||||
  resHist = (uint *)malloc(numValues * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
  int flag = 1;
 | 
			
		||||
 | 
			
		||||
  for (uint j = 0; j < batchSize;
 | 
			
		||||
       j++, srcKey += arrayLength, resKey += arrayLength) {
 | 
			
		||||
    // Build histograms for keys arrays
 | 
			
		||||
    memset(srcHist, 0, numValues * sizeof(uint));
 | 
			
		||||
    memset(resHist, 0, numValues * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
    for (uint i = 0; i < arrayLength; i++) {
 | 
			
		||||
      if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
 | 
			
		||||
        srcHist[srcKey[i]]++;
 | 
			
		||||
        resHist[resKey[i]]++;
 | 
			
		||||
      } else {
 | 
			
		||||
        fprintf(
 | 
			
		||||
            stderr,
 | 
			
		||||
            "***Set %u source/result key arrays are not limited properly***\n",
 | 
			
		||||
            j);
 | 
			
		||||
        flag = 0;
 | 
			
		||||
        goto brk;
 | 
			
		||||
      }
 | 
			
		||||
    if (arrayLength < 2) {
 | 
			
		||||
        printf("validateSortedKeys(): arrays too short, exiting...\n");
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Compare the histograms
 | 
			
		||||
    for (uint i = 0; i < numValues; i++)
 | 
			
		||||
      if (srcHist[i] != resHist[i]) {
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "***Set %u source/result keys histograms do not match***\n", j);
 | 
			
		||||
        flag = 0;
 | 
			
		||||
        goto brk;
 | 
			
		||||
      }
 | 
			
		||||
    printf("...inspecting keys array: ");
 | 
			
		||||
    srcHist = (uint *)malloc(numValues * sizeof(uint));
 | 
			
		||||
    resHist = (uint *)malloc(numValues * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
    // Finally check the ordering
 | 
			
		||||
    for (uint i = 0; i < arrayLength - 1; i++)
 | 
			
		||||
      if ((sortDir && (resKey[i] > resKey[i + 1])) ||
 | 
			
		||||
          (!sortDir && (resKey[i] < resKey[i + 1]))) {
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "***Set %u result key array is not ordered properly***\n", j);
 | 
			
		||||
        flag = 0;
 | 
			
		||||
        goto brk;
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
    int flag = 1;
 | 
			
		||||
 | 
			
		||||
    for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
 | 
			
		||||
        // Build histograms for keys arrays
 | 
			
		||||
        memset(srcHist, 0, numValues * sizeof(uint));
 | 
			
		||||
        memset(resHist, 0, numValues * sizeof(uint));
 | 
			
		||||
 | 
			
		||||
        for (uint i = 0; i < arrayLength; i++) {
 | 
			
		||||
            if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
 | 
			
		||||
                srcHist[srcKey[i]]++;
 | 
			
		||||
                resHist[resKey[i]]++;
 | 
			
		||||
            }
 | 
			
		||||
            else {
 | 
			
		||||
                fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
 | 
			
		||||
                flag = 0;
 | 
			
		||||
                goto brk;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Compare the histograms
 | 
			
		||||
        for (uint i = 0; i < numValues; i++)
 | 
			
		||||
            if (srcHist[i] != resHist[i]) {
 | 
			
		||||
                fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
 | 
			
		||||
                flag = 0;
 | 
			
		||||
                goto brk;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        // Finally check the ordering
 | 
			
		||||
        for (uint i = 0; i < arrayLength - 1; i++)
 | 
			
		||||
            if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
 | 
			
		||||
                fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
 | 
			
		||||
                flag = 0;
 | 
			
		||||
                goto brk;
 | 
			
		||||
            }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
brk:
 | 
			
		||||
  free(resHist);
 | 
			
		||||
  free(srcHist);
 | 
			
		||||
    free(resHist);
 | 
			
		||||
    free(srcHist);
 | 
			
		||||
 | 
			
		||||
  if (flag) printf("OK\n");
 | 
			
		||||
    if (flag)
 | 
			
		||||
        printf("OK\n");
 | 
			
		||||
 | 
			
		||||
  return flag;
 | 
			
		||||
    return flag;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Value validation / stability check routines
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
extern "C" void fillValues(uint *val, uint N) {
 | 
			
		||||
  for (uint i = 0; i < N; i++) val[i] = i;
 | 
			
		||||
extern "C" void fillValues(uint *val, uint N)
 | 
			
		||||
{
 | 
			
		||||
    for (uint i = 0; i < N; i++)
 | 
			
		||||
        val[i] = i;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
 | 
			
		||||
                                    uint batchSize, uint arrayLength) {
 | 
			
		||||
  int correctFlag = 1, stableFlag = 1;
 | 
			
		||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
 | 
			
		||||
{
 | 
			
		||||
    int correctFlag = 1, stableFlag = 1;
 | 
			
		||||
 | 
			
		||||
  printf("...inspecting keys and values array: ");
 | 
			
		||||
    printf("...inspecting keys and values array: ");
 | 
			
		||||
 | 
			
		||||
  for (uint i = 0; i < batchSize;
 | 
			
		||||
       i++, resKey += arrayLength, resVal += arrayLength) {
 | 
			
		||||
    for (uint j = 0; j < arrayLength; j++) {
 | 
			
		||||
      if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
 | 
			
		||||
    for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
 | 
			
		||||
        for (uint j = 0; j < arrayLength; j++) {
 | 
			
		||||
            if (resKey[j] != srcKey[resVal[j]])
 | 
			
		||||
                correctFlag = 0;
 | 
			
		||||
 | 
			
		||||
      if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
 | 
			
		||||
          (resVal[j] > resVal[j + 1]))
 | 
			
		||||
        stableFlag = 0;
 | 
			
		||||
            if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
 | 
			
		||||
                stableFlag = 0;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
 | 
			
		||||
  printf(stableFlag ? "...stability property: stable!\n"
 | 
			
		||||
                    : "...stability property: NOT stable\n");
 | 
			
		||||
    printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
 | 
			
		||||
    printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
 | 
			
		||||
 | 
			
		||||
  return correctFlag;
 | 
			
		||||
    return correctFlag;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -29,106 +29,105 @@
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// Includes CUDA
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <cuda/barrier>
 | 
			
		||||
#include <cooperative_groups.h>
 | 
			
		||||
#include <cuda/barrier>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
// CUDA helper functions
 | 
			
		||||
#include <helper_cuda.h>  // helper functions for CUDA error check
 | 
			
		||||
#include <helper_cuda.h> // helper functions for CUDA error check
 | 
			
		||||
 | 
			
		||||
namespace cg = cooperative_groups;
 | 
			
		||||
 | 
			
		||||
#if __CUDA_ARCH__ >= 700
 | 
			
		||||
template <bool writeSquareRoot>
 | 
			
		||||
__device__ void reduceBlockData(
 | 
			
		||||
    cuda::barrier<cuda::thread_scope_block> &barrier,
 | 
			
		||||
    cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
 | 
			
		||||
  extern __shared__ double tmp[];
 | 
			
		||||
 | 
			
		||||
#pragma unroll
 | 
			
		||||
  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
 | 
			
		||||
    threadSum += tile32.shfl_down(threadSum, offset);
 | 
			
		||||
  }
 | 
			
		||||
  if (tile32.thread_rank() == 0) {
 | 
			
		||||
    tmp[tile32.meta_group_rank()] = threadSum;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto token = barrier.arrive();
 | 
			
		||||
 | 
			
		||||
  barrier.wait(std::move(token));
 | 
			
		||||
 | 
			
		||||
  // The warp 0 will perform last round of reduction
 | 
			
		||||
  if (tile32.meta_group_rank() == 0) {
 | 
			
		||||
    double beta = tile32.thread_rank() < tile32.meta_group_size()
 | 
			
		||||
                      ? tmp[tile32.thread_rank()]
 | 
			
		||||
                      : 0.0;
 | 
			
		||||
__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
 | 
			
		||||
                                cg::thread_block_tile<32>               &tile32,
 | 
			
		||||
                                double                                  &threadSum,
 | 
			
		||||
                                double                                  *result)
 | 
			
		||||
{
 | 
			
		||||
    extern __shared__ double tmp[];
 | 
			
		||||
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
 | 
			
		||||
      beta += tile32.shfl_down(beta, offset);
 | 
			
		||||
        threadSum += tile32.shfl_down(threadSum, offset);
 | 
			
		||||
    }
 | 
			
		||||
    if (tile32.thread_rank() == 0) {
 | 
			
		||||
        tmp[tile32.meta_group_rank()] = threadSum;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (tile32.thread_rank() == 0) {
 | 
			
		||||
      if (writeSquareRoot)
 | 
			
		||||
        *result = sqrt(beta);
 | 
			
		||||
      else
 | 
			
		||||
        *result = beta;
 | 
			
		||||
    auto token = barrier.arrive();
 | 
			
		||||
 | 
			
		||||
    barrier.wait(std::move(token));
 | 
			
		||||
 | 
			
		||||
    // The warp 0 will perform last round of reduction
 | 
			
		||||
    if (tile32.meta_group_rank() == 0) {
 | 
			
		||||
        double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
 | 
			
		||||
 | 
			
		||||
#pragma unroll
 | 
			
		||||
        for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
 | 
			
		||||
            beta += tile32.shfl_down(beta, offset);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (tile32.thread_rank() == 0) {
 | 
			
		||||
            if (writeSquareRoot)
 | 
			
		||||
                *result = sqrt(beta);
 | 
			
		||||
            else
 | 
			
		||||
                *result = beta;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
 | 
			
		||||
                                             double *partialResults, int size) {
 | 
			
		||||
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
 | 
			
		||||
{
 | 
			
		||||
#if __CUDA_ARCH__ >= 700
 | 
			
		||||
#pragma diag_suppress static_var_with_dynamic_init
 | 
			
		||||
  cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
  cg::grid_group grid = cg::this_grid();
 | 
			
		||||
  ;
 | 
			
		||||
  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
 | 
			
		||||
    cg::thread_block cta  = cg::this_thread_block();
 | 
			
		||||
    cg::grid_group   grid = cg::this_grid();
 | 
			
		||||
    ;
 | 
			
		||||
    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
 | 
			
		||||
 | 
			
		||||
  __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
 | 
			
		||||
    __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    init(&barrier, blockDim.x);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
  double threadSum = 0.0;
 | 
			
		||||
  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
 | 
			
		||||
    threadSum += (double)(vecA[i] * vecB[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Each thread block performs reduction of partial dotProducts and writes to
 | 
			
		||||
  // global mem.
 | 
			
		||||
  reduceBlockData<false>(barrier, tile32, threadSum,
 | 
			
		||||
                         &partialResults[blockIdx.x]);
 | 
			
		||||
 | 
			
		||||
  cg::sync(grid);
 | 
			
		||||
 | 
			
		||||
  // One block performs the final summation of partial dot products
 | 
			
		||||
  // of all the thread blocks and writes the sqrt of final dot product.
 | 
			
		||||
  if (blockIdx.x == 0) {
 | 
			
		||||
    threadSum = 0.0;
 | 
			
		||||
    for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
 | 
			
		||||
      threadSum += partialResults[i];
 | 
			
		||||
    if (threadIdx.x == 0) {
 | 
			
		||||
        init(&barrier, blockDim.x);
 | 
			
		||||
    }
 | 
			
		||||
    reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cg::sync(grid);
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
  const double finalValue = partialResults[0];
 | 
			
		||||
    double threadSum = 0.0;
 | 
			
		||||
    for (int i = grid.thread_rank(); i < size; i += grid.size()) {
 | 
			
		||||
        threadSum += (double)(vecA[i] * vecB[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Perform normalization of vecA & vecB.
 | 
			
		||||
  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
 | 
			
		||||
    vecA[i] = (float)vecA[i] / finalValue;
 | 
			
		||||
    vecB[i] = (float)vecB[i] / finalValue;
 | 
			
		||||
  }
 | 
			
		||||
    // Each thread block performs reduction of partial dotProducts and writes to
 | 
			
		||||
    // global mem.
 | 
			
		||||
    reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
 | 
			
		||||
 | 
			
		||||
    cg::sync(grid);
 | 
			
		||||
 | 
			
		||||
    // One block performs the final summation of partial dot products
 | 
			
		||||
    // of all the thread blocks and writes the sqrt of final dot product.
 | 
			
		||||
    if (blockIdx.x == 0) {
 | 
			
		||||
        threadSum = 0.0;
 | 
			
		||||
        for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
 | 
			
		||||
            threadSum += partialResults[i];
 | 
			
		||||
        }
 | 
			
		||||
        reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    cg::sync(grid);
 | 
			
		||||
 | 
			
		||||
    const double finalValue = partialResults[0];
 | 
			
		||||
 | 
			
		||||
    // Perform normalization of vecA & vecB.
 | 
			
		||||
    for (int i = grid.thread_rank(); i < size; i += grid.size()) {
 | 
			
		||||
        vecA[i] = (float)vecA[i] / finalValue;
 | 
			
		||||
        vecB[i] = (float)vecB[i] / finalValue;
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n", argv[0]);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    int dev = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  int major = 0;
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
 | 
			
		||||
    int major = 0;
 | 
			
		||||
    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
 | 
			
		||||
 | 
			
		||||
  // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
 | 
			
		||||
  if (major < 7) {
 | 
			
		||||
    printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int supportsCooperativeLaunch = 0;
 | 
			
		||||
  checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
 | 
			
		||||
                                         cudaDevAttrCooperativeLaunch, dev));
 | 
			
		||||
 | 
			
		||||
  if (!supportsCooperativeLaunch) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
 | 
			
		||||
        "Waiving the run\n",
 | 
			
		||||
        dev);
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
 | 
			
		||||
 | 
			
		||||
  printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
 | 
			
		||||
  float *vecA, *d_vecA;
 | 
			
		||||
  float *vecB, *d_vecB;
 | 
			
		||||
  double *d_partialResults;
 | 
			
		||||
  int size = 10000000;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
 | 
			
		||||
  checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
 | 
			
		||||
 | 
			
		||||
  float baseVal = 2.0;
 | 
			
		||||
  for (int i = 0; i < size; i++) {
 | 
			
		||||
    vecA[i] = vecB[i] = baseVal;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
 | 
			
		||||
                                  cudaMemcpyHostToDevice, stream));
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
 | 
			
		||||
                                  cudaMemcpyHostToDevice, stream));
 | 
			
		||||
 | 
			
		||||
  // Kernel configuration, where a one-dimensional
 | 
			
		||||
  // grid and one-dimensional blocks are configured.
 | 
			
		||||
  int minGridSize = 0, blockSize = 0;
 | 
			
		||||
  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
 | 
			
		||||
      &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
 | 
			
		||||
 | 
			
		||||
  int smemSize = ((blockSize / 32) + 1) * sizeof(double);
 | 
			
		||||
 | 
			
		||||
  int numBlocksPerSm = 0;
 | 
			
		||||
  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 | 
			
		||||
      &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
 | 
			
		||||
 | 
			
		||||
  int multiProcessorCount = 0;
 | 
			
		||||
  checkCudaErrors(cudaDeviceGetAttribute(
 | 
			
		||||
      &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
 | 
			
		||||
 | 
			
		||||
  minGridSize = multiProcessorCount * numBlocksPerSm;
 | 
			
		||||
  checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
 | 
			
		||||
 | 
			
		||||
  printf(
 | 
			
		||||
      "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
 | 
			
		||||
      "blockSize = %d\n",
 | 
			
		||||
      minGridSize, blockSize);
 | 
			
		||||
 | 
			
		||||
  dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
 | 
			
		||||
 | 
			
		||||
  void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
 | 
			
		||||
                        (void *)&d_partialResults, (void *)&size};
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
 | 
			
		||||
                                  dimBlock, kernelArgs, smemSize, stream));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
 | 
			
		||||
                                  cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
  float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
 | 
			
		||||
  unsigned int matches = 0;
 | 
			
		||||
  for (int i = 0; i < size; i++) {
 | 
			
		||||
    if ((vecA[i] - expectedResult) > 0.00001) {
 | 
			
		||||
      printf("mismatch at i = %d\n", i);
 | 
			
		||||
      break;
 | 
			
		||||
    } else {
 | 
			
		||||
      matches++;
 | 
			
		||||
    // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
 | 
			
		||||
    if (major < 7) {
 | 
			
		||||
        printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
 | 
			
		||||
  checkCudaErrors(cudaFree(d_vecA));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_vecB));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_partialResults));
 | 
			
		||||
    int supportsCooperativeLaunch = 0;
 | 
			
		||||
    checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(vecA));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(vecB));
 | 
			
		||||
  return matches == size;
 | 
			
		||||
    if (!supportsCooperativeLaunch) {
 | 
			
		||||
        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
 | 
			
		||||
               "Waiving the run\n",
 | 
			
		||||
               dev);
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
 | 
			
		||||
 | 
			
		||||
    printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
 | 
			
		||||
{
 | 
			
		||||
    float  *vecA, *d_vecA;
 | 
			
		||||
    float  *vecB, *d_vecB;
 | 
			
		||||
    double *d_partialResults;
 | 
			
		||||
    int     size = 10000000;
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
 | 
			
		||||
 | 
			
		||||
    float baseVal = 2.0;
 | 
			
		||||
    for (int i = 0; i < size; i++) {
 | 
			
		||||
        vecA[i] = vecB[i] = baseVal;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    cudaStream_t stream;
 | 
			
		||||
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
 | 
			
		||||
    // Kernel configuration, where a one-dimensional
 | 
			
		||||
    // grid and one-dimensional blocks are configured.
 | 
			
		||||
    int minGridSize = 0, blockSize = 0;
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
 | 
			
		||||
 | 
			
		||||
    int smemSize = ((blockSize / 32) + 1) * sizeof(double);
 | 
			
		||||
 | 
			
		||||
    int numBlocksPerSm = 0;
 | 
			
		||||
    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 | 
			
		||||
        &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
 | 
			
		||||
 | 
			
		||||
    int multiProcessorCount = 0;
 | 
			
		||||
    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
 | 
			
		||||
 | 
			
		||||
    minGridSize = multiProcessorCount * numBlocksPerSm;
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
 | 
			
		||||
 | 
			
		||||
    printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
 | 
			
		||||
           "blockSize = %d\n",
 | 
			
		||||
           minGridSize,
 | 
			
		||||
           blockSize);
 | 
			
		||||
 | 
			
		||||
    dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
 | 
			
		||||
 | 
			
		||||
    void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaLaunchCooperativeKernel(
 | 
			
		||||
        (void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
    float        expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
 | 
			
		||||
    unsigned int matches        = 0;
 | 
			
		||||
    for (int i = 0; i < size; i++) {
 | 
			
		||||
        if ((vecA[i] - expectedResult) > 0.00001) {
 | 
			
		||||
            printf("mismatch at i = %d\n", i);
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            matches++;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
 | 
			
		||||
    checkCudaErrors(cudaFree(d_vecA));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_vecB));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_partialResults));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(vecA));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(vecB));
 | 
			
		||||
    return matches == size;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,17 +34,17 @@
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Includes, system
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <cassert>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// Includes CUDA
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
// CUDA helper functions
 | 
			
		||||
#include <helper_cuda.h>  // helper functions for CUDA error check
 | 
			
		||||
#include <helper_cuda.h> // helper functions for CUDA error check
 | 
			
		||||
 | 
			
		||||
const char *sampleName = "simpleAssert";
 | 
			
		||||
 | 
			
		||||
@ -58,9 +58,10 @@ bool testResult = true;
 | 
			
		||||
//! Tests assert function.
 | 
			
		||||
//! Thread whose id > N will print assertion failed error message.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void testKernel(int N) {
 | 
			
		||||
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  assert(gtid < N);
 | 
			
		||||
__global__ void testKernel(int N)
 | 
			
		||||
{
 | 
			
		||||
    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    assert(gtid < N);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n", sampleName);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n", sampleName);
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
 | 
			
		||||
  printf("%s completed, returned %s\n", sampleName,
 | 
			
		||||
         testResult ? "OK" : "ERROR!");
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  int Nblocks = 2;
 | 
			
		||||
  int Nthreads = 32;
 | 
			
		||||
  cudaError_t error;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int         Nblocks  = 2;
 | 
			
		||||
    int         Nthreads = 32;
 | 
			
		||||
    cudaError_t error;
 | 
			
		||||
 | 
			
		||||
#ifndef _WIN32
 | 
			
		||||
  utsname OS_System_Type;
 | 
			
		||||
  uname(&OS_System_Type);
 | 
			
		||||
    utsname OS_System_Type;
 | 
			
		||||
    uname(&OS_System_Type);
 | 
			
		||||
 | 
			
		||||
  printf("OS_System_Type.release = %s\n", OS_System_Type.release);
 | 
			
		||||
    printf("OS_System_Type.release = %s\n", OS_System_Type.release);
 | 
			
		||||
 | 
			
		||||
  if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
 | 
			
		||||
    printf("simpleAssert is not current supported on Mac OSX\n\n");
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("OS Info: <%s>\n\n", OS_System_Type.version);
 | 
			
		||||
  }
 | 
			
		||||
    if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
 | 
			
		||||
        printf("simpleAssert is not current supported on Mac OSX\n\n");
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("OS Info: <%s>\n\n", OS_System_Type.version);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // Kernel configuration, where a one-dimensional
 | 
			
		||||
  // grid and one-dimensional blocks are configured.
 | 
			
		||||
  dim3 dimGrid(Nblocks);
 | 
			
		||||
  dim3 dimBlock(Nthreads);
 | 
			
		||||
    // Kernel configuration, where a one-dimensional
 | 
			
		||||
    // grid and one-dimensional blocks are configured.
 | 
			
		||||
    dim3 dimGrid(Nblocks);
 | 
			
		||||
    dim3 dimBlock(Nthreads);
 | 
			
		||||
 | 
			
		||||
  printf("Launch kernel to generate assertion failures\n");
 | 
			
		||||
  testKernel<<<dimGrid, dimBlock>>>(60);
 | 
			
		||||
    printf("Launch kernel to generate assertion failures\n");
 | 
			
		||||
    testKernel<<<dimGrid, dimBlock>>>(60);
 | 
			
		||||
 | 
			
		||||
  // Synchronize (flushes assert output).
 | 
			
		||||
  printf("\n-- Begin assert output\n\n");
 | 
			
		||||
  error = cudaDeviceSynchronize();
 | 
			
		||||
  printf("\n-- End assert output\n\n");
 | 
			
		||||
    // Synchronize (flushes assert output).
 | 
			
		||||
    printf("\n-- Begin assert output\n\n");
 | 
			
		||||
    error = cudaDeviceSynchronize();
 | 
			
		||||
    printf("\n-- End assert output\n\n");
 | 
			
		||||
 | 
			
		||||
  // Check for errors and failed asserts in asynchronous kernel launch.
 | 
			
		||||
  if (error == cudaErrorAssert) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "Device assert failed as expected, "
 | 
			
		||||
        "CUDA error message is: %s\n\n",
 | 
			
		||||
        cudaGetErrorString(error));
 | 
			
		||||
  }
 | 
			
		||||
    // Check for errors and failed asserts in asynchronous kernel launch.
 | 
			
		||||
    if (error == cudaErrorAssert) {
 | 
			
		||||
        printf("Device assert failed as expected, "
 | 
			
		||||
               "CUDA error message is: %s\n\n",
 | 
			
		||||
               cudaGetErrorString(error));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  testResult = error == cudaErrorAssert;
 | 
			
		||||
    testResult = error == cudaErrorAssert;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,15 +34,16 @@
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Includes, system
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <cassert>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// Includes CUDA
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
#include "nvrtc_helper.h"
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
const char *sampleName = "simpleAssert_nvrtc";
 | 
			
		||||
 | 
			
		||||
@ -58,56 +59,63 @@ void runTest(int argc, char **argv);
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n", sampleName);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n", sampleName);
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  int Nblocks = 2;
 | 
			
		||||
  int Nthreads = 32;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int Nblocks  = 2;
 | 
			
		||||
    int Nthreads = 32;
 | 
			
		||||
 | 
			
		||||
  // Kernel configuration, where a one-dimensional
 | 
			
		||||
  // grid and one-dimensional blocks are configured.
 | 
			
		||||
    // Kernel configuration, where a one-dimensional
 | 
			
		||||
    // grid and one-dimensional blocks are configured.
 | 
			
		||||
 | 
			
		||||
  dim3 dimGrid(Nblocks);
 | 
			
		||||
  dim3 dimBlock(Nthreads);
 | 
			
		||||
    dim3 dimGrid(Nblocks);
 | 
			
		||||
    dim3 dimBlock(Nthreads);
 | 
			
		||||
 | 
			
		||||
  printf("Launch kernel to generate assertion failures\n");
 | 
			
		||||
  char *cubin, *kernel_file;
 | 
			
		||||
  size_t cubinSize;
 | 
			
		||||
    printf("Launch kernel to generate assertion failures\n");
 | 
			
		||||
    char  *cubin, *kernel_file;
 | 
			
		||||
    size_t cubinSize;
 | 
			
		||||
 | 
			
		||||
  kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
 | 
			
		||||
  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
    kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
 | 
			
		||||
    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
 | 
			
		||||
  CUmodule module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
  CUfunction kernel_addr;
 | 
			
		||||
    CUmodule   module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
    CUfunction kernel_addr;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
 | 
			
		||||
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
 | 
			
		||||
 | 
			
		||||
  int count = 60;
 | 
			
		||||
  void *args[] = {(void *)&count};
 | 
			
		||||
    int   count  = 60;
 | 
			
		||||
    void *args[] = {(void *)&count};
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
      kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
 | 
			
		||||
      dimBlock.x, dimBlock.y, dimBlock.z,           /* block dim */
 | 
			
		||||
      0, 0,                                         /* shared mem, stream */
 | 
			
		||||
      &args[0],                                     /* arguments */
 | 
			
		||||
      0));
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(kernel_addr,
 | 
			
		||||
                                   dimGrid.x,
 | 
			
		||||
                                   dimGrid.y,
 | 
			
		||||
                                   dimGrid.z, /* grid dim */
 | 
			
		||||
                                   dimBlock.x,
 | 
			
		||||
                                   dimBlock.y,
 | 
			
		||||
                                   dimBlock.z, /* block dim */
 | 
			
		||||
                                   0,
 | 
			
		||||
                                   0,        /* shared mem, stream */
 | 
			
		||||
                                   &args[0], /* arguments */
 | 
			
		||||
                                   0));
 | 
			
		||||
 | 
			
		||||
  // Synchronize (flushes assert output).
 | 
			
		||||
  printf("\n-- Begin assert output\n\n");
 | 
			
		||||
  CUresult res = cuCtxSynchronize();
 | 
			
		||||
    // Synchronize (flushes assert output).
 | 
			
		||||
    printf("\n-- Begin assert output\n\n");
 | 
			
		||||
    CUresult res = cuCtxSynchronize();
 | 
			
		||||
 | 
			
		||||
  printf("\n-- End assert output\n\n");
 | 
			
		||||
    printf("\n-- End assert output\n\n");
 | 
			
		||||
 | 
			
		||||
  // Check for errors and failed asserts in asynchronous kernel launch.
 | 
			
		||||
  if (res == CUDA_ERROR_ASSERT) {
 | 
			
		||||
    printf("Device assert failed as expected\n");
 | 
			
		||||
  }
 | 
			
		||||
    // Check for errors and failed asserts in asynchronous kernel launch.
 | 
			
		||||
    if (res == CUDA_ERROR_ASSERT) {
 | 
			
		||||
        printf("Device assert failed as expected\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  testResult = res == CUDA_ERROR_ASSERT;
 | 
			
		||||
    testResult = res == CUDA_ERROR_ASSERT;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -32,7 +32,8 @@
 | 
			
		||||
//! Thread whose id > N will print assertion failed error message.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
extern "C" __global__ void testKernel(int N) {
 | 
			
		||||
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  assert(gtid < N);
 | 
			
		||||
extern "C" __global__ void testKernel(int N)
 | 
			
		||||
{
 | 
			
		||||
    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    assert(gtid < N);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -30,10 +30,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#ifdef _WIN32
 | 
			
		||||
#define WINDOWS_LEAN_AND_MEAN
 | 
			
		||||
@ -45,10 +45,10 @@
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
// CUDA helper functions
 | 
			
		||||
#include <helper_cuda.h>  // helper functions for CUDA error check
 | 
			
		||||
#include <helper_cuda.h> // helper functions for CUDA error check
 | 
			
		||||
 | 
			
		||||
// Includes, kernels
 | 
			
		||||
#include "simpleAtomicIntrinsics_kernel.cuh"
 | 
			
		||||
@ -68,67 +68,67 @@ extern "C" bool computeGold(int *gpuData, const int len);
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n", sampleName);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n", sampleName);
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
 | 
			
		||||
  printf("%s completed, returned %s\n", sampleName,
 | 
			
		||||
         testResult ? "OK" : "ERROR!");
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    cudaStream_t stream;
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  StopWatchInterface *timer;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    StopWatchInterface *timer;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  unsigned int numThreads = 256;
 | 
			
		||||
  unsigned int numBlocks = 64;
 | 
			
		||||
  unsigned int numData = 11;
 | 
			
		||||
  unsigned int memSize = sizeof(int) * numData;
 | 
			
		||||
    unsigned int numThreads = 256;
 | 
			
		||||
    unsigned int numBlocks  = 64;
 | 
			
		||||
    unsigned int numData    = 11;
 | 
			
		||||
    unsigned int memSize    = sizeof(int) * numData;
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  int *hOData;
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&hOData, memSize));
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    int *hOData;
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&hOData, memSize));
 | 
			
		||||
 | 
			
		||||
  // initialize the memory
 | 
			
		||||
  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
 | 
			
		||||
    // initialize the memory
 | 
			
		||||
    for (unsigned int i = 0; i < numData; i++)
 | 
			
		||||
        hOData[i] = 0;
 | 
			
		||||
 | 
			
		||||
  // To make the AND and XOR tests generate something other than 0...
 | 
			
		||||
  hOData[8] = hOData[10] = 0xff;
 | 
			
		||||
    // To make the AND and XOR tests generate something other than 0...
 | 
			
		||||
    hOData[8] = hOData[10] = 0xff;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  int *dOData;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
 | 
			
		||||
  // copy host memory to device to initialize to zero
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    int *dOData;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
 | 
			
		||||
    // copy host memory to device to initialize to zero
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
 | 
			
		||||
  // execute the kernel
 | 
			
		||||
  testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
 | 
			
		||||
    // execute the kernel
 | 
			
		||||
    testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
 | 
			
		||||
 | 
			
		||||
  // Copy result from device to host
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
    // Copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Compute reference solution
 | 
			
		||||
  testResult = computeGold(hOData, numThreads * numBlocks);
 | 
			
		||||
    // Compute reference solution
 | 
			
		||||
    testResult = computeGold(hOData, numThreads * numBlocks);
 | 
			
		||||
 | 
			
		||||
  // Cleanup memory
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(hOData));
 | 
			
		||||
  checkCudaErrors(cudaFree(dOData));
 | 
			
		||||
    // Cleanup memory
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(hOData));
 | 
			
		||||
    checkCudaErrors(cudaFree(dOData));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -42,141 +42,142 @@ extern "C" int computeGold(int *gpuData, const int len);
 | 
			
		||||
//! @param idata      input data as provided to device
 | 
			
		||||
//! @param len        number of elements in reference / idata
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int computeGold(int *gpuData, const int len) {
 | 
			
		||||
  int val = 0;
 | 
			
		||||
int computeGold(int *gpuData, const int len)
 | 
			
		||||
{
 | 
			
		||||
    int val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val += 10;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[0]) {
 | 
			
		||||
    printf("atomicAdd failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val -= 10;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[1]) {
 | 
			
		||||
    printf("atomicSub failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  bool found = false;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // third element should be a member of [0, len)
 | 
			
		||||
    if (i == gpuData[2]) {
 | 
			
		||||
      found = true;
 | 
			
		||||
      break;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val += 10;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!found) {
 | 
			
		||||
    printf("atomicExch failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = -(1 << 8);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // fourth element should be len-1
 | 
			
		||||
    val = max(val, i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[3]) {
 | 
			
		||||
    printf("atomicMax failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = 1 << 8;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val = min(val, i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[4]) {
 | 
			
		||||
    printf("atomicMin failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int limit = 17;
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val = (val >= limit) ? 0 : val + 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[5]) {
 | 
			
		||||
    printf("atomicInc failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  limit = 137;
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val = ((val == 0) || (val > limit)) ? limit : val - 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[6]) {
 | 
			
		||||
    printf("atomicDec failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  found = false;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // eighth element should be a member of [0, len)
 | 
			
		||||
    if (i == gpuData[7]) {
 | 
			
		||||
      found = true;
 | 
			
		||||
      break;
 | 
			
		||||
    if (val != gpuData[0]) {
 | 
			
		||||
        printf("atomicAdd failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!found) {
 | 
			
		||||
    printf("atomicCAS failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    val = 0;
 | 
			
		||||
 | 
			
		||||
  val = 0xff;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val -= 10;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 9th element should be 1
 | 
			
		||||
    val &= (2 * i + 7);
 | 
			
		||||
  }
 | 
			
		||||
    if (val != gpuData[1]) {
 | 
			
		||||
        printf("atomicSub failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[8]) {
 | 
			
		||||
    printf("atomicAnd failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    bool found = false;
 | 
			
		||||
 | 
			
		||||
  val = 0;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // third element should be a member of [0, len)
 | 
			
		||||
        if (i == gpuData[2]) {
 | 
			
		||||
            found = true;
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 10th element should be 0xff
 | 
			
		||||
    val |= (1 << i);
 | 
			
		||||
  }
 | 
			
		||||
    if (!found) {
 | 
			
		||||
        printf("atomicExch failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[9]) {
 | 
			
		||||
    printf("atomicOr failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    val = -(1 << 8);
 | 
			
		||||
 | 
			
		||||
  val = 0xff;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // fourth element should be len-1
 | 
			
		||||
        val = max(val, i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 11th element should be 0xff
 | 
			
		||||
    val ^= i;
 | 
			
		||||
  }
 | 
			
		||||
    if (val != gpuData[3]) {
 | 
			
		||||
        printf("atomicMax failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[10]) {
 | 
			
		||||
    printf("atomicXor failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    val = 1 << 8;
 | 
			
		||||
 | 
			
		||||
  return true;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val = min(val, i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[4]) {
 | 
			
		||||
        printf("atomicMin failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int limit = 17;
 | 
			
		||||
    val       = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val = (val >= limit) ? 0 : val + 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[5]) {
 | 
			
		||||
        printf("atomicInc failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    limit = 137;
 | 
			
		||||
    val   = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val = ((val == 0) || (val > limit)) ? limit : val - 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[6]) {
 | 
			
		||||
        printf("atomicDec failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    found = false;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // eighth element should be a member of [0, len)
 | 
			
		||||
        if (i == gpuData[7]) {
 | 
			
		||||
            found = true;
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!found) {
 | 
			
		||||
        printf("atomicCAS failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0xff;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 9th element should be 1
 | 
			
		||||
        val &= (2 * i + 7);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[8]) {
 | 
			
		||||
        printf("atomicAnd failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 10th element should be 0xff
 | 
			
		||||
        val |= (1 << i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[9]) {
 | 
			
		||||
        printf("atomicOr failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0xff;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 11th element should be 0xff
 | 
			
		||||
        val ^= i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[10]) {
 | 
			
		||||
        printf("atomicXor failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -35,48 +35,49 @@
 | 
			
		||||
//! @param g_idata  input data in global memory
 | 
			
		||||
//! @param g_odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void testKernel(int *g_odata) {
 | 
			
		||||
  // access thread id
 | 
			
		||||
  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
__global__ void testKernel(int *g_odata)
 | 
			
		||||
{
 | 
			
		||||
    // access thread id
 | 
			
		||||
    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // Test various atomic instructions
 | 
			
		||||
    // Test various atomic instructions
 | 
			
		||||
 | 
			
		||||
  // Arithmetic atomic instructions
 | 
			
		||||
    // Arithmetic atomic instructions
 | 
			
		||||
 | 
			
		||||
  // Atomic addition
 | 
			
		||||
  atomicAdd(&g_odata[0], 10);
 | 
			
		||||
    // Atomic addition
 | 
			
		||||
    atomicAdd(&g_odata[0], 10);
 | 
			
		||||
 | 
			
		||||
  // Atomic subtraction (final should be 0)
 | 
			
		||||
  atomicSub(&g_odata[1], 10);
 | 
			
		||||
    // Atomic subtraction (final should be 0)
 | 
			
		||||
    atomicSub(&g_odata[1], 10);
 | 
			
		||||
 | 
			
		||||
  // Atomic exchange
 | 
			
		||||
  atomicExch(&g_odata[2], tid);
 | 
			
		||||
    // Atomic exchange
 | 
			
		||||
    atomicExch(&g_odata[2], tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic maximum
 | 
			
		||||
  atomicMax(&g_odata[3], tid);
 | 
			
		||||
    // Atomic maximum
 | 
			
		||||
    atomicMax(&g_odata[3], tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic minimum
 | 
			
		||||
  atomicMin(&g_odata[4], tid);
 | 
			
		||||
    // Atomic minimum
 | 
			
		||||
    atomicMin(&g_odata[4], tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic increment (modulo 17+1)
 | 
			
		||||
  atomicInc((unsigned int *)&g_odata[5], 17);
 | 
			
		||||
    // Atomic increment (modulo 17+1)
 | 
			
		||||
    atomicInc((unsigned int *)&g_odata[5], 17);
 | 
			
		||||
 | 
			
		||||
  // Atomic decrement
 | 
			
		||||
  atomicDec((unsigned int *)&g_odata[6], 137);
 | 
			
		||||
    // Atomic decrement
 | 
			
		||||
    atomicDec((unsigned int *)&g_odata[6], 137);
 | 
			
		||||
 | 
			
		||||
  // Atomic compare-and-swap
 | 
			
		||||
  atomicCAS(&g_odata[7], tid - 1, tid);
 | 
			
		||||
    // Atomic compare-and-swap
 | 
			
		||||
    atomicCAS(&g_odata[7], tid - 1, tid);
 | 
			
		||||
 | 
			
		||||
  // Bitwise atomic instructions
 | 
			
		||||
    // Bitwise atomic instructions
 | 
			
		||||
 | 
			
		||||
  // Atomic AND
 | 
			
		||||
  atomicAnd(&g_odata[8], 2 * tid + 7);
 | 
			
		||||
    // Atomic AND
 | 
			
		||||
    atomicAnd(&g_odata[8], 2 * tid + 7);
 | 
			
		||||
 | 
			
		||||
  // Atomic OR
 | 
			
		||||
  atomicOr(&g_odata[9], 1 << tid);
 | 
			
		||||
    // Atomic OR
 | 
			
		||||
    atomicOr(&g_odata[9], 1 << tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic XOR
 | 
			
		||||
  atomicXor(&g_odata[10], tid);
 | 
			
		||||
    // Atomic XOR
 | 
			
		||||
    atomicXor(&g_odata[10], tid);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif  // #ifndef _SIMPLEATOMICS_KERNEL_H_
 | 
			
		||||
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
 | 
			
		||||
 | 
			
		||||
@ -30,10 +30,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#ifdef _WIN32
 | 
			
		||||
#define WINDOWS_LEAN_AND_MEAN
 | 
			
		||||
@ -46,7 +46,7 @@
 | 
			
		||||
#include <nvrtc_helper.h>
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
const char *sampleName = "simpleAtomicIntrinsics_nvrtc";
 | 
			
		||||
 | 
			
		||||
@ -64,84 +64,90 @@ extern "C" bool computeGold(int *gpuData, const int len);
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n", sampleName);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n", sampleName);
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
 | 
			
		||||
  printf("%s completed, returned %s\n", sampleName,
 | 
			
		||||
         testResult ? "OK" : "ERROR!");
 | 
			
		||||
    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
 | 
			
		||||
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  int dev = 0;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int dev = 0;
 | 
			
		||||
 | 
			
		||||
  char *cubin, *kernel_file;
 | 
			
		||||
  size_t cubinSize;
 | 
			
		||||
    char  *cubin, *kernel_file;
 | 
			
		||||
    size_t cubinSize;
 | 
			
		||||
 | 
			
		||||
  kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
 | 
			
		||||
  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
    kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
 | 
			
		||||
    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
 | 
			
		||||
  CUmodule module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
  CUfunction kernel_addr;
 | 
			
		||||
    CUmodule   module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
    CUfunction kernel_addr;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
 | 
			
		||||
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
 | 
			
		||||
 | 
			
		||||
  StopWatchInterface *timer;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    StopWatchInterface *timer;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  unsigned int numThreads = 256;
 | 
			
		||||
  unsigned int numBlocks = 64;
 | 
			
		||||
  unsigned int numData = 11;
 | 
			
		||||
  unsigned int memSize = sizeof(int) * numData;
 | 
			
		||||
    unsigned int numThreads = 256;
 | 
			
		||||
    unsigned int numBlocks  = 64;
 | 
			
		||||
    unsigned int numData    = 11;
 | 
			
		||||
    unsigned int memSize    = sizeof(int) * numData;
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  int *hOData = (int *)malloc(memSize);
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    int *hOData = (int *)malloc(memSize);
 | 
			
		||||
 | 
			
		||||
  // initialize the memory
 | 
			
		||||
  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
 | 
			
		||||
    // initialize the memory
 | 
			
		||||
    for (unsigned int i = 0; i < numData; i++)
 | 
			
		||||
        hOData[i] = 0;
 | 
			
		||||
 | 
			
		||||
  // To make the AND and XOR tests generate something other than 0...
 | 
			
		||||
  hOData[8] = hOData[10] = 0xff;
 | 
			
		||||
    // To make the AND and XOR tests generate something other than 0...
 | 
			
		||||
    hOData[8] = hOData[10] = 0xff;
 | 
			
		||||
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  CUdeviceptr dOData;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&dOData, memSize));
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    CUdeviceptr dOData;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&dOData, memSize));
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
 | 
			
		||||
 | 
			
		||||
  // execute the kernel
 | 
			
		||||
  dim3 cudaBlockSize(numThreads, 1, 1);
 | 
			
		||||
  dim3 cudaGridSize(numBlocks, 1, 1);
 | 
			
		||||
    // execute the kernel
 | 
			
		||||
    dim3 cudaBlockSize(numThreads, 1, 1);
 | 
			
		||||
    dim3 cudaGridSize(numBlocks, 1, 1);
 | 
			
		||||
 | 
			
		||||
  void *arr[] = {(void *)&dOData};
 | 
			
		||||
  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
 | 
			
		||||
                                 cudaGridSize.z, /* grid dim */
 | 
			
		||||
                                 cudaBlockSize.x, cudaBlockSize.y,
 | 
			
		||||
                                 cudaBlockSize.z, /* block dim */
 | 
			
		||||
                                 0, 0,            /* shared mem, stream */
 | 
			
		||||
                                 &arr[0],         /* arguments */
 | 
			
		||||
                                 0));
 | 
			
		||||
    void *arr[] = {(void *)&dOData};
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(kernel_addr,
 | 
			
		||||
                                   cudaGridSize.x,
 | 
			
		||||
                                   cudaGridSize.y,
 | 
			
		||||
                                   cudaGridSize.z, /* grid dim */
 | 
			
		||||
                                   cudaBlockSize.x,
 | 
			
		||||
                                   cudaBlockSize.y,
 | 
			
		||||
                                   cudaBlockSize.z, /* block dim */
 | 
			
		||||
                                   0,
 | 
			
		||||
                                   0,       /* shared mem, stream */
 | 
			
		||||
                                   &arr[0], /* arguments */
 | 
			
		||||
                                   0));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
 | 
			
		||||
 | 
			
		||||
  // Copy result from device to host
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    // Copy result from device to host
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Compute reference solution
 | 
			
		||||
  testResult = computeGold(hOData, numThreads * numBlocks);
 | 
			
		||||
    // Compute reference solution
 | 
			
		||||
    testResult = computeGold(hOData, numThreads * numBlocks);
 | 
			
		||||
 | 
			
		||||
  // Cleanup memory
 | 
			
		||||
  free(hOData);
 | 
			
		||||
  checkCudaErrors(cuMemFree(dOData));
 | 
			
		||||
    // Cleanup memory
 | 
			
		||||
    free(hOData);
 | 
			
		||||
    checkCudaErrors(cuMemFree(dOData));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -43,139 +43,140 @@ extern "C" int computeGold(int *gpuData, const int len);
 | 
			
		||||
//! @param len        number of elements in reference / idata
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
int computeGold(int *gpuData, const int len) {
 | 
			
		||||
  int val = 0;
 | 
			
		||||
int computeGold(int *gpuData, const int len)
 | 
			
		||||
{
 | 
			
		||||
    int val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val += 10;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[0]) {
 | 
			
		||||
    printf("atomicAdd failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val -= 10;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[1]) {
 | 
			
		||||
    printf("atomicSub failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  bool found = false;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // third element should be a member of [0, len)
 | 
			
		||||
    if (i == gpuData[2]) {
 | 
			
		||||
      found = true;
 | 
			
		||||
      break;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val += 10;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!found) {
 | 
			
		||||
    printf("atomicExch failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = -(1 << 8);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // fourth element should be len-1
 | 
			
		||||
    val = max(val, i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[3]) {
 | 
			
		||||
    printf("atomicMax failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = 1 << 8;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val = min(val, i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[4]) {
 | 
			
		||||
    printf("atomicMin failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int limit = 17;
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val = (val >= limit) ? 0 : val + 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[5]) {
 | 
			
		||||
    printf("atomicInc failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  limit = 137;
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val = ((val == 0) || (val > limit)) ? limit : val - 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[6]) {
 | 
			
		||||
    printf("atomicDec failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  found = false;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // eighth element should be a member of [0, len)
 | 
			
		||||
    if (i == gpuData[7]) {
 | 
			
		||||
      found = true;
 | 
			
		||||
      break;
 | 
			
		||||
    if (val != gpuData[0]) {
 | 
			
		||||
        printf("atomicAdd failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!found) {
 | 
			
		||||
    printf("atomicCAS failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    val = 0;
 | 
			
		||||
 | 
			
		||||
  val = 0xff;
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 9th element should be 1
 | 
			
		||||
    val &= (2 * i + 7);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val -= 10;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[8]) {
 | 
			
		||||
    printf("atomicAnd failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    if (val != gpuData[1]) {
 | 
			
		||||
        printf("atomicSub failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  val = 0;
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 10th element should be 0xff
 | 
			
		||||
    val |= (1 << i);
 | 
			
		||||
  }
 | 
			
		||||
    bool found = false;
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[9]) {
 | 
			
		||||
    printf("atomicOr failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // third element should be a member of [0, len)
 | 
			
		||||
        if (i == gpuData[2]) {
 | 
			
		||||
            found = true;
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  val = 0xff;
 | 
			
		||||
    if (!found) {
 | 
			
		||||
        printf("atomicExch failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 11th element should be 0xff
 | 
			
		||||
    val ^= i;
 | 
			
		||||
  }
 | 
			
		||||
    val = -(1 << 8);
 | 
			
		||||
 | 
			
		||||
  if (val != gpuData[10]) {
 | 
			
		||||
    printf("atomicXor failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // fourth element should be len-1
 | 
			
		||||
        val = max(val, i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return true;
 | 
			
		||||
    if (val != gpuData[3]) {
 | 
			
		||||
        printf("atomicMax failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 1 << 8;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val = min(val, i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[4]) {
 | 
			
		||||
        printf("atomicMin failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int limit = 17;
 | 
			
		||||
    val       = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val = (val >= limit) ? 0 : val + 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[5]) {
 | 
			
		||||
        printf("atomicInc failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    limit = 137;
 | 
			
		||||
    val   = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val = ((val == 0) || (val > limit)) ? limit : val - 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[6]) {
 | 
			
		||||
        printf("atomicDec failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    found = false;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // eighth element should be a member of [0, len)
 | 
			
		||||
        if (i == gpuData[7]) {
 | 
			
		||||
            found = true;
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!found) {
 | 
			
		||||
        printf("atomicCAS failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0xff;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 9th element should be 1
 | 
			
		||||
        val &= (2 * i + 7);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[8]) {
 | 
			
		||||
        printf("atomicAnd failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0;
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 10th element should be 0xff
 | 
			
		||||
        val |= (1 << i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[9]) {
 | 
			
		||||
        printf("atomicOr failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0xff;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 11th element should be 0xff
 | 
			
		||||
        val ^= i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != gpuData[10]) {
 | 
			
		||||
        printf("atomicXor failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -36,45 +36,46 @@
 | 
			
		||||
//! @param g_odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
extern "C" __global__ void testKernel(int *g_odata) {
 | 
			
		||||
  // access thread id
 | 
			
		||||
  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
extern "C" __global__ void testKernel(int *g_odata)
 | 
			
		||||
{
 | 
			
		||||
    // access thread id
 | 
			
		||||
    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // Test various atomic instructions
 | 
			
		||||
  // Arithmetic atomic instructions
 | 
			
		||||
  // Atomic addition
 | 
			
		||||
  atomicAdd(&g_odata[0], 10);
 | 
			
		||||
    // Test various atomic instructions
 | 
			
		||||
    // Arithmetic atomic instructions
 | 
			
		||||
    // Atomic addition
 | 
			
		||||
    atomicAdd(&g_odata[0], 10);
 | 
			
		||||
 | 
			
		||||
  // Atomic subtraction (final should be 0)
 | 
			
		||||
  atomicSub(&g_odata[1], 10);
 | 
			
		||||
    // Atomic subtraction (final should be 0)
 | 
			
		||||
    atomicSub(&g_odata[1], 10);
 | 
			
		||||
 | 
			
		||||
  // Atomic exchange
 | 
			
		||||
  atomicExch(&g_odata[2], tid);
 | 
			
		||||
    // Atomic exchange
 | 
			
		||||
    atomicExch(&g_odata[2], tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic maximum
 | 
			
		||||
  atomicMax(&g_odata[3], tid);
 | 
			
		||||
    // Atomic maximum
 | 
			
		||||
    atomicMax(&g_odata[3], tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic minimum
 | 
			
		||||
  atomicMin(&g_odata[4], tid);
 | 
			
		||||
    // Atomic minimum
 | 
			
		||||
    atomicMin(&g_odata[4], tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic increment (modulo 17+1)
 | 
			
		||||
  atomicInc((unsigned int *)&g_odata[5], 17);
 | 
			
		||||
    // Atomic increment (modulo 17+1)
 | 
			
		||||
    atomicInc((unsigned int *)&g_odata[5], 17);
 | 
			
		||||
 | 
			
		||||
  // Atomic decrement
 | 
			
		||||
  atomicDec((unsigned int *)&g_odata[6], 137);
 | 
			
		||||
    // Atomic decrement
 | 
			
		||||
    atomicDec((unsigned int *)&g_odata[6], 137);
 | 
			
		||||
 | 
			
		||||
  // Atomic compare-and-swap
 | 
			
		||||
  atomicCAS(&g_odata[7], tid - 1, tid);
 | 
			
		||||
    // Atomic compare-and-swap
 | 
			
		||||
    atomicCAS(&g_odata[7], tid - 1, tid);
 | 
			
		||||
 | 
			
		||||
  // Bitwise atomic instructions
 | 
			
		||||
  // Atomic AND
 | 
			
		||||
  atomicAnd(&g_odata[8], 2 * tid + 7);
 | 
			
		||||
    // Bitwise atomic instructions
 | 
			
		||||
    // Atomic AND
 | 
			
		||||
    atomicAnd(&g_odata[8], 2 * tid + 7);
 | 
			
		||||
 | 
			
		||||
  // Atomic OR
 | 
			
		||||
  atomicOr(&g_odata[9], 1 << tid);
 | 
			
		||||
    // Atomic OR
 | 
			
		||||
    atomicOr(&g_odata[9], 1 << tid);
 | 
			
		||||
 | 
			
		||||
  // Atomic XOR
 | 
			
		||||
  atomicXor(&g_odata[10], tid);
 | 
			
		||||
    // Atomic XOR
 | 
			
		||||
    atomicXor(&g_odata[10], tid);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif  // #ifndef _SIMPLEATOMICS_KERNEL_H_
 | 
			
		||||
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
 | 
			
		||||
 | 
			
		||||
@ -26,30 +26,31 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes CUDA
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>  // helper functions for SDK examples
 | 
			
		||||
#include <helper_functions.h> // helper functions for SDK examples
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// declaration, forward
 | 
			
		||||
void runTest(int argc, char **argv);
 | 
			
		||||
 | 
			
		||||
cudaAccessPolicyWindow initAccessPolicyWindow(void) {
 | 
			
		||||
  cudaAccessPolicyWindow accessPolicyWindow = {0};
 | 
			
		||||
  accessPolicyWindow.base_ptr = (void *)0;
 | 
			
		||||
  accessPolicyWindow.num_bytes = 0;
 | 
			
		||||
  accessPolicyWindow.hitRatio = 0.f;
 | 
			
		||||
  accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
 | 
			
		||||
  accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
 | 
			
		||||
  return accessPolicyWindow;
 | 
			
		||||
cudaAccessPolicyWindow initAccessPolicyWindow(void)
 | 
			
		||||
{
 | 
			
		||||
    cudaAccessPolicyWindow accessPolicyWindow = {0};
 | 
			
		||||
    accessPolicyWindow.base_ptr               = (void *)0;
 | 
			
		||||
    accessPolicyWindow.num_bytes              = 0;
 | 
			
		||||
    accessPolicyWindow.hitRatio               = 0.f;
 | 
			
		||||
    accessPolicyWindow.hitProp                = cudaAccessPropertyNormal;
 | 
			
		||||
    accessPolicyWindow.missProp               = cudaAccessPropertyStreaming;
 | 
			
		||||
    return accessPolicyWindow;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -60,35 +61,35 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
 | 
			
		||||
//! @param bigDataSize  input bigData size
 | 
			
		||||
//! @param hitcount how many data access are done within block
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
 | 
			
		||||
                                            int bigDataSize, int hitCount) {
 | 
			
		||||
  __shared__ unsigned int hit;
 | 
			
		||||
  int row = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
  int col = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  int tID = row * blockDim.y + col;
 | 
			
		||||
  uint32_t psRand = tID;
 | 
			
		||||
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
 | 
			
		||||
{
 | 
			
		||||
    __shared__ unsigned int hit;
 | 
			
		||||
    int                     row    = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
    int                     col    = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    int                     tID    = row * blockDim.y + col;
 | 
			
		||||
    uint32_t                psRand = tID;
 | 
			
		||||
 | 
			
		||||
  atomicExch(&hit, 0);
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  while (hit < hitCount) {
 | 
			
		||||
    psRand ^= psRand << 13;
 | 
			
		||||
    psRand ^= psRand >> 17;
 | 
			
		||||
    psRand ^= psRand << 5;
 | 
			
		||||
    atomicExch(&hit, 0);
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    while (hit < hitCount) {
 | 
			
		||||
        psRand ^= psRand << 13;
 | 
			
		||||
        psRand ^= psRand >> 17;
 | 
			
		||||
        psRand ^= psRand << 5;
 | 
			
		||||
 | 
			
		||||
    int idx = tID - psRand;
 | 
			
		||||
    if (idx < 0) {
 | 
			
		||||
      idx = -idx;
 | 
			
		||||
        int idx = tID - psRand;
 | 
			
		||||
        if (idx < 0) {
 | 
			
		||||
            idx = -idx;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if ((tID % 2) == 0) {
 | 
			
		||||
            data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        atomicAdd(&hit, 1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if ((tID % 2) == 0) {
 | 
			
		||||
      data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
 | 
			
		||||
    } else {
 | 
			
		||||
      trash[psRand % bigDataSize] =
 | 
			
		||||
          trash[psRand % bigDataSize] + trash[idx % bigDataSize];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    atomicAdd(&hit, 1);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
@ -98,117 +99,110 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  bool bTestResult = true;
 | 
			
		||||
  cudaAccessPolicyWindow accessPolicyWindow;
 | 
			
		||||
  cudaDeviceProp deviceProp;
 | 
			
		||||
  cudaStreamAttrValue streamAttrValue;
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
  cudaStreamAttrID streamAttrID;
 | 
			
		||||
  dim3 threads(32, 32);
 | 
			
		||||
  int *dataDevicePointer;
 | 
			
		||||
  int *dataHostPointer;
 | 
			
		||||
  int dataSize;
 | 
			
		||||
  int *bigDataDevicePointer;
 | 
			
		||||
  int *bigDataHostPointer;
 | 
			
		||||
  int bigDataSize;
 | 
			
		||||
  StopWatchInterface *timer = 0;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    bool                   bTestResult = true;
 | 
			
		||||
    cudaAccessPolicyWindow accessPolicyWindow;
 | 
			
		||||
    cudaDeviceProp         deviceProp;
 | 
			
		||||
    cudaStreamAttrValue    streamAttrValue;
 | 
			
		||||
    cudaStream_t           stream;
 | 
			
		||||
    cudaStreamAttrID       streamAttrID;
 | 
			
		||||
    dim3                   threads(32, 32);
 | 
			
		||||
    int                   *dataDevicePointer;
 | 
			
		||||
    int                   *dataHostPointer;
 | 
			
		||||
    int                    dataSize;
 | 
			
		||||
    int                   *bigDataDevicePointer;
 | 
			
		||||
    int                   *bigDataHostPointer;
 | 
			
		||||
    int                    bigDataSize;
 | 
			
		||||
    StopWatchInterface    *timer = 0;
 | 
			
		||||
 | 
			
		||||
  printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
    printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
  // Gflops/s
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
  // Get device properties
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 | 
			
		||||
  dim3 blocks(deviceProp.maxGridSize[1], 1);
 | 
			
		||||
    // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
    // Gflops/s
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
    // Get device properties
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 | 
			
		||||
    dim3 blocks(deviceProp.maxGridSize[1], 1);
 | 
			
		||||
 | 
			
		||||
  // Make sure device the l2 optimization
 | 
			
		||||
  if (deviceProp.persistingL2CacheMaxSize == 0) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "Waiving execution as device %d does not support persisting L2 "
 | 
			
		||||
        "Caching\n",
 | 
			
		||||
        devID);
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Create stream to assiocate with window
 | 
			
		||||
  checkCudaErrors(cudaStreamCreate(&stream));
 | 
			
		||||
 | 
			
		||||
  // Set the amount of l2 cache that will be persisting to maximum the device
 | 
			
		||||
  // can support
 | 
			
		||||
  checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
 | 
			
		||||
                                     deviceProp.persistingL2CacheMaxSize));
 | 
			
		||||
 | 
			
		||||
  // Stream attribute to set
 | 
			
		||||
  streamAttrID = cudaStreamAttributeAccessPolicyWindow;
 | 
			
		||||
 | 
			
		||||
  // Default window
 | 
			
		||||
  streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
 | 
			
		||||
  accessPolicyWindow = initAccessPolicyWindow();
 | 
			
		||||
 | 
			
		||||
  // Allocate size of both buffers
 | 
			
		||||
  bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
 | 
			
		||||
  dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
 | 
			
		||||
 | 
			
		||||
  // Allocate data
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < bigDataSize; ++i) {
 | 
			
		||||
    if (i < dataSize) {
 | 
			
		||||
      dataHostPointer[i] = i;
 | 
			
		||||
    // Make sure device the l2 optimization
 | 
			
		||||
    if (deviceProp.persistingL2CacheMaxSize == 0) {
 | 
			
		||||
        printf("Waiving execution as device %d does not support persisting L2 "
 | 
			
		||||
               "Caching\n",
 | 
			
		||||
               devID);
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    bigDataHostPointer[bigDataSize - i - 1] = i;
 | 
			
		||||
  }
 | 
			
		||||
    // Create stream to assiocate with window
 | 
			
		||||
    checkCudaErrors(cudaStreamCreate(&stream));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
 | 
			
		||||
                                  dataSize * sizeof(int),
 | 
			
		||||
                                  cudaMemcpyHostToDevice, stream));
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
 | 
			
		||||
                                  bigDataSize * sizeof(int),
 | 
			
		||||
                                  cudaMemcpyHostToDevice, stream));
 | 
			
		||||
    // Set the amount of l2 cache that will be persisting to maximum the device
 | 
			
		||||
    // can support
 | 
			
		||||
    checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
 | 
			
		||||
 | 
			
		||||
  // Make a window for the buffer of interest
 | 
			
		||||
  accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
 | 
			
		||||
  accessPolicyWindow.num_bytes = dataSize * sizeof(int);
 | 
			
		||||
  accessPolicyWindow.hitRatio = 1.f;
 | 
			
		||||
  accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
 | 
			
		||||
  accessPolicyWindow.missProp = cudaAccessPropertyNormal;
 | 
			
		||||
  streamAttrValue.accessPolicyWindow = accessPolicyWindow;
 | 
			
		||||
    // Stream attribute to set
 | 
			
		||||
    streamAttrID = cudaStreamAttributeAccessPolicyWindow;
 | 
			
		||||
 | 
			
		||||
  // Assign window to stream
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
 | 
			
		||||
    // Default window
 | 
			
		||||
    streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
 | 
			
		||||
    accessPolicyWindow                 = initAccessPolicyWindow();
 | 
			
		||||
 | 
			
		||||
  // Demote any previous persisting lines
 | 
			
		||||
  checkCudaErrors(cudaCtxResetPersistingL2Cache());
 | 
			
		||||
    // Allocate size of both buffers
 | 
			
		||||
    bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
 | 
			
		||||
    dataSize    = (deviceProp.l2CacheSize / 4) / sizeof(int);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
  kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
 | 
			
		||||
      dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
 | 
			
		||||
    // Allocate data
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
  // check if kernel execution generated and error
 | 
			
		||||
  getLastCudaError("Kernel execution failed");
 | 
			
		||||
    for (int i = 0; i < bigDataSize; ++i) {
 | 
			
		||||
        if (i < dataSize) {
 | 
			
		||||
            dataHostPointer[i] = i;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
  // Free memory
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(dataHostPointer));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(bigDataHostPointer));
 | 
			
		||||
  checkCudaErrors(cudaFree(dataDevicePointer));
 | 
			
		||||
  checkCudaErrors(cudaFree(bigDataDevicePointer));
 | 
			
		||||
        bigDataHostPointer[bigDataSize - i - 1] = i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(
 | 
			
		||||
        bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
 | 
			
		||||
 | 
			
		||||
  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    // Make a window for the buffer of interest
 | 
			
		||||
    accessPolicyWindow.base_ptr        = (void *)dataDevicePointer;
 | 
			
		||||
    accessPolicyWindow.num_bytes       = dataSize * sizeof(int);
 | 
			
		||||
    accessPolicyWindow.hitRatio        = 1.f;
 | 
			
		||||
    accessPolicyWindow.hitProp         = cudaAccessPropertyPersisting;
 | 
			
		||||
    accessPolicyWindow.missProp        = cudaAccessPropertyNormal;
 | 
			
		||||
    streamAttrValue.accessPolicyWindow = accessPolicyWindow;
 | 
			
		||||
 | 
			
		||||
    // Assign window to stream
 | 
			
		||||
    checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
 | 
			
		||||
 | 
			
		||||
    // Demote any previous persisting lines
 | 
			
		||||
    checkCudaErrors(cudaCtxResetPersistingL2Cache());
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
    kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
 | 
			
		||||
        dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
    // check if kernel execution generated and error
 | 
			
		||||
    getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
    // Free memory
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(dataHostPointer));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(bigDataHostPointer));
 | 
			
		||||
    checkCudaErrors(cudaFree(dataDevicePointer));
 | 
			
		||||
    checkCudaErrors(cudaFree(bigDataDevicePointer));
 | 
			
		||||
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 | 
			
		||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 | 
			
		||||
 | 
			
		||||
## References (for more details)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -35,28 +35,30 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
 | 
			
		||||
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
 | 
			
		||||
 | 
			
		||||
// convert floating point rgb color to 8-bit integer
 | 
			
		||||
__device__ int rgbToInt(float r, float g, float b) {
 | 
			
		||||
  r = clamp(r, 0.0f, 255.0f);
 | 
			
		||||
  g = clamp(g, 0.0f, 255.0f);
 | 
			
		||||
  b = clamp(b, 0.0f, 255.0f);
 | 
			
		||||
  return (int(b) << 16) | (int(g) << 8) | int(r);
 | 
			
		||||
__device__ int rgbToInt(float r, float g, float b)
 | 
			
		||||
{
 | 
			
		||||
    r = clamp(r, 0.0f, 255.0f);
 | 
			
		||||
    g = clamp(g, 0.0f, 255.0f);
 | 
			
		||||
    b = clamp(b, 0.0f, 255.0f);
 | 
			
		||||
    return (int(b) << 16) | (int(g) << 8) | int(r);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
 | 
			
		||||
  extern __shared__ uchar4 sdata[];
 | 
			
		||||
__global__ void cudaProcess(unsigned int *g_odata, int imgw)
 | 
			
		||||
{
 | 
			
		||||
    extern __shared__ uchar4 sdata[];
 | 
			
		||||
 | 
			
		||||
  int tx = threadIdx.x;
 | 
			
		||||
  int ty = threadIdx.y;
 | 
			
		||||
  int bw = blockDim.x;
 | 
			
		||||
  int bh = blockDim.y;
 | 
			
		||||
  int x = blockIdx.x * bw + tx;
 | 
			
		||||
  int y = blockIdx.y * bh + ty;
 | 
			
		||||
    int tx = threadIdx.x;
 | 
			
		||||
    int ty = threadIdx.y;
 | 
			
		||||
    int bw = blockDim.x;
 | 
			
		||||
    int bh = blockDim.y;
 | 
			
		||||
    int x  = blockIdx.x * bw + tx;
 | 
			
		||||
    int y  = blockIdx.y * bh + ty;
 | 
			
		||||
 | 
			
		||||
  uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
 | 
			
		||||
  g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
 | 
			
		||||
    uchar4 c4             = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
 | 
			
		||||
    g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
 | 
			
		||||
                                   unsigned int *g_odata, int imgw) {
 | 
			
		||||
  cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
 | 
			
		||||
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
 | 
			
		||||
{
 | 
			
		||||
    cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -29,115 +29,124 @@
 | 
			
		||||
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
// Create thread
 | 
			
		||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
 | 
			
		||||
  return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
 | 
			
		||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
 | 
			
		||||
{
 | 
			
		||||
    return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Wait for thread to finish
 | 
			
		||||
void cutEndThread(CUTThread thread) {
 | 
			
		||||
  WaitForSingleObject(thread, INFINITE);
 | 
			
		||||
  CloseHandle(thread);
 | 
			
		||||
void cutEndThread(CUTThread thread)
 | 
			
		||||
{
 | 
			
		||||
    WaitForSingleObject(thread, INFINITE);
 | 
			
		||||
    CloseHandle(thread);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Wait for multiple threads
 | 
			
		||||
void cutWaitForThreads(const CUTThread *threads, int num) {
 | 
			
		||||
  WaitForMultipleObjects(num, threads, true, INFINITE);
 | 
			
		||||
void cutWaitForThreads(const CUTThread *threads, int num)
 | 
			
		||||
{
 | 
			
		||||
    WaitForMultipleObjects(num, threads, true, INFINITE);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < num; i++) {
 | 
			
		||||
    CloseHandle(threads[i]);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < num; i++) {
 | 
			
		||||
        CloseHandle(threads[i]);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Create barrier.
 | 
			
		||||
CUTBarrier cutCreateBarrier(int releaseCount) {
 | 
			
		||||
  CUTBarrier barrier;
 | 
			
		||||
CUTBarrier cutCreateBarrier(int releaseCount)
 | 
			
		||||
{
 | 
			
		||||
    CUTBarrier barrier;
 | 
			
		||||
 | 
			
		||||
  InitializeCriticalSection(&barrier.criticalSection);
 | 
			
		||||
  barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
 | 
			
		||||
  barrier.count = 0;
 | 
			
		||||
  barrier.releaseCount = releaseCount;
 | 
			
		||||
    InitializeCriticalSection(&barrier.criticalSection);
 | 
			
		||||
    barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
 | 
			
		||||
    barrier.count        = 0;
 | 
			
		||||
    barrier.releaseCount = releaseCount;
 | 
			
		||||
 | 
			
		||||
  return barrier;
 | 
			
		||||
    return barrier;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Increment barrier. (execution continues)
 | 
			
		||||
void cutIncrementBarrier(CUTBarrier *barrier) {
 | 
			
		||||
  int myBarrierCount;
 | 
			
		||||
  EnterCriticalSection(&barrier->criticalSection);
 | 
			
		||||
  myBarrierCount = ++barrier->count;
 | 
			
		||||
  LeaveCriticalSection(&barrier->criticalSection);
 | 
			
		||||
void cutIncrementBarrier(CUTBarrier *barrier)
 | 
			
		||||
{
 | 
			
		||||
    int myBarrierCount;
 | 
			
		||||
    EnterCriticalSection(&barrier->criticalSection);
 | 
			
		||||
    myBarrierCount = ++barrier->count;
 | 
			
		||||
    LeaveCriticalSection(&barrier->criticalSection);
 | 
			
		||||
 | 
			
		||||
  if (myBarrierCount >= barrier->releaseCount) {
 | 
			
		||||
    SetEvent(barrier->barrierEvent);
 | 
			
		||||
  }
 | 
			
		||||
    if (myBarrierCount >= barrier->releaseCount) {
 | 
			
		||||
        SetEvent(barrier->barrierEvent);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Wait for barrier release.
 | 
			
		||||
void cutWaitForBarrier(CUTBarrier *barrier) {
 | 
			
		||||
  WaitForSingleObject(barrier->barrierEvent, INFINITE);
 | 
			
		||||
}
 | 
			
		||||
void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
 | 
			
		||||
 | 
			
		||||
// Destroy barrier
 | 
			
		||||
void cutDestroyBarrier(CUTBarrier *barrier) {}
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
// Create thread
 | 
			
		||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
 | 
			
		||||
  pthread_t thread;
 | 
			
		||||
  pthread_create(&thread, NULL, func, data);
 | 
			
		||||
  return thread;
 | 
			
		||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
 | 
			
		||||
{
 | 
			
		||||
    pthread_t thread;
 | 
			
		||||
    pthread_create(&thread, NULL, func, data);
 | 
			
		||||
    return thread;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Wait for thread to finish
 | 
			
		||||
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
 | 
			
		||||
 | 
			
		||||
// Wait for multiple threads
 | 
			
		||||
void cutWaitForThreads(const CUTThread *threads, int num) {
 | 
			
		||||
  for (int i = 0; i < num; i++) {
 | 
			
		||||
    cutEndThread(threads[i]);
 | 
			
		||||
  }
 | 
			
		||||
void cutWaitForThreads(const CUTThread *threads, int num)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < num; i++) {
 | 
			
		||||
        cutEndThread(threads[i]);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Create barrier.
 | 
			
		||||
CUTBarrier cutCreateBarrier(int releaseCount) {
 | 
			
		||||
  CUTBarrier barrier;
 | 
			
		||||
CUTBarrier cutCreateBarrier(int releaseCount)
 | 
			
		||||
{
 | 
			
		||||
    CUTBarrier barrier;
 | 
			
		||||
 | 
			
		||||
  barrier.count = 0;
 | 
			
		||||
  barrier.releaseCount = releaseCount;
 | 
			
		||||
    barrier.count        = 0;
 | 
			
		||||
    barrier.releaseCount = releaseCount;
 | 
			
		||||
 | 
			
		||||
  pthread_mutex_init(&barrier.mutex, 0);
 | 
			
		||||
  pthread_cond_init(&barrier.conditionVariable, 0);
 | 
			
		||||
    pthread_mutex_init(&barrier.mutex, 0);
 | 
			
		||||
    pthread_cond_init(&barrier.conditionVariable, 0);
 | 
			
		||||
 | 
			
		||||
  return barrier;
 | 
			
		||||
    return barrier;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Increment barrier. (execution continues)
 | 
			
		||||
void cutIncrementBarrier(CUTBarrier *barrier) {
 | 
			
		||||
  int myBarrierCount;
 | 
			
		||||
  pthread_mutex_lock(&barrier->mutex);
 | 
			
		||||
  myBarrierCount = ++barrier->count;
 | 
			
		||||
  pthread_mutex_unlock(&barrier->mutex);
 | 
			
		||||
void cutIncrementBarrier(CUTBarrier *barrier)
 | 
			
		||||
{
 | 
			
		||||
    int myBarrierCount;
 | 
			
		||||
    pthread_mutex_lock(&barrier->mutex);
 | 
			
		||||
    myBarrierCount = ++barrier->count;
 | 
			
		||||
    pthread_mutex_unlock(&barrier->mutex);
 | 
			
		||||
 | 
			
		||||
  if (myBarrierCount >= barrier->releaseCount) {
 | 
			
		||||
    pthread_cond_signal(&barrier->conditionVariable);
 | 
			
		||||
  }
 | 
			
		||||
    if (myBarrierCount >= barrier->releaseCount) {
 | 
			
		||||
        pthread_cond_signal(&barrier->conditionVariable);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Wait for barrier release.
 | 
			
		||||
void cutWaitForBarrier(CUTBarrier *barrier) {
 | 
			
		||||
  pthread_mutex_lock(&barrier->mutex);
 | 
			
		||||
void cutWaitForBarrier(CUTBarrier *barrier)
 | 
			
		||||
{
 | 
			
		||||
    pthread_mutex_lock(&barrier->mutex);
 | 
			
		||||
 | 
			
		||||
  while (barrier->count < barrier->releaseCount) {
 | 
			
		||||
    pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
 | 
			
		||||
  }
 | 
			
		||||
    while (barrier->count < barrier->releaseCount) {
 | 
			
		||||
        pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  pthread_mutex_unlock(&barrier->mutex);
 | 
			
		||||
    pthread_mutex_unlock(&barrier->mutex);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Destroy barrier
 | 
			
		||||
void cutDestroyBarrier(CUTBarrier *barrier) {
 | 
			
		||||
  pthread_mutex_destroy(&barrier->mutex);
 | 
			
		||||
  pthread_cond_destroy(&barrier->conditionVariable);
 | 
			
		||||
void cutDestroyBarrier(CUTBarrier *barrier)
 | 
			
		||||
{
 | 
			
		||||
    pthread_mutex_destroy(&barrier->mutex);
 | 
			
		||||
    pthread_cond_destroy(&barrier->conditionVariable);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@ -37,15 +37,16 @@
 | 
			
		||||
typedef HANDLE CUTThread;
 | 
			
		||||
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
 | 
			
		||||
 | 
			
		||||
struct CUTBarrier {
 | 
			
		||||
  CRITICAL_SECTION criticalSection;
 | 
			
		||||
  HANDLE barrierEvent;
 | 
			
		||||
  int releaseCount;
 | 
			
		||||
  int count;
 | 
			
		||||
struct CUTBarrier
 | 
			
		||||
{
 | 
			
		||||
    CRITICAL_SECTION criticalSection;
 | 
			
		||||
    HANDLE           barrierEvent;
 | 
			
		||||
    int              releaseCount;
 | 
			
		||||
    int              count;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define CUT_THREADPROC unsigned WINAPI
 | 
			
		||||
#define CUT_THREADEND return 0
 | 
			
		||||
#define CUT_THREADEND  return 0
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
// POSIX threads.
 | 
			
		||||
@ -55,44 +56,46 @@ typedef pthread_t CUTThread;
 | 
			
		||||
typedef void *(*CUT_THREADROUTINE)(void *);
 | 
			
		||||
 | 
			
		||||
#define CUT_THREADPROC void *
 | 
			
		||||
#define CUT_THREADEND return 0
 | 
			
		||||
#define CUT_THREADEND  return 0
 | 
			
		||||
 | 
			
		||||
struct CUTBarrier {
 | 
			
		||||
  pthread_mutex_t mutex;
 | 
			
		||||
  pthread_cond_t conditionVariable;
 | 
			
		||||
  int releaseCount;
 | 
			
		||||
  int count;
 | 
			
		||||
struct CUTBarrier
 | 
			
		||||
{
 | 
			
		||||
    pthread_mutex_t mutex;
 | 
			
		||||
    pthread_cond_t  conditionVariable;
 | 
			
		||||
    int             releaseCount;
 | 
			
		||||
    int             count;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef __cplusplus
 | 
			
		||||
extern "C" {
 | 
			
		||||
extern "C"
 | 
			
		||||
{
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Create thread.
 | 
			
		||||
CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
 | 
			
		||||
    // Create thread.
 | 
			
		||||
    CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
 | 
			
		||||
 | 
			
		||||
// Wait for thread to finish.
 | 
			
		||||
void cutEndThread(CUTThread thread);
 | 
			
		||||
    // Wait for thread to finish.
 | 
			
		||||
    void cutEndThread(CUTThread thread);
 | 
			
		||||
 | 
			
		||||
// Wait for multiple threads.
 | 
			
		||||
void cutWaitForThreads(const CUTThread *threads, int num);
 | 
			
		||||
    // Wait for multiple threads.
 | 
			
		||||
    void cutWaitForThreads(const CUTThread *threads, int num);
 | 
			
		||||
 | 
			
		||||
// Create barrier.
 | 
			
		||||
CUTBarrier cutCreateBarrier(int releaseCount);
 | 
			
		||||
    // Create barrier.
 | 
			
		||||
    CUTBarrier cutCreateBarrier(int releaseCount);
 | 
			
		||||
 | 
			
		||||
// Increment barrier. (execution continues)
 | 
			
		||||
void cutIncrementBarrier(CUTBarrier *barrier);
 | 
			
		||||
    // Increment barrier. (execution continues)
 | 
			
		||||
    void cutIncrementBarrier(CUTBarrier *barrier);
 | 
			
		||||
 | 
			
		||||
// Wait for barrier release.
 | 
			
		||||
void cutWaitForBarrier(CUTBarrier *barrier);
 | 
			
		||||
    // Wait for barrier release.
 | 
			
		||||
    void cutWaitForBarrier(CUTBarrier *barrier);
 | 
			
		||||
 | 
			
		||||
// Destroy barrier
 | 
			
		||||
void cutDestroyBarrier(CUTBarrier *barrier);
 | 
			
		||||
    // Destroy barrier
 | 
			
		||||
    void cutDestroyBarrier(CUTBarrier *barrier);
 | 
			
		||||
 | 
			
		||||
#ifdef __cplusplus
 | 
			
		||||
}  // extern "C"
 | 
			
		||||
} // extern "C"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif  // MULTITHREADING_H
 | 
			
		||||
#endif // MULTITHREADING_H
 | 
			
		||||
 | 
			
		||||
@ -43,172 +43,173 @@
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
#include "multithreading.h"
 | 
			
		||||
 | 
			
		||||
const int N_workloads = 8;
 | 
			
		||||
const int N_workloads             = 8;
 | 
			
		||||
const int N_elements_per_workload = 100000;
 | 
			
		||||
 | 
			
		||||
CUTBarrier thread_barrier;
 | 
			
		||||
 | 
			
		||||
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
 | 
			
		||||
                                void *data);
 | 
			
		||||
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
 | 
			
		||||
 | 
			
		||||
struct heterogeneous_workload {
 | 
			
		||||
  int id;
 | 
			
		||||
  int cudaDeviceID;
 | 
			
		||||
struct heterogeneous_workload
 | 
			
		||||
{
 | 
			
		||||
    int id;
 | 
			
		||||
    int cudaDeviceID;
 | 
			
		||||
 | 
			
		||||
  int *h_data;
 | 
			
		||||
  int *d_data;
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
    int         *h_data;
 | 
			
		||||
    int         *d_data;
 | 
			
		||||
    cudaStream_t stream;
 | 
			
		||||
 | 
			
		||||
  bool success;
 | 
			
		||||
    bool success;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
__global__ void incKernel(int *data, int N) {
 | 
			
		||||
  int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
__global__ void incKernel(int *data, int N)
 | 
			
		||||
{
 | 
			
		||||
    int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (i < N) data[i]++;
 | 
			
		||||
    if (i < N)
 | 
			
		||||
        data[i]++;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CUT_THREADPROC launch(void *void_arg) {
 | 
			
		||||
  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
 | 
			
		||||
CUT_THREADPROC launch(void *void_arg)
 | 
			
		||||
{
 | 
			
		||||
    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
 | 
			
		||||
 | 
			
		||||
  // Select GPU for this CPU thread
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
 | 
			
		||||
    // Select GPU for this CPU thread
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
 | 
			
		||||
 | 
			
		||||
  // Allocate Resources
 | 
			
		||||
  checkCudaErrors(cudaStreamCreate(&workload->stream));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
 | 
			
		||||
  checkCudaErrors(cudaHostAlloc(&workload->h_data,
 | 
			
		||||
                                N_elements_per_workload * sizeof(int),
 | 
			
		||||
                                cudaHostAllocPortable));
 | 
			
		||||
    // Allocate Resources
 | 
			
		||||
    checkCudaErrors(cudaStreamCreate(&workload->stream));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
 | 
			
		||||
    checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
 | 
			
		||||
 | 
			
		||||
  // CPU thread generates data
 | 
			
		||||
  for (int i = 0; i < N_elements_per_workload; ++i) {
 | 
			
		||||
    workload->h_data[i] = workload->id + i;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Schedule work for GPU in CUDA stream without blocking the CPU thread
 | 
			
		||||
  // Note: Dedicated streams enable concurrent execution of workloads on the GPU
 | 
			
		||||
  dim3 block(512);
 | 
			
		||||
  dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
 | 
			
		||||
                                  N_elements_per_workload * sizeof(int),
 | 
			
		||||
                                  cudaMemcpyHostToDevice, workload->stream));
 | 
			
		||||
  incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
 | 
			
		||||
                                                  N_elements_per_workload);
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
 | 
			
		||||
                                  N_elements_per_workload * sizeof(int),
 | 
			
		||||
                                  cudaMemcpyDeviceToHost, workload->stream));
 | 
			
		||||
 | 
			
		||||
  // New in CUDA 5.0: Add a CPU callback which is called once all currently
 | 
			
		||||
  // pending operations in the CUDA stream have finished
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
 | 
			
		||||
 | 
			
		||||
  CUT_THREADEND;
 | 
			
		||||
  // CPU thread end of life, GPU continues to process data...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CUT_THREADPROC postprocess(void *void_arg) {
 | 
			
		||||
  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
 | 
			
		||||
  // ... GPU is done with processing, continue on new CPU thread...
 | 
			
		||||
 | 
			
		||||
  // Select GPU for this CPU thread
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
 | 
			
		||||
 | 
			
		||||
  // CPU thread consumes results from GPU
 | 
			
		||||
  workload->success = true;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < N_workloads; ++i) {
 | 
			
		||||
    workload->success &= workload->h_data[i] == i + workload->id + 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Free Resources
 | 
			
		||||
  checkCudaErrors(cudaFree(workload->d_data));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(workload->h_data));
 | 
			
		||||
  checkCudaErrors(cudaStreamDestroy(workload->stream));
 | 
			
		||||
 | 
			
		||||
  // Signal the end of the heterogeneous workload to main thread
 | 
			
		||||
  cutIncrementBarrier(&thread_barrier);
 | 
			
		||||
 | 
			
		||||
  CUT_THREADEND;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
 | 
			
		||||
                                void *data) {
 | 
			
		||||
  // Check status of GPU after stream operations are done
 | 
			
		||||
  checkCudaErrors(status);
 | 
			
		||||
 | 
			
		||||
  // Spawn new CPU worker thread and continue processing on the CPU
 | 
			
		||||
  cutStartThread(postprocess, data);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  int N_gpus, max_gpus = 0;
 | 
			
		||||
  int gpuInfo[32];  // assume a maximum of 32 GPUs in a system configuration
 | 
			
		||||
 | 
			
		||||
  printf("Starting simpleCallback\n");
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceCount(&N_gpus));
 | 
			
		||||
  printf("Found %d CUDA capable GPUs\n", N_gpus);
 | 
			
		||||
 | 
			
		||||
  if (N_gpus > 32) {
 | 
			
		||||
    printf("simpleCallback only supports 32 GPU(s)\n");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (int devid = 0; devid < N_gpus; devid++) {
 | 
			
		||||
    int SMversion;
 | 
			
		||||
    cudaDeviceProp deviceProp;
 | 
			
		||||
    cudaSetDevice(devid);
 | 
			
		||||
    cudaGetDeviceProperties(&deviceProp, devid);
 | 
			
		||||
    SMversion = deviceProp.major << 4 + deviceProp.minor;
 | 
			
		||||
    printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
 | 
			
		||||
           deviceProp.major, deviceProp.minor);
 | 
			
		||||
    printf(", %s GPU Callback Functions\n",
 | 
			
		||||
           (SMversion >= 0x11) ? "capable" : "NOT capable");
 | 
			
		||||
 | 
			
		||||
    if (SMversion >= 0x11) {
 | 
			
		||||
      gpuInfo[max_gpus++] = devid;
 | 
			
		||||
    // CPU thread generates data
 | 
			
		||||
    for (int i = 0; i < N_elements_per_workload; ++i) {
 | 
			
		||||
        workload->h_data[i] = workload->id + i;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("%d GPUs available to run Callback Functions\n", max_gpus);
 | 
			
		||||
    // Schedule work for GPU in CUDA stream without blocking the CPU thread
 | 
			
		||||
    // Note: Dedicated streams enable concurrent execution of workloads on the GPU
 | 
			
		||||
    dim3 block(512);
 | 
			
		||||
    dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
 | 
			
		||||
 | 
			
		||||
  heterogeneous_workload *workloads;
 | 
			
		||||
  workloads = (heterogeneous_workload *)malloc(N_workloads *
 | 
			
		||||
                                               sizeof(heterogeneous_workload));
 | 
			
		||||
  ;
 | 
			
		||||
  thread_barrier = cutCreateBarrier(N_workloads);
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(workload->d_data,
 | 
			
		||||
                                    workload->h_data,
 | 
			
		||||
                                    N_elements_per_workload * sizeof(int),
 | 
			
		||||
                                    cudaMemcpyHostToDevice,
 | 
			
		||||
                                    workload->stream));
 | 
			
		||||
    incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(workload->h_data,
 | 
			
		||||
                                    workload->d_data,
 | 
			
		||||
                                    N_elements_per_workload * sizeof(int),
 | 
			
		||||
                                    cudaMemcpyDeviceToHost,
 | 
			
		||||
                                    workload->stream));
 | 
			
		||||
 | 
			
		||||
  // Main thread spawns a CPU worker thread for each heterogeneous workload
 | 
			
		||||
  printf("Starting %d heterogeneous computing workloads\n", N_workloads);
 | 
			
		||||
    // New in CUDA 5.0: Add a CPU callback which is called once all currently
 | 
			
		||||
    // pending operations in the CUDA stream have finished
 | 
			
		||||
    checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < N_workloads; ++i) {
 | 
			
		||||
    workloads[i].id = i;
 | 
			
		||||
    workloads[i].cudaDeviceID = gpuInfo[i % max_gpus];  // i % N_gpus;
 | 
			
		||||
 | 
			
		||||
    cutStartThread(launch, &workloads[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Sleep until all workloads have finished
 | 
			
		||||
  cutWaitForBarrier(&thread_barrier);
 | 
			
		||||
  printf("Total of %d workloads finished:\n", N_workloads);
 | 
			
		||||
 | 
			
		||||
  bool success = true;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < N_workloads; ++i) {
 | 
			
		||||
    success &= workloads[i].success;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("%s\n", success ? "Success" : "Failure");
 | 
			
		||||
 | 
			
		||||
  free(workloads);
 | 
			
		||||
 | 
			
		||||
  exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    CUT_THREADEND;
 | 
			
		||||
    // CPU thread end of life, GPU continues to process data...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CUT_THREADPROC postprocess(void *void_arg)
 | 
			
		||||
{
 | 
			
		||||
    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
 | 
			
		||||
    // ... GPU is done with processing, continue on new CPU thread...
 | 
			
		||||
 | 
			
		||||
    // Select GPU for this CPU thread
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
 | 
			
		||||
 | 
			
		||||
    // CPU thread consumes results from GPU
 | 
			
		||||
    workload->success = true;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < N_workloads; ++i) {
 | 
			
		||||
        workload->success &= workload->h_data[i] == i + workload->id + 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Free Resources
 | 
			
		||||
    checkCudaErrors(cudaFree(workload->d_data));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(workload->h_data));
 | 
			
		||||
    checkCudaErrors(cudaStreamDestroy(workload->stream));
 | 
			
		||||
 | 
			
		||||
    // Signal the end of the heterogeneous workload to main thread
 | 
			
		||||
    cutIncrementBarrier(&thread_barrier);
 | 
			
		||||
 | 
			
		||||
    CUT_THREADEND;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
 | 
			
		||||
{
 | 
			
		||||
    // Check status of GPU after stream operations are done
 | 
			
		||||
    checkCudaErrors(status);
 | 
			
		||||
 | 
			
		||||
    // Spawn new CPU worker thread and continue processing on the CPU
 | 
			
		||||
    cutStartThread(postprocess, data);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int N_gpus, max_gpus = 0;
 | 
			
		||||
    int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
 | 
			
		||||
 | 
			
		||||
    printf("Starting simpleCallback\n");
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceCount(&N_gpus));
 | 
			
		||||
    printf("Found %d CUDA capable GPUs\n", N_gpus);
 | 
			
		||||
 | 
			
		||||
    if (N_gpus > 32) {
 | 
			
		||||
        printf("simpleCallback only supports 32 GPU(s)\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (int devid = 0; devid < N_gpus; devid++) {
 | 
			
		||||
        int            SMversion;
 | 
			
		||||
        cudaDeviceProp deviceProp;
 | 
			
		||||
        cudaSetDevice(devid);
 | 
			
		||||
        cudaGetDeviceProperties(&deviceProp, devid);
 | 
			
		||||
        SMversion = deviceProp.major << 4 + deviceProp.minor;
 | 
			
		||||
        printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
 | 
			
		||||
        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
 | 
			
		||||
 | 
			
		||||
        if (SMversion >= 0x11) {
 | 
			
		||||
            gpuInfo[max_gpus++] = devid;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%d GPUs available to run Callback Functions\n", max_gpus);
 | 
			
		||||
 | 
			
		||||
    heterogeneous_workload *workloads;
 | 
			
		||||
    workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
 | 
			
		||||
    ;
 | 
			
		||||
    thread_barrier = cutCreateBarrier(N_workloads);
 | 
			
		||||
 | 
			
		||||
    // Main thread spawns a CPU worker thread for each heterogeneous workload
 | 
			
		||||
    printf("Starting %d heterogeneous computing workloads\n", N_workloads);
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < N_workloads; ++i) {
 | 
			
		||||
        workloads[i].id           = i;
 | 
			
		||||
        workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
 | 
			
		||||
 | 
			
		||||
        cutStartThread(launch, &workloads[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Sleep until all workloads have finished
 | 
			
		||||
    cutWaitForBarrier(&thread_barrier);
 | 
			
		||||
    printf("Total of %d workloads finished:\n", N_workloads);
 | 
			
		||||
 | 
			
		||||
    bool success = true;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < N_workloads; ++i) {
 | 
			
		||||
        success &= workloads[i].success;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%s\n", success ? "Success" : "Failure");
 | 
			
		||||
 | 
			
		||||
    free(workloads);
 | 
			
		||||
 | 
			
		||||
    exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -38,8 +38,8 @@
 | 
			
		||||
 *
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <cooperative_groups.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
using namespace cooperative_groups;
 | 
			
		||||
 | 
			
		||||
@ -49,35 +49,36 @@ using namespace cooperative_groups;
 | 
			
		||||
 * calculates the sum of val across the group g. The workspace array, x,
 | 
			
		||||
 * must be large enough to contain g.size() integers.
 | 
			
		||||
 */
 | 
			
		||||
__device__ int sumReduction(thread_group g, int *x, int val) {
 | 
			
		||||
  // rank of this thread in the group
 | 
			
		||||
  int lane = g.thread_rank();
 | 
			
		||||
__device__ int sumReduction(thread_group g, int *x, int val)
 | 
			
		||||
{
 | 
			
		||||
    // rank of this thread in the group
 | 
			
		||||
    int lane = g.thread_rank();
 | 
			
		||||
 | 
			
		||||
  // for each iteration of this loop, the number of threads active in the
 | 
			
		||||
  // reduction, i, is halved, and each active thread (with index [lane])
 | 
			
		||||
  // performs a single summation of it's own value with that
 | 
			
		||||
  // of a "partner" (with index [lane+i]).
 | 
			
		||||
  for (int i = g.size() / 2; i > 0; i /= 2) {
 | 
			
		||||
    // store value for this thread in temporary array
 | 
			
		||||
    x[lane] = val;
 | 
			
		||||
    // for each iteration of this loop, the number of threads active in the
 | 
			
		||||
    // reduction, i, is halved, and each active thread (with index [lane])
 | 
			
		||||
    // performs a single summation of it's own value with that
 | 
			
		||||
    // of a "partner" (with index [lane+i]).
 | 
			
		||||
    for (int i = g.size() / 2; i > 0; i /= 2) {
 | 
			
		||||
        // store value for this thread in temporary array
 | 
			
		||||
        x[lane] = val;
 | 
			
		||||
 | 
			
		||||
    // synchronize all threads in group
 | 
			
		||||
    g.sync();
 | 
			
		||||
        // synchronize all threads in group
 | 
			
		||||
        g.sync();
 | 
			
		||||
 | 
			
		||||
    if (lane < i)
 | 
			
		||||
      // active threads perform summation of their value with
 | 
			
		||||
      // their partner's value
 | 
			
		||||
      val += x[lane + i];
 | 
			
		||||
        if (lane < i)
 | 
			
		||||
            // active threads perform summation of their value with
 | 
			
		||||
            // their partner's value
 | 
			
		||||
            val += x[lane + i];
 | 
			
		||||
 | 
			
		||||
    // synchronize all threads in group
 | 
			
		||||
    g.sync();
 | 
			
		||||
  }
 | 
			
		||||
        // synchronize all threads in group
 | 
			
		||||
        g.sync();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // master thread in group returns result, and others return -1.
 | 
			
		||||
  if (g.thread_rank() == 0)
 | 
			
		||||
    return val;
 | 
			
		||||
  else
 | 
			
		||||
    return -1;
 | 
			
		||||
    // master thread in group returns result, and others return -1.
 | 
			
		||||
    if (g.thread_rank() == 0)
 | 
			
		||||
        return val;
 | 
			
		||||
    else
 | 
			
		||||
        return -1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
@ -85,93 +86,92 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
 | 
			
		||||
 *
 | 
			
		||||
 * Creates cooperative groups and performs reductions
 | 
			
		||||
 */
 | 
			
		||||
__global__ void cgkernel() {
 | 
			
		||||
  // threadBlockGroup includes all threads in the block
 | 
			
		||||
  thread_block threadBlockGroup = this_thread_block();
 | 
			
		||||
  int threadBlockGroupSize = threadBlockGroup.size();
 | 
			
		||||
__global__ void cgkernel()
 | 
			
		||||
{
 | 
			
		||||
    // threadBlockGroup includes all threads in the block
 | 
			
		||||
    thread_block threadBlockGroup     = this_thread_block();
 | 
			
		||||
    int          threadBlockGroupSize = threadBlockGroup.size();
 | 
			
		||||
 | 
			
		||||
  // workspace array in shared memory required for reduction
 | 
			
		||||
  extern __shared__ int workspace[];
 | 
			
		||||
    // workspace array in shared memory required for reduction
 | 
			
		||||
    extern __shared__ int workspace[];
 | 
			
		||||
 | 
			
		||||
  int input, output, expectedOutput;
 | 
			
		||||
    int input, output, expectedOutput;
 | 
			
		||||
 | 
			
		||||
  // input to reduction, for each thread, is its' rank in the group
 | 
			
		||||
  input = threadBlockGroup.thread_rank();
 | 
			
		||||
    // input to reduction, for each thread, is its' rank in the group
 | 
			
		||||
    input = threadBlockGroup.thread_rank();
 | 
			
		||||
 | 
			
		||||
  // expected output from analytical formula (n-1)(n)/2
 | 
			
		||||
  // (noting that indexing starts at 0 rather than 1)
 | 
			
		||||
  expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
 | 
			
		||||
    // expected output from analytical formula (n-1)(n)/2
 | 
			
		||||
    // (noting that indexing starts at 0 rather than 1)
 | 
			
		||||
    expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
 | 
			
		||||
 | 
			
		||||
  // perform reduction
 | 
			
		||||
  output = sumReduction(threadBlockGroup, workspace, input);
 | 
			
		||||
    // perform reduction
 | 
			
		||||
    output = sumReduction(threadBlockGroup, workspace, input);
 | 
			
		||||
 | 
			
		||||
  // master thread in group prints out result
 | 
			
		||||
  if (threadBlockGroup.thread_rank() == 0) {
 | 
			
		||||
    printf(
 | 
			
		||||
        " Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
 | 
			
		||||
        (int)threadBlockGroup.size() - 1, output, expectedOutput);
 | 
			
		||||
    // master thread in group prints out result
 | 
			
		||||
    if (threadBlockGroup.thread_rank() == 0) {
 | 
			
		||||
        printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
 | 
			
		||||
               (int)threadBlockGroup.size() - 1,
 | 
			
		||||
               output,
 | 
			
		||||
               expectedOutput);
 | 
			
		||||
 | 
			
		||||
    printf(" Now creating %d groups, each of size 16 threads:\n\n",
 | 
			
		||||
           (int)threadBlockGroup.size() / 16);
 | 
			
		||||
  }
 | 
			
		||||
        printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  threadBlockGroup.sync();
 | 
			
		||||
    threadBlockGroup.sync();
 | 
			
		||||
 | 
			
		||||
  // each tiledPartition16 group includes 16 threads
 | 
			
		||||
  thread_block_tile<16> tiledPartition16 =
 | 
			
		||||
      tiled_partition<16>(threadBlockGroup);
 | 
			
		||||
    // each tiledPartition16 group includes 16 threads
 | 
			
		||||
    thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
 | 
			
		||||
 | 
			
		||||
  // This offset allows each group to have its own unique area in the workspace
 | 
			
		||||
  // array
 | 
			
		||||
  int workspaceOffset =
 | 
			
		||||
      threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
 | 
			
		||||
    // This offset allows each group to have its own unique area in the workspace
 | 
			
		||||
    // array
 | 
			
		||||
    int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
 | 
			
		||||
 | 
			
		||||
  // input to reduction, for each thread, is its' rank in the group
 | 
			
		||||
  input = tiledPartition16.thread_rank();
 | 
			
		||||
    // input to reduction, for each thread, is its' rank in the group
 | 
			
		||||
    input = tiledPartition16.thread_rank();
 | 
			
		||||
 | 
			
		||||
  // expected output from analytical formula (n-1)(n)/2
 | 
			
		||||
  // (noting that indexing starts at 0 rather than 1)
 | 
			
		||||
  expectedOutput = 15 * 16 / 2;
 | 
			
		||||
    // expected output from analytical formula (n-1)(n)/2
 | 
			
		||||
    // (noting that indexing starts at 0 rather than 1)
 | 
			
		||||
    expectedOutput = 15 * 16 / 2;
 | 
			
		||||
 | 
			
		||||
  // Perform reduction
 | 
			
		||||
  output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
 | 
			
		||||
    // Perform reduction
 | 
			
		||||
    output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
 | 
			
		||||
 | 
			
		||||
  // each master thread prints out result
 | 
			
		||||
  if (tiledPartition16.thread_rank() == 0)
 | 
			
		||||
    printf(
 | 
			
		||||
        "   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
 | 
			
		||||
        "(expected %d)\n",
 | 
			
		||||
        output, expectedOutput);
 | 
			
		||||
    // each master thread prints out result
 | 
			
		||||
    if (tiledPartition16.thread_rank() == 0)
 | 
			
		||||
        printf("   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
 | 
			
		||||
               "(expected %d)\n",
 | 
			
		||||
               output,
 | 
			
		||||
               expectedOutput);
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
    return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Host main routine
 | 
			
		||||
 */
 | 
			
		||||
int main() {
 | 
			
		||||
  // Error code to check return values for CUDA calls
 | 
			
		||||
  cudaError_t err;
 | 
			
		||||
int main()
 | 
			
		||||
{
 | 
			
		||||
    // Error code to check return values for CUDA calls
 | 
			
		||||
    cudaError_t err;
 | 
			
		||||
 | 
			
		||||
  // Launch the kernel
 | 
			
		||||
    // Launch the kernel
 | 
			
		||||
 | 
			
		||||
  int blocksPerGrid = 1;
 | 
			
		||||
  int threadsPerBlock = 64;
 | 
			
		||||
    int blocksPerGrid   = 1;
 | 
			
		||||
    int threadsPerBlock = 64;
 | 
			
		||||
 | 
			
		||||
  printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
 | 
			
		||||
    printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
 | 
			
		||||
 | 
			
		||||
  // we use the optional third argument to specify the size
 | 
			
		||||
  // of shared memory required in the kernel
 | 
			
		||||
  cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
 | 
			
		||||
  err = cudaDeviceSynchronize();
 | 
			
		||||
    // we use the optional third argument to specify the size
 | 
			
		||||
    // of shared memory required in the kernel
 | 
			
		||||
    cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
 | 
			
		||||
    err = cudaDeviceSynchronize();
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf("\n...Done.\n\n");
 | 
			
		||||
    printf("\n...Done.\n\n");
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,27 +26,27 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
* This sample demonstrates how to use texture fetches from layered 2D textures
 | 
			
		||||
* in CUDA C
 | 
			
		||||
*
 | 
			
		||||
* This sample first generates a 3D input data array for the layered texture
 | 
			
		||||
* and the expected output. Then it starts CUDA C kernels, one for each layer,
 | 
			
		||||
* which fetch their layer's texture data (using normalized texture coordinates)
 | 
			
		||||
* transform it to the expected output, and write it to a 3D output data array.
 | 
			
		||||
*/
 | 
			
		||||
 * This sample demonstrates how to use texture fetches from layered 2D textures
 | 
			
		||||
 * in CUDA C
 | 
			
		||||
 *
 | 
			
		||||
 * This sample first generates a 3D input data array for the layered texture
 | 
			
		||||
 * and the expected output. Then it starts CUDA C kernels, one for each layer,
 | 
			
		||||
 * which fetch their layer's texture data (using normalized texture coordinates)
 | 
			
		||||
 * transform it to the expected output, and write it to a 3D output data array.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes CUDA
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
static const char *sSDKname = "simpleCubemapTexture";
 | 
			
		||||
 | 
			
		||||
@ -56,213 +56,207 @@ static const char *sSDKname = "simpleCubemapTexture";
 | 
			
		||||
//! Transform a cubemap face of a linear buffe using cubemap texture lookups
 | 
			
		||||
//! @param g_odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void transformKernel(float *g_odata, int width,
 | 
			
		||||
                                cudaTextureObject_t tex) {
 | 
			
		||||
  // calculate this thread's data point
 | 
			
		||||
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
 | 
			
		||||
{
 | 
			
		||||
    // calculate this thread's data point
 | 
			
		||||
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  // 0.5f offset and division are necessary to access the original data points
 | 
			
		||||
  // in the texture (such that bilinear interpolation will not be activated).
 | 
			
		||||
  // For details, see also CUDA Programming Guide, Appendix D
 | 
			
		||||
    // 0.5f offset and division are necessary to access the original data points
 | 
			
		||||
    // in the texture (such that bilinear interpolation will not be activated).
 | 
			
		||||
    // For details, see also CUDA Programming Guide, Appendix D
 | 
			
		||||
 | 
			
		||||
  float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
 | 
			
		||||
  float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
 | 
			
		||||
    float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
 | 
			
		||||
    float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
 | 
			
		||||
 | 
			
		||||
  float cx, cy, cz;
 | 
			
		||||
    float cx, cy, cz;
 | 
			
		||||
 | 
			
		||||
  for (unsigned int face = 0; face < 6; face++) {
 | 
			
		||||
    // Layer 0 is positive X face
 | 
			
		||||
    if (face == 0) {
 | 
			
		||||
      cx = 1;
 | 
			
		||||
      cy = -v;
 | 
			
		||||
      cz = -u;
 | 
			
		||||
    }
 | 
			
		||||
    // Layer 1 is negative X face
 | 
			
		||||
    else if (face == 1) {
 | 
			
		||||
      cx = -1;
 | 
			
		||||
      cy = -v;
 | 
			
		||||
      cz = u;
 | 
			
		||||
    }
 | 
			
		||||
    // Layer 2 is positive Y face
 | 
			
		||||
    else if (face == 2) {
 | 
			
		||||
      cx = u;
 | 
			
		||||
      cy = 1;
 | 
			
		||||
      cz = v;
 | 
			
		||||
    }
 | 
			
		||||
    // Layer 3 is negative Y face
 | 
			
		||||
    else if (face == 3) {
 | 
			
		||||
      cx = u;
 | 
			
		||||
      cy = -1;
 | 
			
		||||
      cz = -v;
 | 
			
		||||
    }
 | 
			
		||||
    // Layer 4 is positive Z face
 | 
			
		||||
    else if (face == 4) {
 | 
			
		||||
      cx = u;
 | 
			
		||||
      cy = -v;
 | 
			
		||||
      cz = 1;
 | 
			
		||||
    }
 | 
			
		||||
    // Layer 4 is negative Z face
 | 
			
		||||
    else if (face == 5) {
 | 
			
		||||
      cx = -u;
 | 
			
		||||
      cy = -v;
 | 
			
		||||
      cz = -1;
 | 
			
		||||
    }
 | 
			
		||||
    for (unsigned int face = 0; face < 6; face++) {
 | 
			
		||||
        // Layer 0 is positive X face
 | 
			
		||||
        if (face == 0) {
 | 
			
		||||
            cx = 1;
 | 
			
		||||
            cy = -v;
 | 
			
		||||
            cz = -u;
 | 
			
		||||
        }
 | 
			
		||||
        // Layer 1 is negative X face
 | 
			
		||||
        else if (face == 1) {
 | 
			
		||||
            cx = -1;
 | 
			
		||||
            cy = -v;
 | 
			
		||||
            cz = u;
 | 
			
		||||
        }
 | 
			
		||||
        // Layer 2 is positive Y face
 | 
			
		||||
        else if (face == 2) {
 | 
			
		||||
            cx = u;
 | 
			
		||||
            cy = 1;
 | 
			
		||||
            cz = v;
 | 
			
		||||
        }
 | 
			
		||||
        // Layer 3 is negative Y face
 | 
			
		||||
        else if (face == 3) {
 | 
			
		||||
            cx = u;
 | 
			
		||||
            cy = -1;
 | 
			
		||||
            cz = -v;
 | 
			
		||||
        }
 | 
			
		||||
        // Layer 4 is positive Z face
 | 
			
		||||
        else if (face == 4) {
 | 
			
		||||
            cx = u;
 | 
			
		||||
            cy = -v;
 | 
			
		||||
            cz = 1;
 | 
			
		||||
        }
 | 
			
		||||
        // Layer 4 is negative Z face
 | 
			
		||||
        else if (face == 5) {
 | 
			
		||||
            cx = -u;
 | 
			
		||||
            cy = -v;
 | 
			
		||||
            cz = -1;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    // read from texture, do expected transformation and write to global memory
 | 
			
		||||
    g_odata[face * width * width + y * width + x] =
 | 
			
		||||
        -texCubemap<float>(tex, cx, cy, cz);
 | 
			
		||||
  }
 | 
			
		||||
        // read from texture, do expected transformation and write to global memory
 | 
			
		||||
        g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
  // Gflops/s
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
    // Gflops/s
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  bool bResult = true;
 | 
			
		||||
    bool bResult = true;
 | 
			
		||||
 | 
			
		||||
  // get number of SMs on this GPU
 | 
			
		||||
  cudaDeviceProp deviceProps;
 | 
			
		||||
    // get number of SMs on this GPU
 | 
			
		||||
    cudaDeviceProp deviceProps;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
 | 
			
		||||
         deviceProps.multiProcessorCount);
 | 
			
		||||
  printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
 | 
			
		||||
    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
 | 
			
		||||
 | 
			
		||||
  if (deviceProps.major < 2) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
 | 
			
		||||
        "will exit... \n",
 | 
			
		||||
        sSDKname);
 | 
			
		||||
    if (deviceProps.major < 2) {
 | 
			
		||||
        printf("%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
 | 
			
		||||
               "will exit... \n",
 | 
			
		||||
               sSDKname);
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // generate input data for layered texture
 | 
			
		||||
  unsigned int width = 64, num_faces = 6, num_layers = 1;
 | 
			
		||||
  unsigned int cubemap_size = width * width * num_faces;
 | 
			
		||||
  unsigned int size = cubemap_size * num_layers * sizeof(float);
 | 
			
		||||
  float *h_data = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
 | 
			
		||||
    h_data[i] = (float)i;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // this is the expected transformation of the input data (the expected output)
 | 
			
		||||
  float *h_data_ref = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  for (unsigned int layer = 0; layer < num_layers; layer++) {
 | 
			
		||||
    for (int i = 0; i < (int)(cubemap_size); i++) {
 | 
			
		||||
      h_data_ref[layer * cubemap_size + i] =
 | 
			
		||||
          -h_data[layer * cubemap_size + i] + layer;
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  float *d_data = NULL;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_data, size));
 | 
			
		||||
    // generate input data for layered texture
 | 
			
		||||
    unsigned int width = 64, num_faces = 6, num_layers = 1;
 | 
			
		||||
    unsigned int cubemap_size = width * width * num_faces;
 | 
			
		||||
    unsigned int size         = cubemap_size * num_layers * sizeof(float);
 | 
			
		||||
    float       *h_data       = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  // allocate array and copy image data
 | 
			
		||||
  cudaChannelFormatDesc channelDesc =
 | 
			
		||||
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
  cudaArray *cu_3darray;
 | 
			
		||||
  //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
 | 
			
		||||
  //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
 | 
			
		||||
  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
 | 
			
		||||
                                    make_cudaExtent(width, width, num_faces),
 | 
			
		||||
                                    cudaArrayCubemap));
 | 
			
		||||
  cudaMemcpy3DParms myparms = {0};
 | 
			
		||||
  myparms.srcPos = make_cudaPos(0, 0, 0);
 | 
			
		||||
  myparms.dstPos = make_cudaPos(0, 0, 0);
 | 
			
		||||
  myparms.srcPtr =
 | 
			
		||||
      make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
 | 
			
		||||
  myparms.dstArray = cu_3darray;
 | 
			
		||||
  myparms.extent = make_cudaExtent(width, width, num_faces);
 | 
			
		||||
  myparms.kind = cudaMemcpyHostToDevice;
 | 
			
		||||
  checkCudaErrors(cudaMemcpy3D(&myparms));
 | 
			
		||||
    for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
 | 
			
		||||
        h_data[i] = (float)i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  cudaTextureObject_t tex;
 | 
			
		||||
  cudaResourceDesc texRes;
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
    // this is the expected transformation of the input data (the expected output)
 | 
			
		||||
    float *h_data_ref = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  texRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  texRes.res.array.array = cu_3darray;
 | 
			
		||||
    for (unsigned int layer = 0; layer < num_layers; layer++) {
 | 
			
		||||
        for (int i = 0; i < (int)(cubemap_size); i++) {
 | 
			
		||||
            h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  cudaTextureDesc texDescr;
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    float *d_data = NULL;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_data, size));
 | 
			
		||||
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  texDescr.filterMode = cudaFilterModeLinear;
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[2] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeElementType;
 | 
			
		||||
    // allocate array and copy image data
 | 
			
		||||
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
    cudaArray            *cu_3darray;
 | 
			
		||||
    //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
 | 
			
		||||
    //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
 | 
			
		||||
    cudaMemcpy3DParms myparms = {0};
 | 
			
		||||
    myparms.srcPos            = make_cudaPos(0, 0, 0);
 | 
			
		||||
    myparms.dstPos            = make_cudaPos(0, 0, 0);
 | 
			
		||||
    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
 | 
			
		||||
    myparms.dstArray          = cu_3darray;
 | 
			
		||||
    myparms.extent            = make_cudaExtent(width, width, num_faces);
 | 
			
		||||
    myparms.kind              = cudaMemcpyHostToDevice;
 | 
			
		||||
    checkCudaErrors(cudaMemcpy3D(&myparms));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
    cudaTextureObject_t tex;
 | 
			
		||||
    cudaResourceDesc    texRes;
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  dim3 dimBlock(8, 8, 1);
 | 
			
		||||
  dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
 | 
			
		||||
    texRes.resType         = cudaResourceTypeArray;
 | 
			
		||||
    texRes.res.array.array = cu_3darray;
 | 
			
		||||
 | 
			
		||||
  printf(
 | 
			
		||||
      "Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
 | 
			
		||||
      "block has 8 x 8 threads\n",
 | 
			
		||||
      width, num_layers, dimGrid.x, dimGrid.y);
 | 
			
		||||
    cudaTextureDesc texDescr;
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
 | 
			
		||||
                                         tex);  // warmup (for better timing)
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    texDescr.filterMode       = cudaFilterModeLinear;
 | 
			
		||||
    texDescr.addressMode[0]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[2]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode         = cudaReadModeElementType;
 | 
			
		||||
 | 
			
		||||
  // check if kernel execution generated an error
 | 
			
		||||
  getLastCudaError("warmup Kernel execution failed");
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    dim3 dimBlock(8, 8, 1);
 | 
			
		||||
    dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
 | 
			
		||||
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
 | 
			
		||||
           "block has 8 x 8 threads\n",
 | 
			
		||||
           width,
 | 
			
		||||
           num_layers,
 | 
			
		||||
           dimGrid.x,
 | 
			
		||||
           dimGrid.y);
 | 
			
		||||
 | 
			
		||||
  // execute the kernel
 | 
			
		||||
  transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
 | 
			
		||||
                                           tex); // warmup (for better timing)
 | 
			
		||||
 | 
			
		||||
  // check if kernel execution generated an error
 | 
			
		||||
  getLastCudaError("Kernel execution failed");
 | 
			
		||||
    // check if kernel execution generated an error
 | 
			
		||||
    getLastCudaError("warmup Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  printf("%.2f Mtexlookups/sec\n",
 | 
			
		||||
         (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  float *h_odata = (float *)malloc(size);
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // write regression file if necessary
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
    // write file for regression test
 | 
			
		||||
    sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
 | 
			
		||||
                        false);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("Comparing kernel output to expected data\n");
 | 
			
		||||
    // execute the kernel
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
 | 
			
		||||
 | 
			
		||||
    // check if kernel execution generated an error
 | 
			
		||||
    getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    float *h_odata = (float *)malloc(size);
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
    // write regression file if necessary
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
        // write file for regression test
 | 
			
		||||
        sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("Comparing kernel output to expected data\n");
 | 
			
		||||
 | 
			
		||||
#define MIN_EPSILON_ERROR 5e-3f
 | 
			
		||||
    bResult =
 | 
			
		||||
        compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
 | 
			
		||||
  }
 | 
			
		||||
        bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // cleanup memory
 | 
			
		||||
  free(h_data);
 | 
			
		||||
  free(h_data_ref);
 | 
			
		||||
  free(h_odata);
 | 
			
		||||
    // cleanup memory
 | 
			
		||||
    free(h_data);
 | 
			
		||||
    free(h_data_ref);
 | 
			
		||||
    free(h_odata);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_data));
 | 
			
		||||
  checkCudaErrors(cudaFreeArray(cu_3darray));
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_data));
 | 
			
		||||
    checkCudaErrors(cudaFreeArray(cu_3darray));
 | 
			
		||||
 | 
			
		||||
  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -33,12 +33,12 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Includes
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
@ -62,165 +62,165 @@ float *d_B;
 | 
			
		||||
float *d_C;
 | 
			
		||||
 | 
			
		||||
// Functions
 | 
			
		||||
int CleanupNoFailure(CUcontext &cuContext);
 | 
			
		||||
int  CleanupNoFailure(CUcontext &cuContext);
 | 
			
		||||
void RandomInit(float *, int);
 | 
			
		||||
bool findModulePath(const char *, string &, char **, ostringstream &);
 | 
			
		||||
 | 
			
		||||
static void check(CUresult result, char const *const func,
 | 
			
		||||
                  const char *const file, int const line) {
 | 
			
		||||
  if (result) {
 | 
			
		||||
    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
 | 
			
		||||
            static_cast<unsigned int>(result), func);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
static void check(CUresult result, char const *const func, const char *const file, int const line)
 | 
			
		||||
{
 | 
			
		||||
    if (result) {
 | 
			
		||||
        fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)
 | 
			
		||||
 | 
			
		||||
// Host code
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("simpleDrvRuntime..\n");
 | 
			
		||||
  int N = 50000, devID = 0;
 | 
			
		||||
  size_t size = N * sizeof(float);
 | 
			
		||||
  CUdevice cuDevice;
 | 
			
		||||
  CUfunction vecAdd_kernel;
 | 
			
		||||
  CUmodule cuModule = 0;
 | 
			
		||||
  CUcontext cuContext;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("simpleDrvRuntime..\n");
 | 
			
		||||
    int        N = 50000, devID = 0;
 | 
			
		||||
    size_t     size = N * sizeof(float);
 | 
			
		||||
    CUdevice   cuDevice;
 | 
			
		||||
    CUfunction vecAdd_kernel;
 | 
			
		||||
    CUmodule   cuModule = 0;
 | 
			
		||||
    CUcontext  cuContext;
 | 
			
		||||
 | 
			
		||||
  // Initialize
 | 
			
		||||
  checkCudaDrvErrors(cuInit(0));
 | 
			
		||||
    // Initialize
 | 
			
		||||
    checkCudaDrvErrors(cuInit(0));
 | 
			
		||||
 | 
			
		||||
  cuDevice = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
  // Create context
 | 
			
		||||
  checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
    cuDevice = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // Create context
 | 
			
		||||
    checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
 | 
			
		||||
  // first search for the module path before we load the results
 | 
			
		||||
  string module_path;
 | 
			
		||||
  ostringstream fatbin;
 | 
			
		||||
    // first search for the module path before we load the results
 | 
			
		||||
    string        module_path;
 | 
			
		||||
    ostringstream fatbin;
 | 
			
		||||
 | 
			
		||||
  if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!fatbin.str().size()) {
 | 
			
		||||
    printf("fatbin file empty. exiting..\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Create module from binary file (FATBIN)
 | 
			
		||||
  checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
 | 
			
		||||
  // Get function handle from module
 | 
			
		||||
  checkCudaDrvErrors(
 | 
			
		||||
      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
 | 
			
		||||
 | 
			
		||||
  // Allocate input vectors h_A and h_B in host memory
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&h_A, size));
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&h_B, size));
 | 
			
		||||
  checkCudaErrors(cudaMallocHost(&h_C, size));
 | 
			
		||||
 | 
			
		||||
  // Initialize input vectors
 | 
			
		||||
  RandomInit(h_A, N);
 | 
			
		||||
  RandomInit(h_B, N);
 | 
			
		||||
 | 
			
		||||
  // Allocate vectors in device memory
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)(&d_A), size));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)(&d_B), size));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)(&d_C), size));
 | 
			
		||||
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
  // Copy vectors from host memory to device memory
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
 | 
			
		||||
  int threadsPerBlock = 256;
 | 
			
		||||
  int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
 | 
			
		||||
  void *args[] = {&d_A, &d_B, &d_C, &N};
 | 
			
		||||
 | 
			
		||||
  // Launch the CUDA kernel
 | 
			
		||||
  checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
 | 
			
		||||
                                    threadsPerBlock, 1, 1, 0, stream, args,
 | 
			
		||||
                                    NULL));
 | 
			
		||||
 | 
			
		||||
  // Copy result from device memory to host memory
 | 
			
		||||
  // h_C contains the result in host memory
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
  // Verify result
 | 
			
		||||
  int i;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < N; ++i) {
 | 
			
		||||
    float sum = h_A[i] + h_B[i];
 | 
			
		||||
 | 
			
		||||
    if (fabs(h_C[i] - sum) > 1e-7f) {
 | 
			
		||||
      break;
 | 
			
		||||
    if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaDrvErrors(cuModuleUnload(cuModule));
 | 
			
		||||
  CleanupNoFailure(cuContext);
 | 
			
		||||
  printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
    if (!fatbin.str().size()) {
 | 
			
		||||
        printf("fatbin file empty. exiting..\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    // Create module from binary file (FATBIN)
 | 
			
		||||
    checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
 | 
			
		||||
    // Get function handle from module
 | 
			
		||||
    checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
 | 
			
		||||
 | 
			
		||||
    // Allocate input vectors h_A and h_B in host memory
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&h_A, size));
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&h_B, size));
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&h_C, size));
 | 
			
		||||
 | 
			
		||||
    // Initialize input vectors
 | 
			
		||||
    RandomInit(h_A, N);
 | 
			
		||||
    RandomInit(h_B, N);
 | 
			
		||||
 | 
			
		||||
    // Allocate vectors in device memory
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)(&d_A), size));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)(&d_B), size));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)(&d_C), size));
 | 
			
		||||
 | 
			
		||||
    cudaStream_t stream;
 | 
			
		||||
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
    // Copy vectors from host memory to device memory
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
 | 
			
		||||
 | 
			
		||||
    int threadsPerBlock = 256;
 | 
			
		||||
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
 | 
			
		||||
    void *args[] = {&d_A, &d_B, &d_C, &N};
 | 
			
		||||
 | 
			
		||||
    // Launch the CUDA kernel
 | 
			
		||||
    checkCudaDrvErrors(
 | 
			
		||||
        cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
 | 
			
		||||
 | 
			
		||||
    // Copy result from device memory to host memory
 | 
			
		||||
    // h_C contains the result in host memory
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
    // Verify result
 | 
			
		||||
    int i;
 | 
			
		||||
 | 
			
		||||
    for (i = 0; i < N; ++i) {
 | 
			
		||||
        float sum = h_A[i] + h_B[i];
 | 
			
		||||
 | 
			
		||||
        if (fabs(h_C[i] - sum) > 1e-7f) {
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaDrvErrors(cuModuleUnload(cuModule));
 | 
			
		||||
    CleanupNoFailure(cuContext);
 | 
			
		||||
    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
 | 
			
		||||
    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int CleanupNoFailure(CUcontext &cuContext) {
 | 
			
		||||
  // Free device memory
 | 
			
		||||
  checkCudaErrors(cudaFree(d_A));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_B));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_C));
 | 
			
		||||
int CleanupNoFailure(CUcontext &cuContext)
 | 
			
		||||
{
 | 
			
		||||
    // Free device memory
 | 
			
		||||
    checkCudaErrors(cudaFree(d_A));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_B));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_C));
 | 
			
		||||
 | 
			
		||||
  // Free host memory
 | 
			
		||||
  if (h_A) {
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(h_A));
 | 
			
		||||
  }
 | 
			
		||||
    // Free host memory
 | 
			
		||||
    if (h_A) {
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(h_A));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (h_B) {
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(h_B));
 | 
			
		||||
  }
 | 
			
		||||
    if (h_B) {
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(h_B));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (h_C) {
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(h_C));
 | 
			
		||||
  }
 | 
			
		||||
    if (h_C) {
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(h_C));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaDrvErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
    checkCudaDrvErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
// Allocates an array with random float entries.
 | 
			
		||||
void RandomInit(float *data, int n) {
 | 
			
		||||
  for (int i = 0; i < n; ++i) {
 | 
			
		||||
    data[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool inline findModulePath(const char *module_file, string &module_path,
 | 
			
		||||
                           char **argv, ostringstream &ostrm) {
 | 
			
		||||
  char *actual_path = sdkFindFilePath(module_file, argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (actual_path) {
 | 
			
		||||
    module_path = actual_path;
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("> findModulePath file not found: <%s> \n", module_file);
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (module_path.empty()) {
 | 
			
		||||
    printf("> findModulePath could not find file: <%s> \n", module_file);
 | 
			
		||||
    return false;
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("> findModulePath found file at <%s>\n", module_path.c_str());
 | 
			
		||||
    if (module_path.rfind("fatbin") != string::npos) {
 | 
			
		||||
      ifstream fileIn(module_path.c_str(), ios::binary);
 | 
			
		||||
      ostrm << fileIn.rdbuf();
 | 
			
		||||
void RandomInit(float *data, int n)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < n; ++i) {
 | 
			
		||||
        data[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
 | 
			
		||||
{
 | 
			
		||||
    char *actual_path = sdkFindFilePath(module_file, argv[0]);
 | 
			
		||||
 | 
			
		||||
    if (actual_path) {
 | 
			
		||||
        module_path = actual_path;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> findModulePath file not found: <%s> \n", module_file);
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (module_path.empty()) {
 | 
			
		||||
        printf("> findModulePath could not find file: <%s> \n", module_file);
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> findModulePath found file at <%s>\n", module_path.c_str());
 | 
			
		||||
        if (module_path.rfind("fatbin") != string::npos) {
 | 
			
		||||
            ifstream fileIn(module_path.c_str(), ios::binary);
 | 
			
		||||
            ostrm << fileIn.rdbuf();
 | 
			
		||||
        }
 | 
			
		||||
        return true;
 | 
			
		||||
    }
 | 
			
		||||
    return true;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,9 +34,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Device code
 | 
			
		||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
 | 
			
		||||
                                         float *C, int N) {
 | 
			
		||||
  int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
 | 
			
		||||
{
 | 
			
		||||
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (i < N) C[i] = A[i] + B[i];
 | 
			
		||||
    if (i < N)
 | 
			
		||||
        C[i] = A[i] + B[i];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -44,188 +44,188 @@ const char *sSDKsample = "hyperQ";
 | 
			
		||||
 | 
			
		||||
// This subroutine does no real work but runs for at least the specified number
 | 
			
		||||
// of clock ticks.
 | 
			
		||||
__device__ void clock_block(clock_t *d_o, clock_t clock_count) {
 | 
			
		||||
  unsigned int start_clock = (unsigned int)clock();
 | 
			
		||||
__device__ void clock_block(clock_t *d_o, clock_t clock_count)
 | 
			
		||||
{
 | 
			
		||||
    unsigned int start_clock = (unsigned int)clock();
 | 
			
		||||
 | 
			
		||||
  clock_t clock_offset = 0;
 | 
			
		||||
    clock_t clock_offset = 0;
 | 
			
		||||
 | 
			
		||||
  while (clock_offset < clock_count) {
 | 
			
		||||
    unsigned int end_clock = (unsigned int)clock();
 | 
			
		||||
    while (clock_offset < clock_count) {
 | 
			
		||||
        unsigned int end_clock = (unsigned int)clock();
 | 
			
		||||
 | 
			
		||||
    // The code below should work like
 | 
			
		||||
    // this (thanks to modular arithmetics):
 | 
			
		||||
    //
 | 
			
		||||
    // clock_offset = (clock_t) (end_clock > start_clock ?
 | 
			
		||||
    //                           end_clock - start_clock :
 | 
			
		||||
    //                           end_clock + (0xffffffffu - start_clock));
 | 
			
		||||
    //
 | 
			
		||||
    // Indeed, let m = 2^32 then
 | 
			
		||||
    // end - start = end + m - start (mod m).
 | 
			
		||||
        // The code below should work like
 | 
			
		||||
        // this (thanks to modular arithmetics):
 | 
			
		||||
        //
 | 
			
		||||
        // clock_offset = (clock_t) (end_clock > start_clock ?
 | 
			
		||||
        //                           end_clock - start_clock :
 | 
			
		||||
        //                           end_clock + (0xffffffffu - start_clock));
 | 
			
		||||
        //
 | 
			
		||||
        // Indeed, let m = 2^32 then
 | 
			
		||||
        // end - start = end + m - start (mod m).
 | 
			
		||||
 | 
			
		||||
    clock_offset = (clock_t)(end_clock - start_clock);
 | 
			
		||||
  }
 | 
			
		||||
        clock_offset = (clock_t)(end_clock - start_clock);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  d_o[0] = clock_offset;
 | 
			
		||||
    d_o[0] = clock_offset;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// We create two identical kernels calling clock_block(), we create two so that
 | 
			
		||||
// we can identify dependencies in the profile timeline ("kernel_B" is always
 | 
			
		||||
// dependent on "kernel_A" in the same stream).
 | 
			
		||||
__global__ void kernel_A(clock_t *d_o, clock_t clock_count) {
 | 
			
		||||
  clock_block(d_o, clock_count);
 | 
			
		||||
}
 | 
			
		||||
__global__ void kernel_B(clock_t *d_o, clock_t clock_count) {
 | 
			
		||||
  clock_block(d_o, clock_count);
 | 
			
		||||
}
 | 
			
		||||
__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
 | 
			
		||||
__global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
 | 
			
		||||
 | 
			
		||||
// Single-warp reduction kernel (note: this is not optimized for simplicity)
 | 
			
		||||
__global__ void sum(clock_t *d_clocks, int N) {
 | 
			
		||||
  // Handle to thread block group
 | 
			
		||||
  cg::thread_block cta = cg::this_thread_block();
 | 
			
		||||
  __shared__ clock_t s_clocks[32];
 | 
			
		||||
__global__ void sum(clock_t *d_clocks, int N)
 | 
			
		||||
{
 | 
			
		||||
    // Handle to thread block group
 | 
			
		||||
    cg::thread_block   cta = cg::this_thread_block();
 | 
			
		||||
    __shared__ clock_t s_clocks[32];
 | 
			
		||||
 | 
			
		||||
  clock_t my_sum = 0;
 | 
			
		||||
    clock_t my_sum = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = threadIdx.x; i < N; i += blockDim.x) {
 | 
			
		||||
    my_sum += d_clocks[i];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  s_clocks[threadIdx.x] = my_sum;
 | 
			
		||||
  cg::sync(cta);
 | 
			
		||||
 | 
			
		||||
  for (int i = warpSize / 2; i > 0; i /= 2) {
 | 
			
		||||
    if (threadIdx.x < i) {
 | 
			
		||||
      s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
 | 
			
		||||
    for (int i = threadIdx.x; i < N; i += blockDim.x) {
 | 
			
		||||
        my_sum += d_clocks[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    s_clocks[threadIdx.x] = my_sum;
 | 
			
		||||
    cg::sync(cta);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    d_clocks[0] = s_clocks[0];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
    for (int i = warpSize / 2; i > 0; i /= 2) {
 | 
			
		||||
        if (threadIdx.x < i) {
 | 
			
		||||
            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  int nstreams = 32;       // One stream for each pair of kernels
 | 
			
		||||
  float kernel_time = 10;  // Time each kernel should run in ms
 | 
			
		||||
  float elapsed_time;
 | 
			
		||||
  int cuda_device = 0;
 | 
			
		||||
 | 
			
		||||
  printf("starting %s...\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
  // Get number of streams (if overridden on the command line)
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
 | 
			
		||||
    nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Use command-line specified CUDA device, otherwise use device with
 | 
			
		||||
  // highest Gflops/s
 | 
			
		||||
  cuda_device = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // Get device properties
 | 
			
		||||
  cudaDeviceProp deviceProp;
 | 
			
		||||
  checkCudaErrors(cudaGetDevice(&cuda_device));
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 | 
			
		||||
 | 
			
		||||
  // HyperQ is available in devices of Compute Capability 3.5 and higher
 | 
			
		||||
  if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
 | 
			
		||||
    if (deviceProp.concurrentKernels == 0) {
 | 
			
		||||
      printf(
 | 
			
		||||
          "> GPU does not support concurrent kernel execution (SM 3.5 or "
 | 
			
		||||
          "higher required)\n");
 | 
			
		||||
      printf("  CUDA kernel runs will be serialized\n");
 | 
			
		||||
    } else {
 | 
			
		||||
      printf("> GPU does not support HyperQ\n");
 | 
			
		||||
      printf("  CUDA kernel runs will have limited concurrency\n");
 | 
			
		||||
        cg::sync(cta);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
 | 
			
		||||
         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 | 
			
		||||
 | 
			
		||||
  // Allocate host memory for the output (reduced to a single value)
 | 
			
		||||
  clock_t *a = 0;
 | 
			
		||||
  checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));
 | 
			
		||||
 | 
			
		||||
  // Allocate device memory for the output (one value for each kernel)
 | 
			
		||||
  clock_t *d_a = 0;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
 | 
			
		||||
 | 
			
		||||
  // Allocate and initialize an array of stream handles
 | 
			
		||||
  cudaStream_t *streams =
 | 
			
		||||
      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
    checkCudaErrors(cudaStreamCreate(&(streams[i])));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Create CUDA event handles
 | 
			
		||||
  cudaEvent_t start_event, stop_event;
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&start_event));
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&stop_event));
 | 
			
		||||
 | 
			
		||||
  // Target time per kernel is kernel_time ms, clockRate is in KHz
 | 
			
		||||
  // Target number of clocks = target time * clock frequency
 | 
			
		||||
#if defined(__arm__) || defined(__aarch64__)
 | 
			
		||||
  // the kernel takes more time than the channel reset time on arm archs, so to
 | 
			
		||||
  // prevent hangs reduce time_clocks.
 | 
			
		||||
  clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
 | 
			
		||||
#else
 | 
			
		||||
  clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
 | 
			
		||||
#endif
 | 
			
		||||
  clock_t total_clocks = 0;
 | 
			
		||||
 | 
			
		||||
  // Start the clock
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
 | 
			
		||||
  // Queue pairs of {kernel_A, kernel_B} in separate streams
 | 
			
		||||
  for (int i = 0; i < nstreams; ++i) {
 | 
			
		||||
    kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
 | 
			
		||||
    total_clocks += time_clocks;
 | 
			
		||||
    kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks);
 | 
			
		||||
    total_clocks += time_clocks;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Stop the clock in stream 0 (i.e. all previous kernels will be complete)
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
 | 
			
		||||
  // At this point the CPU has dispatched all work for the GPU and can
 | 
			
		||||
  // continue processing other tasks in parallel. In this sample we just want
 | 
			
		||||
  // to wait until all work is done so we use a blocking cudaMemcpy below.
 | 
			
		||||
 | 
			
		||||
  // Run the sum kernel and copy the result back to host
 | 
			
		||||
  sum<<<1, 32>>>(d_a, 2 * nstreams);
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  // stop_event will have been recorded but including the synchronize here to
 | 
			
		||||
  // prevent copy/paste errors!
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
 | 
			
		||||
 | 
			
		||||
  printf(
 | 
			
		||||
      "Expected time for serial execution of %d sets of kernels is between "
 | 
			
		||||
      "approx. %.3fs and %.3fs\n",
 | 
			
		||||
      nstreams, (nstreams + 1) * kernel_time / 1000.0f,
 | 
			
		||||
      2 * nstreams * kernel_time / 1000.0f);
 | 
			
		||||
  printf(
 | 
			
		||||
      "Expected time for fully concurrent execution of %d sets of kernels is "
 | 
			
		||||
      "approx. %.3fs\n",
 | 
			
		||||
      nstreams, 2 * kernel_time / 1000.0f);
 | 
			
		||||
  printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
 | 
			
		||||
 | 
			
		||||
  bool bTestResult = (a[0] >= total_clocks);
 | 
			
		||||
 | 
			
		||||
  // Release resources
 | 
			
		||||
  for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
    cudaStreamDestroy(streams[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  free(streams);
 | 
			
		||||
  cudaEventDestroy(start_event);
 | 
			
		||||
  cudaEventDestroy(stop_event);
 | 
			
		||||
  cudaFreeHost(a);
 | 
			
		||||
  cudaFree(d_a);
 | 
			
		||||
 | 
			
		||||
  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    if (threadIdx.x == 0) {
 | 
			
		||||
        d_clocks[0] = s_clocks[0];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int   nstreams    = 32; // One stream for each pair of kernels
 | 
			
		||||
    float kernel_time = 10; // Time each kernel should run in ms
 | 
			
		||||
    float elapsed_time;
 | 
			
		||||
    int   cuda_device = 0;
 | 
			
		||||
 | 
			
		||||
    printf("starting %s...\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
    // Get number of streams (if overridden on the command line)
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
 | 
			
		||||
        nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Use command-line specified CUDA device, otherwise use device with
 | 
			
		||||
    // highest Gflops/s
 | 
			
		||||
    cuda_device = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
    // Get device properties
 | 
			
		||||
    cudaDeviceProp deviceProp;
 | 
			
		||||
    checkCudaErrors(cudaGetDevice(&cuda_device));
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 | 
			
		||||
 | 
			
		||||
    // HyperQ is available in devices of Compute Capability 3.5 and higher
 | 
			
		||||
    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
 | 
			
		||||
        if (deviceProp.concurrentKernels == 0) {
 | 
			
		||||
            printf("> GPU does not support concurrent kernel execution (SM 3.5 or "
 | 
			
		||||
                   "higher required)\n");
 | 
			
		||||
            printf("  CUDA kernel runs will be serialized\n");
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            printf("> GPU does not support HyperQ\n");
 | 
			
		||||
            printf("  CUDA kernel runs will have limited concurrency\n");
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
 | 
			
		||||
           deviceProp.major,
 | 
			
		||||
           deviceProp.minor,
 | 
			
		||||
           deviceProp.multiProcessorCount);
 | 
			
		||||
 | 
			
		||||
    // Allocate host memory for the output (reduced to a single value)
 | 
			
		||||
    clock_t *a = 0;
 | 
			
		||||
    checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));
 | 
			
		||||
 | 
			
		||||
    // Allocate device memory for the output (one value for each kernel)
 | 
			
		||||
    clock_t *d_a = 0;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
 | 
			
		||||
 | 
			
		||||
    // Allocate and initialize an array of stream handles
 | 
			
		||||
    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
        checkCudaErrors(cudaStreamCreate(&(streams[i])));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Create CUDA event handles
 | 
			
		||||
    cudaEvent_t start_event, stop_event;
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&start_event));
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&stop_event));
 | 
			
		||||
 | 
			
		||||
    // Target time per kernel is kernel_time ms, clockRate is in KHz
 | 
			
		||||
    // Target number of clocks = target time * clock frequency
 | 
			
		||||
#if defined(__arm__) || defined(__aarch64__)
 | 
			
		||||
    // the kernel takes more time than the channel reset time on arm archs, so to
 | 
			
		||||
    // prevent hangs reduce time_clocks.
 | 
			
		||||
    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
 | 
			
		||||
#else
 | 
			
		||||
    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
 | 
			
		||||
#endif
 | 
			
		||||
    clock_t total_clocks = 0;
 | 
			
		||||
 | 
			
		||||
    // Start the clock
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
 | 
			
		||||
    // Queue pairs of {kernel_A, kernel_B} in separate streams
 | 
			
		||||
    for (int i = 0; i < nstreams; ++i) {
 | 
			
		||||
        kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
 | 
			
		||||
        total_clocks += time_clocks;
 | 
			
		||||
        kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks);
 | 
			
		||||
        total_clocks += time_clocks;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Stop the clock in stream 0 (i.e. all previous kernels will be complete)
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
 | 
			
		||||
    // At this point the CPU has dispatched all work for the GPU and can
 | 
			
		||||
    // continue processing other tasks in parallel. In this sample we just want
 | 
			
		||||
    // to wait until all work is done so we use a blocking cudaMemcpy below.
 | 
			
		||||
 | 
			
		||||
    // Run the sum kernel and copy the result back to host
 | 
			
		||||
    sum<<<1, 32>>>(d_a, 2 * nstreams);
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
    // stop_event will have been recorded but including the synchronize here to
 | 
			
		||||
    // prevent copy/paste errors!
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
 | 
			
		||||
 | 
			
		||||
    printf("Expected time for serial execution of %d sets of kernels is between "
 | 
			
		||||
           "approx. %.3fs and %.3fs\n",
 | 
			
		||||
           nstreams,
 | 
			
		||||
           (nstreams + 1) * kernel_time / 1000.0f,
 | 
			
		||||
           2 * nstreams * kernel_time / 1000.0f);
 | 
			
		||||
    printf("Expected time for fully concurrent execution of %d sets of kernels is "
 | 
			
		||||
           "approx. %.3fs\n",
 | 
			
		||||
           nstreams,
 | 
			
		||||
           2 * kernel_time / 1000.0f);
 | 
			
		||||
    printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
 | 
			
		||||
 | 
			
		||||
    bool bTestResult = (a[0] >= total_clocks);
 | 
			
		||||
 | 
			
		||||
    // Release resources
 | 
			
		||||
    for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
        cudaStreamDestroy(streams[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    free(streams);
 | 
			
		||||
    cudaEventDestroy(start_event);
 | 
			
		||||
    cudaEventDestroy(stop_event);
 | 
			
		||||
    cudaFreeHost(a);
 | 
			
		||||
    cudaFree(d_a);
 | 
			
		||||
 | 
			
		||||
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 | 
			
		||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 | 
			
		||||
 | 
			
		||||
## References (for more details)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -32,6 +32,7 @@
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <vector>
 | 
			
		||||
 | 
			
		||||
#include "helper_cuda.h"
 | 
			
		||||
#include "helper_multiprocess.h"
 | 
			
		||||
static const char shmName[] = "simpleIPCshm";
 | 
			
		||||
@ -39,7 +40,7 @@ static const char shmName[] = "simpleIPCshm";
 | 
			
		||||
// For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited
 | 
			
		||||
// in the same way.
 | 
			
		||||
#define MAX_DEVICES (32)
 | 
			
		||||
#define DATA_SIZE (64ULL << 20ULL)  // 64MB
 | 
			
		||||
#define DATA_SIZE   (64ULL << 20ULL) // 64MB
 | 
			
		||||
 | 
			
		||||
#if defined(__linux__)
 | 
			
		||||
#define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x)
 | 
			
		||||
@ -49,281 +50,280 @@ static const char shmName[] = "simpleIPCshm";
 | 
			
		||||
#error Unsupported system
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
typedef struct shmStruct_st {
 | 
			
		||||
  size_t nprocesses;
 | 
			
		||||
  int barrier;
 | 
			
		||||
  int sense;
 | 
			
		||||
  int devices[MAX_DEVICES];
 | 
			
		||||
  cudaIpcMemHandle_t memHandle[MAX_DEVICES];
 | 
			
		||||
  cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
 | 
			
		||||
typedef struct shmStruct_st
 | 
			
		||||
{
 | 
			
		||||
    size_t               nprocesses;
 | 
			
		||||
    int                  barrier;
 | 
			
		||||
    int                  sense;
 | 
			
		||||
    int                  devices[MAX_DEVICES];
 | 
			
		||||
    cudaIpcMemHandle_t   memHandle[MAX_DEVICES];
 | 
			
		||||
    cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
 | 
			
		||||
} shmStruct;
 | 
			
		||||
 | 
			
		||||
__global__ void simpleKernel(char *ptr, int sz, char val) {
 | 
			
		||||
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
 | 
			
		||||
    ptr[idx] = val;
 | 
			
		||||
  }
 | 
			
		||||
__global__ void simpleKernel(char *ptr, int sz, char val)
 | 
			
		||||
{
 | 
			
		||||
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
 | 
			
		||||
        ptr[idx] = val;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void barrierWait(volatile int *barrier, volatile int *sense,
 | 
			
		||||
                        unsigned int n) {
 | 
			
		||||
  int count;
 | 
			
		||||
static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
 | 
			
		||||
{
 | 
			
		||||
    int count;
 | 
			
		||||
 | 
			
		||||
  // Check-in
 | 
			
		||||
  count = cpu_atomic_add32(barrier, 1);
 | 
			
		||||
  if (count == n)  // Last one in
 | 
			
		||||
    *sense = 1;
 | 
			
		||||
  while (!*sense)
 | 
			
		||||
    ;
 | 
			
		||||
    // Check-in
 | 
			
		||||
    count = cpu_atomic_add32(barrier, 1);
 | 
			
		||||
    if (count == n) // Last one in
 | 
			
		||||
        *sense = 1;
 | 
			
		||||
    while (!*sense)
 | 
			
		||||
        ;
 | 
			
		||||
 | 
			
		||||
  // Check-out
 | 
			
		||||
  count = cpu_atomic_add32(barrier, -1);
 | 
			
		||||
  if (count == 0)  // Last one out
 | 
			
		||||
    *sense = 0;
 | 
			
		||||
  while (*sense)
 | 
			
		||||
    ;
 | 
			
		||||
    // Check-out
 | 
			
		||||
    count = cpu_atomic_add32(barrier, -1);
 | 
			
		||||
    if (count == 0) // Last one out
 | 
			
		||||
        *sense = 0;
 | 
			
		||||
    while (*sense)
 | 
			
		||||
        ;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void childProcess(int id) {
 | 
			
		||||
  volatile shmStruct *shm = NULL;
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
  sharedMemoryInfo info;
 | 
			
		||||
  size_t procCount, i;
 | 
			
		||||
  int blocks = 0;
 | 
			
		||||
  int threads = 128;
 | 
			
		||||
  cudaDeviceProp prop;
 | 
			
		||||
  std::vector<void *> ptrs;
 | 
			
		||||
  std::vector<cudaEvent_t> events;
 | 
			
		||||
  std::vector<char> verification_buffer(DATA_SIZE);
 | 
			
		||||
static void childProcess(int id)
 | 
			
		||||
{
 | 
			
		||||
    volatile shmStruct      *shm = NULL;
 | 
			
		||||
    cudaStream_t             stream;
 | 
			
		||||
    sharedMemoryInfo         info;
 | 
			
		||||
    size_t                   procCount, i;
 | 
			
		||||
    int                      blocks  = 0;
 | 
			
		||||
    int                      threads = 128;
 | 
			
		||||
    cudaDeviceProp           prop;
 | 
			
		||||
    std::vector<void *>      ptrs;
 | 
			
		||||
    std::vector<cudaEvent_t> events;
 | 
			
		||||
    std::vector<char>        verification_buffer(DATA_SIZE);
 | 
			
		||||
 | 
			
		||||
  if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
 | 
			
		||||
    printf("Failed to create shared memory slab\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
  shm = (volatile shmStruct *)info.addr;
 | 
			
		||||
  procCount = shm->nprocesses;
 | 
			
		||||
 | 
			
		||||
  printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(shm->devices[id]));
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
 | 
			
		||||
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 | 
			
		||||
      &blocks, simpleKernel, threads, 0));
 | 
			
		||||
  blocks *= prop.multiProcessorCount;
 | 
			
		||||
 | 
			
		||||
  // Open and track all the allocations and events created in the master
 | 
			
		||||
  // process for use later
 | 
			
		||||
  for (i = 0; i < procCount; i++) {
 | 
			
		||||
    void *ptr = NULL;
 | 
			
		||||
    cudaEvent_t event;
 | 
			
		||||
 | 
			
		||||
    // Notice, we don't need to explicitly enable peer access for
 | 
			
		||||
    // allocations on other devices.
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i],
 | 
			
		||||
                             cudaIpcMemLazyEnablePeerAccess));
 | 
			
		||||
    checkCudaErrors(cudaIpcOpenEventHandle(
 | 
			
		||||
        &event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
 | 
			
		||||
 | 
			
		||||
    ptrs.push_back(ptr);
 | 
			
		||||
    events.push_back(event);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // At each iteration of the loop, each sibling process will push work on
 | 
			
		||||
  // their respective devices accessing the next peer mapped buffer allocated
 | 
			
		||||
  // by the master process (these can come from other sibling processes as
 | 
			
		||||
  // well). To coordinate each process' access, we force the stream to wait for
 | 
			
		||||
  // the work already accessing this buffer asynchronously through IPC events,
 | 
			
		||||
  // allowing the CPU processes to continue to queue more work.
 | 
			
		||||
  for (i = 0; i < procCount; i++) {
 | 
			
		||||
    size_t bufferId = (i + id) % procCount;
 | 
			
		||||
    // Wait for the buffer to be accessed to be ready
 | 
			
		||||
    checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
 | 
			
		||||
    // Push a simple kernel on it
 | 
			
		||||
    simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
 | 
			
		||||
                                                 DATA_SIZE, id);
 | 
			
		||||
    checkCudaErrors(cudaGetLastError());
 | 
			
		||||
    // Signal that this buffer is ready for the next consumer
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(events[bufferId], stream));
 | 
			
		||||
    // Wait for all my sibling processes to push this stage of their work
 | 
			
		||||
    // before proceeding to the next. This prevents siblings from racing
 | 
			
		||||
    // ahead and clobbering the recorded event or waiting on the wrong
 | 
			
		||||
    // recorded event.
 | 
			
		||||
    barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
 | 
			
		||||
    if (id == 0) {
 | 
			
		||||
      printf("Step %lld done\n", (unsigned long long)i);
 | 
			
		||||
    if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
 | 
			
		||||
        printf("Failed to create shared memory slab\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
    shm       = (volatile shmStruct *)info.addr;
 | 
			
		||||
    procCount = shm->nprocesses;
 | 
			
		||||
 | 
			
		||||
  // Now wait for my buffer to be ready so I can copy it locally and verify it
 | 
			
		||||
  checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
 | 
			
		||||
                                  cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
  // And wait for all the queued up work to complete
 | 
			
		||||
  checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
    printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
 | 
			
		||||
 | 
			
		||||
  printf("Process %d: verifying...\n", id);
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(shm->devices[id]));
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
 | 
			
		||||
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 | 
			
		||||
    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
 | 
			
		||||
    blocks *= prop.multiProcessorCount;
 | 
			
		||||
 | 
			
		||||
  // The contents should have the id of the sibling just after me
 | 
			
		||||
  char compareId = (char)((id + 1) % procCount);
 | 
			
		||||
  for (unsigned long long j = 0; j < DATA_SIZE; j++) {
 | 
			
		||||
    if (verification_buffer[j] != compareId) {
 | 
			
		||||
      printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j,
 | 
			
		||||
             (int)verification_buffer[j], (int)compareId);
 | 
			
		||||
    // Open and track all the allocations and events created in the master
 | 
			
		||||
    // process for use later
 | 
			
		||||
    for (i = 0; i < procCount; i++) {
 | 
			
		||||
        void       *ptr = NULL;
 | 
			
		||||
        cudaEvent_t event;
 | 
			
		||||
 | 
			
		||||
        // Notice, we don't need to explicitly enable peer access for
 | 
			
		||||
        // allocations on other devices.
 | 
			
		||||
        checkCudaErrors(
 | 
			
		||||
            cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
 | 
			
		||||
        checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
 | 
			
		||||
 | 
			
		||||
        ptrs.push_back(ptr);
 | 
			
		||||
        events.push_back(event);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Clean up!
 | 
			
		||||
  for (i = 0; i < procCount; i++) {
 | 
			
		||||
    checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(events[i]));
 | 
			
		||||
  }
 | 
			
		||||
    // At each iteration of the loop, each sibling process will push work on
 | 
			
		||||
    // their respective devices accessing the next peer mapped buffer allocated
 | 
			
		||||
    // by the master process (these can come from other sibling processes as
 | 
			
		||||
    // well). To coordinate each process' access, we force the stream to wait for
 | 
			
		||||
    // the work already accessing this buffer asynchronously through IPC events,
 | 
			
		||||
    // allowing the CPU processes to continue to queue more work.
 | 
			
		||||
    for (i = 0; i < procCount; i++) {
 | 
			
		||||
        size_t bufferId = (i + id) % procCount;
 | 
			
		||||
        // Wait for the buffer to be accessed to be ready
 | 
			
		||||
        checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
 | 
			
		||||
        // Push a simple kernel on it
 | 
			
		||||
        simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
 | 
			
		||||
        checkCudaErrors(cudaGetLastError());
 | 
			
		||||
        // Signal that this buffer is ready for the next consumer
 | 
			
		||||
        checkCudaErrors(cudaEventRecord(events[bufferId], stream));
 | 
			
		||||
        // Wait for all my sibling processes to push this stage of their work
 | 
			
		||||
        // before proceeding to the next. This prevents siblings from racing
 | 
			
		||||
        // ahead and clobbering the recorded event or waiting on the wrong
 | 
			
		||||
        // recorded event.
 | 
			
		||||
        barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
 | 
			
		||||
        if (id == 0) {
 | 
			
		||||
            printf("Step %lld done\n", (unsigned long long)i);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaStreamDestroy(stream));
 | 
			
		||||
    // Now wait for my buffer to be ready so I can copy it locally and verify it
 | 
			
		||||
    checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
 | 
			
		||||
    // And wait for all the queued up work to complete
 | 
			
		||||
    checkCudaErrors(cudaStreamSynchronize(stream));
 | 
			
		||||
 | 
			
		||||
  printf("Process %d complete!\n", id);
 | 
			
		||||
    printf("Process %d: verifying...\n", id);
 | 
			
		||||
 | 
			
		||||
    // The contents should have the id of the sibling just after me
 | 
			
		||||
    char compareId = (char)((id + 1) % procCount);
 | 
			
		||||
    for (unsigned long long j = 0; j < DATA_SIZE; j++) {
 | 
			
		||||
        if (verification_buffer[j] != compareId) {
 | 
			
		||||
            printf("Process %d: Verification mismatch at %lld: %d != %d\n",
 | 
			
		||||
                   id,
 | 
			
		||||
                   j,
 | 
			
		||||
                   (int)verification_buffer[j],
 | 
			
		||||
                   (int)compareId);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Clean up!
 | 
			
		||||
    for (i = 0; i < procCount; i++) {
 | 
			
		||||
        checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
 | 
			
		||||
        checkCudaErrors(cudaEventDestroy(events[i]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaStreamDestroy(stream));
 | 
			
		||||
 | 
			
		||||
    printf("Process %d complete!\n", id);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void parentProcess(char *app) {
 | 
			
		||||
  sharedMemoryInfo info;
 | 
			
		||||
  int devCount, i;
 | 
			
		||||
  volatile shmStruct *shm = NULL;
 | 
			
		||||
  std::vector<void *> ptrs;
 | 
			
		||||
  std::vector<cudaEvent_t> events;
 | 
			
		||||
  std::vector<Process> processes;
 | 
			
		||||
static void parentProcess(char *app)
 | 
			
		||||
{
 | 
			
		||||
    sharedMemoryInfo         info;
 | 
			
		||||
    int                      devCount, i;
 | 
			
		||||
    volatile shmStruct      *shm = NULL;
 | 
			
		||||
    std::vector<void *>      ptrs;
 | 
			
		||||
    std::vector<cudaEvent_t> events;
 | 
			
		||||
    std::vector<Process>     processes;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceCount(&devCount));
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceCount(&devCount));
 | 
			
		||||
 | 
			
		||||
  if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
 | 
			
		||||
    printf("Failed to create shared memory slab\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
  shm = (volatile shmStruct *)info.addr;
 | 
			
		||||
  memset((void *)shm, 0, sizeof(*shm));
 | 
			
		||||
 | 
			
		||||
  // Pick all the devices that can access each other's memory for this test
 | 
			
		||||
  // Keep in mind that CUDA has minimal support for fork() without a
 | 
			
		||||
  // corresponding exec() in the child process, but in this case our
 | 
			
		||||
  // spawnProcess will always exec, so no need to worry.
 | 
			
		||||
  for (i = 0; i < devCount; i++) {
 | 
			
		||||
    bool allPeers = true;
 | 
			
		||||
    cudaDeviceProp prop;
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&prop, i));
 | 
			
		||||
 | 
			
		||||
    // CUDA IPC is only supported on devices with unified addressing
 | 
			
		||||
    if (!prop.unifiedAddressing) {
 | 
			
		||||
      printf("Device %d does not support unified addressing, skipping...\n", i);
 | 
			
		||||
      continue;
 | 
			
		||||
    if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
 | 
			
		||||
        printf("Failed to create shared memory slab\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    // This sample requires two processes accessing each device, so we need
 | 
			
		||||
    // to ensure exclusive or prohibited mode is not set
 | 
			
		||||
    if (prop.computeMode != cudaComputeModeDefault) {
 | 
			
		||||
      printf("Device %d is in an unsupported compute mode for this sample\n",
 | 
			
		||||
             i);
 | 
			
		||||
      continue;
 | 
			
		||||
    shm = (volatile shmStruct *)info.addr;
 | 
			
		||||
    memset((void *)shm, 0, sizeof(*shm));
 | 
			
		||||
 | 
			
		||||
    // Pick all the devices that can access each other's memory for this test
 | 
			
		||||
    // Keep in mind that CUDA has minimal support for fork() without a
 | 
			
		||||
    // corresponding exec() in the child process, but in this case our
 | 
			
		||||
    // spawnProcess will always exec, so no need to worry.
 | 
			
		||||
    for (i = 0; i < devCount; i++) {
 | 
			
		||||
        bool           allPeers = true;
 | 
			
		||||
        cudaDeviceProp prop;
 | 
			
		||||
        checkCudaErrors(cudaGetDeviceProperties(&prop, i));
 | 
			
		||||
 | 
			
		||||
        // CUDA IPC is only supported on devices with unified addressing
 | 
			
		||||
        if (!prop.unifiedAddressing) {
 | 
			
		||||
            printf("Device %d does not support unified addressing, skipping...\n", i);
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
        // This sample requires two processes accessing each device, so we need
 | 
			
		||||
        // to ensure exclusive or prohibited mode is not set
 | 
			
		||||
        if (prop.computeMode != cudaComputeModeDefault) {
 | 
			
		||||
            printf("Device %d is in an unsupported compute mode for this sample\n", i);
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (int j = 0; j < shm->nprocesses; j++) {
 | 
			
		||||
            int canAccessPeerIJ, canAccessPeerJI;
 | 
			
		||||
            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
 | 
			
		||||
            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
 | 
			
		||||
            if (!canAccessPeerIJ || !canAccessPeerJI) {
 | 
			
		||||
                allPeers = false;
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        if (allPeers) {
 | 
			
		||||
            // Enable peers here.  This isn't necessary for IPC, but it will
 | 
			
		||||
            // setup the peers for the device.  For systems that only allow 8
 | 
			
		||||
            // peers per GPU at a time, this acts to remove devices from CanAccessPeer
 | 
			
		||||
            for (int j = 0; j < shm->nprocesses; j++) {
 | 
			
		||||
                checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
                checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
 | 
			
		||||
                checkCudaErrors(cudaSetDevice(shm->devices[j]));
 | 
			
		||||
                checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
 | 
			
		||||
            }
 | 
			
		||||
            shm->devices[shm->nprocesses++] = i;
 | 
			
		||||
            if (shm->nprocesses >= MAX_DEVICES)
 | 
			
		||||
                break;
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            printf("Device %d is not peer capable with some other selected peers, "
 | 
			
		||||
                   "skipping\n",
 | 
			
		||||
                   i);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < shm->nprocesses; j++) {
 | 
			
		||||
      int canAccessPeerIJ, canAccessPeerJI;
 | 
			
		||||
      checkCudaErrors(
 | 
			
		||||
          cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
 | 
			
		||||
      checkCudaErrors(
 | 
			
		||||
          cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
 | 
			
		||||
      if (!canAccessPeerIJ || !canAccessPeerJI) {
 | 
			
		||||
        allPeers = false;
 | 
			
		||||
        break;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    if (allPeers) {
 | 
			
		||||
      // Enable peers here.  This isn't necessary for IPC, but it will
 | 
			
		||||
      // setup the peers for the device.  For systems that only allow 8
 | 
			
		||||
      // peers per GPU at a time, this acts to remove devices from CanAccessPeer
 | 
			
		||||
      for (int j = 0; j < shm->nprocesses; j++) {
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
        checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(shm->devices[j]));
 | 
			
		||||
        checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
 | 
			
		||||
      }
 | 
			
		||||
      shm->devices[shm->nprocesses++] = i;
 | 
			
		||||
      if (shm->nprocesses >= MAX_DEVICES) break;
 | 
			
		||||
    } else {
 | 
			
		||||
      printf(
 | 
			
		||||
          "Device %d is not peer capable with some other selected peers, "
 | 
			
		||||
          "skipping\n",
 | 
			
		||||
          i);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (shm->nprocesses == 0) {
 | 
			
		||||
    printf("No CUDA devices support IPC\n");
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Now allocate memory and an event for each process and fill the shared
 | 
			
		||||
  // memory buffer with the IPC handles to communicate
 | 
			
		||||
  for (i = 0; i < shm->nprocesses; i++) {
 | 
			
		||||
    void *ptr = NULL;
 | 
			
		||||
    cudaEvent_t event;
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(shm->devices[i]));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(
 | 
			
		||||
        &event, cudaEventDisableTiming | cudaEventInterprocess));
 | 
			
		||||
    checkCudaErrors(cudaIpcGetEventHandle(
 | 
			
		||||
        (cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
 | 
			
		||||
 | 
			
		||||
    ptrs.push_back(ptr);
 | 
			
		||||
    events.push_back(event);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Launch the child processes!
 | 
			
		||||
  for (i = 0; i < shm->nprocesses; i++) {
 | 
			
		||||
    char devIdx[12];  // Increased size to ensure enough space for formatted integer
 | 
			
		||||
    char *const args[] = {app, devIdx, NULL};
 | 
			
		||||
    Process process;
 | 
			
		||||
 | 
			
		||||
    snprintf(devIdx, sizeof(devIdx), "%d", i);
 | 
			
		||||
 | 
			
		||||
    if (spawnProcess(&process, app, args)) {
 | 
			
		||||
      printf("Failed to create process\n");
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
    if (shm->nprocesses == 0) {
 | 
			
		||||
        printf("No CUDA devices support IPC\n");
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    processes.push_back(process);
 | 
			
		||||
  }
 | 
			
		||||
    // Now allocate memory and an event for each process and fill the shared
 | 
			
		||||
    // memory buffer with the IPC handles to communicate
 | 
			
		||||
    for (i = 0; i < shm->nprocesses; i++) {
 | 
			
		||||
        void       *ptr = NULL;
 | 
			
		||||
        cudaEvent_t event;
 | 
			
		||||
 | 
			
		||||
  // And wait for them to finish
 | 
			
		||||
  for (i = 0; i < processes.size(); i++) {
 | 
			
		||||
    if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
 | 
			
		||||
      printf("Process %d failed!\n", i);
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(shm->devices[i]));
 | 
			
		||||
        checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
 | 
			
		||||
        checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
 | 
			
		||||
        checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
 | 
			
		||||
        checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
 | 
			
		||||
 | 
			
		||||
        ptrs.push_back(ptr);
 | 
			
		||||
        events.push_back(event);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Clean up!
 | 
			
		||||
  for (i = 0; i < shm->nprocesses; i++) {
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(shm->devices[i]));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(events[i]));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(events[i]));
 | 
			
		||||
    checkCudaErrors(cudaFree(ptrs[i]));
 | 
			
		||||
  }
 | 
			
		||||
    // Launch the child processes!
 | 
			
		||||
    for (i = 0; i < shm->nprocesses; i++) {
 | 
			
		||||
        char        devIdx[12]; // Increased size to ensure enough space for formatted integer
 | 
			
		||||
        char *const args[] = {app, devIdx, NULL};
 | 
			
		||||
        Process     process;
 | 
			
		||||
 | 
			
		||||
  sharedMemoryClose(&info);
 | 
			
		||||
        snprintf(devIdx, sizeof(devIdx), "%d", i);
 | 
			
		||||
 | 
			
		||||
        if (spawnProcess(&process, app, args)) {
 | 
			
		||||
            printf("Failed to create process\n");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        processes.push_back(process);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // And wait for them to finish
 | 
			
		||||
    for (i = 0; i < processes.size(); i++) {
 | 
			
		||||
        if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
 | 
			
		||||
            printf("Process %d failed!\n", i);
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Clean up!
 | 
			
		||||
    for (i = 0; i < shm->nprocesses; i++) {
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(shm->devices[i]));
 | 
			
		||||
        checkCudaErrors(cudaEventSynchronize(events[i]));
 | 
			
		||||
        checkCudaErrors(cudaEventDestroy(events[i]));
 | 
			
		||||
        checkCudaErrors(cudaFree(ptrs[i]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    sharedMemoryClose(&info);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
#if defined(__arm__) || defined(__aarch64__)
 | 
			
		||||
  printf("Not supported on ARM\n");
 | 
			
		||||
  return EXIT_WAIVED;
 | 
			
		||||
    printf("Not supported on ARM\n");
 | 
			
		||||
    return EXIT_WAIVED;
 | 
			
		||||
#else
 | 
			
		||||
  if (argc == 1) {
 | 
			
		||||
    parentProcess(argv[0]);
 | 
			
		||||
  } else {
 | 
			
		||||
    childProcess(atoi(argv[1]));
 | 
			
		||||
  }
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
    if (argc == 1) {
 | 
			
		||||
        parentProcess(argv[0]);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        childProcess(atoi(argv[1]));
 | 
			
		||||
    }
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,27 +26,27 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
* This sample demonstrates how to use texture fetches from layered 2D textures
 | 
			
		||||
* in CUDA C
 | 
			
		||||
*
 | 
			
		||||
* This sample first generates a 3D input data array for the layered texture
 | 
			
		||||
* and the expected output. Then it starts CUDA C kernels, one for each layer,
 | 
			
		||||
* which fetch their layer's texture data (using normalized texture coordinates)
 | 
			
		||||
* transform it to the expected output, and write it to a 3D output data array.
 | 
			
		||||
*/
 | 
			
		||||
 * This sample demonstrates how to use texture fetches from layered 2D textures
 | 
			
		||||
 * in CUDA C
 | 
			
		||||
 *
 | 
			
		||||
 * This sample first generates a 3D input data array for the layered texture
 | 
			
		||||
 * and the expected output. Then it starts CUDA C kernels, one for each layer,
 | 
			
		||||
 * which fetch their layer's texture data (using normalized texture coordinates)
 | 
			
		||||
 * transform it to the expected output, and write it to a 3D output data array.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes, kernels
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
 | 
			
		||||
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
 | 
			
		||||
 | 
			
		||||
static const char *sSDKname = "simpleLayeredTexture";
 | 
			
		||||
 | 
			
		||||
@ -54,163 +54,156 @@ static const char *sSDKname = "simpleLayeredTexture";
 | 
			
		||||
//! Transform a layer of a layered 2D texture using texture lookups
 | 
			
		||||
//! @param g_odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void transformKernel(float *g_odata, int width, int height,
 | 
			
		||||
                                int layer, cudaTextureObject_t tex) {
 | 
			
		||||
  // calculate this thread's data point
 | 
			
		||||
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
__global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex)
 | 
			
		||||
{
 | 
			
		||||
    // calculate this thread's data point
 | 
			
		||||
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  // 0.5f offset and division are necessary to access the original data points
 | 
			
		||||
  // in the texture (such that bilinear interpolation will not be activated).
 | 
			
		||||
  // For details, see also CUDA Programming Guide, Appendix D
 | 
			
		||||
  float u = (x + 0.5f) / (float)width;
 | 
			
		||||
  float v = (y + 0.5f) / (float)height;
 | 
			
		||||
    // 0.5f offset and division are necessary to access the original data points
 | 
			
		||||
    // in the texture (such that bilinear interpolation will not be activated).
 | 
			
		||||
    // For details, see also CUDA Programming Guide, Appendix D
 | 
			
		||||
    float u = (x + 0.5f) / (float)width;
 | 
			
		||||
    float v = (y + 0.5f) / (float)height;
 | 
			
		||||
 | 
			
		||||
  // read from texture, do expected transformation and write to global memory
 | 
			
		||||
  g_odata[layer * width * height + y * width + x] =
 | 
			
		||||
      -tex2DLayered<float>(tex, u, v, layer) + layer;
 | 
			
		||||
    // read from texture, do expected transformation and write to global memory
 | 
			
		||||
    g_odata[layer * width * height + y * width + x] = -tex2DLayered<float>(tex, u, v, layer) + layer;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("[%s] - Starting...\n", sSDKname);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("[%s] - Starting...\n", sSDKname);
 | 
			
		||||
 | 
			
		||||
  // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
  // Gflops/s
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
    // Gflops/s
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  bool bResult = true;
 | 
			
		||||
    bool bResult = true;
 | 
			
		||||
 | 
			
		||||
  // get number of SMs on this GPU
 | 
			
		||||
  cudaDeviceProp deviceProps;
 | 
			
		||||
    // get number of SMs on this GPU
 | 
			
		||||
    cudaDeviceProp deviceProps;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
 | 
			
		||||
         deviceProps.multiProcessorCount);
 | 
			
		||||
  printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
 | 
			
		||||
    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
 | 
			
		||||
 | 
			
		||||
  // generate input data for layered texture
 | 
			
		||||
  unsigned int width = 512, height = 512, num_layers = 5;
 | 
			
		||||
  unsigned int size = width * height * num_layers * sizeof(float);
 | 
			
		||||
  float *h_data = (float *)malloc(size);
 | 
			
		||||
    // generate input data for layered texture
 | 
			
		||||
    unsigned int width = 512, height = 512, num_layers = 5;
 | 
			
		||||
    unsigned int size   = width * height * num_layers * sizeof(float);
 | 
			
		||||
    float       *h_data = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  for (unsigned int layer = 0; layer < num_layers; layer++)
 | 
			
		||||
    for (int i = 0; i < (int)(width * height); i++) {
 | 
			
		||||
      h_data[layer * width * height + i] = (float)i;
 | 
			
		||||
    for (unsigned int layer = 0; layer < num_layers; layer++)
 | 
			
		||||
        for (int i = 0; i < (int)(width * height); i++) {
 | 
			
		||||
            h_data[layer * width * height + i] = (float)i;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    // this is the expected transformation of the input data (the expected output)
 | 
			
		||||
    float *h_data_ref = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
    for (unsigned int layer = 0; layer < num_layers; layer++)
 | 
			
		||||
        for (int i = 0; i < (int)(width * height); i++) {
 | 
			
		||||
            h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    float *d_data = NULL;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_data, size));
 | 
			
		||||
 | 
			
		||||
    // allocate array and copy image data
 | 
			
		||||
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
    cudaArray            *cu_3darray;
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
 | 
			
		||||
    cudaMemcpy3DParms myparms = {0};
 | 
			
		||||
    myparms.srcPos            = make_cudaPos(0, 0, 0);
 | 
			
		||||
    myparms.dstPos            = make_cudaPos(0, 0, 0);
 | 
			
		||||
    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
 | 
			
		||||
    myparms.dstArray          = cu_3darray;
 | 
			
		||||
    myparms.extent            = make_cudaExtent(width, height, num_layers);
 | 
			
		||||
    myparms.kind              = cudaMemcpyHostToDevice;
 | 
			
		||||
    checkCudaErrors(cudaMemcpy3D(&myparms));
 | 
			
		||||
 | 
			
		||||
    cudaTextureObject_t tex;
 | 
			
		||||
    cudaResourceDesc    texRes;
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
    texRes.resType         = cudaResourceTypeArray;
 | 
			
		||||
    texRes.res.array.array = cu_3darray;
 | 
			
		||||
 | 
			
		||||
    cudaTextureDesc texDescr;
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    texDescr.filterMode       = cudaFilterModeLinear;
 | 
			
		||||
    texDescr.addressMode[0]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode         = cudaReadModeElementType;
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
 | 
			
		||||
    dim3 dimBlock(8, 8, 1);
 | 
			
		||||
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
 | 
			
		||||
 | 
			
		||||
    printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
 | 
			
		||||
           "8 x 8 threads\n",
 | 
			
		||||
           width,
 | 
			
		||||
           height,
 | 
			
		||||
           dimGrid.x,
 | 
			
		||||
           dimGrid.y);
 | 
			
		||||
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
 | 
			
		||||
                                           tex); // warmup (for better timing)
 | 
			
		||||
 | 
			
		||||
    // check if kernel execution generated an error
 | 
			
		||||
    getLastCudaError("warmup Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    // execute the kernel
 | 
			
		||||
    for (unsigned int layer = 0; layer < num_layers; layer++)
 | 
			
		||||
        transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, tex);
 | 
			
		||||
 | 
			
		||||
    // check if kernel execution generated an error
 | 
			
		||||
    getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    float *h_odata = (float *)malloc(size);
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
    // write regression file if necessary
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
        // write file for regression test
 | 
			
		||||
        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // this is the expected transformation of the input data (the expected output)
 | 
			
		||||
  float *h_data_ref = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  for (unsigned int layer = 0; layer < num_layers; layer++)
 | 
			
		||||
    for (int i = 0; i < (int)(width * height); i++) {
 | 
			
		||||
      h_data_ref[layer * width * height + i] =
 | 
			
		||||
          -h_data[layer * width * height + i] + layer;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  float *d_data = NULL;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_data, size));
 | 
			
		||||
 | 
			
		||||
  // allocate array and copy image data
 | 
			
		||||
  cudaChannelFormatDesc channelDesc =
 | 
			
		||||
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
  cudaArray *cu_3darray;
 | 
			
		||||
  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
 | 
			
		||||
                                    make_cudaExtent(width, height, num_layers),
 | 
			
		||||
                                    cudaArrayLayered));
 | 
			
		||||
  cudaMemcpy3DParms myparms = {0};
 | 
			
		||||
  myparms.srcPos = make_cudaPos(0, 0, 0);
 | 
			
		||||
  myparms.dstPos = make_cudaPos(0, 0, 0);
 | 
			
		||||
  myparms.srcPtr =
 | 
			
		||||
      make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
 | 
			
		||||
  myparms.dstArray = cu_3darray;
 | 
			
		||||
  myparms.extent = make_cudaExtent(width, height, num_layers);
 | 
			
		||||
  myparms.kind = cudaMemcpyHostToDevice;
 | 
			
		||||
  checkCudaErrors(cudaMemcpy3D(&myparms));
 | 
			
		||||
 | 
			
		||||
  cudaTextureObject_t tex;
 | 
			
		||||
  cudaResourceDesc texRes;
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  texRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  texRes.res.array.array = cu_3darray;
 | 
			
		||||
 | 
			
		||||
  cudaTextureDesc texDescr;
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  texDescr.filterMode = cudaFilterModeLinear;
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeElementType;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
 | 
			
		||||
  dim3 dimBlock(8, 8, 1);
 | 
			
		||||
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
 | 
			
		||||
 | 
			
		||||
  printf(
 | 
			
		||||
      "Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
 | 
			
		||||
      "8 x 8 threads\n",
 | 
			
		||||
      width, height, dimGrid.x, dimGrid.y);
 | 
			
		||||
 | 
			
		||||
  transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
 | 
			
		||||
                                         tex);  // warmup (for better timing)
 | 
			
		||||
 | 
			
		||||
  // check if kernel execution generated an error
 | 
			
		||||
  getLastCudaError("warmup Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // execute the kernel
 | 
			
		||||
  for (unsigned int layer = 0; layer < num_layers; layer++)
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer,
 | 
			
		||||
                                              tex);
 | 
			
		||||
 | 
			
		||||
  // check if kernel execution generated an error
 | 
			
		||||
  getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  printf("%.2f Mtexlookups/sec\n",
 | 
			
		||||
         (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) /
 | 
			
		||||
          1e6));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  float *h_odata = (float *)malloc(size);
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  // write regression file if necessary
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
    // write file for regression test
 | 
			
		||||
    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
 | 
			
		||||
                        false);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("Comparing kernel output to expected data\n");
 | 
			
		||||
    else {
 | 
			
		||||
        printf("Comparing kernel output to expected data\n");
 | 
			
		||||
 | 
			
		||||
#define MIN_EPSILON_ERROR 5e-3f
 | 
			
		||||
    bResult = compareData(h_odata, h_data_ref, width * height * num_layers,
 | 
			
		||||
                          MIN_EPSILON_ERROR, 0.0f);
 | 
			
		||||
  }
 | 
			
		||||
        bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // cleanup memory
 | 
			
		||||
  free(h_data);
 | 
			
		||||
  free(h_data_ref);
 | 
			
		||||
  free(h_odata);
 | 
			
		||||
    // cleanup memory
 | 
			
		||||
    free(h_data);
 | 
			
		||||
    free(h_data_ref);
 | 
			
		||||
    free(h_odata);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_data));
 | 
			
		||||
  checkCudaErrors(cudaFreeArray(cu_3darray));
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_data));
 | 
			
		||||
    checkCudaErrors(cudaFreeArray(cu_3darray));
 | 
			
		||||
 | 
			
		||||
  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,15 +26,15 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/* Simple example demonstrating how to use MPI with CUDA
 | 
			
		||||
*
 | 
			
		||||
*  Generate some random numbers on one node.
 | 
			
		||||
*  Dispatch them to all nodes.
 | 
			
		||||
*  Compute their square root on each node's GPU.
 | 
			
		||||
*  Compute the average of the results using MPI.
 | 
			
		||||
*
 | 
			
		||||
*  simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
 | 
			
		||||
*                 on Windows, please download the Microsoft HPC Pack SDK 2008
 | 
			
		||||
*/
 | 
			
		||||
 *
 | 
			
		||||
 *  Generate some random numbers on one node.
 | 
			
		||||
 *  Dispatch them to all nodes.
 | 
			
		||||
 *  Compute their square root on each node's GPU.
 | 
			
		||||
 *  Compute the average of the results using MPI.
 | 
			
		||||
 *
 | 
			
		||||
 *  simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
 | 
			
		||||
 *                 on Windows, please download the Microsoft HPC Pack SDK 2008
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// MPI include
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
@ -42,87 +42,88 @@
 | 
			
		||||
// System includes
 | 
			
		||||
#include <iostream>
 | 
			
		||||
 | 
			
		||||
using std::cout;
 | 
			
		||||
using std::cerr;
 | 
			
		||||
using std::cout;
 | 
			
		||||
using std::endl;
 | 
			
		||||
 | 
			
		||||
// User include
 | 
			
		||||
#include "simpleMPI.h"
 | 
			
		||||
 | 
			
		||||
// Error handling macros
 | 
			
		||||
#define MPI_CHECK(call)                          \
 | 
			
		||||
  if ((call) != MPI_SUCCESS) {                   \
 | 
			
		||||
    cerr << "MPI error calling \"" #call "\"\n"; \
 | 
			
		||||
    my_abort(-1);                                \
 | 
			
		||||
  }
 | 
			
		||||
#define MPI_CHECK(call)                              \
 | 
			
		||||
    if ((call) != MPI_SUCCESS) {                     \
 | 
			
		||||
        cerr << "MPI error calling \"" #call "\"\n"; \
 | 
			
		||||
        my_abort(-1);                                \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
// Host code
 | 
			
		||||
// No CUDA here, only MPI
 | 
			
		||||
int main(int argc, char *argv[]) {
 | 
			
		||||
  // Dimensions of the dataset
 | 
			
		||||
  int blockSize = 256;
 | 
			
		||||
  int gridSize = 10000;
 | 
			
		||||
  int dataSizePerNode = gridSize * blockSize;
 | 
			
		||||
int main(int argc, char *argv[])
 | 
			
		||||
{
 | 
			
		||||
    // Dimensions of the dataset
 | 
			
		||||
    int blockSize       = 256;
 | 
			
		||||
    int gridSize        = 10000;
 | 
			
		||||
    int dataSizePerNode = gridSize * blockSize;
 | 
			
		||||
 | 
			
		||||
  // Initialize MPI state
 | 
			
		||||
  MPI_CHECK(MPI_Init(&argc, &argv));
 | 
			
		||||
    // Initialize MPI state
 | 
			
		||||
    MPI_CHECK(MPI_Init(&argc, &argv));
 | 
			
		||||
 | 
			
		||||
  // Get our MPI node number and node count
 | 
			
		||||
  int commSize, commRank;
 | 
			
		||||
  MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
 | 
			
		||||
  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
 | 
			
		||||
    // Get our MPI node number and node count
 | 
			
		||||
    int commSize, commRank;
 | 
			
		||||
    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
 | 
			
		||||
    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
 | 
			
		||||
 | 
			
		||||
  // Generate some random numbers on the root node (node 0)
 | 
			
		||||
  int dataSizeTotal = dataSizePerNode * commSize;
 | 
			
		||||
  float *dataRoot = NULL;
 | 
			
		||||
    // Generate some random numbers on the root node (node 0)
 | 
			
		||||
    int    dataSizeTotal = dataSizePerNode * commSize;
 | 
			
		||||
    float *dataRoot      = NULL;
 | 
			
		||||
 | 
			
		||||
  // Are we the root node?
 | 
			
		||||
  if (commRank == 0) {
 | 
			
		||||
    cout << "Running on " << commSize << " nodes" << endl;
 | 
			
		||||
    dataRoot = new float[dataSizeTotal];
 | 
			
		||||
    initData(dataRoot, dataSizeTotal);
 | 
			
		||||
  }
 | 
			
		||||
    // Are we the root node?
 | 
			
		||||
    if (commRank == 0) {
 | 
			
		||||
        cout << "Running on " << commSize << " nodes" << endl;
 | 
			
		||||
        dataRoot = new float[dataSizeTotal];
 | 
			
		||||
        initData(dataRoot, dataSizeTotal);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Allocate a buffer on each node
 | 
			
		||||
  float *dataNode = new float[dataSizePerNode];
 | 
			
		||||
    // Allocate a buffer on each node
 | 
			
		||||
    float *dataNode = new float[dataSizePerNode];
 | 
			
		||||
 | 
			
		||||
  // Dispatch a portion of the input data to each node
 | 
			
		||||
  MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
 | 
			
		||||
                        dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
 | 
			
		||||
    // Dispatch a portion of the input data to each node
 | 
			
		||||
    MPI_CHECK(
 | 
			
		||||
        MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
 | 
			
		||||
 | 
			
		||||
  if (commRank == 0) {
 | 
			
		||||
    // No need for root data any more
 | 
			
		||||
    delete[] dataRoot;
 | 
			
		||||
  }
 | 
			
		||||
    if (commRank == 0) {
 | 
			
		||||
        // No need for root data any more
 | 
			
		||||
        delete[] dataRoot;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // On each node, run computation on GPU
 | 
			
		||||
  computeGPU(dataNode, blockSize, gridSize);
 | 
			
		||||
    // On each node, run computation on GPU
 | 
			
		||||
    computeGPU(dataNode, blockSize, gridSize);
 | 
			
		||||
 | 
			
		||||
  // Reduction to the root node, computing the sum of output elements
 | 
			
		||||
  float sumNode = sum(dataNode, dataSizePerNode);
 | 
			
		||||
  float sumRoot;
 | 
			
		||||
    // Reduction to the root node, computing the sum of output elements
 | 
			
		||||
    float sumNode = sum(dataNode, dataSizePerNode);
 | 
			
		||||
    float sumRoot;
 | 
			
		||||
 | 
			
		||||
  MPI_CHECK(
 | 
			
		||||
      MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
 | 
			
		||||
    MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
 | 
			
		||||
 | 
			
		||||
  if (commRank == 0) {
 | 
			
		||||
    float average = sumRoot / dataSizeTotal;
 | 
			
		||||
    cout << "Average of square roots is: " << average << endl;
 | 
			
		||||
  }
 | 
			
		||||
    if (commRank == 0) {
 | 
			
		||||
        float average = sumRoot / dataSizeTotal;
 | 
			
		||||
        cout << "Average of square roots is: " << average << endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Cleanup
 | 
			
		||||
  delete[] dataNode;
 | 
			
		||||
  MPI_CHECK(MPI_Finalize());
 | 
			
		||||
    // Cleanup
 | 
			
		||||
    delete[] dataNode;
 | 
			
		||||
    MPI_CHECK(MPI_Finalize());
 | 
			
		||||
 | 
			
		||||
  if (commRank == 0) {
 | 
			
		||||
    cout << "PASSED\n";
 | 
			
		||||
  }
 | 
			
		||||
    if (commRank == 0) {
 | 
			
		||||
        cout << "PASSED\n";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Shut down MPI cleanly if something goes wrong
 | 
			
		||||
void my_abort(int err) {
 | 
			
		||||
  cout << "Test FAILED\n";
 | 
			
		||||
  MPI_Abort(MPI_COMM_WORLD, err);
 | 
			
		||||
void my_abort(int err)
 | 
			
		||||
{
 | 
			
		||||
    cout << "Test FAILED\n";
 | 
			
		||||
    MPI_Abort(MPI_COMM_WORLD, err);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,14 +26,14 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/* Simple example demonstrating how to use MPI with CUDA
 | 
			
		||||
*
 | 
			
		||||
*  Generate some random numbers on one node.
 | 
			
		||||
*  Dispatch them to all nodes.
 | 
			
		||||
*  Compute their square root on each node's GPU.
 | 
			
		||||
*  Compute the average of the results using MPI.
 | 
			
		||||
*
 | 
			
		||||
*  simpleMPI.cu: GPU part, compiled with nvcc
 | 
			
		||||
*/
 | 
			
		||||
 *
 | 
			
		||||
 *  Generate some random numbers on one node.
 | 
			
		||||
 *  Dispatch them to all nodes.
 | 
			
		||||
 *  Compute their square root on each node's GPU.
 | 
			
		||||
 *  Compute the average of the results using MPI.
 | 
			
		||||
 *
 | 
			
		||||
 *  simpleMPI.cu: GPU part, compiled with nvcc
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <iostream>
 | 
			
		||||
using std::cerr;
 | 
			
		||||
@ -42,61 +42,63 @@ using std::endl;
 | 
			
		||||
#include "simpleMPI.h"
 | 
			
		||||
 | 
			
		||||
// Error handling macro
 | 
			
		||||
#define CUDA_CHECK(call)                                                 \
 | 
			
		||||
  if ((call) != cudaSuccess) {                                           \
 | 
			
		||||
    cudaError_t err = cudaGetLastError();                                \
 | 
			
		||||
    cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \
 | 
			
		||||
    my_abort(err);                                                       \
 | 
			
		||||
  }
 | 
			
		||||
#define CUDA_CHECK(call)                                                     \
 | 
			
		||||
    if ((call) != cudaSuccess) {                                             \
 | 
			
		||||
        cudaError_t err = cudaGetLastError();                                \
 | 
			
		||||
        cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \
 | 
			
		||||
        my_abort(err);                                                       \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
// Device code
 | 
			
		||||
// Very simple GPU Kernel that computes square roots of input numbers
 | 
			
		||||
__global__ void simpleMPIKernel(float *input, float *output) {
 | 
			
		||||
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  output[tid] = sqrt(input[tid]);
 | 
			
		||||
__global__ void simpleMPIKernel(float *input, float *output)
 | 
			
		||||
{
 | 
			
		||||
    int tid     = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    output[tid] = sqrt(input[tid]);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Initialize an array with random data (between 0 and 1)
 | 
			
		||||
void initData(float *data, int dataSize) {
 | 
			
		||||
  for (int i = 0; i < dataSize; i++) {
 | 
			
		||||
    data[i] = (float)rand() / RAND_MAX;
 | 
			
		||||
  }
 | 
			
		||||
void initData(float *data, int dataSize)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < dataSize; i++) {
 | 
			
		||||
        data[i] = (float)rand() / RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// CUDA computation on each node
 | 
			
		||||
// No MPI here, only CUDA
 | 
			
		||||
void computeGPU(float *hostData, int blockSize, int gridSize) {
 | 
			
		||||
  int dataSize = blockSize * gridSize;
 | 
			
		||||
void computeGPU(float *hostData, int blockSize, int gridSize)
 | 
			
		||||
{
 | 
			
		||||
    int dataSize = blockSize * gridSize;
 | 
			
		||||
 | 
			
		||||
  // Allocate data on GPU memory
 | 
			
		||||
  float *deviceInputData = NULL;
 | 
			
		||||
  CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
 | 
			
		||||
    // Allocate data on GPU memory
 | 
			
		||||
    float *deviceInputData = NULL;
 | 
			
		||||
    CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
 | 
			
		||||
 | 
			
		||||
  float *deviceOutputData = NULL;
 | 
			
		||||
  CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
 | 
			
		||||
    float *deviceOutputData = NULL;
 | 
			
		||||
    CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
 | 
			
		||||
 | 
			
		||||
  // Copy to GPU memory
 | 
			
		||||
  CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float),
 | 
			
		||||
                        cudaMemcpyHostToDevice));
 | 
			
		||||
    // Copy to GPU memory
 | 
			
		||||
    CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  // Run kernel
 | 
			
		||||
  simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
 | 
			
		||||
    // Run kernel
 | 
			
		||||
    simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
 | 
			
		||||
 | 
			
		||||
  // Copy data back to CPU memory
 | 
			
		||||
  CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float),
 | 
			
		||||
                        cudaMemcpyDeviceToHost));
 | 
			
		||||
    // Copy data back to CPU memory
 | 
			
		||||
    CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  // Free GPU memory
 | 
			
		||||
  CUDA_CHECK(cudaFree(deviceInputData));
 | 
			
		||||
  CUDA_CHECK(cudaFree(deviceOutputData));
 | 
			
		||||
    // Free GPU memory
 | 
			
		||||
    CUDA_CHECK(cudaFree(deviceInputData));
 | 
			
		||||
    CUDA_CHECK(cudaFree(deviceOutputData));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
float sum(float *data, int size) {
 | 
			
		||||
  float accum = 0.f;
 | 
			
		||||
float sum(float *data, int size)
 | 
			
		||||
{
 | 
			
		||||
    float accum = 0.f;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < size; i++) {
 | 
			
		||||
    accum += data[i];
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < size; i++) {
 | 
			
		||||
        accum += data[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return accum;
 | 
			
		||||
    return accum;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,19 +26,20 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/* Simple example demonstrating how to use MPI with CUDA
 | 
			
		||||
*
 | 
			
		||||
*  Generate some random numbers on one node.
 | 
			
		||||
*  Dispatch them to all nodes.
 | 
			
		||||
*  Compute their square root on each node's GPU.
 | 
			
		||||
*  Compute the average of the results using MPI.
 | 
			
		||||
*
 | 
			
		||||
*  simpleMPI.h: common header file
 | 
			
		||||
*/
 | 
			
		||||
 *
 | 
			
		||||
 *  Generate some random numbers on one node.
 | 
			
		||||
 *  Dispatch them to all nodes.
 | 
			
		||||
 *  Compute their square root on each node's GPU.
 | 
			
		||||
 *  Compute the average of the results using MPI.
 | 
			
		||||
 *
 | 
			
		||||
 *  simpleMPI.h: common header file
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Forward declarations
 | 
			
		||||
extern "C" {
 | 
			
		||||
void initData(float *data, int dataSize);
 | 
			
		||||
void computeGPU(float *hostData, int blockSize, int gridSize);
 | 
			
		||||
float sum(float *data, int size);
 | 
			
		||||
void my_abort(int err);
 | 
			
		||||
extern "C"
 | 
			
		||||
{
 | 
			
		||||
    void  initData(float *data, int dataSize);
 | 
			
		||||
    void  computeGPU(float *hostData, int blockSize, int gridSize);
 | 
			
		||||
    float sum(float *data, int size);
 | 
			
		||||
    void  my_abort(int err);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -38,7 +38,7 @@
 | 
			
		||||
 *
 | 
			
		||||
 * Elapsed times are averaged over nreps repetitions (10 by default).
 | 
			
		||||
 *
 | 
			
		||||
*/
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
const char *sSDKname = "simpleMultiCopy";
 | 
			
		||||
 | 
			
		||||
@ -50,25 +50,26 @@ const char *sSDKname = "simpleMultiCopy";
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
 | 
			
		||||
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
 | 
			
		||||
 | 
			
		||||
// includes, kernels
 | 
			
		||||
// Declare the CUDA kernels here and main() code that is needed to launch
 | 
			
		||||
// Compute workload on the system
 | 
			
		||||
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
 | 
			
		||||
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
 | 
			
		||||
{
 | 
			
		||||
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (idx < N) {
 | 
			
		||||
    for (int i = 0; i < inner_reps; ++i) {
 | 
			
		||||
      g_out[idx] = g_in[idx] + 1;
 | 
			
		||||
    if (idx < N) {
 | 
			
		||||
        for (int i = 0; i < inner_reps; ++i) {
 | 
			
		||||
            g_out[idx] = g_in[idx] + 1;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define STREAM_COUNT 4
 | 
			
		||||
 | 
			
		||||
// Uncomment to simulate data source/sink IO times
 | 
			
		||||
//#define SIMULATE_IO
 | 
			
		||||
// #define SIMULATE_IO
 | 
			
		||||
 | 
			
		||||
int *h_data_source;
 | 
			
		||||
int *h_data_sink;
 | 
			
		||||
@ -79,13 +80,13 @@ int *d_data_in[STREAM_COUNT];
 | 
			
		||||
int *h_data_out[STREAM_COUNT];
 | 
			
		||||
int *d_data_out[STREAM_COUNT];
 | 
			
		||||
 | 
			
		||||
cudaEvent_t cycleDone[STREAM_COUNT];
 | 
			
		||||
cudaEvent_t  cycleDone[STREAM_COUNT];
 | 
			
		||||
cudaStream_t stream[STREAM_COUNT];
 | 
			
		||||
 | 
			
		||||
cudaEvent_t start, stop;
 | 
			
		||||
 | 
			
		||||
int N = 1 << 22;
 | 
			
		||||
int nreps = 10;  // number of times each experiment is repeated
 | 
			
		||||
int N          = 1 << 22;
 | 
			
		||||
int nreps      = 10; // number of times each experiment is repeated
 | 
			
		||||
int inner_reps = 5;
 | 
			
		||||
 | 
			
		||||
int memsize;
 | 
			
		||||
@ -96,278 +97,268 @@ dim3 grid;
 | 
			
		||||
int thread_blocks;
 | 
			
		||||
 | 
			
		||||
float processWithStreams(int streams_used);
 | 
			
		||||
void init();
 | 
			
		||||
bool test();
 | 
			
		||||
void  init();
 | 
			
		||||
bool  test();
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char *argv[]) {
 | 
			
		||||
  int cuda_device = 0;
 | 
			
		||||
  float scale_factor;
 | 
			
		||||
  cudaDeviceProp deviceProp;
 | 
			
		||||
int main(int argc, char *argv[])
 | 
			
		||||
{
 | 
			
		||||
    int            cuda_device = 0;
 | 
			
		||||
    float          scale_factor;
 | 
			
		||||
    cudaDeviceProp deviceProp;
 | 
			
		||||
 | 
			
		||||
  printf("[%s] - Starting...\n", sSDKname);
 | 
			
		||||
    printf("[%s] - Starting...\n", sSDKname);
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
 | 
			
		||||
    cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
 | 
			
		||||
        cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
 | 
			
		||||
 | 
			
		||||
    if (cuda_device < 0) {
 | 
			
		||||
      printf("Invalid command line parameters\n");
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
    } else {
 | 
			
		||||
      printf("cuda_device = %d\n", cuda_device);
 | 
			
		||||
      cuda_device = gpuDeviceInit(cuda_device);
 | 
			
		||||
        if (cuda_device < 0) {
 | 
			
		||||
            printf("Invalid command line parameters\n");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            printf("cuda_device = %d\n", cuda_device);
 | 
			
		||||
            cuda_device = gpuDeviceInit(cuda_device);
 | 
			
		||||
 | 
			
		||||
      if (cuda_device < 0) {
 | 
			
		||||
        printf("No CUDA Capable devices found, exiting...\n");
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
      }
 | 
			
		||||
            if (cuda_device < 0) {
 | 
			
		||||
                printf("No CUDA Capable devices found, exiting...\n");
 | 
			
		||||
                exit(EXIT_SUCCESS);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    // Otherwise pick the device with the highest Gflops/s
 | 
			
		||||
    cuda_device = gpuGetMaxGflopsDeviceId();
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(cuda_device));
 | 
			
		||||
    else {
 | 
			
		||||
        // Otherwise pick the device with the highest Gflops/s
 | 
			
		||||
        cuda_device = gpuGetMaxGflopsDeviceId();
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(cuda_device));
 | 
			
		||||
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 | 
			
		||||
        printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 | 
			
		||||
    printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
 | 
			
		||||
  }
 | 
			
		||||
    printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
 | 
			
		||||
           deviceProp.name,
 | 
			
		||||
           deviceProp.multiProcessorCount,
 | 
			
		||||
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 | 
			
		||||
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 | 
			
		||||
  printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
 | 
			
		||||
         deviceProp.multiProcessorCount,
 | 
			
		||||
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 | 
			
		||||
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
 | 
			
		||||
             deviceProp.multiProcessorCount);
 | 
			
		||||
    // Anything that is less than 32 Cores will have scaled down workload
 | 
			
		||||
    scale_factor =
 | 
			
		||||
        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
 | 
			
		||||
            1.0f);
 | 
			
		||||
    N = (int)((float)N / scale_factor);
 | 
			
		||||
 | 
			
		||||
  // Anything that is less than 32 Cores will have scaled down workload
 | 
			
		||||
  scale_factor =
 | 
			
		||||
      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
 | 
			
		||||
                    (float)deviceProp.multiProcessorCount)),
 | 
			
		||||
          1.0f);
 | 
			
		||||
  N = (int)((float)N / scale_factor);
 | 
			
		||||
    printf("> Device name: %s\n", deviceProp.name);
 | 
			
		||||
    printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
 | 
			
		||||
           deviceProp.major,
 | 
			
		||||
           deviceProp.minor,
 | 
			
		||||
           deviceProp.multiProcessorCount);
 | 
			
		||||
    printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
 | 
			
		||||
    printf("> array_size   = %d\n\n", N);
 | 
			
		||||
 | 
			
		||||
  printf("> Device name: %s\n", deviceProp.name);
 | 
			
		||||
  printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
 | 
			
		||||
         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 | 
			
		||||
  printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
 | 
			
		||||
  printf("> array_size   = %d\n\n", N);
 | 
			
		||||
    memsize = N * sizeof(int);
 | 
			
		||||
 | 
			
		||||
  memsize = N * sizeof(int);
 | 
			
		||||
    thread_blocks = N / block.x;
 | 
			
		||||
 | 
			
		||||
  thread_blocks = N / block.x;
 | 
			
		||||
    grid.x = thread_blocks % 65535;
 | 
			
		||||
    grid.y = (thread_blocks / 65535 + 1);
 | 
			
		||||
 | 
			
		||||
  grid.x = thread_blocks % 65535;
 | 
			
		||||
  grid.y = (thread_blocks / 65535 + 1);
 | 
			
		||||
    // Allocate resources
 | 
			
		||||
 | 
			
		||||
  // Allocate resources
 | 
			
		||||
    h_data_source = (int *)malloc(memsize);
 | 
			
		||||
    h_data_sink   = (int *)malloc(memsize);
 | 
			
		||||
 | 
			
		||||
  h_data_source = (int *)malloc(memsize);
 | 
			
		||||
  h_data_sink = (int *)malloc(memsize);
 | 
			
		||||
    for (int i = 0; i < STREAM_COUNT; ++i) {
 | 
			
		||||
        checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
 | 
			
		||||
        checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
 | 
			
		||||
        checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < STREAM_COUNT; ++i) {
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
 | 
			
		||||
    checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
 | 
			
		||||
        checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
 | 
			
		||||
        checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
 | 
			
		||||
        checkCudaErrors(cudaStreamCreate(&stream[i]));
 | 
			
		||||
        checkCudaErrors(cudaEventCreate(&cycleDone[i]));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaStreamCreate(&stream[i]));
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&cycleDone[i]));
 | 
			
		||||
        cudaEventRecord(cycleDone[i], stream[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    cudaEventRecord(cycleDone[i], stream[i]);
 | 
			
		||||
  }
 | 
			
		||||
    cudaEventCreate(&start);
 | 
			
		||||
    cudaEventCreate(&stop);
 | 
			
		||||
 | 
			
		||||
  cudaEventCreate(&start);
 | 
			
		||||
  cudaEventCreate(&stop);
 | 
			
		||||
    init();
 | 
			
		||||
 | 
			
		||||
  init();
 | 
			
		||||
    // Kernel warmup
 | 
			
		||||
    incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
 | 
			
		||||
 | 
			
		||||
  // Kernel warmup
 | 
			
		||||
  incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
 | 
			
		||||
    // Time copies and kernel
 | 
			
		||||
    cudaEventRecord(start, 0);
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0));
 | 
			
		||||
    cudaEventRecord(stop, 0);
 | 
			
		||||
    cudaEventSynchronize(stop);
 | 
			
		||||
 | 
			
		||||
  // Time copies and kernel
 | 
			
		||||
  cudaEventRecord(start, 0);
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
 | 
			
		||||
                                  cudaMemcpyHostToDevice, 0));
 | 
			
		||||
  cudaEventRecord(stop, 0);
 | 
			
		||||
  cudaEventSynchronize(stop);
 | 
			
		||||
    float memcpy_h2d_time;
 | 
			
		||||
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
 | 
			
		||||
 | 
			
		||||
  float memcpy_h2d_time;
 | 
			
		||||
  cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
 | 
			
		||||
    cudaEventRecord(start, 0);
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0));
 | 
			
		||||
    cudaEventRecord(stop, 0);
 | 
			
		||||
    cudaEventSynchronize(stop);
 | 
			
		||||
 | 
			
		||||
  cudaEventRecord(start, 0);
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
 | 
			
		||||
                                  cudaMemcpyDeviceToHost, 0));
 | 
			
		||||
  cudaEventRecord(stop, 0);
 | 
			
		||||
  cudaEventSynchronize(stop);
 | 
			
		||||
    float memcpy_d2h_time;
 | 
			
		||||
    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
 | 
			
		||||
 | 
			
		||||
  float memcpy_d2h_time;
 | 
			
		||||
  cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
 | 
			
		||||
    cudaEventRecord(start, 0);
 | 
			
		||||
    incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
 | 
			
		||||
    cudaEventRecord(stop, 0);
 | 
			
		||||
    cudaEventSynchronize(stop);
 | 
			
		||||
 | 
			
		||||
  cudaEventRecord(start, 0);
 | 
			
		||||
  incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
 | 
			
		||||
  cudaEventRecord(stop, 0);
 | 
			
		||||
  cudaEventSynchronize(stop);
 | 
			
		||||
    float kernel_time;
 | 
			
		||||
    cudaEventElapsedTime(&kernel_time, start, stop);
 | 
			
		||||
 | 
			
		||||
  float kernel_time;
 | 
			
		||||
  cudaEventElapsedTime(&kernel_time, start, stop);
 | 
			
		||||
    printf("\n");
 | 
			
		||||
    printf("Relevant properties of this CUDA device\n");
 | 
			
		||||
    printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
 | 
			
		||||
           "(device property \"deviceOverlap\")\n",
 | 
			
		||||
           deviceProp.deviceOverlap ? "X" : " ");
 | 
			
		||||
    // printf("(%s) Can execute several GPU kernels simultaneously (compute
 | 
			
		||||
    // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
 | 
			
		||||
    printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
 | 
			
		||||
           "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
 | 
			
		||||
           "4000/5000/6000/K5000)\n",
 | 
			
		||||
           (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
 | 
			
		||||
 | 
			
		||||
  printf("\n");
 | 
			
		||||
  printf("Relevant properties of this CUDA device\n");
 | 
			
		||||
  printf(
 | 
			
		||||
      "(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
 | 
			
		||||
      "(device property \"deviceOverlap\")\n",
 | 
			
		||||
      deviceProp.deviceOverlap ? "X" : " ");
 | 
			
		||||
  // printf("(%s) Can execute several GPU kernels simultaneously (compute
 | 
			
		||||
  // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
 | 
			
		||||
  printf(
 | 
			
		||||
      "(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
 | 
			
		||||
      "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
 | 
			
		||||
      "4000/5000/6000/K5000)\n",
 | 
			
		||||
      (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
 | 
			
		||||
    printf("\n");
 | 
			
		||||
    printf("Measured timings (throughput):\n");
 | 
			
		||||
    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
 | 
			
		||||
    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time);
 | 
			
		||||
    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time);
 | 
			
		||||
 | 
			
		||||
  printf("\n");
 | 
			
		||||
  printf("Measured timings (throughput):\n");
 | 
			
		||||
  printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
 | 
			
		||||
         (memsize * 1e-6) / memcpy_h2d_time);
 | 
			
		||||
  printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
 | 
			
		||||
         (memsize * 1e-6) / memcpy_d2h_time);
 | 
			
		||||
  printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
 | 
			
		||||
         (inner_reps * memsize * 2e-6) / kernel_time);
 | 
			
		||||
    printf("\n");
 | 
			
		||||
    printf("Theoretical limits for speedup gained from overlapped data "
 | 
			
		||||
           "transfers:\n");
 | 
			
		||||
    printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
 | 
			
		||||
    printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
 | 
			
		||||
    printf("Compute can overlap with both data transfers: %f ms\n",
 | 
			
		||||
           max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
 | 
			
		||||
 | 
			
		||||
  printf("\n");
 | 
			
		||||
  printf(
 | 
			
		||||
      "Theoretical limits for speedup gained from overlapped data "
 | 
			
		||||
      "transfers:\n");
 | 
			
		||||
  printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
 | 
			
		||||
         memcpy_h2d_time + memcpy_d2h_time + kernel_time);
 | 
			
		||||
  printf("Compute can overlap with one transfer: %f ms\n",
 | 
			
		||||
         max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
 | 
			
		||||
  printf("Compute can overlap with both data transfers: %f ms\n",
 | 
			
		||||
         max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
 | 
			
		||||
    // Process pipelined work
 | 
			
		||||
    float serial_time  = processWithStreams(1);
 | 
			
		||||
    float overlap_time = processWithStreams(STREAM_COUNT);
 | 
			
		||||
 | 
			
		||||
  // Process pipelined work
 | 
			
		||||
  float serial_time = processWithStreams(1);
 | 
			
		||||
  float overlap_time = processWithStreams(STREAM_COUNT);
 | 
			
		||||
    printf("\nAverage measured timings over %d repetitions:\n", nreps);
 | 
			
		||||
    printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps);
 | 
			
		||||
    printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps);
 | 
			
		||||
    printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps);
 | 
			
		||||
 | 
			
		||||
  printf("\nAverage measured timings over %d repetitions:\n", nreps);
 | 
			
		||||
  printf(" Avg. time when execution fully serialized\t: %f ms\n",
 | 
			
		||||
         serial_time / nreps);
 | 
			
		||||
  printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
 | 
			
		||||
         overlap_time / nreps);
 | 
			
		||||
  printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
 | 
			
		||||
         (serial_time - overlap_time) / nreps);
 | 
			
		||||
    printf("\nMeasured throughput:\n");
 | 
			
		||||
    printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time);
 | 
			
		||||
    printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time);
 | 
			
		||||
 | 
			
		||||
  printf("\nMeasured throughput:\n");
 | 
			
		||||
  printf(" Fully serialized execution\t\t: %f GB/s\n",
 | 
			
		||||
         (nreps * (memsize * 2e-6)) / serial_time);
 | 
			
		||||
  printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
 | 
			
		||||
         (nreps * (memsize * 2e-6)) / overlap_time);
 | 
			
		||||
    // Verify the results, we will use the results for final output
 | 
			
		||||
    bool bResults = test();
 | 
			
		||||
 | 
			
		||||
  // Verify the results, we will use the results for final output
 | 
			
		||||
  bool bResults = test();
 | 
			
		||||
    // Free resources
 | 
			
		||||
 | 
			
		||||
  // Free resources
 | 
			
		||||
    free(h_data_source);
 | 
			
		||||
    free(h_data_sink);
 | 
			
		||||
 | 
			
		||||
  free(h_data_source);
 | 
			
		||||
  free(h_data_sink);
 | 
			
		||||
    for (int i = 0; i < STREAM_COUNT; ++i) {
 | 
			
		||||
        cudaFreeHost(h_data_in[i]);
 | 
			
		||||
        cudaFree(d_data_in[i]);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < STREAM_COUNT; ++i) {
 | 
			
		||||
    cudaFreeHost(h_data_in[i]);
 | 
			
		||||
    cudaFree(d_data_in[i]);
 | 
			
		||||
        cudaFreeHost(h_data_out[i]);
 | 
			
		||||
        cudaFree(d_data_out[i]);
 | 
			
		||||
 | 
			
		||||
    cudaFreeHost(h_data_out[i]);
 | 
			
		||||
    cudaFree(d_data_out[i]);
 | 
			
		||||
        cudaStreamDestroy(stream[i]);
 | 
			
		||||
        cudaEventDestroy(cycleDone[i]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    cudaStreamDestroy(stream[i]);
 | 
			
		||||
    cudaEventDestroy(cycleDone[i]);
 | 
			
		||||
  }
 | 
			
		||||
    cudaEventDestroy(start);
 | 
			
		||||
    cudaEventDestroy(stop);
 | 
			
		||||
 | 
			
		||||
  cudaEventDestroy(start);
 | 
			
		||||
  cudaEventDestroy(stop);
 | 
			
		||||
 | 
			
		||||
  // Test result
 | 
			
		||||
  exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    // Test result
 | 
			
		||||
    exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
float processWithStreams(int streams_used) {
 | 
			
		||||
  int current_stream = 0;
 | 
			
		||||
float processWithStreams(int streams_used)
 | 
			
		||||
{
 | 
			
		||||
    int current_stream = 0;
 | 
			
		||||
 | 
			
		||||
  float time;
 | 
			
		||||
    float time;
 | 
			
		||||
 | 
			
		||||
  // Do processing in a loop
 | 
			
		||||
  //
 | 
			
		||||
  // Note: All memory commands are processed in the order  they are issued,
 | 
			
		||||
  // independent of the stream they are enqueued in. Hence the pattern by
 | 
			
		||||
  // which the copy and kernel commands are enqueued in the stream
 | 
			
		||||
  // has an influence on the achieved overlap.
 | 
			
		||||
    // Do processing in a loop
 | 
			
		||||
    //
 | 
			
		||||
    // Note: All memory commands are processed in the order  they are issued,
 | 
			
		||||
    // independent of the stream they are enqueued in. Hence the pattern by
 | 
			
		||||
    // which the copy and kernel commands are enqueued in the stream
 | 
			
		||||
    // has an influence on the achieved overlap.
 | 
			
		||||
 | 
			
		||||
  cudaEventRecord(start, 0);
 | 
			
		||||
    cudaEventRecord(start, 0);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nreps; ++i) {
 | 
			
		||||
    int next_stream = (current_stream + 1) % streams_used;
 | 
			
		||||
    for (int i = 0; i < nreps; ++i) {
 | 
			
		||||
        int next_stream = (current_stream + 1) % streams_used;
 | 
			
		||||
 | 
			
		||||
#ifdef SIMULATE_IO
 | 
			
		||||
    // Store the result
 | 
			
		||||
    memcpy(h_data_sink, h_data_out[current_stream], memsize);
 | 
			
		||||
        // Store the result
 | 
			
		||||
        memcpy(h_data_sink, h_data_out[current_stream], memsize);
 | 
			
		||||
 | 
			
		||||
    // Read new input
 | 
			
		||||
    memcpy(h_data_in[next_stream], h_data_source, memsize);
 | 
			
		||||
        // Read new input
 | 
			
		||||
        memcpy(h_data_in[next_stream], h_data_source, memsize);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    // Ensure that processing and copying of the last cycle has finished
 | 
			
		||||
    cudaEventSynchronize(cycleDone[next_stream]);
 | 
			
		||||
        // Ensure that processing and copying of the last cycle has finished
 | 
			
		||||
        cudaEventSynchronize(cycleDone[next_stream]);
 | 
			
		||||
 | 
			
		||||
    // Process current frame
 | 
			
		||||
    incKernel<<<grid, block, 0, stream[current_stream]>>>(
 | 
			
		||||
        d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
 | 
			
		||||
        // Process current frame
 | 
			
		||||
        incKernel<<<grid, block, 0, stream[current_stream]>>>(
 | 
			
		||||
            d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
 | 
			
		||||
 | 
			
		||||
    // Upload next frame
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
 | 
			
		||||
                        cudaMemcpyHostToDevice, stream[next_stream]));
 | 
			
		||||
        // Upload next frame
 | 
			
		||||
        checkCudaErrors(cudaMemcpyAsync(
 | 
			
		||||
            d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream]));
 | 
			
		||||
 | 
			
		||||
    // Download current frame
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(
 | 
			
		||||
        h_data_out[current_stream], d_data_out[current_stream], memsize,
 | 
			
		||||
        cudaMemcpyDeviceToHost, stream[current_stream]));
 | 
			
		||||
        // Download current frame
 | 
			
		||||
        checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream],
 | 
			
		||||
                                        d_data_out[current_stream],
 | 
			
		||||
                                        memsize,
 | 
			
		||||
                                        cudaMemcpyDeviceToHost,
 | 
			
		||||
                                        stream[current_stream]));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
 | 
			
		||||
        checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
 | 
			
		||||
 | 
			
		||||
    current_stream = next_stream;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cudaEventRecord(stop, 0);
 | 
			
		||||
 | 
			
		||||
  cudaDeviceSynchronize();
 | 
			
		||||
 | 
			
		||||
  cudaEventElapsedTime(&time, start, stop);
 | 
			
		||||
 | 
			
		||||
  return time;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void init() {
 | 
			
		||||
  for (int i = 0; i < N; ++i) {
 | 
			
		||||
    h_data_source[i] = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < STREAM_COUNT; ++i) {
 | 
			
		||||
    memcpy(h_data_in[i], h_data_source, memsize);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool test() {
 | 
			
		||||
  bool passed = true;
 | 
			
		||||
 | 
			
		||||
  for (int j = 0; j < STREAM_COUNT; ++j) {
 | 
			
		||||
    for (int i = 0; i < N; ++i) {
 | 
			
		||||
      passed &= (h_data_out[j][i] == 1);
 | 
			
		||||
        current_stream = next_stream;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return passed;
 | 
			
		||||
    cudaEventRecord(stop, 0);
 | 
			
		||||
 | 
			
		||||
    cudaDeviceSynchronize();
 | 
			
		||||
 | 
			
		||||
    cudaEventElapsedTime(&time, start, stop);
 | 
			
		||||
 | 
			
		||||
    return time;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void init()
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < N; ++i) {
 | 
			
		||||
        h_data_source[i] = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < STREAM_COUNT; ++i) {
 | 
			
		||||
        memcpy(h_data_in[i], h_data_source, memsize);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool test()
 | 
			
		||||
{
 | 
			
		||||
    bool passed = true;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < STREAM_COUNT; ++j) {
 | 
			
		||||
        for (int i = 0; i < N; ++i) {
 | 
			
		||||
            passed &= (h_data_out[j][i] == 1);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return passed;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -37,15 +37,15 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// System includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// CUDA runtime
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(a, b) (a > b ? a : b)
 | 
			
		||||
@ -57,180 +57,176 @@
 | 
			
		||||
// Data configuration
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
const int MAX_GPU_COUNT = 32;
 | 
			
		||||
const int DATA_N = 1048576 * 32;
 | 
			
		||||
const int DATA_N        = 1048576 * 32;
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Simple reduction kernel.
 | 
			
		||||
// Refer to the 'reduction' CUDA Sample describing
 | 
			
		||||
// reduction optimization strategies
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
 | 
			
		||||
  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  const int threadN = gridDim.x * blockDim.x;
 | 
			
		||||
  float sum = 0;
 | 
			
		||||
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
 | 
			
		||||
{
 | 
			
		||||
    const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    const int threadN = gridDim.x * blockDim.x;
 | 
			
		||||
    float     sum     = 0;
 | 
			
		||||
 | 
			
		||||
  for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos];
 | 
			
		||||
    for (int pos = tid; pos < N; pos += threadN)
 | 
			
		||||
        sum += d_Input[pos];
 | 
			
		||||
 | 
			
		||||
  d_Result[tid] = sum;
 | 
			
		||||
    d_Result[tid] = sum;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  // Solver config
 | 
			
		||||
  TGPUplan plan[MAX_GPU_COUNT];
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // Solver config
 | 
			
		||||
    TGPUplan plan[MAX_GPU_COUNT];
 | 
			
		||||
 | 
			
		||||
  // GPU reduction results
 | 
			
		||||
  float h_SumGPU[MAX_GPU_COUNT];
 | 
			
		||||
    // GPU reduction results
 | 
			
		||||
    float h_SumGPU[MAX_GPU_COUNT];
 | 
			
		||||
 | 
			
		||||
  float sumGPU;
 | 
			
		||||
  double sumCPU, diff;
 | 
			
		||||
    float  sumGPU;
 | 
			
		||||
    double sumCPU, diff;
 | 
			
		||||
 | 
			
		||||
  int i, j, gpuBase, GPU_N;
 | 
			
		||||
    int i, j, gpuBase, GPU_N;
 | 
			
		||||
 | 
			
		||||
  const int BLOCK_N = 32;
 | 
			
		||||
  const int THREAD_N = 256;
 | 
			
		||||
  const int ACCUM_N = BLOCK_N * THREAD_N;
 | 
			
		||||
    const int BLOCK_N  = 32;
 | 
			
		||||
    const int THREAD_N = 256;
 | 
			
		||||
    const int ACCUM_N  = BLOCK_N * THREAD_N;
 | 
			
		||||
 | 
			
		||||
  printf("Starting simpleMultiGPU\n");
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceCount(&GPU_N));
 | 
			
		||||
    printf("Starting simpleMultiGPU\n");
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceCount(&GPU_N));
 | 
			
		||||
 | 
			
		||||
  if (GPU_N > MAX_GPU_COUNT) {
 | 
			
		||||
    GPU_N = MAX_GPU_COUNT;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("CUDA-capable device count: %i\n", GPU_N);
 | 
			
		||||
 | 
			
		||||
  printf("Generating input data...\n\n");
 | 
			
		||||
 | 
			
		||||
  // Subdividing input data across GPUs
 | 
			
		||||
  // Get data sizes for each GPU
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    plan[i].dataN = DATA_N / GPU_N;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Take into account "odd" data sizes
 | 
			
		||||
  for (i = 0; i < DATA_N % GPU_N; i++) {
 | 
			
		||||
    plan[i].dataN++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Assign data ranges to GPUs
 | 
			
		||||
  gpuBase = 0;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    plan[i].h_Sum = h_SumGPU + i;
 | 
			
		||||
    gpuBase += plan[i].dataN;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Create streams for issuing GPU command asynchronously and allocate memory
 | 
			
		||||
  // (GPU and System page-locked)
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
    checkCudaErrors(cudaStreamCreate(&plan[i].stream));
 | 
			
		||||
    // Allocate memory
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
 | 
			
		||||
    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device,
 | 
			
		||||
                                   ACCUM_N * sizeof(float)));
 | 
			
		||||
    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data,
 | 
			
		||||
                                   plan[i].dataN * sizeof(float)));
 | 
			
		||||
 | 
			
		||||
    for (j = 0; j < plan[i].dataN; j++) {
 | 
			
		||||
      plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Start timing and compute on GPU(s)
 | 
			
		||||
  printf("Computing with %d GPUs...\n", GPU_N);
 | 
			
		||||
  // create and start timer
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // start the timer
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Copy data to GPU, launch the kernel and copy data back. All asynchronously
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    // Set device
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
 | 
			
		||||
    // Copy input data from CPU
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data,
 | 
			
		||||
                                    plan[i].dataN * sizeof(float),
 | 
			
		||||
                                    cudaMemcpyHostToDevice, plan[i].stream));
 | 
			
		||||
 | 
			
		||||
    // Perform GPU computations
 | 
			
		||||
    reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(
 | 
			
		||||
        plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
 | 
			
		||||
    getLastCudaError("reduceKernel() execution failed.\n");
 | 
			
		||||
 | 
			
		||||
    // Read back GPU results
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum,
 | 
			
		||||
                                    ACCUM_N * sizeof(float),
 | 
			
		||||
                                    cudaMemcpyDeviceToHost, plan[i].stream));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Process GPU results
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    float sum;
 | 
			
		||||
 | 
			
		||||
    // Set device
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
 | 
			
		||||
    // Wait for all operations to finish
 | 
			
		||||
    cudaStreamSynchronize(plan[i].stream);
 | 
			
		||||
 | 
			
		||||
    // Finalize GPU reduction for current subvector
 | 
			
		||||
    sum = 0;
 | 
			
		||||
 | 
			
		||||
    for (j = 0; j < ACCUM_N; j++) {
 | 
			
		||||
      sum += plan[i].h_Sum_from_device[j];
 | 
			
		||||
    if (GPU_N > MAX_GPU_COUNT) {
 | 
			
		||||
        GPU_N = MAX_GPU_COUNT;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    *(plan[i].h_Sum) = (float)sum;
 | 
			
		||||
    printf("CUDA-capable device count: %i\n", GPU_N);
 | 
			
		||||
 | 
			
		||||
    // Shut down this GPU
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
 | 
			
		||||
    checkCudaErrors(cudaFree(plan[i].d_Sum));
 | 
			
		||||
    checkCudaErrors(cudaFree(plan[i].d_Data));
 | 
			
		||||
    checkCudaErrors(cudaStreamDestroy(plan[i].stream));
 | 
			
		||||
  }
 | 
			
		||||
    printf("Generating input data...\n\n");
 | 
			
		||||
 | 
			
		||||
  sumGPU = 0;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    sumGPU += h_SumGPU[i];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("  GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Compute on Host CPU
 | 
			
		||||
  printf("Computing with Host CPU...\n\n");
 | 
			
		||||
 | 
			
		||||
  sumCPU = 0;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    for (j = 0; j < plan[i].dataN; j++) {
 | 
			
		||||
      sumCPU += plan[i].h_Data[j];
 | 
			
		||||
    // Subdividing input data across GPUs
 | 
			
		||||
    // Get data sizes for each GPU
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        plan[i].dataN = DATA_N / GPU_N;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Compare GPU and CPU results
 | 
			
		||||
  printf("Comparing GPU and Host CPU results...\n");
 | 
			
		||||
  diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
 | 
			
		||||
  printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);
 | 
			
		||||
  printf("  Relative difference: %E \n\n", diff);
 | 
			
		||||
    // Take into account "odd" data sizes
 | 
			
		||||
    for (i = 0; i < DATA_N % GPU_N; i++) {
 | 
			
		||||
        plan[i].dataN++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Cleanup and shutdown
 | 
			
		||||
  for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(plan[i].h_Data));
 | 
			
		||||
  }
 | 
			
		||||
    // Assign data ranges to GPUs
 | 
			
		||||
    gpuBase = 0;
 | 
			
		||||
 | 
			
		||||
  exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        plan[i].h_Sum = h_SumGPU + i;
 | 
			
		||||
        gpuBase += plan[i].dataN;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Create streams for issuing GPU command asynchronously and allocate memory
 | 
			
		||||
    // (GPU and System page-locked)
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
        checkCudaErrors(cudaStreamCreate(&plan[i].stream));
 | 
			
		||||
        // Allocate memory
 | 
			
		||||
        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
 | 
			
		||||
        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
 | 
			
		||||
        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
 | 
			
		||||
        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));
 | 
			
		||||
 | 
			
		||||
        for (j = 0; j < plan[i].dataN; j++) {
 | 
			
		||||
            plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Start timing and compute on GPU(s)
 | 
			
		||||
    printf("Computing with %d GPUs...\n", GPU_N);
 | 
			
		||||
    // create and start timer
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    // start the timer
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    // Copy data to GPU, launch the kernel and copy data back. All asynchronously
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        // Set device
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
 | 
			
		||||
        // Copy input data from CPU
 | 
			
		||||
        checkCudaErrors(cudaMemcpyAsync(
 | 
			
		||||
            plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
 | 
			
		||||
 | 
			
		||||
        // Perform GPU computations
 | 
			
		||||
        reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
 | 
			
		||||
        getLastCudaError("reduceKernel() execution failed.\n");
 | 
			
		||||
 | 
			
		||||
        // Read back GPU results
 | 
			
		||||
        checkCudaErrors(cudaMemcpyAsync(
 | 
			
		||||
            plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Process GPU results
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        float sum;
 | 
			
		||||
 | 
			
		||||
        // Set device
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
 | 
			
		||||
        // Wait for all operations to finish
 | 
			
		||||
        cudaStreamSynchronize(plan[i].stream);
 | 
			
		||||
 | 
			
		||||
        // Finalize GPU reduction for current subvector
 | 
			
		||||
        sum = 0;
 | 
			
		||||
 | 
			
		||||
        for (j = 0; j < ACCUM_N; j++) {
 | 
			
		||||
            sum += plan[i].h_Sum_from_device[j];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        *(plan[i].h_Sum) = (float)sum;
 | 
			
		||||
 | 
			
		||||
        // Shut down this GPU
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
 | 
			
		||||
        checkCudaErrors(cudaFree(plan[i].d_Sum));
 | 
			
		||||
        checkCudaErrors(cudaFree(plan[i].d_Data));
 | 
			
		||||
        checkCudaErrors(cudaStreamDestroy(plan[i].stream));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    sumGPU = 0;
 | 
			
		||||
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        sumGPU += h_SumGPU[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("  GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    // Compute on Host CPU
 | 
			
		||||
    printf("Computing with Host CPU...\n\n");
 | 
			
		||||
 | 
			
		||||
    sumCPU = 0;
 | 
			
		||||
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        for (j = 0; j < plan[i].dataN; j++) {
 | 
			
		||||
            sumCPU += plan[i].h_Data[j];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Compare GPU and CPU results
 | 
			
		||||
    printf("Comparing GPU and Host CPU results...\n");
 | 
			
		||||
    diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
 | 
			
		||||
    printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);
 | 
			
		||||
    printf("  Relative difference: %E \n\n", diff);
 | 
			
		||||
 | 
			
		||||
    // Cleanup and shutdown
 | 
			
		||||
    for (i = 0; i < GPU_N; i++) {
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(plan[i].h_Data));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -37,26 +37,26 @@
 | 
			
		||||
#ifndef SIMPLEMULTIGPU_H
 | 
			
		||||
#define SIMPLEMULTIGPU_H
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
  // Host-side input data
 | 
			
		||||
  int dataN;
 | 
			
		||||
  float *h_Data;
 | 
			
		||||
typedef struct
 | 
			
		||||
{
 | 
			
		||||
    // Host-side input data
 | 
			
		||||
    int    dataN;
 | 
			
		||||
    float *h_Data;
 | 
			
		||||
 | 
			
		||||
  // Partial sum for this GPU
 | 
			
		||||
  float *h_Sum;
 | 
			
		||||
    // Partial sum for this GPU
 | 
			
		||||
    float *h_Sum;
 | 
			
		||||
 | 
			
		||||
  // Device buffers
 | 
			
		||||
  float *d_Data, *d_Sum;
 | 
			
		||||
    // Device buffers
 | 
			
		||||
    float *d_Data, *d_Sum;
 | 
			
		||||
 | 
			
		||||
  // Reduction copied back from GPU
 | 
			
		||||
  float *h_Sum_from_device;
 | 
			
		||||
    // Reduction copied back from GPU
 | 
			
		||||
    float *h_Sum_from_device;
 | 
			
		||||
 | 
			
		||||
  // Stream for asynchronous command execution
 | 
			
		||||
  cudaStream_t stream;
 | 
			
		||||
    // Stream for asynchronous command execution
 | 
			
		||||
    cudaStream_t stream;
 | 
			
		||||
 | 
			
		||||
} TGPUplan;
 | 
			
		||||
 | 
			
		||||
extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N,
 | 
			
		||||
                                    int BLOCK_N, int THREAD_N, cudaStream_t &s);
 | 
			
		||||
extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@ -25,8 +25,8 @@
 | 
			
		||||
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <helper_cuda.h> // helper functions for CUDA error check
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <helper_cuda.h>  // helper functions for CUDA error check
 | 
			
		||||
 | 
			
		||||
const int manualBlockSize = 32;
 | 
			
		||||
 | 
			
		||||
@ -38,13 +38,14 @@ const int manualBlockSize = 32;
 | 
			
		||||
// execution configuration, including anything the launch configurator
 | 
			
		||||
// API suggests.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void square(int *array, int arrayCount) {
 | 
			
		||||
  extern __shared__ int dynamicSmem[];
 | 
			
		||||
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
 | 
			
		||||
__global__ void square(int *array, int arrayCount)
 | 
			
		||||
{
 | 
			
		||||
    extern __shared__ int dynamicSmem[];
 | 
			
		||||
    int                   idx = threadIdx.x + blockIdx.x * blockDim.x;
 | 
			
		||||
 | 
			
		||||
  if (idx < arrayCount) {
 | 
			
		||||
    array[idx] *= array[idx];
 | 
			
		||||
  }
 | 
			
		||||
    if (idx < arrayCount) {
 | 
			
		||||
        array[idx] *= array[idx];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -58,29 +59,28 @@ __global__ void square(int *array, int arrayCount) {
 | 
			
		||||
// This wrapper routine computes the occupancy of kernel, and reports
 | 
			
		||||
// it in terms of active warps / maximum warps per SM.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static double reportPotentialOccupancy(void *kernel, int blockSize,
 | 
			
		||||
                                       size_t dynamicSMem) {
 | 
			
		||||
  int device;
 | 
			
		||||
  cudaDeviceProp prop;
 | 
			
		||||
static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem)
 | 
			
		||||
{
 | 
			
		||||
    int            device;
 | 
			
		||||
    cudaDeviceProp prop;
 | 
			
		||||
 | 
			
		||||
  int numBlocks;
 | 
			
		||||
  int activeWarps;
 | 
			
		||||
  int maxWarps;
 | 
			
		||||
    int numBlocks;
 | 
			
		||||
    int activeWarps;
 | 
			
		||||
    int maxWarps;
 | 
			
		||||
 | 
			
		||||
  double occupancy;
 | 
			
		||||
    double occupancy;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDevice(&device));
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&prop, device));
 | 
			
		||||
    checkCudaErrors(cudaGetDevice(&device));
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&prop, device));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 | 
			
		||||
      &numBlocks, kernel, blockSize, dynamicSMem));
 | 
			
		||||
    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem));
 | 
			
		||||
 | 
			
		||||
  activeWarps = numBlocks * blockSize / prop.warpSize;
 | 
			
		||||
  maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
 | 
			
		||||
    activeWarps = numBlocks * blockSize / prop.warpSize;
 | 
			
		||||
    maxWarps    = prop.maxThreadsPerMultiProcessor / prop.warpSize;
 | 
			
		||||
 | 
			
		||||
  occupancy = (double)activeWarps / maxWarps;
 | 
			
		||||
    occupancy = (double)activeWarps / maxWarps;
 | 
			
		||||
 | 
			
		||||
  return occupancy;
 | 
			
		||||
    return occupancy;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -99,65 +99,63 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
 | 
			
		||||
// This function configures the launch based on the "automatic"
 | 
			
		||||
// argument, records the runtime, and reports occupancy and runtime.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static int launchConfig(int *array, int arrayCount, bool automatic) {
 | 
			
		||||
  int blockSize;
 | 
			
		||||
  int minGridSize;
 | 
			
		||||
  int gridSize;
 | 
			
		||||
  size_t dynamicSMemUsage = 0;
 | 
			
		||||
static int launchConfig(int *array, int arrayCount, bool automatic)
 | 
			
		||||
{
 | 
			
		||||
    int    blockSize;
 | 
			
		||||
    int    minGridSize;
 | 
			
		||||
    int    gridSize;
 | 
			
		||||
    size_t dynamicSMemUsage = 0;
 | 
			
		||||
 | 
			
		||||
  cudaEvent_t start;
 | 
			
		||||
  cudaEvent_t end;
 | 
			
		||||
    cudaEvent_t start;
 | 
			
		||||
    cudaEvent_t end;
 | 
			
		||||
 | 
			
		||||
  float elapsedTime;
 | 
			
		||||
    float elapsedTime;
 | 
			
		||||
 | 
			
		||||
  double potentialOccupancy;
 | 
			
		||||
    double potentialOccupancy;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&start));
 | 
			
		||||
  checkCudaErrors(cudaEventCreate(&end));
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&start));
 | 
			
		||||
    checkCudaErrors(cudaEventCreate(&end));
 | 
			
		||||
 | 
			
		||||
  if (automatic) {
 | 
			
		||||
    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
 | 
			
		||||
        &minGridSize, &blockSize, (void *)square, dynamicSMemUsage,
 | 
			
		||||
        arrayCount));
 | 
			
		||||
    if (automatic) {
 | 
			
		||||
        checkCudaErrors(
 | 
			
		||||
            cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount));
 | 
			
		||||
 | 
			
		||||
    std::cout << "Suggested block size: " << blockSize << std::endl
 | 
			
		||||
              << "Minimum grid size for maximum occupancy: " << minGridSize
 | 
			
		||||
              << std::endl;
 | 
			
		||||
  } else {
 | 
			
		||||
    // This block size is too small. Given limited number of
 | 
			
		||||
    // active blocks per multiprocessor, the number of active
 | 
			
		||||
    // threads will be limited, and thus unable to achieve maximum
 | 
			
		||||
    // occupancy.
 | 
			
		||||
        std::cout << "Suggested block size: " << blockSize << std::endl
 | 
			
		||||
                  << "Minimum grid size for maximum occupancy: " << minGridSize << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // This block size is too small. Given limited number of
 | 
			
		||||
        // active blocks per multiprocessor, the number of active
 | 
			
		||||
        // threads will be limited, and thus unable to achieve maximum
 | 
			
		||||
        // occupancy.
 | 
			
		||||
        //
 | 
			
		||||
        blockSize = manualBlockSize;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Round up
 | 
			
		||||
    //
 | 
			
		||||
    blockSize = manualBlockSize;
 | 
			
		||||
  }
 | 
			
		||||
    gridSize = (arrayCount + blockSize - 1) / blockSize;
 | 
			
		||||
 | 
			
		||||
  // Round up
 | 
			
		||||
  //
 | 
			
		||||
  gridSize = (arrayCount + blockSize - 1) / blockSize;
 | 
			
		||||
    // Launch and profile
 | 
			
		||||
    //
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start));
 | 
			
		||||
    square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(end));
 | 
			
		||||
 | 
			
		||||
  // Launch and profile
 | 
			
		||||
  //
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start));
 | 
			
		||||
  square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(end));
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    // Calculate occupancy
 | 
			
		||||
    //
 | 
			
		||||
    potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
 | 
			
		||||
 | 
			
		||||
  // Calculate occupancy
 | 
			
		||||
  //
 | 
			
		||||
  potentialOccupancy =
 | 
			
		||||
      reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
 | 
			
		||||
    std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
    // Report elapsed time
 | 
			
		||||
    //
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
 | 
			
		||||
    std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;
 | 
			
		||||
 | 
			
		||||
  // Report elapsed time
 | 
			
		||||
  //
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
 | 
			
		||||
  std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -166,41 +164,41 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
 | 
			
		||||
// The test generates an array and squares it with a CUDA kernel, then
 | 
			
		||||
// verifies the result.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static int test(bool automaticLaunchConfig, const int count = 1000000) {
 | 
			
		||||
  int *array;
 | 
			
		||||
  int *dArray;
 | 
			
		||||
  int size = count * sizeof(int);
 | 
			
		||||
static int test(bool automaticLaunchConfig, const int count = 1000000)
 | 
			
		||||
{
 | 
			
		||||
    int *array;
 | 
			
		||||
    int *dArray;
 | 
			
		||||
    int  size = count * sizeof(int);
 | 
			
		||||
 | 
			
		||||
  array = new int[count];
 | 
			
		||||
    array = new int[count];
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < count; i += 1) {
 | 
			
		||||
    array[i] = i;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMalloc(&dArray, size));
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < count; i += 1) {
 | 
			
		||||
    array[i] = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  launchConfig(dArray, count, automaticLaunchConfig);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
  checkCudaErrors(cudaFree(dArray));
 | 
			
		||||
 | 
			
		||||
  // Verify the return data
 | 
			
		||||
  //
 | 
			
		||||
  for (int i = 0; i < count; i += 1) {
 | 
			
		||||
    if (array[i] != i * i) {
 | 
			
		||||
      std::cout << "element " << i << " expected " << i * i << " actual "
 | 
			
		||||
                << array[i] << std::endl;
 | 
			
		||||
      return 1;
 | 
			
		||||
    for (int i = 0; i < count; i += 1) {
 | 
			
		||||
        array[i] = i;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  delete[] array;
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&dArray, size));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < count; i += 1) {
 | 
			
		||||
        array[i] = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    launchConfig(dArray, count, automaticLaunchConfig);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
    checkCudaErrors(cudaFree(dArray));
 | 
			
		||||
 | 
			
		||||
    // Verify the return data
 | 
			
		||||
    //
 | 
			
		||||
    for (int i = 0; i < count; i += 1) {
 | 
			
		||||
        if (array[i] != i * i) {
 | 
			
		||||
            std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
 | 
			
		||||
            return 1;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    delete[] array;
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -210,31 +208,31 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
 | 
			
		||||
// automatically configured launch, and reports the occupancy and
 | 
			
		||||
// performance.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main() {
 | 
			
		||||
  int status;
 | 
			
		||||
int main()
 | 
			
		||||
{
 | 
			
		||||
    int status;
 | 
			
		||||
 | 
			
		||||
  std::cout << "starting Simple Occupancy" << std::endl << std::endl;
 | 
			
		||||
    std::cout << "starting Simple Occupancy" << std::endl << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << "[ Manual configuration with " << manualBlockSize
 | 
			
		||||
            << " threads per block ]" << std::endl;
 | 
			
		||||
    std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl;
 | 
			
		||||
 | 
			
		||||
  status = test(false);
 | 
			
		||||
  if (status) {
 | 
			
		||||
    std::cerr << "Test failed\n" << std::endl;
 | 
			
		||||
    return -1;
 | 
			
		||||
  }
 | 
			
		||||
    status = test(false);
 | 
			
		||||
    if (status) {
 | 
			
		||||
        std::cerr << "Test failed\n" << std::endl;
 | 
			
		||||
        return -1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  std::cout << std::endl;
 | 
			
		||||
    std::cout << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl;
 | 
			
		||||
  status = test(true);
 | 
			
		||||
  if (status) {
 | 
			
		||||
    std::cerr << "Test failed\n" << std::endl;
 | 
			
		||||
    return -1;
 | 
			
		||||
  }
 | 
			
		||||
    std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl;
 | 
			
		||||
    status = test(true);
 | 
			
		||||
    if (status) {
 | 
			
		||||
        std::cerr << "Test failed\n" << std::endl;
 | 
			
		||||
        return -1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  std::cout << std::endl;
 | 
			
		||||
  std::cout << "Test PASSED\n" << std::endl;
 | 
			
		||||
    std::cout << std::endl;
 | 
			
		||||
    std::cout << "Test PASSED\n" << std::endl;
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 | 
			
		||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 | 
			
		||||
 | 
			
		||||
## References (for more details)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -31,230 +31,233 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
 | 
			
		||||
// CUDA includes
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
 | 
			
		||||
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
 | 
			
		||||
 | 
			
		||||
__global__ void SimpleKernel(float *src, float *dst) {
 | 
			
		||||
  // Just a dummy kernel, doing enough for us to verify that everything
 | 
			
		||||
  // worked
 | 
			
		||||
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  dst[idx] = src[idx] * 2.0f;
 | 
			
		||||
__global__ void SimpleKernel(float *src, float *dst)
 | 
			
		||||
{
 | 
			
		||||
    // Just a dummy kernel, doing enough for us to verify that everything
 | 
			
		||||
    // worked
 | 
			
		||||
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    dst[idx]      = src[idx] * 2.0f;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("[%s] - Starting...\n", argv[0]);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("[%s] - Starting...\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (!IsAppBuiltAs64()) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "%s is only supported with on 64-bit OSs and the application must be "
 | 
			
		||||
        "built as a 64-bit target.  Test is being waived.\n",
 | 
			
		||||
        argv[0]);
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Number of GPUs
 | 
			
		||||
  printf("Checking for multiple GPUs...\n");
 | 
			
		||||
  int gpu_n;
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceCount(&gpu_n));
 | 
			
		||||
  printf("CUDA-capable device count: %i\n", gpu_n);
 | 
			
		||||
 | 
			
		||||
  if (gpu_n < 2) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "Two or more GPUs with Peer-to-Peer access capability are required for "
 | 
			
		||||
        "%s.\n",
 | 
			
		||||
        argv[0]);
 | 
			
		||||
    printf("Waiving test.\n");
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Query device properties
 | 
			
		||||
  cudaDeviceProp prop[64];
 | 
			
		||||
  int gpuid[2];  // we want to find the first two GPU's that can support P2P
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < gpu_n; i++) {
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
 | 
			
		||||
  }
 | 
			
		||||
  // Check possibility for peer access
 | 
			
		||||
  printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
 | 
			
		||||
 | 
			
		||||
  int can_access_peer;
 | 
			
		||||
  int p2pCapableGPUs[2];  // We take only 1 pair of P2P capable GPUs
 | 
			
		||||
  p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
 | 
			
		||||
 | 
			
		||||
  // Show all the combinations of supported P2P GPUs
 | 
			
		||||
  for (int i = 0; i < gpu_n; i++) {
 | 
			
		||||
    for (int j = 0; j < gpu_n; j++) {
 | 
			
		||||
      if (i == j) {
 | 
			
		||||
        continue;
 | 
			
		||||
      }
 | 
			
		||||
      checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
 | 
			
		||||
      printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name,
 | 
			
		||||
             i, prop[j].name, j, can_access_peer ? "Yes" : "No");
 | 
			
		||||
      if (can_access_peer && p2pCapableGPUs[0] == -1) {
 | 
			
		||||
        p2pCapableGPUs[0] = i;
 | 
			
		||||
        p2pCapableGPUs[1] = j;
 | 
			
		||||
      }
 | 
			
		||||
    if (!IsAppBuiltAs64()) {
 | 
			
		||||
        printf("%s is only supported with on 64-bit OSs and the application must be "
 | 
			
		||||
               "built as a 64-bit target.  Test is being waived.\n",
 | 
			
		||||
               argv[0]);
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "Two or more GPUs with Peer-to-Peer access capability are required for "
 | 
			
		||||
        "%s.\n",
 | 
			
		||||
        argv[0]);
 | 
			
		||||
    printf(
 | 
			
		||||
        "Peer to Peer access is not available amongst GPUs in the system, "
 | 
			
		||||
        "waiving test.\n");
 | 
			
		||||
    // Number of GPUs
 | 
			
		||||
    printf("Checking for multiple GPUs...\n");
 | 
			
		||||
    int gpu_n;
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceCount(&gpu_n));
 | 
			
		||||
    printf("CUDA-capable device count: %i\n", gpu_n);
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Use first pair of p2p capable GPUs detected.
 | 
			
		||||
  gpuid[0] = p2pCapableGPUs[0];
 | 
			
		||||
  gpuid[1] = p2pCapableGPUs[1];
 | 
			
		||||
 | 
			
		||||
  // Enable peer access
 | 
			
		||||
  printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
 | 
			
		||||
         gpuid[1]);
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
 | 
			
		||||
 | 
			
		||||
  // Allocate buffers
 | 
			
		||||
  const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
 | 
			
		||||
  printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
 | 
			
		||||
         int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
  float *g0;
 | 
			
		||||
  checkCudaErrors(cudaMalloc(&g0, buf_size));
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
  float *g1;
 | 
			
		||||
  checkCudaErrors(cudaMalloc(&g1, buf_size));
 | 
			
		||||
  float *h0;
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMallocHost(&h0, buf_size));  // Automatically portable with UVA
 | 
			
		||||
 | 
			
		||||
  // Create CUDA event handles
 | 
			
		||||
  printf("Creating event handles...\n");
 | 
			
		||||
  cudaEvent_t start_event, stop_event;
 | 
			
		||||
  float time_memcpy;
 | 
			
		||||
  int eventflags = cudaEventBlockingSync;
 | 
			
		||||
  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
 | 
			
		||||
  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
 | 
			
		||||
 | 
			
		||||
  // P2P memcopy() benchmark
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < 100; i++) {
 | 
			
		||||
    // With UVA we don't need to specify source and target devices, the
 | 
			
		||||
    // runtime figures this out by itself from the pointers
 | 
			
		||||
    // Ping-pong copy between GPUs
 | 
			
		||||
    if (i % 2 == 0) {
 | 
			
		||||
      checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
 | 
			
		||||
    } else {
 | 
			
		||||
      checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
 | 
			
		||||
    if (gpu_n < 2) {
 | 
			
		||||
        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
 | 
			
		||||
               "%s.\n",
 | 
			
		||||
               argv[0]);
 | 
			
		||||
        printf("Waiving test.\n");
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
 | 
			
		||||
  printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
 | 
			
		||||
         gpuid[0], gpuid[1],
 | 
			
		||||
         (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f /
 | 
			
		||||
             1024.0f / 1024.0f);
 | 
			
		||||
    // Query device properties
 | 
			
		||||
    cudaDeviceProp prop[64];
 | 
			
		||||
    int            gpuid[2]; // we want to find the first two GPU's that can support P2P
 | 
			
		||||
 | 
			
		||||
  // Prepare host buffer and copy to GPU 0
 | 
			
		||||
  printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < buf_size / sizeof(float); i++) {
 | 
			
		||||
    h0[i] = float(i % 4096);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
 | 
			
		||||
 | 
			
		||||
  // Kernel launch configuration
 | 
			
		||||
  const dim3 threads(512, 1);
 | 
			
		||||
  const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
 | 
			
		||||
 | 
			
		||||
  // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
 | 
			
		||||
  // output to the GPU 1 buffer
 | 
			
		||||
  printf(
 | 
			
		||||
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
 | 
			
		||||
      "GPU%d...\n",
 | 
			
		||||
      gpuid[1], gpuid[0], gpuid[1]);
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
  SimpleKernel<<<blocks, threads>>>(g0, g1);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
  // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
 | 
			
		||||
  // output to the GPU 0 buffer
 | 
			
		||||
  printf(
 | 
			
		||||
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
 | 
			
		||||
      "GPU%d...\n",
 | 
			
		||||
      gpuid[0], gpuid[1], gpuid[0]);
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
  SimpleKernel<<<blocks, threads>>>(g1, g0);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
  // Copy data back to host and verify
 | 
			
		||||
  printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
 | 
			
		||||
 | 
			
		||||
  int error_count = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < buf_size / sizeof(float); i++) {
 | 
			
		||||
    // Re-generate input data and apply 2x '* 2.0f' computation of both
 | 
			
		||||
    // kernel runs
 | 
			
		||||
    if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
 | 
			
		||||
      printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i],
 | 
			
		||||
             (float(i % 4096) * 2.0f * 2.0f));
 | 
			
		||||
 | 
			
		||||
      if (error_count++ > 10) {
 | 
			
		||||
        break;
 | 
			
		||||
      }
 | 
			
		||||
    for (int i = 0; i < gpu_n; i++) {
 | 
			
		||||
        checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
    // Check possibility for peer access
 | 
			
		||||
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
 | 
			
		||||
 | 
			
		||||
  // Disable peer access (also unregisters memory for non-UVA cases)
 | 
			
		||||
  printf("Disabling peer access...\n");
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
 | 
			
		||||
    int can_access_peer;
 | 
			
		||||
    int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs
 | 
			
		||||
    p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
 | 
			
		||||
 | 
			
		||||
  // Cleanup and shutdown
 | 
			
		||||
  printf("Shutting down...\n");
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(start_event));
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(stop_event));
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
  checkCudaErrors(cudaFree(g0));
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
  checkCudaErrors(cudaFree(g1));
 | 
			
		||||
  checkCudaErrors(cudaFreeHost(h0));
 | 
			
		||||
    // Show all the combinations of supported P2P GPUs
 | 
			
		||||
    for (int i = 0; i < gpu_n; i++) {
 | 
			
		||||
        for (int j = 0; j < gpu_n; j++) {
 | 
			
		||||
            if (i == j) {
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
            checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
 | 
			
		||||
            printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
 | 
			
		||||
                   prop[i].name,
 | 
			
		||||
                   i,
 | 
			
		||||
                   prop[j].name,
 | 
			
		||||
                   j,
 | 
			
		||||
                   can_access_peer ? "Yes" : "No");
 | 
			
		||||
            if (can_access_peer && p2pCapableGPUs[0] == -1) {
 | 
			
		||||
                p2pCapableGPUs[0] = i;
 | 
			
		||||
                p2pCapableGPUs[1] = j;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < gpu_n; i++) {
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
  }
 | 
			
		||||
    if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
 | 
			
		||||
        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
 | 
			
		||||
               "%s.\n",
 | 
			
		||||
               argv[0]);
 | 
			
		||||
        printf("Peer to Peer access is not available amongst GPUs in the system, "
 | 
			
		||||
               "waiving test.\n");
 | 
			
		||||
 | 
			
		||||
  if (error_count != 0) {
 | 
			
		||||
    printf("Test failed!\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("Test passed\n");
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Use first pair of p2p capable GPUs detected.
 | 
			
		||||
    gpuid[0] = p2pCapableGPUs[0];
 | 
			
		||||
    gpuid[1] = p2pCapableGPUs[1];
 | 
			
		||||
 | 
			
		||||
    // Enable peer access
 | 
			
		||||
    printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
    checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
    checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
 | 
			
		||||
 | 
			
		||||
    // Allocate buffers
 | 
			
		||||
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
 | 
			
		||||
    printf(
 | 
			
		||||
        "Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
    float *g0;
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&g0, buf_size));
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
    float *g1;
 | 
			
		||||
    checkCudaErrors(cudaMalloc(&g1, buf_size));
 | 
			
		||||
    float *h0;
 | 
			
		||||
    checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
 | 
			
		||||
 | 
			
		||||
    // Create CUDA event handles
 | 
			
		||||
    printf("Creating event handles...\n");
 | 
			
		||||
    cudaEvent_t start_event, stop_event;
 | 
			
		||||
    float       time_memcpy;
 | 
			
		||||
    int         eventflags = cudaEventBlockingSync;
 | 
			
		||||
    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
 | 
			
		||||
    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
 | 
			
		||||
 | 
			
		||||
    // P2P memcopy() benchmark
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < 100; i++) {
 | 
			
		||||
        // With UVA we don't need to specify source and target devices, the
 | 
			
		||||
        // runtime figures this out by itself from the pointers
 | 
			
		||||
        // Ping-pong copy between GPUs
 | 
			
		||||
        if (i % 2 == 0) {
 | 
			
		||||
            checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
 | 
			
		||||
    printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
 | 
			
		||||
           gpuid[0],
 | 
			
		||||
           gpuid[1],
 | 
			
		||||
           (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
 | 
			
		||||
 | 
			
		||||
    // Prepare host buffer and copy to GPU 0
 | 
			
		||||
    printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < buf_size / sizeof(float); i++) {
 | 
			
		||||
        h0[i] = float(i % 4096);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
 | 
			
		||||
 | 
			
		||||
    // Kernel launch configuration
 | 
			
		||||
    const dim3 threads(512, 1);
 | 
			
		||||
    const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
 | 
			
		||||
 | 
			
		||||
    // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
 | 
			
		||||
    // output to the GPU 1 buffer
 | 
			
		||||
    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
 | 
			
		||||
           "GPU%d...\n",
 | 
			
		||||
           gpuid[1],
 | 
			
		||||
           gpuid[0],
 | 
			
		||||
           gpuid[1]);
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
    SimpleKernel<<<blocks, threads>>>(g0, g1);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
    // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
 | 
			
		||||
    // output to the GPU 0 buffer
 | 
			
		||||
    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
 | 
			
		||||
           "GPU%d...\n",
 | 
			
		||||
           gpuid[0],
 | 
			
		||||
           gpuid[1],
 | 
			
		||||
           gpuid[0]);
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
    SimpleKernel<<<blocks, threads>>>(g1, g0);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
    // Copy data back to host and verify
 | 
			
		||||
    printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
 | 
			
		||||
 | 
			
		||||
    int error_count = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < buf_size / sizeof(float); i++) {
 | 
			
		||||
        // Re-generate input data and apply 2x '* 2.0f' computation of both
 | 
			
		||||
        // kernel runs
 | 
			
		||||
        if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
 | 
			
		||||
            printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));
 | 
			
		||||
 | 
			
		||||
            if (error_count++ > 10) {
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Disable peer access (also unregisters memory for non-UVA cases)
 | 
			
		||||
    printf("Disabling peer access...\n");
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
    checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
    checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
 | 
			
		||||
 | 
			
		||||
    // Cleanup and shutdown
 | 
			
		||||
    printf("Shutting down...\n");
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(start_event));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(stop_event));
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[0]));
 | 
			
		||||
    checkCudaErrors(cudaFree(g0));
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(gpuid[1]));
 | 
			
		||||
    checkCudaErrors(cudaFree(g1));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(h0));
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < gpu_n; i++) {
 | 
			
		||||
        checkCudaErrors(cudaSetDevice(i));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (error_count != 0) {
 | 
			
		||||
        printf("Test failed!\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("Test passed\n");
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,16 +26,16 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/* pitchLinearTexture
 | 
			
		||||
*
 | 
			
		||||
* This example demonstrates how to use textures bound to pitch linear memory.
 | 
			
		||||
* It performs a shift of matrix elements using wrap addressing mode (aka
 | 
			
		||||
* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
 | 
			
		||||
* in order to highlight the differences in using each.
 | 
			
		||||
*
 | 
			
		||||
* Textures binding to pitch linear memory is a new feature in CUDA 2.2,
 | 
			
		||||
* and allows use of texture features such as wrap addressing mode and
 | 
			
		||||
* filtering which are not possible with textures bound to regular linear memory
 | 
			
		||||
*/
 | 
			
		||||
 *
 | 
			
		||||
 * This example demonstrates how to use textures bound to pitch linear memory.
 | 
			
		||||
 * It performs a shift of matrix elements using wrap addressing mode (aka
 | 
			
		||||
 * periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
 | 
			
		||||
 * in order to highlight the differences in using each.
 | 
			
		||||
 *
 | 
			
		||||
 * Textures binding to pitch linear memory is a new feature in CUDA 2.2,
 | 
			
		||||
 * and allows use of texture features such as wrap addressing mode and
 | 
			
		||||
 * filtering which are not possible with textures bound to regular linear memory
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
@ -50,13 +50,13 @@
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
// CUDA helper functions
 | 
			
		||||
#include <helper_cuda.h>  // helper functions for CUDA error check
 | 
			
		||||
#include <helper_cuda.h> // helper functions for CUDA error check
 | 
			
		||||
 | 
			
		||||
#define NUM_REPS 100  // number of repetitions performed
 | 
			
		||||
#define TILE_DIM 16   // tile/block size
 | 
			
		||||
#define NUM_REPS 100 // number of repetitions performed
 | 
			
		||||
#define TILE_DIM 16  // tile/block size
 | 
			
		||||
 | 
			
		||||
const char *sSDKsample = "simplePitchLinearTexture";
 | 
			
		||||
 | 
			
		||||
@ -70,29 +70,26 @@ bool bTestResult = true;
 | 
			
		||||
//! Shifts matrix elements using pitch linear array
 | 
			
		||||
//! @param odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height,
 | 
			
		||||
                                 int shiftX, int shiftY,
 | 
			
		||||
                                 cudaTextureObject_t texRefPL) {
 | 
			
		||||
  int xid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  int yid = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
__global__ void
 | 
			
		||||
shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL)
 | 
			
		||||
{
 | 
			
		||||
    int xid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    int yid = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  odata[yid * pitch + xid] = tex2D<float>(
 | 
			
		||||
      texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 | 
			
		||||
    odata[yid * pitch + xid] = tex2D<float>(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Shifts matrix elements using regular array
 | 
			
		||||
//! @param odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void shiftArray(float *odata, int pitch, int width, int height,
 | 
			
		||||
                           int shiftX, int shiftY,
 | 
			
		||||
                           cudaTextureObject_t texRefArray) {
 | 
			
		||||
  int xid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  int yid = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
__global__ void
 | 
			
		||||
shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray)
 | 
			
		||||
{
 | 
			
		||||
    int xid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    int yid = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  odata[yid * pitch + xid] =
 | 
			
		||||
      tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
 | 
			
		||||
                   (yid + shiftY) / (float)height);
 | 
			
		||||
    odata[yid * pitch + xid] = tex2D<float>(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -102,210 +99,199 @@ void runTest(int argc, char **argv);
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n\n", sSDKsample);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
 | 
			
		||||
  printf("%s completed, returned %s\n", sSDKsample,
 | 
			
		||||
         bTestResult ? "OK" : "ERROR!");
 | 
			
		||||
  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!");
 | 
			
		||||
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  // Set array size
 | 
			
		||||
  const int nx = 2048;
 | 
			
		||||
  const int ny = 2048;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // Set array size
 | 
			
		||||
    const int nx = 2048;
 | 
			
		||||
    const int ny = 2048;
 | 
			
		||||
 | 
			
		||||
  // Setup shifts applied to x and y data
 | 
			
		||||
  const int x_shift = 5;
 | 
			
		||||
  const int y_shift = 7;
 | 
			
		||||
    // Setup shifts applied to x and y data
 | 
			
		||||
    const int x_shift = 5;
 | 
			
		||||
    const int y_shift = 7;
 | 
			
		||||
 | 
			
		||||
  if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
 | 
			
		||||
    printf("nx and ny must be multiples of TILE_DIM\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Setup execution configuration parameters
 | 
			
		||||
  dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // CUDA events for timing
 | 
			
		||||
  cudaEvent_t start, stop;
 | 
			
		||||
  cudaEventCreate(&start);
 | 
			
		||||
  cudaEventCreate(&stop);
 | 
			
		||||
 | 
			
		||||
  // Host allocation and initialization
 | 
			
		||||
  float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
 | 
			
		||||
  float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
 | 
			
		||||
  float *gold = (float *)malloc(sizeof(float) * nx * ny);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nx * ny; ++i) {
 | 
			
		||||
    h_idata[i] = (float)i;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Device memory allocation
 | 
			
		||||
  // Pitch linear input data
 | 
			
		||||
  float *d_idataPL;
 | 
			
		||||
  size_t d_pitchBytes;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
 | 
			
		||||
                                  nx * sizeof(float), ny));
 | 
			
		||||
 | 
			
		||||
  // Array input data
 | 
			
		||||
  cudaArray *d_idataArray;
 | 
			
		||||
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
 | 
			
		||||
 | 
			
		||||
  // Pitch linear output data
 | 
			
		||||
  float *d_odata;
 | 
			
		||||
  checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
 | 
			
		||||
                                  nx * sizeof(float), ny));
 | 
			
		||||
 | 
			
		||||
  // Copy host data to device
 | 
			
		||||
  // Pitch linear
 | 
			
		||||
  size_t h_pitchBytes = nx * sizeof(float);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
 | 
			
		||||
                               nx * sizeof(float), ny, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  // Array
 | 
			
		||||
  checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
 | 
			
		||||
                                    nx * ny * sizeof(float),
 | 
			
		||||
                                    cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  cudaTextureObject_t texRefPL;
 | 
			
		||||
  cudaTextureObject_t texRefArray;
 | 
			
		||||
  cudaResourceDesc texRes;
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  texRes.resType = cudaResourceTypePitch2D;
 | 
			
		||||
  texRes.res.pitch2D.devPtr = d_idataPL;
 | 
			
		||||
  texRes.res.pitch2D.desc = channelDesc;
 | 
			
		||||
  texRes.res.pitch2D.width = nx;
 | 
			
		||||
  texRes.res.pitch2D.height = ny;
 | 
			
		||||
  texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
 | 
			
		||||
  cudaTextureDesc texDescr;
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  texDescr.filterMode = cudaFilterModePoint;
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeElementType;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
  texRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  texRes.res.array.array = d_idataArray;
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  texDescr.filterMode = cudaFilterModePoint;
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeElementType;
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
 | 
			
		||||
 | 
			
		||||
  // Reference calculation
 | 
			
		||||
  for (int j = 0; j < ny; ++j) {
 | 
			
		||||
    int jshift = (j + y_shift) % ny;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < nx; ++i) {
 | 
			
		||||
      int ishift = (i + x_shift) % nx;
 | 
			
		||||
      gold[j * nx + i] = h_idata[jshift * nx + ishift];
 | 
			
		||||
    if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
 | 
			
		||||
        printf("nx and ny must be multiples of TILE_DIM\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Run ShiftPitchLinear kernel
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
 | 
			
		||||
    // Setup execution configuration parameters
 | 
			
		||||
    dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start, 0));
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < NUM_REPS; ++i) {
 | 
			
		||||
    shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata,
 | 
			
		||||
                                            (int)(d_pitchBytes / sizeof(float)),
 | 
			
		||||
                                            nx, ny, x_shift, y_shift, texRefPL);
 | 
			
		||||
  }
 | 
			
		||||
    // CUDA events for timing
 | 
			
		||||
    cudaEvent_t start, stop;
 | 
			
		||||
    cudaEventCreate(&start);
 | 
			
		||||
    cudaEventCreate(&stop);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop, 0));
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop));
 | 
			
		||||
  float timePL;
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
 | 
			
		||||
    // Host allocation and initialization
 | 
			
		||||
    float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
 | 
			
		||||
    float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
 | 
			
		||||
    float *gold    = (float *)malloc(sizeof(float) * nx * ny);
 | 
			
		||||
 | 
			
		||||
  // Check results
 | 
			
		||||
  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
 | 
			
		||||
                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
 | 
			
		||||
    for (int i = 0; i < nx * ny; ++i) {
 | 
			
		||||
        h_idata[i] = (float)i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
 | 
			
		||||
    // Device memory allocation
 | 
			
		||||
    // Pitch linear input data
 | 
			
		||||
    float *d_idataPL;
 | 
			
		||||
    size_t d_pitchBytes;
 | 
			
		||||
 | 
			
		||||
  bTestResult = true;
 | 
			
		||||
    checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny));
 | 
			
		||||
 | 
			
		||||
  if (res == false) {
 | 
			
		||||
    printf("*** shiftPitchLinear failed ***\n");
 | 
			
		||||
    bTestResult = false;
 | 
			
		||||
  }
 | 
			
		||||
    // Array input data
 | 
			
		||||
    cudaArray            *d_idataArray;
 | 
			
		||||
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 | 
			
		||||
 | 
			
		||||
  // Run ShiftArray kernel
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start, 0));
 | 
			
		||||
    checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < NUM_REPS; ++i) {
 | 
			
		||||
    shiftArray<<<dimGrid, dimBlock>>>(d_odata,
 | 
			
		||||
                                      (int)(d_pitchBytes / sizeof(float)), nx,
 | 
			
		||||
                                      ny, x_shift, y_shift, texRefArray);
 | 
			
		||||
  }
 | 
			
		||||
    // Pitch linear output data
 | 
			
		||||
    float *d_odata;
 | 
			
		||||
    checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop, 0));
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop));
 | 
			
		||||
  float timeArray;
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
 | 
			
		||||
    // Copy host data to device
 | 
			
		||||
    // Pitch linear
 | 
			
		||||
    size_t h_pitchBytes = nx * sizeof(float);
 | 
			
		||||
 | 
			
		||||
  // Check results
 | 
			
		||||
  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
 | 
			
		||||
                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
 | 
			
		||||
  res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  if (res == false) {
 | 
			
		||||
    printf("*** shiftArray failed ***\n");
 | 
			
		||||
    bTestResult = false;
 | 
			
		||||
  }
 | 
			
		||||
    // Array
 | 
			
		||||
    checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  float bandwidthPL =
 | 
			
		||||
      2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
 | 
			
		||||
  float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
 | 
			
		||||
                         (timeArray / NUM_REPS);
 | 
			
		||||
    cudaTextureObject_t texRefPL;
 | 
			
		||||
    cudaTextureObject_t texRefArray;
 | 
			
		||||
    cudaResourceDesc    texRes;
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
 | 
			
		||||
         bandwidthPL, bandwidthArray);
 | 
			
		||||
    texRes.resType                  = cudaResourceTypePitch2D;
 | 
			
		||||
    texRes.res.pitch2D.devPtr       = d_idataPL;
 | 
			
		||||
    texRes.res.pitch2D.desc         = channelDesc;
 | 
			
		||||
    texRes.res.pitch2D.width        = nx;
 | 
			
		||||
    texRes.res.pitch2D.height       = ny;
 | 
			
		||||
    texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
 | 
			
		||||
    cudaTextureDesc texDescr;
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
 | 
			
		||||
  float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    texDescr.filterMode       = cudaFilterModePoint;
 | 
			
		||||
    texDescr.addressMode[0]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode         = cudaReadModeElementType;
 | 
			
		||||
 | 
			
		||||
  printf(
 | 
			
		||||
      "\nTexture fetch rate (Mpix/s) for pitch linear: "
 | 
			
		||||
      "%.2e; for array: %.2e\n\n",
 | 
			
		||||
      fetchRatePL, fetchRateArray);
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
    texRes.resType            = cudaResourceTypeArray;
 | 
			
		||||
    texRes.res.array.array    = d_idataArray;
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    texDescr.filterMode       = cudaFilterModePoint;
 | 
			
		||||
    texDescr.addressMode[0]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode         = cudaReadModeElementType;
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
 | 
			
		||||
 | 
			
		||||
  // Cleanup
 | 
			
		||||
  free(h_idata);
 | 
			
		||||
  free(h_odata);
 | 
			
		||||
  free(gold);
 | 
			
		||||
    // Reference calculation
 | 
			
		||||
    for (int j = 0; j < ny; ++j) {
 | 
			
		||||
        int jshift = (j + y_shift) % ny;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDestroyTextureObject(texRefPL));
 | 
			
		||||
  checkCudaErrors(cudaDestroyTextureObject(texRefArray));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_idataPL));
 | 
			
		||||
  checkCudaErrors(cudaFreeArray(d_idataArray));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_odata));
 | 
			
		||||
        for (int i = 0; i < nx; ++i) {
 | 
			
		||||
            int ishift       = (i + x_shift) % nx;
 | 
			
		||||
            gold[j * nx + i] = h_idata[jshift * nx + ishift];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(start));
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(stop));
 | 
			
		||||
    // Run ShiftPitchLinear kernel
 | 
			
		||||
    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start, 0));
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < NUM_REPS; ++i) {
 | 
			
		||||
        shiftPitchLinear<<<dimGrid, dimBlock>>>(
 | 
			
		||||
            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop, 0));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop));
 | 
			
		||||
    float timePL;
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
 | 
			
		||||
 | 
			
		||||
    // Check results
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
    bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
 | 
			
		||||
 | 
			
		||||
    bTestResult = true;
 | 
			
		||||
 | 
			
		||||
    if (res == false) {
 | 
			
		||||
        printf("*** shiftPitchLinear failed ***\n");
 | 
			
		||||
        bTestResult = false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Run ShiftArray kernel
 | 
			
		||||
    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start, 0));
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < NUM_REPS; ++i) {
 | 
			
		||||
        shiftArray<<<dimGrid, dimBlock>>>(
 | 
			
		||||
            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop, 0));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop));
 | 
			
		||||
    float timeArray;
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
 | 
			
		||||
 | 
			
		||||
    // Check results
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
 | 
			
		||||
    res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
 | 
			
		||||
 | 
			
		||||
    if (res == false) {
 | 
			
		||||
        printf("*** shiftArray failed ***\n");
 | 
			
		||||
        bTestResult = false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    float bandwidthPL    = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
 | 
			
		||||
    float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS);
 | 
			
		||||
 | 
			
		||||
    printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray);
 | 
			
		||||
 | 
			
		||||
    float fetchRatePL    = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
 | 
			
		||||
    float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
 | 
			
		||||
 | 
			
		||||
    printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
 | 
			
		||||
           "%.2e; for array: %.2e\n\n",
 | 
			
		||||
           fetchRatePL,
 | 
			
		||||
           fetchRateArray);
 | 
			
		||||
 | 
			
		||||
    // Cleanup
 | 
			
		||||
    free(h_idata);
 | 
			
		||||
    free(h_odata);
 | 
			
		||||
    free(gold);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(texRefPL));
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(texRefArray));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_idataPL));
 | 
			
		||||
    checkCudaErrors(cudaFreeArray(d_idataArray));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_odata));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(start));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(stop));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,48 +26,49 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// System includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// CUDA runtime
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(a, b) (a > b ? a : b)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
__global__ void testKernel(int val) {
 | 
			
		||||
  printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x,
 | 
			
		||||
         threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
 | 
			
		||||
             threadIdx.x,
 | 
			
		||||
         val);
 | 
			
		||||
__global__ void testKernel(int val)
 | 
			
		||||
{
 | 
			
		||||
    printf("[%d, %d]:\t\tValue is:%d\n",
 | 
			
		||||
           blockIdx.y * gridDim.x + blockIdx.x,
 | 
			
		||||
           threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x,
 | 
			
		||||
           val);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  int devID;
 | 
			
		||||
  cudaDeviceProp props;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int            devID;
 | 
			
		||||
    cudaDeviceProp props;
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // Get GPU information
 | 
			
		||||
  checkCudaErrors(cudaGetDevice(&devID));
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&props, devID));
 | 
			
		||||
  printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
 | 
			
		||||
         props.major, props.minor);
 | 
			
		||||
    // Get GPU information
 | 
			
		||||
    checkCudaErrors(cudaGetDevice(&devID));
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
 | 
			
		||||
    printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor);
 | 
			
		||||
 | 
			
		||||
  printf("printf() is called. Output:\n\n");
 | 
			
		||||
    printf("printf() is called. Output:\n\n");
 | 
			
		||||
 | 
			
		||||
  // Kernel configuration, where a two-dimensional grid and
 | 
			
		||||
  // three-dimensional blocks are configured.
 | 
			
		||||
  dim3 dimGrid(2, 2);
 | 
			
		||||
  dim3 dimBlock(2, 2, 2);
 | 
			
		||||
  testKernel<<<dimGrid, dimBlock>>>(10);
 | 
			
		||||
  cudaDeviceSynchronize();
 | 
			
		||||
    // Kernel configuration, where a two-dimensional grid and
 | 
			
		||||
    // three-dimensional blocks are configured.
 | 
			
		||||
    dim3 dimGrid(2, 2);
 | 
			
		||||
    dim3 dimBlock(2, 2, 2);
 | 
			
		||||
    testKernel<<<dimGrid, dimBlock>>>(10);
 | 
			
		||||
    cudaDeviceSynchronize();
 | 
			
		||||
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -44,141 +44,137 @@
 | 
			
		||||
 *
 | 
			
		||||
 * Elapsed times are averaged over nreps repetitions (10 by default).
 | 
			
		||||
 *
 | 
			
		||||
*/
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
const char *sSDKsample = "simpleStreams";
 | 
			
		||||
 | 
			
		||||
const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync",
 | 
			
		||||
                                  "cudaEventDisableTiming", NULL};
 | 
			
		||||
const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL};
 | 
			
		||||
 | 
			
		||||
const char *sDeviceSyncMethod[] = {
 | 
			
		||||
    "cudaDeviceScheduleAuto",         "cudaDeviceScheduleSpin",
 | 
			
		||||
    "cudaDeviceScheduleYield",        "INVALID",
 | 
			
		||||
    "cudaDeviceScheduleBlockingSync", NULL};
 | 
			
		||||
const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
 | 
			
		||||
                                   "cudaDeviceScheduleSpin",
 | 
			
		||||
                                   "cudaDeviceScheduleYield",
 | 
			
		||||
                                   "INVALID",
 | 
			
		||||
                                   "cudaDeviceScheduleBlockingSync",
 | 
			
		||||
                                   NULL};
 | 
			
		||||
 | 
			
		||||
// System includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// CUDA runtime
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
#ifndef WIN32
 | 
			
		||||
#include <sys/mman.h>  // for mmap() / munmap()
 | 
			
		||||
#include <sys/mman.h> // for mmap() / munmap()
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Macro to aligned up to the memory size in question
 | 
			
		||||
#define MEMORY_ALIGNMENT 4096
 | 
			
		||||
#define MEMORY_ALIGNMENT  4096
 | 
			
		||||
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
 | 
			
		||||
 | 
			
		||||
__global__ void init_array(int *g_data, int *factor, int num_iterations) {
 | 
			
		||||
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
__global__ void init_array(int *g_data, int *factor, int num_iterations)
 | 
			
		||||
{
 | 
			
		||||
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < num_iterations; i++) {
 | 
			
		||||
    g_data[idx] += *factor;  // non-coalesced on purpose, to burn time
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool correct_data(int *a, const int n, const int c) {
 | 
			
		||||
  for (int i = 0; i < n; i++) {
 | 
			
		||||
    if (a[i] != c) {
 | 
			
		||||
      printf("%d: %d %d\n", i, a[i], c);
 | 
			
		||||
      return false;
 | 
			
		||||
    for (int i = 0; i < num_iterations; i++) {
 | 
			
		||||
        g_data[idx] += *factor; // non-coalesced on purpose, to burn time
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a,
 | 
			
		||||
                               int **ppAligned_a, int nbytes) {
 | 
			
		||||
bool correct_data(int *a, const int n, const int c)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < n; i++) {
 | 
			
		||||
        if (a[i] != c) {
 | 
			
		||||
            printf("%d: %d %d\n", i, a[i], c);
 | 
			
		||||
            return false;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
 | 
			
		||||
{
 | 
			
		||||
#if CUDART_VERSION >= 4000
 | 
			
		||||
#if !defined(__arm__) && !defined(__aarch64__)
 | 
			
		||||
  if (bPinGenericMemory) {
 | 
			
		||||
    if (bPinGenericMemory) {
 | 
			
		||||
// allocate a generic page-aligned chunk of system memory
 | 
			
		||||
#ifdef WIN32
 | 
			
		||||
    printf(
 | 
			
		||||
        "> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
 | 
			
		||||
        "system memory)\n",
 | 
			
		||||
        (float)nbytes / 1048576.0f);
 | 
			
		||||
    *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
 | 
			
		||||
                                MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 | 
			
		||||
        printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
 | 
			
		||||
               "system memory)\n",
 | 
			
		||||
               (float)nbytes / 1048576.0f);
 | 
			
		||||
        *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 | 
			
		||||
#else
 | 
			
		||||
    printf(
 | 
			
		||||
        "> mmap() allocating %4.2f Mbytes (generic page-aligned system "
 | 
			
		||||
        "memory)\n",
 | 
			
		||||
        (float)nbytes / 1048576.0f);
 | 
			
		||||
    *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
 | 
			
		||||
                        PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
 | 
			
		||||
        printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system "
 | 
			
		||||
               "memory)\n",
 | 
			
		||||
               (float)nbytes / 1048576.0f);
 | 
			
		||||
        *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
 | 
			
		||||
        *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
 | 
			
		||||
 | 
			
		||||
    printf(
 | 
			
		||||
        "> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
 | 
			
		||||
        "system memory\n",
 | 
			
		||||
        (float)nbytes / 1048576.0f);
 | 
			
		||||
    // pin allocate memory
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
 | 
			
		||||
  } else
 | 
			
		||||
        printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
 | 
			
		||||
               "system memory\n",
 | 
			
		||||
               (float)nbytes / 1048576.0f);
 | 
			
		||||
        // pin allocate memory
 | 
			
		||||
        checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  {
 | 
			
		||||
    printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n",
 | 
			
		||||
           (float)nbytes / 1048576.0f);
 | 
			
		||||
    // allocate host memory (pinned is required for achieve asynchronicity)
 | 
			
		||||
    checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
 | 
			
		||||
    *ppAligned_a = *pp_a;
 | 
			
		||||
  }
 | 
			
		||||
    {
 | 
			
		||||
        printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f);
 | 
			
		||||
        // allocate host memory (pinned is required for achieve asynchronicity)
 | 
			
		||||
        checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
 | 
			
		||||
        *ppAligned_a = *pp_a;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
 | 
			
		||||
                           int **ppAligned_a, int nbytes) {
 | 
			
		||||
inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
 | 
			
		||||
{
 | 
			
		||||
#if CUDART_VERSION >= 4000
 | 
			
		||||
#if !defined(__arm__) && !defined(__aarch64__)
 | 
			
		||||
  // CUDA 4.0 support pinning of generic host memory
 | 
			
		||||
  if (bPinGenericMemory) {
 | 
			
		||||
    // unpin and delete host memory
 | 
			
		||||
    checkCudaErrors(cudaHostUnregister(*ppAligned_a));
 | 
			
		||||
    // CUDA 4.0 support pinning of generic host memory
 | 
			
		||||
    if (bPinGenericMemory) {
 | 
			
		||||
        // unpin and delete host memory
 | 
			
		||||
        checkCudaErrors(cudaHostUnregister(*ppAligned_a));
 | 
			
		||||
#ifdef WIN32
 | 
			
		||||
    VirtualFree(*pp_a, 0, MEM_RELEASE);
 | 
			
		||||
        VirtualFree(*pp_a, 0, MEM_RELEASE);
 | 
			
		||||
#else
 | 
			
		||||
    munmap(*pp_a, nbytes);
 | 
			
		||||
        munmap(*pp_a, nbytes);
 | 
			
		||||
#endif
 | 
			
		||||
  } else
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  {
 | 
			
		||||
    cudaFreeHost(*pp_a);
 | 
			
		||||
  }
 | 
			
		||||
    {
 | 
			
		||||
        cudaFreeHost(*pp_a);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static const char *sSyncMethod[] = {
 | 
			
		||||
    "0 (Automatic Blocking)",
 | 
			
		||||
    "1 (Spin Blocking)",
 | 
			
		||||
    "2 (Yield Blocking)",
 | 
			
		||||
    "3 (Undefined Blocking Method)",
 | 
			
		||||
    "4 (Blocking Sync Event) = low CPU utilization",
 | 
			
		||||
    NULL};
 | 
			
		||||
static const char *sSyncMethod[] = {"0 (Automatic Blocking)",
 | 
			
		||||
                                    "1 (Spin Blocking)",
 | 
			
		||||
                                    "2 (Yield Blocking)",
 | 
			
		||||
                                    "3 (Undefined Blocking Method)",
 | 
			
		||||
                                    "4 (Blocking Sync Event) = low CPU utilization",
 | 
			
		||||
                                    NULL};
 | 
			
		||||
 | 
			
		||||
void printHelp() {
 | 
			
		||||
  printf("Usage: %s [options below]\n", sSDKsample);
 | 
			
		||||
  printf("\t--sync_method=n for CPU/GPU synchronization\n");
 | 
			
		||||
  printf("\t             n=%s\n", sSyncMethod[0]);
 | 
			
		||||
  printf("\t             n=%s\n", sSyncMethod[1]);
 | 
			
		||||
  printf("\t             n=%s\n", sSyncMethod[2]);
 | 
			
		||||
  printf("\t   <Default> n=%s\n", sSyncMethod[4]);
 | 
			
		||||
  printf(
 | 
			
		||||
      "\t--use_generic_memory (default) use generic page-aligned for system "
 | 
			
		||||
      "memory\n");
 | 
			
		||||
  printf(
 | 
			
		||||
      "\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
 | 
			
		||||
      "system memory\n");
 | 
			
		||||
void printHelp()
 | 
			
		||||
{
 | 
			
		||||
    printf("Usage: %s [options below]\n", sSDKsample);
 | 
			
		||||
    printf("\t--sync_method=n for CPU/GPU synchronization\n");
 | 
			
		||||
    printf("\t             n=%s\n", sSyncMethod[0]);
 | 
			
		||||
    printf("\t             n=%s\n", sSyncMethod[1]);
 | 
			
		||||
    printf("\t             n=%s\n", sSyncMethod[2]);
 | 
			
		||||
    printf("\t   <Default> n=%s\n", sSyncMethod[4]);
 | 
			
		||||
    printf("\t--use_generic_memory (default) use generic page-aligned for system "
 | 
			
		||||
           "memory\n");
 | 
			
		||||
    printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
 | 
			
		||||
           "system memory\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if defined(__APPLE__) || defined(MACOSX)
 | 
			
		||||
@ -187,259 +183,240 @@ void printHelp() {
 | 
			
		||||
#define DEFAULT_PINNED_GENERIC_MEMORY true
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  int cuda_device = 0;
 | 
			
		||||
  int nstreams = 4;              // number of streams for CUDA calls
 | 
			
		||||
  int nreps = 10;                // number of times each experiment is repeated
 | 
			
		||||
  int n = 16 * 1024 * 1024;      // number of ints in the data set
 | 
			
		||||
  int nbytes = n * sizeof(int);  // number of data bytes
 | 
			
		||||
  dim3 threads, blocks;          // kernel launch configuration
 | 
			
		||||
  float elapsed_time, time_memcpy, time_kernel;  // timing variables
 | 
			
		||||
  float scale_factor = 1.0f;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int   cuda_device = 0;
 | 
			
		||||
    int   nstreams    = 4;                        // number of streams for CUDA calls
 | 
			
		||||
    int   nreps       = 10;                       // number of times each experiment is repeated
 | 
			
		||||
    int   n           = 16 * 1024 * 1024;         // number of ints in the data set
 | 
			
		||||
    int   nbytes      = n * sizeof(int);          // number of data bytes
 | 
			
		||||
    dim3  threads, blocks;                        // kernel launch configuration
 | 
			
		||||
    float elapsed_time, time_memcpy, time_kernel; // timing variables
 | 
			
		||||
    float scale_factor = 1.0f;
 | 
			
		||||
 | 
			
		||||
  // allocate generic memory and pin it laster instead of using cudaHostAlloc()
 | 
			
		||||
    // allocate generic memory and pin it laster instead of using cudaHostAlloc()
 | 
			
		||||
 | 
			
		||||
  bool bPinGenericMemory =
 | 
			
		||||
      DEFAULT_PINNED_GENERIC_MEMORY;  // we want this to be the default behavior
 | 
			
		||||
  int device_sync_method =
 | 
			
		||||
      cudaDeviceBlockingSync;  // by default we use BlockingSync
 | 
			
		||||
    bool bPinGenericMemory  = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
 | 
			
		||||
    int  device_sync_method = cudaDeviceBlockingSync;        // by default we use BlockingSync
 | 
			
		||||
 | 
			
		||||
  int niterations;  // number of iterations for the loop inside the kernel
 | 
			
		||||
    int niterations; // number of iterations for the loop inside the kernel
 | 
			
		||||
 | 
			
		||||
  printf("[ %s ]\n\n", sSDKsample);
 | 
			
		||||
    printf("[ %s ]\n\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
 | 
			
		||||
    printHelp();
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
 | 
			
		||||
                                                  "sync_method")) >= 0) {
 | 
			
		||||
    if (device_sync_method == 0 || device_sync_method == 1 ||
 | 
			
		||||
        device_sync_method == 2 || device_sync_method == 4) {
 | 
			
		||||
      printf("Device synchronization method set to = %s\n",
 | 
			
		||||
             sSyncMethod[device_sync_method]);
 | 
			
		||||
      printf("Setting reps to 100 to demonstrate steady state\n");
 | 
			
		||||
      nreps = 100;
 | 
			
		||||
    } else {
 | 
			
		||||
      printf("Invalid command line option sync_method=\"%d\"\n",
 | 
			
		||||
             device_sync_method);
 | 
			
		||||
      return EXIT_FAILURE;
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
 | 
			
		||||
        printHelp();
 | 
			
		||||
        return EXIT_SUCCESS;
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    printHelp();
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 | 
			
		||||
    if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) {
 | 
			
		||||
        if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
 | 
			
		||||
            printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
 | 
			
		||||
            printf("Setting reps to 100 to demonstrate steady state\n");
 | 
			
		||||
            nreps = 100;
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
 | 
			
		||||
            return EXIT_FAILURE;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printHelp();
 | 
			
		||||
        return EXIT_SUCCESS;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 | 
			
		||||
#if defined(__APPLE__) || defined(MACOSX)
 | 
			
		||||
    bPinGenericMemory = false;  // Generic Pinning of System Paged memory not
 | 
			
		||||
                                // currently supported on Mac OSX
 | 
			
		||||
        bPinGenericMemory = false; // Generic Pinning of System Paged memory not
 | 
			
		||||
                                   // currently supported on Mac OSX
 | 
			
		||||
#else
 | 
			
		||||
    bPinGenericMemory = true;
 | 
			
		||||
        bPinGenericMemory = true;
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
 | 
			
		||||
    bPinGenericMemory = false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("\n> ");
 | 
			
		||||
  cuda_device = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // check the compute capability of the device
 | 
			
		||||
  int num_devices = 0;
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceCount(&num_devices));
 | 
			
		||||
 | 
			
		||||
  if (0 == num_devices) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "your system does not have a CUDA capable device, waiving test...\n");
 | 
			
		||||
    return EXIT_WAIVED;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // check if the command-line chosen device ID is within range, exit if not
 | 
			
		||||
  if (cuda_device >= num_devices) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
 | 
			
		||||
        cuda_device, num_devices - 1);
 | 
			
		||||
    return EXIT_FAILURE;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(cuda_device));
 | 
			
		||||
 | 
			
		||||
  // Checking for compute capabilities
 | 
			
		||||
  cudaDeviceProp deviceProp;
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 | 
			
		||||
 | 
			
		||||
  niterations = 5;
 | 
			
		||||
 | 
			
		||||
  // Check if GPU can map host memory (Generic Method), if not then we override
 | 
			
		||||
  // bPinGenericMemory to be false
 | 
			
		||||
  if (bPinGenericMemory) {
 | 
			
		||||
    printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
 | 
			
		||||
           deviceProp.canMapHostMemory ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    if (deviceProp.canMapHostMemory == 0) {
 | 
			
		||||
      printf(
 | 
			
		||||
          "Using cudaMallocHost, CUDA device does not support mapping of "
 | 
			
		||||
          "generic host memory\n");
 | 
			
		||||
      bPinGenericMemory = false;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Anything that is less than 32 Cores will have scaled down workload
 | 
			
		||||
  scale_factor =
 | 
			
		||||
      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
 | 
			
		||||
                    (float)deviceProp.multiProcessorCount)),
 | 
			
		||||
          1.0f);
 | 
			
		||||
  n = (int)rint((float)n / scale_factor);
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
 | 
			
		||||
        bPinGenericMemory = false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major,
 | 
			
		||||
         deviceProp.minor);
 | 
			
		||||
  printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
 | 
			
		||||
         deviceProp.multiProcessorCount,
 | 
			
		||||
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 | 
			
		||||
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
 | 
			
		||||
             deviceProp.multiProcessorCount);
 | 
			
		||||
    printf("\n> ");
 | 
			
		||||
    cuda_device = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
 | 
			
		||||
  printf("> array_size   = %d\n\n", n);
 | 
			
		||||
    // check the compute capability of the device
 | 
			
		||||
    int num_devices = 0;
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceCount(&num_devices));
 | 
			
		||||
 | 
			
		||||
  // enable use of blocking sync, to reduce CPU usage
 | 
			
		||||
  printf("> Using CPU/GPU Device Synchronization method (%s)\n",
 | 
			
		||||
         sDeviceSyncMethod[device_sync_method]);
 | 
			
		||||
  checkCudaErrors(cudaSetDeviceFlags(
 | 
			
		||||
      device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
 | 
			
		||||
    if (0 == num_devices) {
 | 
			
		||||
        printf("your system does not have a CUDA capable device, waiving test...\n");
 | 
			
		||||
        return EXIT_WAIVED;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // allocate host memory
 | 
			
		||||
  int c = 5;            // value to which the array will be initialized
 | 
			
		||||
  int *h_a = 0;         // pointer to the array data in host memory
 | 
			
		||||
  int *hAligned_a = 0;  // pointer to the array data in host memory (aligned to
 | 
			
		||||
                        // MEMORY_ALIGNMENT)
 | 
			
		||||
    // check if the command-line chosen device ID is within range, exit if not
 | 
			
		||||
    if (cuda_device >= num_devices) {
 | 
			
		||||
        printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1);
 | 
			
		||||
        return EXIT_FAILURE;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
 | 
			
		||||
  // using the new CUDA 4.0 features
 | 
			
		||||
  AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(cuda_device));
 | 
			
		||||
 | 
			
		||||
  // allocate device memory
 | 
			
		||||
  int *d_a = 0,
 | 
			
		||||
      *d_c = 0;  // pointers to data and init value in the device memory
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
 | 
			
		||||
  checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
 | 
			
		||||
    // Checking for compute capabilities
 | 
			
		||||
    cudaDeviceProp deviceProp;
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 | 
			
		||||
 | 
			
		||||
  printf("\nStarting Test\n");
 | 
			
		||||
    niterations = 5;
 | 
			
		||||
 | 
			
		||||
  // allocate and initialize an array of stream handles
 | 
			
		||||
  cudaStream_t *streams =
 | 
			
		||||
      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
 | 
			
		||||
    // Check if GPU can map host memory (Generic Method), if not then we override
 | 
			
		||||
    // bPinGenericMemory to be false
 | 
			
		||||
    if (bPinGenericMemory) {
 | 
			
		||||
        printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
    checkCudaErrors(cudaStreamCreate(&(streams[i])));
 | 
			
		||||
  }
 | 
			
		||||
        if (deviceProp.canMapHostMemory == 0) {
 | 
			
		||||
            printf("Using cudaMallocHost, CUDA device does not support mapping of "
 | 
			
		||||
                   "generic host memory\n");
 | 
			
		||||
            bPinGenericMemory = false;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // create CUDA event handles
 | 
			
		||||
  // use blocking sync
 | 
			
		||||
  cudaEvent_t start_event, stop_event;
 | 
			
		||||
  int eventflags =
 | 
			
		||||
      ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
 | 
			
		||||
                                                      : cudaEventDefault);
 | 
			
		||||
    // Anything that is less than 32 Cores will have scaled down workload
 | 
			
		||||
    scale_factor =
 | 
			
		||||
        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
 | 
			
		||||
            1.0f);
 | 
			
		||||
    n = (int)rint((float)n / scale_factor);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
 | 
			
		||||
  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
 | 
			
		||||
    printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
 | 
			
		||||
    printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
 | 
			
		||||
           deviceProp.multiProcessorCount,
 | 
			
		||||
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 | 
			
		||||
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
 | 
			
		||||
 | 
			
		||||
  // time memcopy from device
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start_event, 0));  // record in stream-0, to
 | 
			
		||||
                                                     // ensure that all previous
 | 
			
		||||
                                                     // CUDA calls have
 | 
			
		||||
                                                     // completed
 | 
			
		||||
  checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
 | 
			
		||||
                                  cudaMemcpyDeviceToHost, streams[0]));
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(
 | 
			
		||||
      stop_event));  // block until the event is actually recorded
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
 | 
			
		||||
  printf("memcopy:\t%.2f\n", time_memcpy);
 | 
			
		||||
    printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
 | 
			
		||||
    printf("> array_size   = %d\n\n", n);
 | 
			
		||||
 | 
			
		||||
  // time kernel
 | 
			
		||||
  threads = dim3(512, 1);
 | 
			
		||||
  blocks = dim3(n / threads.x, 1);
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
  init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
 | 
			
		||||
  printf("kernel:\t\t%.2f\n", time_kernel);
 | 
			
		||||
    // enable use of blocking sync, to reduce CPU usage
 | 
			
		||||
    printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
 | 
			
		||||
    checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // time non-streamed execution for reference
 | 
			
		||||
  threads = dim3(512, 1);
 | 
			
		||||
  blocks = dim3(n / threads.x, 1);
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
    // allocate host memory
 | 
			
		||||
    int  c          = 5; // value to which the array will be initialized
 | 
			
		||||
    int *h_a        = 0; // pointer to the array data in host memory
 | 
			
		||||
    int *hAligned_a = 0; // pointer to the array data in host memory (aligned to
 | 
			
		||||
                         // MEMORY_ALIGNMENT)
 | 
			
		||||
 | 
			
		||||
  for (int k = 0; k < nreps; k++) {
 | 
			
		||||
    init_array<<<blocks, threads>>>(d_a, d_c, niterations);
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
 | 
			
		||||
  }
 | 
			
		||||
    // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
 | 
			
		||||
    // using the new CUDA 4.0 features
 | 
			
		||||
    AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
 | 
			
		||||
  printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
 | 
			
		||||
    // allocate device memory
 | 
			
		||||
    int *d_a = 0,
 | 
			
		||||
        *d_c = 0; // pointers to data and init value in the device memory
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
 | 
			
		||||
    checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // time execution with nstreams streams
 | 
			
		||||
  threads = dim3(512, 1);
 | 
			
		||||
  blocks = dim3(n / (nstreams * threads.x), 1);
 | 
			
		||||
  memset(hAligned_a, 255,
 | 
			
		||||
         nbytes);  // set host memory bits to all 1s, for testing correctness
 | 
			
		||||
  checkCudaErrors(cudaMemset(
 | 
			
		||||
      d_a, 0, nbytes));  // set device memory to all 0s, for testing correctness
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
    printf("\nStarting Test\n");
 | 
			
		||||
 | 
			
		||||
    // allocate and initialize an array of stream handles
 | 
			
		||||
    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
 | 
			
		||||
 | 
			
		||||
  for (int k = 0; k < nreps; k++) {
 | 
			
		||||
    // asynchronously launch nstreams kernels, each operating on its own portion
 | 
			
		||||
    // of data
 | 
			
		||||
    for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
      init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams,
 | 
			
		||||
                                                     d_c, niterations);
 | 
			
		||||
        checkCudaErrors(cudaStreamCreate(&(streams[i])));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
 | 
			
		||||
    // will only
 | 
			
		||||
    //   commence executing when all previous CUDA calls in stream x have
 | 
			
		||||
    //   completed
 | 
			
		||||
    // create CUDA event handles
 | 
			
		||||
    // use blocking sync
 | 
			
		||||
    cudaEvent_t start_event, stop_event;
 | 
			
		||||
    int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
 | 
			
		||||
    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
 | 
			
		||||
 | 
			
		||||
    // time memcopy from device
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to
 | 
			
		||||
                                                      // ensure that all previous
 | 
			
		||||
                                                      // CUDA calls have
 | 
			
		||||
                                                      // completed
 | 
			
		||||
    checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
 | 
			
		||||
    printf("memcopy:\t%.2f\n", time_memcpy);
 | 
			
		||||
 | 
			
		||||
    // time kernel
 | 
			
		||||
    threads = dim3(512, 1);
 | 
			
		||||
    blocks  = dim3(n / threads.x, 1);
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
    init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
 | 
			
		||||
    printf("kernel:\t\t%.2f\n", time_kernel);
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // time non-streamed execution for reference
 | 
			
		||||
    threads = dim3(512, 1);
 | 
			
		||||
    blocks  = dim3(n / threads.x, 1);
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
 | 
			
		||||
    for (int k = 0; k < nreps; k++) {
 | 
			
		||||
        init_array<<<blocks, threads>>>(d_a, d_c, niterations);
 | 
			
		||||
        checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
 | 
			
		||||
    printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // time execution with nstreams streams
 | 
			
		||||
    threads = dim3(512, 1);
 | 
			
		||||
    blocks  = dim3(n / (nstreams * threads.x), 1);
 | 
			
		||||
    memset(hAligned_a, 255,
 | 
			
		||||
           nbytes);                              // set host memory bits to all 1s, for testing correctness
 | 
			
		||||
    checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(start_event, 0));
 | 
			
		||||
 | 
			
		||||
    for (int k = 0; k < nreps; k++) {
 | 
			
		||||
        // asynchronously launch nstreams kernels, each operating on its own portion
 | 
			
		||||
        // of data
 | 
			
		||||
        for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
            init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
 | 
			
		||||
        // will only
 | 
			
		||||
        //   commence executing when all previous CUDA calls in stream x have
 | 
			
		||||
        //   completed
 | 
			
		||||
        for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
            checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
 | 
			
		||||
                                            d_a + i * n / nstreams,
 | 
			
		||||
                                            nbytes / nstreams,
 | 
			
		||||
                                            cudaMemcpyDeviceToHost,
 | 
			
		||||
                                            streams[i]));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
    checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
 | 
			
		||||
    printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
 | 
			
		||||
 | 
			
		||||
    // check whether the output is correct
 | 
			
		||||
    printf("-------------------------------\n");
 | 
			
		||||
    bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
 | 
			
		||||
 | 
			
		||||
    // release resources
 | 
			
		||||
    for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
      checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
 | 
			
		||||
                                      d_a + i * n / nstreams, nbytes / nstreams,
 | 
			
		||||
                                      cudaMemcpyDeviceToHost, streams[i]));
 | 
			
		||||
        checkCudaErrors(cudaStreamDestroy(streams[i]));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventRecord(stop_event, 0));
 | 
			
		||||
  checkCudaErrors(cudaEventSynchronize(stop_event));
 | 
			
		||||
  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
 | 
			
		||||
  printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(start_event));
 | 
			
		||||
    checkCudaErrors(cudaEventDestroy(stop_event));
 | 
			
		||||
 | 
			
		||||
  // check whether the output is correct
 | 
			
		||||
  printf("-------------------------------\n");
 | 
			
		||||
  bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
 | 
			
		||||
    // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
 | 
			
		||||
    FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
 | 
			
		||||
 | 
			
		||||
  // release resources
 | 
			
		||||
  for (int i = 0; i < nstreams; i++) {
 | 
			
		||||
    checkCudaErrors(cudaStreamDestroy(streams[i]));
 | 
			
		||||
  }
 | 
			
		||||
    checkCudaErrors(cudaFree(d_a));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_c));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(start_event));
 | 
			
		||||
  checkCudaErrors(cudaEventDestroy(stop_event));
 | 
			
		||||
 | 
			
		||||
  // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
 | 
			
		||||
  FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaFree(d_a));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_c));
 | 
			
		||||
 | 
			
		||||
  return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
 | 
			
		||||
    return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,10 +34,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#ifdef _WIN32
 | 
			
		||||
#define WINDOWS_LEAN_AND_MEAN
 | 
			
		||||
@ -49,18 +49,18 @@
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
// CUDA helper functions
 | 
			
		||||
#include <helper_cuda.h>  // helper functions for CUDA error check
 | 
			
		||||
#include <helper_cuda.h> // helper functions for CUDA error check
 | 
			
		||||
 | 
			
		||||
#define MIN_EPSILON_ERROR 5e-3f
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Define the files that are to be save and the reference images for validation
 | 
			
		||||
const char *imageFilename = "teapot512.pgm";
 | 
			
		||||
const char *refFilename = "ref_rotated.pgm";
 | 
			
		||||
float angle = 0.5f;  // angle to rotate image by (in radians)
 | 
			
		||||
const char *refFilename   = "ref_rotated.pgm";
 | 
			
		||||
float       angle         = 0.5f; // angle to rotate image by (in radians)
 | 
			
		||||
 | 
			
		||||
// Auto-Verification Code
 | 
			
		||||
bool testResult = true;
 | 
			
		||||
@ -73,223 +73,218 @@ static const char *sampleName = "simpleSurfaceWrite";
 | 
			
		||||
//! Write to a cuArray (texture data source) using surface writes
 | 
			
		||||
//! @param gIData input data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void surfaceWriteKernel(float *gIData, int width, int height,
 | 
			
		||||
                                   cudaSurfaceObject_t outputSurface) {
 | 
			
		||||
  // calculate surface coordinates
 | 
			
		||||
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
__global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface)
 | 
			
		||||
{
 | 
			
		||||
    // calculate surface coordinates
 | 
			
		||||
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  // read from global memory and write to cuarray (via surface reference)
 | 
			
		||||
  surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y,
 | 
			
		||||
              cudaBoundaryModeTrap);
 | 
			
		||||
    // read from global memory and write to cuarray (via surface reference)
 | 
			
		||||
    surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Transform an image using texture lookups
 | 
			
		||||
//! @param gOData  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void transformKernel(float *gOData, int width, int height,
 | 
			
		||||
                                float theta, cudaTextureObject_t tex) {
 | 
			
		||||
  // calculate normalized texture coordinates
 | 
			
		||||
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
__global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex)
 | 
			
		||||
{
 | 
			
		||||
    // calculate normalized texture coordinates
 | 
			
		||||
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  float u = x / (float)width;
 | 
			
		||||
  float v = y / (float)height;
 | 
			
		||||
    float u = x / (float)width;
 | 
			
		||||
    float v = y / (float)height;
 | 
			
		||||
 | 
			
		||||
  // transform coordinates
 | 
			
		||||
  u -= 0.5f;
 | 
			
		||||
  v -= 0.5f;
 | 
			
		||||
  float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
 | 
			
		||||
  float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
 | 
			
		||||
    // transform coordinates
 | 
			
		||||
    u -= 0.5f;
 | 
			
		||||
    v -= 0.5f;
 | 
			
		||||
    float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
 | 
			
		||||
    float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
 | 
			
		||||
 | 
			
		||||
  // read from texture and write to global memory
 | 
			
		||||
  gOData[y * width + x] = tex2D<float>(tex, tu, tv);
 | 
			
		||||
    // read from texture and write to global memory
 | 
			
		||||
    gOData[y * width + x] = tex2D<float>(tex, tu, tv);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Declaration, forward
 | 
			
		||||
void runTest(int argc, char **argv);
 | 
			
		||||
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata,
 | 
			
		||||
                            const unsigned int len);
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n", sampleName);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n", sampleName);
 | 
			
		||||
 | 
			
		||||
  // Process command-line arguments
 | 
			
		||||
  if (argc > 1) {
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
 | 
			
		||||
      getCmdLineArgumentString(argc, (const char **)argv, "input",
 | 
			
		||||
                               (char **)&imageFilename);
 | 
			
		||||
    // Process command-line arguments
 | 
			
		||||
    if (argc > 1) {
 | 
			
		||||
        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
 | 
			
		||||
            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
 | 
			
		||||
 | 
			
		||||
      if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
        getCmdLineArgumentString(argc, (const char **)argv, "reference",
 | 
			
		||||
                                 (char **)&refFilename);
 | 
			
		||||
      } else {
 | 
			
		||||
        printf("-input flag should be used with -reference flag");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
      }
 | 
			
		||||
    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
      printf("-reference flag should be used with -input flag");
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
 | 
			
		||||
            }
 | 
			
		||||
            else {
 | 
			
		||||
                printf("-input flag should be used with -reference flag");
 | 
			
		||||
                exit(EXIT_FAILURE);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
            printf("-reference flag should be used with -input flag");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
 | 
			
		||||
  printf("%s completed, returned %s\n", sampleName,
 | 
			
		||||
         testResult ? "OK" : "ERROR!");
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  // Use command-line specified CUDA device,
 | 
			
		||||
  // otherwise use device with highest Gflops/s
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // Use command-line specified CUDA device,
 | 
			
		||||
    // otherwise use device with highest Gflops/s
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // Get number of SMs on this GPU
 | 
			
		||||
  cudaDeviceProp deviceProps;
 | 
			
		||||
    // Get number of SMs on this GPU
 | 
			
		||||
    cudaDeviceProp deviceProps;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
  printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
 | 
			
		||||
         deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major,
 | 
			
		||||
         deviceProps.minor);
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
    printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
 | 
			
		||||
           deviceProps.name,
 | 
			
		||||
           deviceProps.multiProcessorCount,
 | 
			
		||||
           deviceProps.major,
 | 
			
		||||
           deviceProps.minor);
 | 
			
		||||
 | 
			
		||||
  // Load image from disk
 | 
			
		||||
  float *hData = NULL;
 | 
			
		||||
  unsigned int width, height;
 | 
			
		||||
  char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
 | 
			
		||||
    // Load image from disk
 | 
			
		||||
    float       *hData = NULL;
 | 
			
		||||
    unsigned int width, height;
 | 
			
		||||
    char        *imagePath = sdkFindFilePath(imageFilename, argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (imagePath == NULL) {
 | 
			
		||||
    printf("Unable to source image input file: %s\n", imageFilename);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (imagePath == NULL) {
 | 
			
		||||
        printf("Unable to source image input file: %s\n", imageFilename);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  sdkLoadPGM(imagePath, &hData, &width, &height);
 | 
			
		||||
    sdkLoadPGM(imagePath, &hData, &width, &height);
 | 
			
		||||
 | 
			
		||||
  unsigned int size = width * height * sizeof(float);
 | 
			
		||||
  printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
 | 
			
		||||
    unsigned int size = width * height * sizeof(float);
 | 
			
		||||
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
 | 
			
		||||
 | 
			
		||||
  // Load reference image from image (output)
 | 
			
		||||
  float *hDataRef = (float *)malloc(size);
 | 
			
		||||
  char *refPath = sdkFindFilePath(refFilename, argv[0]);
 | 
			
		||||
    // Load reference image from image (output)
 | 
			
		||||
    float *hDataRef = (float *)malloc(size);
 | 
			
		||||
    char  *refPath  = sdkFindFilePath(refFilename, argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (refPath == NULL) {
 | 
			
		||||
    printf("Unable to find reference image file: %s\n", refFilename);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (refPath == NULL) {
 | 
			
		||||
        printf("Unable to find reference image file: %s\n", refFilename);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  sdkLoadPGM(refPath, &hDataRef, &width, &height);
 | 
			
		||||
    sdkLoadPGM(refPath, &hDataRef, &width, &height);
 | 
			
		||||
 | 
			
		||||
  // Allocate device memory for result
 | 
			
		||||
  float *dData = NULL;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&dData, size));
 | 
			
		||||
    // Allocate device memory for result
 | 
			
		||||
    float *dData = NULL;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&dData, size));
 | 
			
		||||
 | 
			
		||||
  // Allocate array and copy image data
 | 
			
		||||
  cudaChannelFormatDesc channelDesc =
 | 
			
		||||
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
  cudaArray *cuArray;
 | 
			
		||||
  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
 | 
			
		||||
                                  cudaArraySurfaceLoadStore));
 | 
			
		||||
    // Allocate array and copy image data
 | 
			
		||||
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
    cudaArray            *cuArray;
 | 
			
		||||
    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));
 | 
			
		||||
 | 
			
		||||
  dim3 dimBlock(8, 8, 1);
 | 
			
		||||
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
 | 
			
		||||
    dim3 dimBlock(8, 8, 1);
 | 
			
		||||
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
 | 
			
		||||
 | 
			
		||||
  cudaSurfaceObject_t outputSurface;
 | 
			
		||||
  cudaResourceDesc surfRes;
 | 
			
		||||
  memset(&surfRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
  surfRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  surfRes.res.array.array = cuArray;
 | 
			
		||||
    cudaSurfaceObject_t outputSurface;
 | 
			
		||||
    cudaResourceDesc    surfRes;
 | 
			
		||||
    memset(&surfRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
    surfRes.resType         = cudaResourceTypeArray;
 | 
			
		||||
    surfRes.res.array.array = cuArray;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
 | 
			
		||||
    checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
 | 
			
		||||
#if 1
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
 | 
			
		||||
  surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
 | 
			
		||||
                                            outputSurface);
 | 
			
		||||
#else  // This is what differs from the example simpleTexture
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
 | 
			
		||||
    surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
 | 
			
		||||
#else // This is what differs from the example simpleTexture
 | 
			
		||||
    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  cudaTextureObject_t tex;
 | 
			
		||||
  cudaResourceDesc texRes;
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
    cudaTextureObject_t tex;
 | 
			
		||||
    cudaResourceDesc    texRes;
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  texRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  texRes.res.array.array = cuArray;
 | 
			
		||||
    texRes.resType         = cudaResourceTypeArray;
 | 
			
		||||
    texRes.res.array.array = cuArray;
 | 
			
		||||
 | 
			
		||||
  cudaTextureDesc texDescr;
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
    cudaTextureDesc texDescr;
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  texDescr.filterMode = cudaFilterModeLinear;
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeElementType;
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    texDescr.filterMode       = cudaFilterModeLinear;
 | 
			
		||||
    texDescr.addressMode[0]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode         = cudaReadModeElementType;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
 | 
			
		||||
  // Warmup
 | 
			
		||||
  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
    // Warmup
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Execute the kernel
 | 
			
		||||
  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
    // Execute the kernel
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
 | 
			
		||||
  // Check if kernel execution generated an error
 | 
			
		||||
  getLastCudaError("Kernel execution failed");
 | 
			
		||||
    // Check if kernel execution generated an error
 | 
			
		||||
    getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
  cudaDeviceSynchronize();
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  printf("%.2f Mpixels/sec\n",
 | 
			
		||||
         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    cudaDeviceSynchronize();
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Allocate mem for the result on host side
 | 
			
		||||
  float *hOData = (float *)malloc(size);
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
    // Allocate mem for the result on host side
 | 
			
		||||
    float *hOData = (float *)malloc(size);
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  // Write result to file
 | 
			
		||||
  char outputFilename[1024];
 | 
			
		||||
  strcpy(outputFilename, "output.pgm");
 | 
			
		||||
  sdkSavePGM("output.pgm", hOData, width, height);
 | 
			
		||||
  printf("Wrote '%s'\n", outputFilename);
 | 
			
		||||
    // Write result to file
 | 
			
		||||
    char outputFilename[1024];
 | 
			
		||||
    strcpy(outputFilename, "output.pgm");
 | 
			
		||||
    sdkSavePGM("output.pgm", hOData, width, height);
 | 
			
		||||
    printf("Wrote '%s'\n", outputFilename);
 | 
			
		||||
 | 
			
		||||
  // Write regression file if necessary
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
    // Write file for regression test
 | 
			
		||||
    sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f,
 | 
			
		||||
                        false);
 | 
			
		||||
  } else {
 | 
			
		||||
    // We need to reload the data from disk,
 | 
			
		||||
    // because it is inverted upon output
 | 
			
		||||
    sdkLoadPGM(outputFilename, &hOData, &width, &height);
 | 
			
		||||
    // Write regression file if necessary
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
        // Write file for regression test
 | 
			
		||||
        sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, false);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // We need to reload the data from disk,
 | 
			
		||||
        // because it is inverted upon output
 | 
			
		||||
        sdkLoadPGM(outputFilename, &hOData, &width, &height);
 | 
			
		||||
 | 
			
		||||
    printf("Comparing files\n");
 | 
			
		||||
    printf("\toutput:    <%s>\n", outputFilename);
 | 
			
		||||
    printf("\treference: <%s>\n", refPath);
 | 
			
		||||
    testResult =
 | 
			
		||||
        compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
 | 
			
		||||
  }
 | 
			
		||||
        printf("Comparing files\n");
 | 
			
		||||
        printf("\toutput:    <%s>\n", outputFilename);
 | 
			
		||||
        printf("\treference: <%s>\n", refPath);
 | 
			
		||||
        testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
 | 
			
		||||
  checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
  checkCudaErrors(cudaFree(dData));
 | 
			
		||||
  checkCudaErrors(cudaFreeArray(cuArray));
 | 
			
		||||
  free(imagePath);
 | 
			
		||||
  free(refPath);
 | 
			
		||||
    checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
    checkCudaErrors(cudaFree(dData));
 | 
			
		||||
    checkCudaErrors(cudaFreeArray(cuArray));
 | 
			
		||||
    free(imagePath);
 | 
			
		||||
    free(refPath);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -68,106 +68,118 @@
 | 
			
		||||
// this
 | 
			
		||||
// struct by putting an undefined symbol in the function body so it won't
 | 
			
		||||
// compile.
 | 
			
		||||
template <typename T>
 | 
			
		||||
struct SharedMemory {
 | 
			
		||||
  // Ensure that we won't compile any un-specialized types
 | 
			
		||||
  __device__ T *getPointer() {
 | 
			
		||||
    extern __device__ void error(void);
 | 
			
		||||
    error();
 | 
			
		||||
    return NULL;
 | 
			
		||||
  }
 | 
			
		||||
template <typename T> struct SharedMemory
 | 
			
		||||
{
 | 
			
		||||
    // Ensure that we won't compile any un-specialized types
 | 
			
		||||
    __device__ T *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __device__ void error(void);
 | 
			
		||||
        error();
 | 
			
		||||
        return NULL;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Following are the specializations for the following types.
 | 
			
		||||
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
 | 
			
		||||
// One could also specialize it for user-defined types.
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<int> {
 | 
			
		||||
  __device__ int *getPointer() {
 | 
			
		||||
    extern __shared__ int s_int[];
 | 
			
		||||
    return s_int;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<int>
 | 
			
		||||
{
 | 
			
		||||
    __device__ int *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ int s_int[];
 | 
			
		||||
        return s_int;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<unsigned int> {
 | 
			
		||||
  __device__ unsigned int *getPointer() {
 | 
			
		||||
    extern __shared__ unsigned int s_uint[];
 | 
			
		||||
    return s_uint;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<unsigned int>
 | 
			
		||||
{
 | 
			
		||||
    __device__ unsigned int *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ unsigned int s_uint[];
 | 
			
		||||
        return s_uint;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<char> {
 | 
			
		||||
  __device__ char *getPointer() {
 | 
			
		||||
    extern __shared__ char s_char[];
 | 
			
		||||
    return s_char;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<char>
 | 
			
		||||
{
 | 
			
		||||
    __device__ char *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ char s_char[];
 | 
			
		||||
        return s_char;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<unsigned char> {
 | 
			
		||||
  __device__ unsigned char *getPointer() {
 | 
			
		||||
    extern __shared__ unsigned char s_uchar[];
 | 
			
		||||
    return s_uchar;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<unsigned char>
 | 
			
		||||
{
 | 
			
		||||
    __device__ unsigned char *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ unsigned char s_uchar[];
 | 
			
		||||
        return s_uchar;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<short> {
 | 
			
		||||
  __device__ short *getPointer() {
 | 
			
		||||
    extern __shared__ short s_short[];
 | 
			
		||||
    return s_short;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<short>
 | 
			
		||||
{
 | 
			
		||||
    __device__ short *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ short s_short[];
 | 
			
		||||
        return s_short;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<unsigned short> {
 | 
			
		||||
  __device__ unsigned short *getPointer() {
 | 
			
		||||
    extern __shared__ unsigned short s_ushort[];
 | 
			
		||||
    return s_ushort;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<unsigned short>
 | 
			
		||||
{
 | 
			
		||||
    __device__ unsigned short *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ unsigned short s_ushort[];
 | 
			
		||||
        return s_ushort;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<long> {
 | 
			
		||||
  __device__ long *getPointer() {
 | 
			
		||||
    extern __shared__ long s_long[];
 | 
			
		||||
    return s_long;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<long>
 | 
			
		||||
{
 | 
			
		||||
    __device__ long *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ long s_long[];
 | 
			
		||||
        return s_long;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<unsigned long> {
 | 
			
		||||
  __device__ unsigned long *getPointer() {
 | 
			
		||||
    extern __shared__ unsigned long s_ulong[];
 | 
			
		||||
    return s_ulong;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<unsigned long>
 | 
			
		||||
{
 | 
			
		||||
    __device__ unsigned long *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ unsigned long s_ulong[];
 | 
			
		||||
        return s_ulong;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<bool> {
 | 
			
		||||
  __device__ bool *getPointer() {
 | 
			
		||||
    extern __shared__ bool s_bool[];
 | 
			
		||||
    return s_bool;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<bool>
 | 
			
		||||
{
 | 
			
		||||
    __device__ bool *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ bool s_bool[];
 | 
			
		||||
        return s_bool;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<float> {
 | 
			
		||||
  __device__ float *getPointer() {
 | 
			
		||||
    extern __shared__ float s_float[];
 | 
			
		||||
    return s_float;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<float>
 | 
			
		||||
{
 | 
			
		||||
    __device__ float *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ float s_float[];
 | 
			
		||||
        return s_float;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct SharedMemory<double> {
 | 
			
		||||
  __device__ double *getPointer() {
 | 
			
		||||
    extern __shared__ double s_double[];
 | 
			
		||||
    return s_double;
 | 
			
		||||
  }
 | 
			
		||||
template <> struct SharedMemory<double>
 | 
			
		||||
{
 | 
			
		||||
    __device__ double *getPointer()
 | 
			
		||||
    {
 | 
			
		||||
        extern __shared__ double s_double[];
 | 
			
		||||
        return s_double;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif  //_SHAREDMEM_H_
 | 
			
		||||
#endif //_SHAREDMEM_H_
 | 
			
		||||
 | 
			
		||||
@ -26,23 +26,23 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/* This sample is a templatized version of the template project.
 | 
			
		||||
* It also shows how to correctly templatize dynamically allocated shared
 | 
			
		||||
* memory arrays.
 | 
			
		||||
* Host code.
 | 
			
		||||
*/
 | 
			
		||||
 * It also shows how to correctly templatize dynamically allocated shared
 | 
			
		||||
 * memory arrays.
 | 
			
		||||
 * Host code.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// System includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// CUDA runtime
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(a, b) (a > b ? a : b)
 | 
			
		||||
@ -58,55 +58,55 @@ int g_TotalFailures = 0;
 | 
			
		||||
//! @param g_idata  input data in global memory
 | 
			
		||||
//! @param g_odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class T>
 | 
			
		||||
__global__ void testKernel(T *g_idata, T *g_odata) {
 | 
			
		||||
  // Shared mem size is determined by the host app at run time
 | 
			
		||||
  SharedMemory<T> smem;
 | 
			
		||||
  T *sdata = smem.getPointer();
 | 
			
		||||
template <class T> __global__ void testKernel(T *g_idata, T *g_odata)
 | 
			
		||||
{
 | 
			
		||||
    // Shared mem size is determined by the host app at run time
 | 
			
		||||
    SharedMemory<T> smem;
 | 
			
		||||
    T              *sdata = smem.getPointer();
 | 
			
		||||
 | 
			
		||||
  // access thread id
 | 
			
		||||
  const unsigned int tid = threadIdx.x;
 | 
			
		||||
  // access number of threads in this block
 | 
			
		||||
  const unsigned int num_threads = blockDim.x;
 | 
			
		||||
    // access thread id
 | 
			
		||||
    const unsigned int tid = threadIdx.x;
 | 
			
		||||
    // access number of threads in this block
 | 
			
		||||
    const unsigned int num_threads = blockDim.x;
 | 
			
		||||
 | 
			
		||||
  // read in input data from global memory
 | 
			
		||||
  sdata[tid] = g_idata[tid];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
    // read in input data from global memory
 | 
			
		||||
    sdata[tid] = g_idata[tid];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
 | 
			
		||||
  // perform some computations
 | 
			
		||||
  sdata[tid] = (T)num_threads * sdata[tid];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
    // perform some computations
 | 
			
		||||
    sdata[tid] = (T)num_threads * sdata[tid];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
 | 
			
		||||
  // write data to global memory
 | 
			
		||||
  g_odata[tid] = sdata[tid];
 | 
			
		||||
    // write data to global memory
 | 
			
		||||
    g_odata[tid] = sdata[tid];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// declaration, forward
 | 
			
		||||
template <class T>
 | 
			
		||||
void runTest(int argc, char **argv, int len);
 | 
			
		||||
template <class T> void runTest(int argc, char **argv, int len);
 | 
			
		||||
 | 
			
		||||
template <class T>
 | 
			
		||||
void computeGold(T *reference, T *idata, const unsigned int len) {
 | 
			
		||||
  const T T_len = static_cast<T>(len);
 | 
			
		||||
template <class T> void computeGold(T *reference, T *idata, const unsigned int len)
 | 
			
		||||
{
 | 
			
		||||
    const T T_len = static_cast<T>(len);
 | 
			
		||||
 | 
			
		||||
  for (unsigned int i = 0; i < len; ++i) {
 | 
			
		||||
    reference[i] = idata[i] * T_len;
 | 
			
		||||
  }
 | 
			
		||||
    for (unsigned int i = 0; i < len; ++i) {
 | 
			
		||||
        reference[i] = idata[i] * T_len;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("> runTest<float,32>\n");
 | 
			
		||||
  runTest<float>(argc, argv, 32);
 | 
			
		||||
  printf("> runTest<int,64>\n");
 | 
			
		||||
  runTest<int>(argc, argv, 64);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("> runTest<float,32>\n");
 | 
			
		||||
    runTest<float>(argc, argv, 32);
 | 
			
		||||
    printf("> runTest<int,64>\n");
 | 
			
		||||
    runTest<int>(argc, argv, 64);
 | 
			
		||||
 | 
			
		||||
  printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);
 | 
			
		||||
    printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);
 | 
			
		||||
 | 
			
		||||
  exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// To completely templatize runTest (below) with cutil, we need to use
 | 
			
		||||
@ -114,151 +114,152 @@ int main(int argc, char **argv) {
 | 
			
		||||
// functions for different types.
 | 
			
		||||
 | 
			
		||||
// Here's the generic wrapper for cutCompare*
 | 
			
		||||
template <class T>
 | 
			
		||||
class ArrayComparator {
 | 
			
		||||
 public:
 | 
			
		||||
  bool compare(const T *reference, T *data, unsigned int len) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "Error: no comparison function implemented for this type\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
template <class T> class ArrayComparator
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    bool compare(const T *reference, T *data, unsigned int len)
 | 
			
		||||
    {
 | 
			
		||||
        fprintf(stderr, "Error: no comparison function implemented for this type\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Here's the specialization for ints:
 | 
			
		||||
template <>
 | 
			
		||||
class ArrayComparator<int> {
 | 
			
		||||
 public:
 | 
			
		||||
  bool compare(const int *reference, int *data, unsigned int len) {
 | 
			
		||||
    return compareData(reference, data, len, 0.15f, 0.0f);
 | 
			
		||||
  }
 | 
			
		||||
template <> class ArrayComparator<int>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    bool compare(const int *reference, int *data, unsigned int len)
 | 
			
		||||
    {
 | 
			
		||||
        return compareData(reference, data, len, 0.15f, 0.0f);
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Here's the specialization for floats:
 | 
			
		||||
template <>
 | 
			
		||||
class ArrayComparator<float> {
 | 
			
		||||
 public:
 | 
			
		||||
  bool compare(const float *reference, float *data, unsigned int len) {
 | 
			
		||||
    return compareData(reference, data, len, 0.15f, 0.15f);
 | 
			
		||||
  }
 | 
			
		||||
template <> class ArrayComparator<float>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    bool compare(const float *reference, float *data, unsigned int len)
 | 
			
		||||
    {
 | 
			
		||||
        return compareData(reference, data, len, 0.15f, 0.15f);
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Here's the generic wrapper for cutWriteFile*
 | 
			
		||||
template <class T>
 | 
			
		||||
class ArrayFileWriter {
 | 
			
		||||
 public:
 | 
			
		||||
  bool write(const char *filename, T *data, unsigned int len, float epsilon) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "Error: no file write function implemented for this type\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
template <class T> class ArrayFileWriter
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    bool write(const char *filename, T *data, unsigned int len, float epsilon)
 | 
			
		||||
    {
 | 
			
		||||
        fprintf(stderr, "Error: no file write function implemented for this type\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Here's the specialization for ints:
 | 
			
		||||
template <>
 | 
			
		||||
class ArrayFileWriter<int> {
 | 
			
		||||
 public:
 | 
			
		||||
  bool write(const char *filename, int *data, unsigned int len, float epsilon) {
 | 
			
		||||
    return sdkWriteFile(filename, data, len, epsilon, false);
 | 
			
		||||
  }
 | 
			
		||||
template <> class ArrayFileWriter<int>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    bool write(const char *filename, int *data, unsigned int len, float epsilon)
 | 
			
		||||
    {
 | 
			
		||||
        return sdkWriteFile(filename, data, len, epsilon, false);
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Here's the specialization for floats:
 | 
			
		||||
template <>
 | 
			
		||||
class ArrayFileWriter<float> {
 | 
			
		||||
 public:
 | 
			
		||||
  bool write(const char *filename, float *data, unsigned int len,
 | 
			
		||||
             float epsilon) {
 | 
			
		||||
    return sdkWriteFile(filename, data, len, epsilon, false);
 | 
			
		||||
  }
 | 
			
		||||
template <> class ArrayFileWriter<float>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    bool write(const char *filename, float *data, unsigned int len, float epsilon)
 | 
			
		||||
    {
 | 
			
		||||
        return sdkWriteFile(filename, data, len, epsilon, false);
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class T>
 | 
			
		||||
void runTest(int argc, char **argv, int len) {
 | 
			
		||||
  int devID;
 | 
			
		||||
  cudaDeviceProp deviceProps;
 | 
			
		||||
template <class T> void runTest(int argc, char **argv, int len)
 | 
			
		||||
{
 | 
			
		||||
    int            devID;
 | 
			
		||||
    cudaDeviceProp deviceProps;
 | 
			
		||||
 | 
			
		||||
  devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // get number of SMs on this GPU
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
  printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name,
 | 
			
		||||
         deviceProps.multiProcessorCount);
 | 
			
		||||
    // get number of SMs on this GPU
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 | 
			
		||||
    printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
 | 
			
		||||
 | 
			
		||||
  // create and start timer
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
    // create and start timer
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // start the timer
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    // start the timer
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  unsigned int num_threads = len;
 | 
			
		||||
  unsigned int mem_size = sizeof(float) * num_threads;
 | 
			
		||||
    unsigned int num_threads = len;
 | 
			
		||||
    unsigned int mem_size    = sizeof(float) * num_threads;
 | 
			
		||||
 | 
			
		||||
  // allocate host memory
 | 
			
		||||
  T *h_idata = (T *)malloc(mem_size);
 | 
			
		||||
    // allocate host memory
 | 
			
		||||
    T *h_idata = (T *)malloc(mem_size);
 | 
			
		||||
 | 
			
		||||
  // initialize the memory
 | 
			
		||||
  for (unsigned int i = 0; i < num_threads; ++i) {
 | 
			
		||||
    h_idata[i] = (T)i;
 | 
			
		||||
  }
 | 
			
		||||
    // initialize the memory
 | 
			
		||||
    for (unsigned int i = 0; i < num_threads; ++i) {
 | 
			
		||||
        h_idata[i] = (T)i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // allocate device memory
 | 
			
		||||
  T *d_idata;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
 | 
			
		||||
  // copy host memory to device
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
 | 
			
		||||
    // allocate device memory
 | 
			
		||||
    T *d_idata;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
 | 
			
		||||
    // copy host memory to device
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  T *d_odata;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    T *d_odata;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
 | 
			
		||||
 | 
			
		||||
  // setup execution parameters
 | 
			
		||||
  dim3 grid(1, 1, 1);
 | 
			
		||||
  dim3 threads(num_threads, 1, 1);
 | 
			
		||||
    // setup execution parameters
 | 
			
		||||
    dim3 grid(1, 1, 1);
 | 
			
		||||
    dim3 threads(num_threads, 1, 1);
 | 
			
		||||
 | 
			
		||||
  // execute the kernel
 | 
			
		||||
  testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata);
 | 
			
		||||
    // execute the kernel
 | 
			
		||||
    testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata);
 | 
			
		||||
 | 
			
		||||
  // check if kernel execution generated and error
 | 
			
		||||
  getLastCudaError("Kernel execution failed");
 | 
			
		||||
    // check if kernel execution generated and error
 | 
			
		||||
    getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  T *h_odata = (T *)malloc(mem_size);
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads,
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    T *h_odata = (T *)malloc(mem_size);
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // compute reference solution
 | 
			
		||||
  T *reference = (T *)malloc(mem_size);
 | 
			
		||||
  computeGold<T>(reference, h_idata, num_threads);
 | 
			
		||||
    // compute reference solution
 | 
			
		||||
    T *reference = (T *)malloc(mem_size);
 | 
			
		||||
    computeGold<T>(reference, h_idata, num_threads);
 | 
			
		||||
 | 
			
		||||
  ArrayComparator<T> comparator;
 | 
			
		||||
  ArrayFileWriter<T> writer;
 | 
			
		||||
    ArrayComparator<T> comparator;
 | 
			
		||||
    ArrayFileWriter<T> writer;
 | 
			
		||||
 | 
			
		||||
  // check result
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
    // write file for regression test
 | 
			
		||||
    writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
 | 
			
		||||
  } else {
 | 
			
		||||
    // custom output handling when no regression test running
 | 
			
		||||
    // in this case check if the result is equivalent to the expected solution
 | 
			
		||||
    bool res = comparator.compare(reference, h_odata, num_threads);
 | 
			
		||||
    printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
 | 
			
		||||
    g_TotalFailures += (1 != res);
 | 
			
		||||
  }
 | 
			
		||||
    // check result
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
        // write file for regression test
 | 
			
		||||
        writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // custom output handling when no regression test running
 | 
			
		||||
        // in this case check if the result is equivalent to the expected solution
 | 
			
		||||
        bool res = comparator.compare(reference, h_odata, num_threads);
 | 
			
		||||
        printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
 | 
			
		||||
        g_TotalFailures += (1 != res);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // cleanup memory
 | 
			
		||||
  free(h_idata);
 | 
			
		||||
  free(h_odata);
 | 
			
		||||
  free(reference);
 | 
			
		||||
  checkCudaErrors(cudaFree(d_idata));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_odata));
 | 
			
		||||
    // cleanup memory
 | 
			
		||||
    free(h_idata);
 | 
			
		||||
    free(h_odata);
 | 
			
		||||
    free(reference);
 | 
			
		||||
    checkCudaErrors(cudaFree(d_idata));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_odata));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,10 +34,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#ifdef _WIN32
 | 
			
		||||
#define WINDOWS_LEAN_AND_MEAN
 | 
			
		||||
@ -49,22 +49,22 @@
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
// Utilities and timing functions
 | 
			
		||||
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 | 
			
		||||
 | 
			
		||||
// CUDA helper functions
 | 
			
		||||
#include <helper_cuda.h>  // helper functions for CUDA error check
 | 
			
		||||
#include <helper_cuda.h> // helper functions for CUDA error check
 | 
			
		||||
 | 
			
		||||
#define MAX_EPSILON_ERROR 5e-3f
 | 
			
		||||
 | 
			
		||||
// Define the files that are to be save and the reference images for validation
 | 
			
		||||
const char *imageFilename = "teapot512.pgm";
 | 
			
		||||
const char *refFilename = "ref_rotated.pgm";
 | 
			
		||||
const char *refFilename   = "ref_rotated.pgm";
 | 
			
		||||
 | 
			
		||||
const char *sampleName = "simpleTexture";
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Constants
 | 
			
		||||
const float angle = 0.5f;  // angle to rotate image by (in radians)
 | 
			
		||||
const float angle = 0.5f; // angle to rotate image by (in radians)
 | 
			
		||||
 | 
			
		||||
// Auto-Verification Code
 | 
			
		||||
bool testResult = true;
 | 
			
		||||
@ -73,22 +73,22 @@ bool testResult = true;
 | 
			
		||||
//! Transform an image using texture lookups
 | 
			
		||||
//! @param outputData  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void transformKernel(float *outputData, int width, int height,
 | 
			
		||||
                                float theta, cudaTextureObject_t tex) {
 | 
			
		||||
  // calculate normalized texture coordinates
 | 
			
		||||
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
__global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex)
 | 
			
		||||
{
 | 
			
		||||
    // calculate normalized texture coordinates
 | 
			
		||||
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  float u = (float)x - (float)width / 2;
 | 
			
		||||
  float v = (float)y - (float)height / 2;
 | 
			
		||||
  float tu = u * cosf(theta) - v * sinf(theta);
 | 
			
		||||
  float tv = v * cosf(theta) + u * sinf(theta);
 | 
			
		||||
    float u  = (float)x - (float)width / 2;
 | 
			
		||||
    float v  = (float)y - (float)height / 2;
 | 
			
		||||
    float tu = u * cosf(theta) - v * sinf(theta);
 | 
			
		||||
    float tv = v * cosf(theta) + u * sinf(theta);
 | 
			
		||||
 | 
			
		||||
  tu /= (float)width;
 | 
			
		||||
  tv /= (float)height;
 | 
			
		||||
    tu /= (float)width;
 | 
			
		||||
    tv /= (float)height;
 | 
			
		||||
 | 
			
		||||
  // read from texture and write to global memory
 | 
			
		||||
  outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 | 
			
		||||
    // read from texture and write to global memory
 | 
			
		||||
    outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -98,154 +98,151 @@ void runTest(int argc, char **argv);
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("%s starting...\n", sampleName);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("%s starting...\n", sampleName);
 | 
			
		||||
 | 
			
		||||
  // Process command-line arguments
 | 
			
		||||
  if (argc > 1) {
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
 | 
			
		||||
      getCmdLineArgumentString(argc, (const char **)argv, "input",
 | 
			
		||||
                               (char **)&imageFilename);
 | 
			
		||||
    // Process command-line arguments
 | 
			
		||||
    if (argc > 1) {
 | 
			
		||||
        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
 | 
			
		||||
            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
 | 
			
		||||
 | 
			
		||||
      if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
        getCmdLineArgumentString(argc, (const char **)argv, "reference",
 | 
			
		||||
                                 (char **)&refFilename);
 | 
			
		||||
      } else {
 | 
			
		||||
        printf("-input flag should be used with -reference flag");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
      }
 | 
			
		||||
    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
      printf("-reference flag should be used with -input flag");
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
 | 
			
		||||
            }
 | 
			
		||||
            else {
 | 
			
		||||
                printf("-input flag should be used with -reference flag");
 | 
			
		||||
                exit(EXIT_FAILURE);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
 | 
			
		||||
            printf("-reference flag should be used with -input flag");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
 | 
			
		||||
  printf("%s completed, returned %s\n", sampleName,
 | 
			
		||||
         testResult ? "OK" : "ERROR!");
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // load image from disk
 | 
			
		||||
  float *hData = NULL;
 | 
			
		||||
  unsigned int width, height;
 | 
			
		||||
  char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
 | 
			
		||||
    // load image from disk
 | 
			
		||||
    float       *hData = NULL;
 | 
			
		||||
    unsigned int width, height;
 | 
			
		||||
    char        *imagePath = sdkFindFilePath(imageFilename, argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (imagePath == NULL) {
 | 
			
		||||
    printf("Unable to source image file: %s\n", imageFilename);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (imagePath == NULL) {
 | 
			
		||||
        printf("Unable to source image file: %s\n", imageFilename);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  sdkLoadPGM(imagePath, &hData, &width, &height);
 | 
			
		||||
    sdkLoadPGM(imagePath, &hData, &width, &height);
 | 
			
		||||
 | 
			
		||||
  unsigned int size = width * height * sizeof(float);
 | 
			
		||||
  printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
 | 
			
		||||
    unsigned int size = width * height * sizeof(float);
 | 
			
		||||
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
 | 
			
		||||
 | 
			
		||||
  // Load reference image from image (output)
 | 
			
		||||
  float *hDataRef = (float *)malloc(size);
 | 
			
		||||
  char *refPath = sdkFindFilePath(refFilename, argv[0]);
 | 
			
		||||
    // Load reference image from image (output)
 | 
			
		||||
    float *hDataRef = (float *)malloc(size);
 | 
			
		||||
    char  *refPath  = sdkFindFilePath(refFilename, argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (refPath == NULL) {
 | 
			
		||||
    printf("Unable to find reference image file: %s\n", refFilename);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (refPath == NULL) {
 | 
			
		||||
        printf("Unable to find reference image file: %s\n", refFilename);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  sdkLoadPGM(refPath, &hDataRef, &width, &height);
 | 
			
		||||
    sdkLoadPGM(refPath, &hDataRef, &width, &height);
 | 
			
		||||
 | 
			
		||||
  // Allocate device memory for result
 | 
			
		||||
  float *dData = NULL;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&dData, size));
 | 
			
		||||
    // Allocate device memory for result
 | 
			
		||||
    float *dData = NULL;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&dData, size));
 | 
			
		||||
 | 
			
		||||
  // Allocate array and copy image data
 | 
			
		||||
  cudaChannelFormatDesc channelDesc =
 | 
			
		||||
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
  cudaArray *cuArray;
 | 
			
		||||
  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
 | 
			
		||||
    // Allocate array and copy image data
 | 
			
		||||
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 | 
			
		||||
    cudaArray            *cuArray;
 | 
			
		||||
    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
 | 
			
		||||
    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  cudaTextureObject_t tex;
 | 
			
		||||
  cudaResourceDesc texRes;
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
    cudaTextureObject_t tex;
 | 
			
		||||
    cudaResourceDesc    texRes;
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  texRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  texRes.res.array.array = cuArray;
 | 
			
		||||
    texRes.resType         = cudaResourceTypeArray;
 | 
			
		||||
    texRes.res.array.array = cuArray;
 | 
			
		||||
 | 
			
		||||
  cudaTextureDesc texDescr;
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
    cudaTextureDesc texDescr;
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  texDescr.filterMode = cudaFilterModeLinear;
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeElementType;
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    texDescr.filterMode       = cudaFilterModeLinear;
 | 
			
		||||
    texDescr.addressMode[0]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1]   = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode         = cudaReadModeElementType;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
 | 
			
		||||
  dim3 dimBlock(8, 8, 1);
 | 
			
		||||
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
 | 
			
		||||
    dim3 dimBlock(8, 8, 1);
 | 
			
		||||
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
 | 
			
		||||
 | 
			
		||||
  // Warmup
 | 
			
		||||
  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
    // Warmup
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    StopWatchInterface *timer = NULL;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Execute the kernel
 | 
			
		||||
  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
    // Execute the kernel
 | 
			
		||||
    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
 | 
			
		||||
 | 
			
		||||
  // Check if kernel execution generated an error
 | 
			
		||||
  getLastCudaError("Kernel execution failed");
 | 
			
		||||
    // Check if kernel execution generated an error
 | 
			
		||||
    getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  printf("%.2f Mpixels/sec\n",
 | 
			
		||||
         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // Allocate mem for the result on host side
 | 
			
		||||
  float *hOutputData = (float *)malloc(size);
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
    // Allocate mem for the result on host side
 | 
			
		||||
    float *hOutputData = (float *)malloc(size);
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  // Write result to file
 | 
			
		||||
  char outputFilename[1024];
 | 
			
		||||
  strcpy(outputFilename, imagePath);
 | 
			
		||||
  strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm");
 | 
			
		||||
  sdkSavePGM(outputFilename, hOutputData, width, height);
 | 
			
		||||
  printf("Wrote '%s'\n", outputFilename);
 | 
			
		||||
    // Write result to file
 | 
			
		||||
    char outputFilename[1024];
 | 
			
		||||
    strcpy(outputFilename, imagePath);
 | 
			
		||||
    strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm");
 | 
			
		||||
    sdkSavePGM(outputFilename, hOutputData, width, height);
 | 
			
		||||
    printf("Wrote '%s'\n", outputFilename);
 | 
			
		||||
 | 
			
		||||
  // Write regression file if necessary
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
    // Write file for regression test
 | 
			
		||||
    sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height,
 | 
			
		||||
                        0.0f, false);
 | 
			
		||||
  } else {
 | 
			
		||||
    // We need to reload the data from disk,
 | 
			
		||||
    // because it is inverted upon output
 | 
			
		||||
    sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
 | 
			
		||||
    // Write regression file if necessary
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
        // Write file for regression test
 | 
			
		||||
        sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, 0.0f, false);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // We need to reload the data from disk,
 | 
			
		||||
        // because it is inverted upon output
 | 
			
		||||
        sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
 | 
			
		||||
 | 
			
		||||
    printf("Comparing files\n");
 | 
			
		||||
    printf("\toutput:    <%s>\n", outputFilename);
 | 
			
		||||
    printf("\treference: <%s>\n", refPath);
 | 
			
		||||
        printf("Comparing files\n");
 | 
			
		||||
        printf("\toutput:    <%s>\n", outputFilename);
 | 
			
		||||
        printf("\treference: <%s>\n", refPath);
 | 
			
		||||
 | 
			
		||||
    testResult = compareData(hOutputData, hDataRef, width * height,
 | 
			
		||||
                             MAX_EPSILON_ERROR, 0.15f);
 | 
			
		||||
  }
 | 
			
		||||
        testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
  checkCudaErrors(cudaFree(dData));
 | 
			
		||||
  checkCudaErrors(cudaFreeArray(cuArray));
 | 
			
		||||
  free(imagePath);
 | 
			
		||||
  free(refPath);
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
    checkCudaErrors(cudaFree(dData));
 | 
			
		||||
    checkCudaErrors(cudaFreeArray(cuArray));
 | 
			
		||||
    free(imagePath);
 | 
			
		||||
    free(refPath);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 | 
			
		||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 | 
			
		||||
 | 
			
		||||
## References (for more details)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -32,11 +32,11 @@
 | 
			
		||||
  using 3D texture lookups.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <helper_gl.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#if defined(__APPLE__) || defined(MACOSX)
 | 
			
		||||
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
 | 
			
		||||
@ -49,53 +49,52 @@
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// includes, cuda
 | 
			
		||||
#include <vector_types.h>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <cuda_gl_interop.h>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <vector_types.h>
 | 
			
		||||
 | 
			
		||||
// CUDA utilities and system includes
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
#include <vector_types.h>
 | 
			
		||||
 | 
			
		||||
typedef unsigned int uint;
 | 
			
		||||
typedef unsigned int  uint;
 | 
			
		||||
typedef unsigned char uchar;
 | 
			
		||||
 | 
			
		||||
#define MAX_EPSILON_ERROR 5.0f
 | 
			
		||||
#define THRESHOLD 0.15f
 | 
			
		||||
#define THRESHOLD         0.15f
 | 
			
		||||
 | 
			
		||||
const char *sSDKsample = "simpleTexture3D";
 | 
			
		||||
 | 
			
		||||
const char *volumeFilename = "Bucky.raw";
 | 
			
		||||
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
 | 
			
		||||
const char      *volumeFilename = "Bucky.raw";
 | 
			
		||||
const cudaExtent volumeSize     = make_cudaExtent(32, 32, 32);
 | 
			
		||||
 | 
			
		||||
const uint width = 512, height = 512;
 | 
			
		||||
const dim3 blockSize(16, 16, 1);
 | 
			
		||||
const dim3 gridSize(width / blockSize.x, height / blockSize.y);
 | 
			
		||||
 | 
			
		||||
float w = 0.5;  // texture coordinate in z
 | 
			
		||||
float w = 0.5; // texture coordinate in z
 | 
			
		||||
 | 
			
		||||
GLuint pbo;  // OpenGL pixel buffer object
 | 
			
		||||
struct cudaGraphicsResource
 | 
			
		||||
    *cuda_pbo_resource;  // CUDA Graphics Resource (to transfer PBO)
 | 
			
		||||
GLuint                       pbo;               // OpenGL pixel buffer object
 | 
			
		||||
struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
 | 
			
		||||
 | 
			
		||||
bool linearFiltering = true;
 | 
			
		||||
bool animate = true;
 | 
			
		||||
bool animate         = true;
 | 
			
		||||
 | 
			
		||||
StopWatchInterface *timer = NULL;
 | 
			
		||||
 | 
			
		||||
uint *d_output = NULL;
 | 
			
		||||
 | 
			
		||||
// Auto-Verification Code
 | 
			
		||||
const int frameCheckNumber = 4;
 | 
			
		||||
int fpsCount = 0;  // FPS count for averaging
 | 
			
		||||
int fpsLimit = 1;  // FPS limit for sampling
 | 
			
		||||
int g_Index = 0;
 | 
			
		||||
unsigned int frameCount = 0;
 | 
			
		||||
unsigned int g_TotalErrors = 0;
 | 
			
		||||
const int    frameCheckNumber  = 4;
 | 
			
		||||
int          fpsCount          = 0; // FPS count for averaging
 | 
			
		||||
int          fpsLimit          = 1; // FPS limit for sampling
 | 
			
		||||
int          g_Index           = 0;
 | 
			
		||||
unsigned int frameCount        = 0;
 | 
			
		||||
unsigned int g_TotalErrors     = 0;
 | 
			
		||||
volatile int g_GraphicsMapFlag = 0;
 | 
			
		||||
 | 
			
		||||
int *pArgc = NULL;
 | 
			
		||||
int   *pArgc = NULL;
 | 
			
		||||
char **pArgv = NULL;
 | 
			
		||||
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
@ -105,288 +104,294 @@ char **pArgv = NULL;
 | 
			
		||||
extern "C" void cleanup();
 | 
			
		||||
extern "C" void setTextureFilterMode(bool bLinearFilter);
 | 
			
		||||
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
 | 
			
		||||
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
 | 
			
		||||
                              uint imageW, uint imageH, float w);
 | 
			
		||||
extern void cleanupCuda();
 | 
			
		||||
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w);
 | 
			
		||||
extern void     cleanupCuda();
 | 
			
		||||
 | 
			
		||||
void loadVolumeData(char *exec_path);
 | 
			
		||||
 | 
			
		||||
void computeFPS() {
 | 
			
		||||
  frameCount++;
 | 
			
		||||
  fpsCount++;
 | 
			
		||||
void computeFPS()
 | 
			
		||||
{
 | 
			
		||||
    frameCount++;
 | 
			
		||||
    fpsCount++;
 | 
			
		||||
 | 
			
		||||
  if (fpsCount == fpsLimit) {
 | 
			
		||||
    char fps[256];
 | 
			
		||||
    float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
 | 
			
		||||
    sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);
 | 
			
		||||
    if (fpsCount == fpsLimit) {
 | 
			
		||||
        char  fps[256];
 | 
			
		||||
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
 | 
			
		||||
        sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);
 | 
			
		||||
 | 
			
		||||
    glutSetWindowTitle(fps);
 | 
			
		||||
    fpsCount = 0;
 | 
			
		||||
        glutSetWindowTitle(fps);
 | 
			
		||||
        fpsCount = 0;
 | 
			
		||||
 | 
			
		||||
    fpsLimit = ftoi(MAX(1.0f, ifps));
 | 
			
		||||
    sdkResetTimer(&timer);
 | 
			
		||||
  }
 | 
			
		||||
        fpsLimit = ftoi(MAX(1.0f, ifps));
 | 
			
		||||
        sdkResetTimer(&timer);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// render image using CUDA
 | 
			
		||||
void render() {
 | 
			
		||||
  // map PBO to get CUDA device pointer
 | 
			
		||||
  g_GraphicsMapFlag++;
 | 
			
		||||
  checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
 | 
			
		||||
  size_t num_bytes;
 | 
			
		||||
  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
 | 
			
		||||
      (void **)&d_output, &num_bytes, cuda_pbo_resource));
 | 
			
		||||
  // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
 | 
			
		||||
void render()
 | 
			
		||||
{
 | 
			
		||||
    // map PBO to get CUDA device pointer
 | 
			
		||||
    g_GraphicsMapFlag++;
 | 
			
		||||
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
 | 
			
		||||
    size_t num_bytes;
 | 
			
		||||
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
 | 
			
		||||
    // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
 | 
			
		||||
 | 
			
		||||
  // call CUDA kernel, writing results to PBO
 | 
			
		||||
  render_kernel(gridSize, blockSize, d_output, width, height, w);
 | 
			
		||||
    // call CUDA kernel, writing results to PBO
 | 
			
		||||
    render_kernel(gridSize, blockSize, d_output, width, height, w);
 | 
			
		||||
 | 
			
		||||
  getLastCudaError("render_kernel failed");
 | 
			
		||||
    getLastCudaError("render_kernel failed");
 | 
			
		||||
 | 
			
		||||
  if (g_GraphicsMapFlag) {
 | 
			
		||||
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
 | 
			
		||||
    g_GraphicsMapFlag--;
 | 
			
		||||
  }
 | 
			
		||||
    if (g_GraphicsMapFlag) {
 | 
			
		||||
        checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
 | 
			
		||||
        g_GraphicsMapFlag--;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// display results using OpenGL (called by GLUT)
 | 
			
		||||
void display() {
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
void display()
 | 
			
		||||
{
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  render();
 | 
			
		||||
    render();
 | 
			
		||||
 | 
			
		||||
  // display results
 | 
			
		||||
  glClear(GL_COLOR_BUFFER_BIT);
 | 
			
		||||
    // display results
 | 
			
		||||
    glClear(GL_COLOR_BUFFER_BIT);
 | 
			
		||||
 | 
			
		||||
  // draw image from PBO
 | 
			
		||||
  glDisable(GL_DEPTH_TEST);
 | 
			
		||||
  glRasterPos2i(0, 0);
 | 
			
		||||
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
 | 
			
		||||
  glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
 | 
			
		||||
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
 | 
			
		||||
    // draw image from PBO
 | 
			
		||||
    glDisable(GL_DEPTH_TEST);
 | 
			
		||||
    glRasterPos2i(0, 0);
 | 
			
		||||
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
 | 
			
		||||
    glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
 | 
			
		||||
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
 | 
			
		||||
 | 
			
		||||
  glutSwapBuffers();
 | 
			
		||||
  glutReportErrors();
 | 
			
		||||
    glutSwapBuffers();
 | 
			
		||||
    glutReportErrors();
 | 
			
		||||
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  computeFPS();
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    computeFPS();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void idle() {
 | 
			
		||||
  if (animate) {
 | 
			
		||||
    w += 0.01f;
 | 
			
		||||
    glutPostRedisplay();
 | 
			
		||||
  }
 | 
			
		||||
void idle()
 | 
			
		||||
{
 | 
			
		||||
    if (animate) {
 | 
			
		||||
        w += 0.01f;
 | 
			
		||||
        glutPostRedisplay();
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void keyboard(unsigned char key, int x, int y) {
 | 
			
		||||
  switch (key) {
 | 
			
		||||
void keyboard(unsigned char key, int x, int y)
 | 
			
		||||
{
 | 
			
		||||
    switch (key) {
 | 
			
		||||
    case 27:
 | 
			
		||||
#if defined(__APPLE__) || defined(MACOSX)
 | 
			
		||||
      exit(EXIT_SUCCESS);
 | 
			
		||||
      glutDestroyWindow(glutGetWindow());
 | 
			
		||||
      return;
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
        glutDestroyWindow(glutGetWindow());
 | 
			
		||||
        return;
 | 
			
		||||
#else
 | 
			
		||||
      glutDestroyWindow(glutGetWindow());
 | 
			
		||||
      return;
 | 
			
		||||
        glutDestroyWindow(glutGetWindow());
 | 
			
		||||
        return;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    case '=':
 | 
			
		||||
    case '+':
 | 
			
		||||
      w += 0.01f;
 | 
			
		||||
      break;
 | 
			
		||||
        w += 0.01f;
 | 
			
		||||
        break;
 | 
			
		||||
 | 
			
		||||
    case '-':
 | 
			
		||||
      w -= 0.01f;
 | 
			
		||||
      break;
 | 
			
		||||
        w -= 0.01f;
 | 
			
		||||
        break;
 | 
			
		||||
 | 
			
		||||
    case 'f':
 | 
			
		||||
      linearFiltering = !linearFiltering;
 | 
			
		||||
      setTextureFilterMode(linearFiltering);
 | 
			
		||||
      break;
 | 
			
		||||
        linearFiltering = !linearFiltering;
 | 
			
		||||
        setTextureFilterMode(linearFiltering);
 | 
			
		||||
        break;
 | 
			
		||||
 | 
			
		||||
    case ' ':
 | 
			
		||||
      animate = !animate;
 | 
			
		||||
      break;
 | 
			
		||||
        animate = !animate;
 | 
			
		||||
        break;
 | 
			
		||||
 | 
			
		||||
    default:
 | 
			
		||||
      break;
 | 
			
		||||
  }
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  glutPostRedisplay();
 | 
			
		||||
    glutPostRedisplay();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void reshape(int x, int y) {
 | 
			
		||||
  glViewport(0, 0, x, y);
 | 
			
		||||
void reshape(int x, int y)
 | 
			
		||||
{
 | 
			
		||||
    glViewport(0, 0, x, y);
 | 
			
		||||
 | 
			
		||||
  glMatrixMode(GL_MODELVIEW);
 | 
			
		||||
  glLoadIdentity();
 | 
			
		||||
    glMatrixMode(GL_MODELVIEW);
 | 
			
		||||
    glLoadIdentity();
 | 
			
		||||
 | 
			
		||||
  glMatrixMode(GL_PROJECTION);
 | 
			
		||||
  glLoadIdentity();
 | 
			
		||||
  glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
 | 
			
		||||
    glMatrixMode(GL_PROJECTION);
 | 
			
		||||
    glLoadIdentity();
 | 
			
		||||
    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void cleanup() {
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
void cleanup()
 | 
			
		||||
{
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // add extra check to unmap the resource before unregistering it
 | 
			
		||||
  if (g_GraphicsMapFlag) {
 | 
			
		||||
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
 | 
			
		||||
    g_GraphicsMapFlag--;
 | 
			
		||||
  }
 | 
			
		||||
    // add extra check to unmap the resource before unregistering it
 | 
			
		||||
    if (g_GraphicsMapFlag) {
 | 
			
		||||
        checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
 | 
			
		||||
        g_GraphicsMapFlag--;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // unregister this buffer object from CUDA C
 | 
			
		||||
  checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
 | 
			
		||||
  glDeleteBuffers(1, &pbo);
 | 
			
		||||
  cleanupCuda();
 | 
			
		||||
    // unregister this buffer object from CUDA C
 | 
			
		||||
    checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
 | 
			
		||||
    glDeleteBuffers(1, &pbo);
 | 
			
		||||
    cleanupCuda();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void initGLBuffers() {
 | 
			
		||||
  // create pixel buffer object
 | 
			
		||||
  glGenBuffers(1, &pbo);
 | 
			
		||||
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
 | 
			
		||||
  glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4,
 | 
			
		||||
               0, GL_STREAM_DRAW_ARB);
 | 
			
		||||
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
 | 
			
		||||
void initGLBuffers()
 | 
			
		||||
{
 | 
			
		||||
    // create pixel buffer object
 | 
			
		||||
    glGenBuffers(1, &pbo);
 | 
			
		||||
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
 | 
			
		||||
    glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB);
 | 
			
		||||
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
 | 
			
		||||
 | 
			
		||||
  // register this buffer object with CUDA
 | 
			
		||||
  checkCudaErrors(cudaGraphicsGLRegisterBuffer(
 | 
			
		||||
      &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
 | 
			
		||||
    // register this buffer object with CUDA
 | 
			
		||||
    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Load raw data from disk
 | 
			
		||||
uchar *loadRawFile(const char *filename, size_t size) {
 | 
			
		||||
  FILE *fp = fopen(filename, "rb");
 | 
			
		||||
uchar *loadRawFile(const char *filename, size_t size)
 | 
			
		||||
{
 | 
			
		||||
    FILE *fp = fopen(filename, "rb");
 | 
			
		||||
 | 
			
		||||
  if (!fp) {
 | 
			
		||||
    fprintf(stderr, "Error opening file '%s'\n", filename);
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
    if (!fp) {
 | 
			
		||||
        fprintf(stderr, "Error opening file '%s'\n", filename);
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  uchar *data = (uchar *)malloc(size);
 | 
			
		||||
  size_t read = fread(data, 1, size, fp);
 | 
			
		||||
  fclose(fp);
 | 
			
		||||
    uchar *data = (uchar *)malloc(size);
 | 
			
		||||
    size_t read = fread(data, 1, size, fp);
 | 
			
		||||
    fclose(fp);
 | 
			
		||||
 | 
			
		||||
  printf("Read '%s', %zu bytes\n", filename, read);
 | 
			
		||||
    printf("Read '%s', %zu bytes\n", filename, read);
 | 
			
		||||
 | 
			
		||||
  return data;
 | 
			
		||||
    return data;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void initGL(int *argc, char **argv) {
 | 
			
		||||
  // initialize GLUT callback functions
 | 
			
		||||
  glutInit(argc, argv);
 | 
			
		||||
  glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
 | 
			
		||||
  glutInitWindowSize(width, height);
 | 
			
		||||
  glutCreateWindow("CUDA 3D texture");
 | 
			
		||||
  glutDisplayFunc(display);
 | 
			
		||||
  glutKeyboardFunc(keyboard);
 | 
			
		||||
  glutReshapeFunc(reshape);
 | 
			
		||||
  glutIdleFunc(idle);
 | 
			
		||||
void initGL(int *argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // initialize GLUT callback functions
 | 
			
		||||
    glutInit(argc, argv);
 | 
			
		||||
    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
 | 
			
		||||
    glutInitWindowSize(width, height);
 | 
			
		||||
    glutCreateWindow("CUDA 3D texture");
 | 
			
		||||
    glutDisplayFunc(display);
 | 
			
		||||
    glutKeyboardFunc(keyboard);
 | 
			
		||||
    glutReshapeFunc(reshape);
 | 
			
		||||
    glutIdleFunc(idle);
 | 
			
		||||
 | 
			
		||||
  if (!isGLVersionSupported(2, 0) ||
 | 
			
		||||
      !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
 | 
			
		||||
    fprintf(stderr, "Required OpenGL extensions are missing.");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
 | 
			
		||||
        fprintf(stderr, "Required OpenGL extensions are missing.");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void runAutoTest(const char *ref_file, char *exec_path) {
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
 | 
			
		||||
void runAutoTest(const char *ref_file, char *exec_path)
 | 
			
		||||
{
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
 | 
			
		||||
 | 
			
		||||
  // render the volumeData
 | 
			
		||||
  render_kernel(gridSize, blockSize, d_output, width, height, w);
 | 
			
		||||
    // render the volumeData
 | 
			
		||||
    render_kernel(gridSize, blockSize, d_output, width, height, w);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  getLastCudaError("render_kernel failed");
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    getLastCudaError("render_kernel failed");
 | 
			
		||||
 | 
			
		||||
  void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h_output, d_output,
 | 
			
		||||
                             width * height * sizeof(GLubyte) * 4,
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
  sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
 | 
			
		||||
             "simpleTexture3D.bin");
 | 
			
		||||
    void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost));
 | 
			
		||||
    sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin");
 | 
			
		||||
 | 
			
		||||
  bool bTestResult = sdkCompareBin2BinFloat(
 | 
			
		||||
      "simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path),
 | 
			
		||||
      width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path);
 | 
			
		||||
    bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin",
 | 
			
		||||
                                              sdkFindFilePath(ref_file, exec_path),
 | 
			
		||||
                                              width * height,
 | 
			
		||||
                                              MAX_EPSILON_ERROR,
 | 
			
		||||
                                              THRESHOLD,
 | 
			
		||||
                                              exec_path);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaFree(d_output));
 | 
			
		||||
  free(h_output);
 | 
			
		||||
    checkCudaErrors(cudaFree(d_output));
 | 
			
		||||
    free(h_output);
 | 
			
		||||
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void loadVolumeData(char *exec_path) {
 | 
			
		||||
  // load volume data
 | 
			
		||||
  const char *path = sdkFindFilePath(volumeFilename, exec_path);
 | 
			
		||||
void loadVolumeData(char *exec_path)
 | 
			
		||||
{
 | 
			
		||||
    // load volume data
 | 
			
		||||
    const char *path = sdkFindFilePath(volumeFilename, exec_path);
 | 
			
		||||
 | 
			
		||||
  if (path == NULL) {
 | 
			
		||||
    fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n",
 | 
			
		||||
            volumeFilename);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (path == NULL) {
 | 
			
		||||
        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;
 | 
			
		||||
  uchar *h_volume = loadRawFile(path, size);
 | 
			
		||||
    size_t size     = volumeSize.width * volumeSize.height * volumeSize.depth;
 | 
			
		||||
    uchar *h_volume = loadRawFile(path, size);
 | 
			
		||||
 | 
			
		||||
  initCuda(h_volume, volumeSize);
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
    initCuda(h_volume, volumeSize);
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  free(h_volume);
 | 
			
		||||
    free(h_volume);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  pArgc = &argc;
 | 
			
		||||
  pArgv = argv;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    pArgc = &argc;
 | 
			
		||||
    pArgv = argv;
 | 
			
		||||
 | 
			
		||||
  char *ref_file = NULL;
 | 
			
		||||
    char *ref_file = NULL;
 | 
			
		||||
 | 
			
		||||
#if defined(__linux__)
 | 
			
		||||
  setenv("DISPLAY", ":0", 0);
 | 
			
		||||
    setenv("DISPLAY", ":0", 0);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  printf("%s Starting...\n\n", sSDKsample);
 | 
			
		||||
    printf("%s Starting...\n\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
 | 
			
		||||
    fpsLimit = frameCheckNumber;
 | 
			
		||||
    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
 | 
			
		||||
  }
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
 | 
			
		||||
        fpsLimit = frameCheckNumber;
 | 
			
		||||
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
  // Gflops/s
 | 
			
		||||
  findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
    // Gflops/s
 | 
			
		||||
    findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  if (ref_file) {
 | 
			
		||||
    loadVolumeData(argv[0]);
 | 
			
		||||
    runAutoTest(ref_file, argv[0]);
 | 
			
		||||
  } else {
 | 
			
		||||
    initGL(&argc, argv);
 | 
			
		||||
    if (ref_file) {
 | 
			
		||||
        loadVolumeData(argv[0]);
 | 
			
		||||
        runAutoTest(ref_file, argv[0]);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        initGL(&argc, argv);
 | 
			
		||||
 | 
			
		||||
    // OpenGL buffers
 | 
			
		||||
    initGLBuffers();
 | 
			
		||||
        // OpenGL buffers
 | 
			
		||||
        initGLBuffers();
 | 
			
		||||
 | 
			
		||||
    loadVolumeData(argv[0]);
 | 
			
		||||
  }
 | 
			
		||||
        loadVolumeData(argv[0]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf(
 | 
			
		||||
      "Press space to toggle animation\n"
 | 
			
		||||
      "Press '+' and '-' to change displayed slice\n");
 | 
			
		||||
    printf("Press space to toggle animation\n"
 | 
			
		||||
           "Press '+' and '-' to change displayed slice\n");
 | 
			
		||||
 | 
			
		||||
#if defined(__APPLE__) || defined(MACOSX)
 | 
			
		||||
  atexit(cleanup);
 | 
			
		||||
    atexit(cleanup);
 | 
			
		||||
#else
 | 
			
		||||
  glutCloseFunc(cleanup);
 | 
			
		||||
    glutCloseFunc(cleanup);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  glutMainLoop();
 | 
			
		||||
    glutMainLoop();
 | 
			
		||||
 | 
			
		||||
  exit(EXIT_SUCCESS);
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -28,111 +28,111 @@
 | 
			
		||||
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
 | 
			
		||||
#define _SIMPLETEXTURE3D_KERNEL_CU_
 | 
			
		||||
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_math.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
typedef unsigned int uint;
 | 
			
		||||
typedef unsigned int  uint;
 | 
			
		||||
typedef unsigned char uchar;
 | 
			
		||||
 | 
			
		||||
cudaArray *d_volumeArray = 0;
 | 
			
		||||
cudaTextureObject_t tex;  // 3D texture
 | 
			
		||||
cudaArray          *d_volumeArray = 0;
 | 
			
		||||
cudaTextureObject_t tex; // 3D texture
 | 
			
		||||
 | 
			
		||||
__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
 | 
			
		||||
                         cudaTextureObject_t texObj) {
 | 
			
		||||
  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
 | 
			
		||||
  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
 | 
			
		||||
__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj)
 | 
			
		||||
{
 | 
			
		||||
    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
 | 
			
		||||
    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  float u = x / (float)imageW;
 | 
			
		||||
  float v = y / (float)imageH;
 | 
			
		||||
  // read from 3D texture
 | 
			
		||||
  float voxel = tex3D<float>(texObj, u, v, w);
 | 
			
		||||
    float u = x / (float)imageW;
 | 
			
		||||
    float v = y / (float)imageH;
 | 
			
		||||
    // read from 3D texture
 | 
			
		||||
    float voxel = tex3D<float>(texObj, u, v, w);
 | 
			
		||||
 | 
			
		||||
  if ((x < imageW) && (y < imageH)) {
 | 
			
		||||
    // write output color
 | 
			
		||||
    uint i = __umul24(y, imageW) + x;
 | 
			
		||||
    d_output[i] = voxel * 255;
 | 
			
		||||
  }
 | 
			
		||||
    if ((x < imageW) && (y < imageH)) {
 | 
			
		||||
        // write output color
 | 
			
		||||
        uint i      = __umul24(y, imageW) + x;
 | 
			
		||||
        d_output[i] = voxel * 255;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void setTextureFilterMode(bool bLinearFilter) {
 | 
			
		||||
  if (tex) {
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
  }
 | 
			
		||||
  cudaResourceDesc texRes;
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
extern "C" void setTextureFilterMode(bool bLinearFilter)
 | 
			
		||||
{
 | 
			
		||||
    if (tex) {
 | 
			
		||||
        checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
    }
 | 
			
		||||
    cudaResourceDesc texRes;
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  texRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  texRes.res.array.array = d_volumeArray;
 | 
			
		||||
    texRes.resType         = cudaResourceTypeArray;
 | 
			
		||||
    texRes.res.array.array = d_volumeArray;
 | 
			
		||||
 | 
			
		||||
  cudaTextureDesc texDescr;
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
    cudaTextureDesc texDescr;
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  texDescr.filterMode =
 | 
			
		||||
      bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
 | 
			
		||||
  ;
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[2] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeNormalizedFloat;
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    texDescr.filterMode       = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
 | 
			
		||||
    ;
 | 
			
		||||
    texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[2] = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode       = cudaReadModeNormalizedFloat;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
 | 
			
		||||
  // create 3D array
 | 
			
		||||
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
 | 
			
		||||
  checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
 | 
			
		||||
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize)
 | 
			
		||||
{
 | 
			
		||||
    // create 3D array
 | 
			
		||||
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
 | 
			
		||||
    checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
 | 
			
		||||
 | 
			
		||||
  // copy data to 3D array
 | 
			
		||||
  cudaMemcpy3DParms copyParams = {0};
 | 
			
		||||
  copyParams.srcPtr =
 | 
			
		||||
      make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar),
 | 
			
		||||
                          volumeSize.width, volumeSize.height);
 | 
			
		||||
  copyParams.dstArray = d_volumeArray;
 | 
			
		||||
  copyParams.extent = volumeSize;
 | 
			
		||||
  copyParams.kind = cudaMemcpyHostToDevice;
 | 
			
		||||
  checkCudaErrors(cudaMemcpy3D(©Params));
 | 
			
		||||
    // copy data to 3D array
 | 
			
		||||
    cudaMemcpy3DParms copyParams = {0};
 | 
			
		||||
    copyParams.srcPtr =
 | 
			
		||||
        make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height);
 | 
			
		||||
    copyParams.dstArray = d_volumeArray;
 | 
			
		||||
    copyParams.extent   = volumeSize;
 | 
			
		||||
    copyParams.kind     = cudaMemcpyHostToDevice;
 | 
			
		||||
    checkCudaErrors(cudaMemcpy3D(©Params));
 | 
			
		||||
 | 
			
		||||
  cudaResourceDesc texRes;
 | 
			
		||||
  memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
    cudaResourceDesc texRes;
 | 
			
		||||
    memset(&texRes, 0, sizeof(cudaResourceDesc));
 | 
			
		||||
 | 
			
		||||
  texRes.resType = cudaResourceTypeArray;
 | 
			
		||||
  texRes.res.array.array = d_volumeArray;
 | 
			
		||||
    texRes.resType         = cudaResourceTypeArray;
 | 
			
		||||
    texRes.res.array.array = d_volumeArray;
 | 
			
		||||
 | 
			
		||||
  cudaTextureDesc texDescr;
 | 
			
		||||
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
    cudaTextureDesc texDescr;
 | 
			
		||||
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | 
			
		||||
 | 
			
		||||
  // access with normalized texture coordinates
 | 
			
		||||
  texDescr.normalizedCoords = true;
 | 
			
		||||
  // linear interpolation
 | 
			
		||||
  texDescr.filterMode = cudaFilterModeLinear;
 | 
			
		||||
  // wrap texture coordinates
 | 
			
		||||
  texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.addressMode[2] = cudaAddressModeWrap;
 | 
			
		||||
  texDescr.readMode = cudaReadModeNormalizedFloat;
 | 
			
		||||
    // access with normalized texture coordinates
 | 
			
		||||
    texDescr.normalizedCoords = true;
 | 
			
		||||
    // linear interpolation
 | 
			
		||||
    texDescr.filterMode = cudaFilterModeLinear;
 | 
			
		||||
    // wrap texture coordinates
 | 
			
		||||
    texDescr.addressMode[0] = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[1] = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.addressMode[2] = cudaAddressModeWrap;
 | 
			
		||||
    texDescr.readMode       = cudaReadModeNormalizedFloat;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
 | 
			
		||||
                              uint imageW, uint imageH, float w) {
 | 
			
		||||
  d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
 | 
			
		||||
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w)
 | 
			
		||||
{
 | 
			
		||||
    d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void cleanupCuda() {
 | 
			
		||||
  if (tex) {
 | 
			
		||||
    checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
  }
 | 
			
		||||
  if (d_volumeArray) {
 | 
			
		||||
    checkCudaErrors(cudaFreeArray(d_volumeArray));
 | 
			
		||||
  }
 | 
			
		||||
void cleanupCuda()
 | 
			
		||||
{
 | 
			
		||||
    if (tex) {
 | 
			
		||||
        checkCudaErrors(cudaDestroyTextureObject(tex));
 | 
			
		||||
    }
 | 
			
		||||
    if (d_volumeArray) {
 | 
			
		||||
        checkCudaErrors(cudaFreeArray(d_volumeArray));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif  // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
 | 
			
		||||
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
 | 
			
		||||
 | 
			
		||||
@ -26,29 +26,29 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
* This sample demonstrates how use texture fetches in CUDA
 | 
			
		||||
*
 | 
			
		||||
* This sample takes an input PGM image (image_filename) and generates
 | 
			
		||||
* an output PGM image (image_filename_out).  This CUDA kernel performs
 | 
			
		||||
* a simple 2D transform (rotation) on the texture coordinates (u,v).
 | 
			
		||||
* The results between simpleTexture and simpleTextureDrv are identical.
 | 
			
		||||
* The main difference is the implementation.  simpleTextureDrv makes calls
 | 
			
		||||
* to the CUDA driver API and demonstrates how to use cuModuleLoad to load
 | 
			
		||||
* the CUDA ptx (*.ptx) kernel just prior to kernel launch.
 | 
			
		||||
*
 | 
			
		||||
*/
 | 
			
		||||
 * This sample demonstrates how use texture fetches in CUDA
 | 
			
		||||
 *
 | 
			
		||||
 * This sample takes an input PGM image (image_filename) and generates
 | 
			
		||||
 * an output PGM image (image_filename_out).  This CUDA kernel performs
 | 
			
		||||
 * a simple 2D transform (rotation) on the texture coordinates (u,v).
 | 
			
		||||
 * The results between simpleTexture and simpleTextureDrv are identical.
 | 
			
		||||
 * The main difference is the implementation.  simpleTextureDrv makes calls
 | 
			
		||||
 * to the CUDA driver API and demonstrates how to use cuModuleLoad to load
 | 
			
		||||
 * the CUDA ptx (*.ptx) kernel just prior to kernel launch.
 | 
			
		||||
 *
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes, CUDA
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
#include <builtin_types.h>
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda_drvapi.h>
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
@ -56,8 +56,8 @@
 | 
			
		||||
using namespace std;
 | 
			
		||||
 | 
			
		||||
const char *image_filename = "teapot512.pgm";
 | 
			
		||||
const char *ref_filename = "ref_rotated.pgm";
 | 
			
		||||
float angle = 0.5f;  // angle to rotate image by (in radians)
 | 
			
		||||
const char *ref_filename   = "ref_rotated.pgm";
 | 
			
		||||
float       angle          = 0.5f; // angle to rotate image by (in radians)
 | 
			
		||||
 | 
			
		||||
#define MIN_EPSILON_ERROR 5e-3f
 | 
			
		||||
 | 
			
		||||
@ -65,8 +65,7 @@ float angle = 0.5f;  // angle to rotate image by (in radians)
 | 
			
		||||
// declaration, forward
 | 
			
		||||
void runTest(int argc, char **argv);
 | 
			
		||||
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata,
 | 
			
		||||
                            const unsigned int len);
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
 | 
			
		||||
 | 
			
		||||
static CUresult initCUDA(int argc, char **argv, CUfunction *);
 | 
			
		||||
 | 
			
		||||
@ -80,212 +79,227 @@ const char *sSDKsample = "simpleTextureDrv (Driver API)";
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Globals
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
CUdevice cuDevice;
 | 
			
		||||
CUdevice  cuDevice;
 | 
			
		||||
CUcontext cuContext;
 | 
			
		||||
CUmodule cuModule;
 | 
			
		||||
CUmodule  cuModule;
 | 
			
		||||
 | 
			
		||||
void showHelp() {
 | 
			
		||||
  printf("\n> [%s] Command line options\n", sSDKsample);
 | 
			
		||||
  printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
 | 
			
		||||
void showHelp()
 | 
			
		||||
{
 | 
			
		||||
    printf("\n> [%s] Command line options\n", sSDKsample);
 | 
			
		||||
    printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
 | 
			
		||||
    showHelp();
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
 | 
			
		||||
        showHelp();
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  runTest(argc, argv);
 | 
			
		||||
    runTest(argc, argv);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  bool bTestResults = true;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    bool bTestResults = true;
 | 
			
		||||
 | 
			
		||||
  // initialize CUDA
 | 
			
		||||
  CUfunction transform = NULL;
 | 
			
		||||
    // initialize CUDA
 | 
			
		||||
    CUfunction transform = NULL;
 | 
			
		||||
 | 
			
		||||
  if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) {
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) {
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // load image from disk
 | 
			
		||||
  float *h_data = NULL;
 | 
			
		||||
  unsigned int width, height;
 | 
			
		||||
  char *image_path = sdkFindFilePath(image_filename, argv[0]);
 | 
			
		||||
    // load image from disk
 | 
			
		||||
    float       *h_data = NULL;
 | 
			
		||||
    unsigned int width, height;
 | 
			
		||||
    char        *image_path = sdkFindFilePath(image_filename, argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (image_path == NULL) {
 | 
			
		||||
    printf("Unable to find image file: '%s'\n", image_filename);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (image_path == NULL) {
 | 
			
		||||
        printf("Unable to find image file: '%s'\n", image_filename);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  sdkLoadPGM(image_path, &h_data, &width, &height);
 | 
			
		||||
    sdkLoadPGM(image_path, &h_data, &width, &height);
 | 
			
		||||
 | 
			
		||||
  size_t size = width * height * sizeof(float);
 | 
			
		||||
  printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);
 | 
			
		||||
    size_t size = width * height * sizeof(float);
 | 
			
		||||
    printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);
 | 
			
		||||
 | 
			
		||||
  // load reference image from image (output)
 | 
			
		||||
  float *h_data_ref = (float *)malloc(size);
 | 
			
		||||
  char *ref_path = sdkFindFilePath(ref_filename, argv[0]);
 | 
			
		||||
    // load reference image from image (output)
 | 
			
		||||
    float *h_data_ref = (float *)malloc(size);
 | 
			
		||||
    char  *ref_path   = sdkFindFilePath(ref_filename, argv[0]);
 | 
			
		||||
 | 
			
		||||
  if (ref_path == NULL) {
 | 
			
		||||
    printf("Unable to find reference file %s\n", ref_filename);
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (ref_path == NULL) {
 | 
			
		||||
        printf("Unable to find reference file %s\n", ref_filename);
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  sdkLoadPGM(ref_path, &h_data_ref, &width, &height);
 | 
			
		||||
    sdkLoadPGM(ref_path, &h_data_ref, &width, &height);
 | 
			
		||||
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  CUdeviceptr d_data = (CUdeviceptr)NULL;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_data, size));
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    CUdeviceptr d_data = (CUdeviceptr)NULL;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_data, size));
 | 
			
		||||
 | 
			
		||||
  // allocate array and copy image data
 | 
			
		||||
  CUarray cu_array;
 | 
			
		||||
  CUDA_ARRAY_DESCRIPTOR desc;
 | 
			
		||||
  desc.Format = CU_AD_FORMAT_FLOAT;
 | 
			
		||||
  desc.NumChannels = 1;
 | 
			
		||||
  desc.Width = width;
 | 
			
		||||
  desc.Height = height;
 | 
			
		||||
  checkCudaErrors(cuArrayCreate(&cu_array, &desc));
 | 
			
		||||
  CUDA_MEMCPY2D copyParam;
 | 
			
		||||
  memset(©Param, 0, sizeof(copyParam));
 | 
			
		||||
  copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
 | 
			
		||||
  copyParam.dstArray = cu_array;
 | 
			
		||||
  copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
 | 
			
		||||
  copyParam.srcHost = h_data;
 | 
			
		||||
  copyParam.srcPitch = width * sizeof(float);
 | 
			
		||||
  copyParam.WidthInBytes = copyParam.srcPitch;
 | 
			
		||||
  copyParam.Height = height;
 | 
			
		||||
  checkCudaErrors(cuMemcpy2D(©Param));
 | 
			
		||||
    // allocate array and copy image data
 | 
			
		||||
    CUarray               cu_array;
 | 
			
		||||
    CUDA_ARRAY_DESCRIPTOR desc;
 | 
			
		||||
    desc.Format      = CU_AD_FORMAT_FLOAT;
 | 
			
		||||
    desc.NumChannels = 1;
 | 
			
		||||
    desc.Width       = width;
 | 
			
		||||
    desc.Height      = height;
 | 
			
		||||
    checkCudaErrors(cuArrayCreate(&cu_array, &desc));
 | 
			
		||||
    CUDA_MEMCPY2D copyParam;
 | 
			
		||||
    memset(©Param, 0, sizeof(copyParam));
 | 
			
		||||
    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
 | 
			
		||||
    copyParam.dstArray      = cu_array;
 | 
			
		||||
    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
 | 
			
		||||
    copyParam.srcHost       = h_data;
 | 
			
		||||
    copyParam.srcPitch      = width * sizeof(float);
 | 
			
		||||
    copyParam.WidthInBytes  = copyParam.srcPitch;
 | 
			
		||||
    copyParam.Height        = height;
 | 
			
		||||
    checkCudaErrors(cuMemcpy2D(©Param));
 | 
			
		||||
 | 
			
		||||
  // set texture parameters
 | 
			
		||||
  CUtexObject TexObject;
 | 
			
		||||
  CUDA_RESOURCE_DESC ResDesc;
 | 
			
		||||
  memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
 | 
			
		||||
  ResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
 | 
			
		||||
  ResDesc.res.array.hArray = cu_array;
 | 
			
		||||
    // set texture parameters
 | 
			
		||||
    CUtexObject        TexObject;
 | 
			
		||||
    CUDA_RESOURCE_DESC ResDesc;
 | 
			
		||||
    memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
 | 
			
		||||
    ResDesc.resType          = CU_RESOURCE_TYPE_ARRAY;
 | 
			
		||||
    ResDesc.res.array.hArray = cu_array;
 | 
			
		||||
 | 
			
		||||
  CUDA_TEXTURE_DESC TexDesc;
 | 
			
		||||
  memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
 | 
			
		||||
  TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
 | 
			
		||||
  TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
 | 
			
		||||
  TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
 | 
			
		||||
  TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
 | 
			
		||||
  TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
 | 
			
		||||
    CUDA_TEXTURE_DESC TexDesc;
 | 
			
		||||
    memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
 | 
			
		||||
    TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
 | 
			
		||||
    TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
 | 
			
		||||
    TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
 | 
			
		||||
    TexDesc.filterMode     = CU_TR_FILTER_MODE_LINEAR;
 | 
			
		||||
    TexDesc.flags          = CU_TRSF_NORMALIZED_COORDINATES;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));
 | 
			
		||||
    checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));
 | 
			
		||||
 | 
			
		||||
  // There are two ways to launch CUDA kernels via the Driver API.
 | 
			
		||||
  // In this CUDA Sample, we illustrate both ways to pass parameters
 | 
			
		||||
  // and specify parameters.  By default we use the simpler method.
 | 
			
		||||
  int block_size = 8;
 | 
			
		||||
  StopWatchInterface *timer = NULL;
 | 
			
		||||
    // There are two ways to launch CUDA kernels via the Driver API.
 | 
			
		||||
    // In this CUDA Sample, we illustrate both ways to pass parameters
 | 
			
		||||
    // and specify parameters.  By default we use the simpler method.
 | 
			
		||||
    int                 block_size = 8;
 | 
			
		||||
    StopWatchInterface *timer      = NULL;
 | 
			
		||||
 | 
			
		||||
  if (1) {
 | 
			
		||||
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
    // Launching (simpler method)
 | 
			
		||||
    void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
 | 
			
		||||
    if (1) {
 | 
			
		||||
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
        // Launching (simpler method)
 | 
			
		||||
        void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
 | 
			
		||||
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
 | 
			
		||||
        checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
        sdkCreateTimer(&timer);
 | 
			
		||||
        sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
        // launch kernel again for performance measurement
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
        // Launching (advanced method)
 | 
			
		||||
        int  offset = 0;
 | 
			
		||||
        char argBuffer[256];
 | 
			
		||||
 | 
			
		||||
        // pass in launch parameters (not actually de-referencing CUdeviceptr).
 | 
			
		||||
        // CUdeviceptr is
 | 
			
		||||
        // storing the value of the parameters
 | 
			
		||||
        *((CUdeviceptr *)&argBuffer[offset]) = d_data;
 | 
			
		||||
        offset += sizeof(d_data);
 | 
			
		||||
        *((unsigned int *)&argBuffer[offset]) = width;
 | 
			
		||||
        offset += sizeof(width);
 | 
			
		||||
        *((unsigned int *)&argBuffer[offset]) = height;
 | 
			
		||||
        offset += sizeof(height);
 | 
			
		||||
        *((float *)&argBuffer[offset]) = angle;
 | 
			
		||||
        offset += sizeof(angle);
 | 
			
		||||
        *((CUtexObject *)&argBuffer[offset]) = TexObject;
 | 
			
		||||
        offset += sizeof(TexObject);
 | 
			
		||||
 | 
			
		||||
        void *kernel_launch_config[5] = {
 | 
			
		||||
            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
 | 
			
		||||
 | 
			
		||||
        // new CUDA 4.0 Driver API Kernel launch call (warmup)
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(transform,
 | 
			
		||||
                                       (width / block_size),
 | 
			
		||||
                                       (height / block_size),
 | 
			
		||||
                                       1,
 | 
			
		||||
                                       block_size,
 | 
			
		||||
                                       block_size,
 | 
			
		||||
                                       1,
 | 
			
		||||
                                       0,
 | 
			
		||||
                                       NULL,
 | 
			
		||||
                                       NULL,
 | 
			
		||||
                                       (void **)&kernel_launch_config));
 | 
			
		||||
        checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
        sdkCreateTimer(&timer);
 | 
			
		||||
        sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
        // launch kernel again for performance measurement
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(transform,
 | 
			
		||||
                                       (width / block_size),
 | 
			
		||||
                                       (height / block_size),
 | 
			
		||||
                                       1,
 | 
			
		||||
                                       block_size,
 | 
			
		||||
                                       block_size,
 | 
			
		||||
                                       1,
 | 
			
		||||
                                       0,
 | 
			
		||||
                                       0,
 | 
			
		||||
                                       NULL,
 | 
			
		||||
                                       (void **)&kernel_launch_config));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
 | 
			
		||||
                                   (height / block_size), 1, block_size,
 | 
			
		||||
                                   block_size, 1, 0, NULL, args, NULL));
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
    // launch kernel again for performance measurement
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
 | 
			
		||||
                                   (height / block_size), 1, block_size,
 | 
			
		||||
                                   block_size, 1, 0, NULL, args, NULL));
 | 
			
		||||
  } else {
 | 
			
		||||
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 | 
			
		||||
    // Launching (advanced method)
 | 
			
		||||
    int offset = 0;
 | 
			
		||||
    char argBuffer[256];
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    float *h_odata = (float *)malloc(size);
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));
 | 
			
		||||
 | 
			
		||||
    // pass in launch parameters (not actually de-referencing CUdeviceptr).
 | 
			
		||||
    // CUdeviceptr is
 | 
			
		||||
    // storing the value of the parameters
 | 
			
		||||
    *((CUdeviceptr *)&argBuffer[offset]) = d_data;
 | 
			
		||||
    offset += sizeof(d_data);
 | 
			
		||||
    *((unsigned int *)&argBuffer[offset]) = width;
 | 
			
		||||
    offset += sizeof(width);
 | 
			
		||||
    *((unsigned int *)&argBuffer[offset]) = height;
 | 
			
		||||
    offset += sizeof(height);
 | 
			
		||||
    *((float *)&argBuffer[offset]) = angle;
 | 
			
		||||
    offset += sizeof(angle);
 | 
			
		||||
    *((CUtexObject *)&argBuffer[offset]) = TexObject;
 | 
			
		||||
    offset += sizeof(TexObject);
 | 
			
		||||
    // write result to file
 | 
			
		||||
    char output_filename[1024];
 | 
			
		||||
    strcpy(output_filename, image_path);
 | 
			
		||||
    strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
 | 
			
		||||
    sdkSavePGM(output_filename, h_odata, width, height);
 | 
			
		||||
    printf("Wrote '%s'\n", output_filename);
 | 
			
		||||
 | 
			
		||||
    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
 | 
			
		||||
                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
 | 
			
		||||
                                     CU_LAUNCH_PARAM_END};
 | 
			
		||||
    // write regression file if necessary
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
        // write file for regression test
 | 
			
		||||
        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // We need to reload the data from disk, because it is inverted upon output
 | 
			
		||||
        sdkLoadPGM(output_filename, &h_odata, &width, &height);
 | 
			
		||||
 | 
			
		||||
    // new CUDA 4.0 Driver API Kernel launch call (warmup)
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
        transform, (width / block_size), (height / block_size), 1, block_size,
 | 
			
		||||
        block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config));
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
        printf("Comparing files\n");
 | 
			
		||||
        printf("\toutput:    <%s>\n", output_filename);
 | 
			
		||||
        printf("\treference: <%s>\n", ref_path);
 | 
			
		||||
        bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // launch kernel again for performance measurement
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(
 | 
			
		||||
        transform, (width / block_size), (height / block_size), 1, block_size,
 | 
			
		||||
        block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config));
 | 
			
		||||
  }
 | 
			
		||||
    // cleanup memory
 | 
			
		||||
    checkCudaErrors(cuTexObjectDestroy(TexObject));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_data));
 | 
			
		||||
    checkCudaErrors(cuArrayDestroy(cu_array));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  printf("%.2f Mpixels/sec\n",
 | 
			
		||||
         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    free(image_path);
 | 
			
		||||
    free(ref_path);
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  float *h_odata = (float *)malloc(size);
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));
 | 
			
		||||
    checkCudaErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
 | 
			
		||||
  // write result to file
 | 
			
		||||
  char output_filename[1024];
 | 
			
		||||
  strcpy(output_filename, image_path);
 | 
			
		||||
  strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
 | 
			
		||||
  sdkSavePGM(output_filename, h_odata, width, height);
 | 
			
		||||
  printf("Wrote '%s'\n", output_filename);
 | 
			
		||||
 | 
			
		||||
  // write regression file if necessary
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
    // write file for regression test
 | 
			
		||||
    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
 | 
			
		||||
                        false);
 | 
			
		||||
  } else {
 | 
			
		||||
    // We need to reload the data from disk, because it is inverted upon output
 | 
			
		||||
    sdkLoadPGM(output_filename, &h_odata, &width, &height);
 | 
			
		||||
 | 
			
		||||
    printf("Comparing files\n");
 | 
			
		||||
    printf("\toutput:    <%s>\n", output_filename);
 | 
			
		||||
    printf("\treference: <%s>\n", ref_path);
 | 
			
		||||
    bTestResults = compareData(h_odata, h_data_ref, width * height,
 | 
			
		||||
                               MIN_EPSILON_ERROR, 0.15f);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // cleanup memory
 | 
			
		||||
  checkCudaErrors(cuTexObjectDestroy(TexObject));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_data));
 | 
			
		||||
  checkCudaErrors(cuArrayDestroy(cu_array));
 | 
			
		||||
 | 
			
		||||
  free(image_path);
 | 
			
		||||
  free(ref_path);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
 | 
			
		||||
  exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -293,45 +307,44 @@ void runTest(int argc, char **argv) {
 | 
			
		||||
//! kernel function.  After the module is loaded, cuModuleGetFunction
 | 
			
		||||
//! retrieves the CUDA function pointer "cuFunction"
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
 | 
			
		||||
  CUfunction cuFunction = 0;
 | 
			
		||||
  int major = 0, minor = 0, devID = 0;
 | 
			
		||||
  char deviceName[100];
 | 
			
		||||
  string module_path;
 | 
			
		||||
static CUresult initCUDA(int argc, char **argv, CUfunction *transform)
 | 
			
		||||
{
 | 
			
		||||
    CUfunction cuFunction = 0;
 | 
			
		||||
    int        major = 0, minor = 0, devID = 0;
 | 
			
		||||
    char       deviceName[100];
 | 
			
		||||
    string     module_path;
 | 
			
		||||
 | 
			
		||||
  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // get compute capabilities and the devicename
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
 | 
			
		||||
  checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
 | 
			
		||||
  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
 | 
			
		||||
    // get compute capabilities and the devicename
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
 | 
			
		||||
    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
 | 
			
		||||
  // first search for the module_path before we try to load the results
 | 
			
		||||
  std::ostringstream fatbin;
 | 
			
		||||
    // first search for the module_path before we try to load the results
 | 
			
		||||
    std::ostringstream fatbin;
 | 
			
		||||
 | 
			
		||||
  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
  }
 | 
			
		||||
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (!fatbin.str().size()) {
 | 
			
		||||
    printf("fatbin file empty. exiting..\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (!fatbin.str().size()) {
 | 
			
		||||
        printf("fatbin file empty. exiting..\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Create module from binary file (FATBIN)
 | 
			
		||||
  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
    // Create module from binary file (FATBIN)
 | 
			
		||||
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
 | 
			
		||||
    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
 | 
			
		||||
 | 
			
		||||
  *transform = cuFunction;
 | 
			
		||||
    *transform = cuFunction;
 | 
			
		||||
 | 
			
		||||
  return CUDA_SUCCESS;
 | 
			
		||||
    return CUDA_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -33,23 +33,22 @@
 | 
			
		||||
//! Transform an image using texture lookups
 | 
			
		||||
//! @param g_odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
extern "C" __global__ void transformKernel(float *g_odata, int width,
 | 
			
		||||
                                           int height, float theta,
 | 
			
		||||
                                           CUtexObject tex) {
 | 
			
		||||
  // calculate normalized texture coordinates
 | 
			
		||||
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex)
 | 
			
		||||
{
 | 
			
		||||
    // calculate normalized texture coordinates
 | 
			
		||||
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
  float u = (float)x - (float)width / 2;
 | 
			
		||||
  float v = (float)y - (float)height / 2;
 | 
			
		||||
  float tu = u * cosf(theta) - v * sinf(theta);
 | 
			
		||||
  float tv = v * cosf(theta) + u * sinf(theta);
 | 
			
		||||
    float u  = (float)x - (float)width / 2;
 | 
			
		||||
    float v  = (float)y - (float)height / 2;
 | 
			
		||||
    float tu = u * cosf(theta) - v * sinf(theta);
 | 
			
		||||
    float tv = v * cosf(theta) + u * sinf(theta);
 | 
			
		||||
 | 
			
		||||
  tu /= (float)width;
 | 
			
		||||
  tv /= (float)height;
 | 
			
		||||
    tu /= (float)width;
 | 
			
		||||
    tv /= (float)height;
 | 
			
		||||
 | 
			
		||||
  // read from texture and write to global memory
 | 
			
		||||
  g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 | 
			
		||||
    // read from texture and write to global memory
 | 
			
		||||
    g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif  // #ifndef _SIMPLETEXTURE_KERNEL_H_
 | 
			
		||||
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
 | 
			
		||||
 | 
			
		||||
@ -53,257 +53,237 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0";
 | 
			
		||||
#include "simpleVote_kernel.cuh"
 | 
			
		||||
 | 
			
		||||
// Generate the test pattern for Tests 1 and 2
 | 
			
		||||
void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
 | 
			
		||||
  // For testing VOTE.Any (all of these threads will return 0)
 | 
			
		||||
  for (int i = 0; i < size / 4; i++) {
 | 
			
		||||
    VOTE_PATTERN[i] = 0x00000000;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // For testing VOTE.Any (1/2 these threads will return 1)
 | 
			
		||||
  for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
 | 
			
		||||
    VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // For testing VOTE.all (1/2 of these threads will return 0)
 | 
			
		||||
  for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
 | 
			
		||||
    VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // For testing VOTE.all (all of these threads will return 1)
 | 
			
		||||
  for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
 | 
			
		||||
    VOTE_PATTERN[i] = 0xffffffff;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
 | 
			
		||||
                 const char *voteType) {
 | 
			
		||||
  int i, sum = 0;
 | 
			
		||||
 | 
			
		||||
  for (sum = 0, i = start; i < end; i++) {
 | 
			
		||||
    sum += h_result[i];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (sum > 0) {
 | 
			
		||||
    printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
 | 
			
		||||
 | 
			
		||||
    for (i = start; i < end; i++) {
 | 
			
		||||
      printf("%d", h_result[i]);
 | 
			
		||||
void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
 | 
			
		||||
{
 | 
			
		||||
    // For testing VOTE.Any (all of these threads will return 0)
 | 
			
		||||
    for (int i = 0; i < size / 4; i++) {
 | 
			
		||||
        VOTE_PATTERN[i] = 0x00000000;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%d values FAILED\n", sum);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return (sum > 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
 | 
			
		||||
                 const char *voteType) {
 | 
			
		||||
  int i, sum = 0;
 | 
			
		||||
 | 
			
		||||
  for (sum = 0, i = start; i < end; i++) {
 | 
			
		||||
    sum += h_result[i];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (sum != warp_size) {
 | 
			
		||||
    printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
 | 
			
		||||
 | 
			
		||||
    for (i = start; i < end; i++) {
 | 
			
		||||
      printf("%d", h_result[i]);
 | 
			
		||||
    // For testing VOTE.Any (1/2 these threads will return 1)
 | 
			
		||||
    for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
 | 
			
		||||
        VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf(" - FAILED\n");
 | 
			
		||||
  }
 | 
			
		||||
    // For testing VOTE.all (1/2 of these threads will return 0)
 | 
			
		||||
    for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
 | 
			
		||||
        VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return (sum != warp_size);
 | 
			
		||||
    // For testing VOTE.all (all of these threads will return 1)
 | 
			
		||||
    for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
 | 
			
		||||
        VOTE_PATTERN[i] = 0xffffffff;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
 | 
			
		||||
{
 | 
			
		||||
    int i, sum = 0;
 | 
			
		||||
 | 
			
		||||
    for (sum = 0, i = start; i < end; i++) {
 | 
			
		||||
        sum += h_result[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (sum > 0) {
 | 
			
		||||
        printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
 | 
			
		||||
 | 
			
		||||
        for (i = start; i < end; i++) {
 | 
			
		||||
            printf("%d", h_result[i]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        printf("%d values FAILED\n", sum);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return (sum > 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
 | 
			
		||||
{
 | 
			
		||||
    int i, sum = 0;
 | 
			
		||||
 | 
			
		||||
    for (sum = 0, i = start; i < end; i++) {
 | 
			
		||||
        sum += h_result[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (sum != warp_size) {
 | 
			
		||||
        printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
 | 
			
		||||
 | 
			
		||||
        for (i = start; i < end; i++) {
 | 
			
		||||
            printf("%d", h_result[i]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        printf(" - FAILED\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return (sum != warp_size);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Verification code for Kernel #1
 | 
			
		||||
int checkResultsVoteAnyKernel1(unsigned int *h_result, int size,
 | 
			
		||||
                               int warp_size) {
 | 
			
		||||
  int error_count = 0;
 | 
			
		||||
int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size)
 | 
			
		||||
{
 | 
			
		||||
    int error_count = 0;
 | 
			
		||||
 | 
			
		||||
  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                              warp_size, "Vote.Any");
 | 
			
		||||
  error_count +=
 | 
			
		||||
      checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
 | 
			
		||||
  error_count +=
 | 
			
		||||
      checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
 | 
			
		||||
  error_count +=
 | 
			
		||||
      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
 | 
			
		||||
    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
 | 
			
		||||
    error_count += checkErrors2(
 | 
			
		||||
        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
 | 
			
		||||
    error_count += checkErrors2(
 | 
			
		||||
        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
 | 
			
		||||
    error_count += checkErrors2(
 | 
			
		||||
        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
 | 
			
		||||
 | 
			
		||||
  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
 | 
			
		||||
  return error_count;
 | 
			
		||||
    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
 | 
			
		||||
    return error_count;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Verification code for Kernel #2
 | 
			
		||||
int checkResultsVoteAllKernel2(unsigned int *h_result, int size,
 | 
			
		||||
                               int warp_size) {
 | 
			
		||||
  int error_count = 0;
 | 
			
		||||
int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size)
 | 
			
		||||
{
 | 
			
		||||
    int error_count = 0;
 | 
			
		||||
 | 
			
		||||
  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                              warp_size, "Vote.All");
 | 
			
		||||
  error_count +=
 | 
			
		||||
      checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
 | 
			
		||||
  error_count +=
 | 
			
		||||
      checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
 | 
			
		||||
  error_count +=
 | 
			
		||||
      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
 | 
			
		||||
                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
 | 
			
		||||
    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
 | 
			
		||||
    error_count += checkErrors1(
 | 
			
		||||
        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
 | 
			
		||||
    error_count += checkErrors1(
 | 
			
		||||
        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
 | 
			
		||||
    error_count += checkErrors2(
 | 
			
		||||
        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
 | 
			
		||||
 | 
			
		||||
  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
 | 
			
		||||
  return error_count;
 | 
			
		||||
    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
 | 
			
		||||
    return error_count;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Verification code for Kernel #3
 | 
			
		||||
int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
 | 
			
		||||
  int i, error_count = 0;
 | 
			
		||||
int checkResultsVoteAnyKernel3(bool *hinfo, int size)
 | 
			
		||||
{
 | 
			
		||||
    int i, error_count = 0;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < size * 3; i++) {
 | 
			
		||||
    switch (i % 3) {
 | 
			
		||||
      case 0:
 | 
			
		||||
    for (i = 0; i < size * 3; i++) {
 | 
			
		||||
        switch (i % 3) {
 | 
			
		||||
        case 0:
 | 
			
		||||
 | 
			
		||||
        // First warp should be all zeros.
 | 
			
		||||
        if (hinfo[i] != (i >= size * 1)) {
 | 
			
		||||
          error_count++;
 | 
			
		||||
            // First warp should be all zeros.
 | 
			
		||||
            if (hinfo[i] != (i >= size * 1)) {
 | 
			
		||||
                error_count++;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            break;
 | 
			
		||||
 | 
			
		||||
        case 1:
 | 
			
		||||
 | 
			
		||||
            // First warp and half of second should be all zeros.
 | 
			
		||||
            if (hinfo[i] != (i >= size * 3 / 2)) {
 | 
			
		||||
                error_count++;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            break;
 | 
			
		||||
 | 
			
		||||
        case 2:
 | 
			
		||||
 | 
			
		||||
            // First two warps should be all zeros.
 | 
			
		||||
            if (hinfo[i] != (i >= size * 2)) {
 | 
			
		||||
                error_count++;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        break;
 | 
			
		||||
 | 
			
		||||
      case 1:
 | 
			
		||||
 | 
			
		||||
        // First warp and half of second should be all zeros.
 | 
			
		||||
        if (hinfo[i] != (i >= size * 3 / 2)) {
 | 
			
		||||
          error_count++;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        break;
 | 
			
		||||
 | 
			
		||||
      case 2:
 | 
			
		||||
 | 
			
		||||
        // First two warps should be all zeros.
 | 
			
		||||
        if (hinfo[i] != (i >= size * 2)) {
 | 
			
		||||
          error_count++;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
 | 
			
		||||
  return error_count;
 | 
			
		||||
    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
 | 
			
		||||
    return error_count;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  unsigned int *h_input, *h_result;
 | 
			
		||||
  unsigned int *d_input, *d_result;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    unsigned int *h_input, *h_result;
 | 
			
		||||
    unsigned int *d_input, *d_result;
 | 
			
		||||
 | 
			
		||||
  bool *dinfo = NULL, *hinfo = NULL;
 | 
			
		||||
  int error_count[3] = {0, 0, 0};
 | 
			
		||||
    bool *dinfo = NULL, *hinfo = NULL;
 | 
			
		||||
    int   error_count[3] = {0, 0, 0};
 | 
			
		||||
 | 
			
		||||
  cudaDeviceProp deviceProp;
 | 
			
		||||
  int devID, warp_size = 32;
 | 
			
		||||
    cudaDeviceProp deviceProp;
 | 
			
		||||
    int            devID, warp_size = 32;
 | 
			
		||||
 | 
			
		||||
  printf("%s\n", sSDKsample);
 | 
			
		||||
    printf("%s\n", sSDKsample);
 | 
			
		||||
 | 
			
		||||
  // This will pick the best possible CUDA capable device
 | 
			
		||||
  devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // This will pick the best possible CUDA capable device
 | 
			
		||||
    devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 | 
			
		||||
 | 
			
		||||
  // Statistics about the GPU device
 | 
			
		||||
  printf(
 | 
			
		||||
      "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
 | 
			
		||||
      deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
 | 
			
		||||
    // Statistics about the GPU device
 | 
			
		||||
    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
 | 
			
		||||
           deviceProp.multiProcessorCount,
 | 
			
		||||
           deviceProp.major,
 | 
			
		||||
           deviceProp.minor);
 | 
			
		||||
 | 
			
		||||
  h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
 | 
			
		||||
                                   sizeof(unsigned int));
 | 
			
		||||
  h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
 | 
			
		||||
                                    sizeof(unsigned int));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc(reinterpret_cast<void **>(&d_input),
 | 
			
		||||
                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMalloc(reinterpret_cast<void **>(&d_result),
 | 
			
		||||
                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
 | 
			
		||||
  genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(d_input, h_input,
 | 
			
		||||
                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
 | 
			
		||||
                             cudaMemcpyHostToDevice));
 | 
			
		||||
    h_input  = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
 | 
			
		||||
    h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMalloc(reinterpret_cast<void **>(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMalloc(reinterpret_cast<void **>(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
 | 
			
		||||
    genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  // Start of Vote Any Test Kernel #1
 | 
			
		||||
  printf("[VOTE Kernel Test 1/3]\n");
 | 
			
		||||
  printf("\tRunning <<Vote.Any>> kernel1 ...\n");
 | 
			
		||||
  {
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    dim3 gridBlock(1, 1);
 | 
			
		||||
    dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
 | 
			
		||||
    VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result,
 | 
			
		||||
                                               VOTE_DATA_GROUP * warp_size);
 | 
			
		||||
    getLastCudaError("VoteAnyKernel() execution failed\n");
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  }
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h_result, d_result,
 | 
			
		||||
                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
  error_count[0] += checkResultsVoteAnyKernel1(
 | 
			
		||||
      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
 | 
			
		||||
    // Start of Vote Any Test Kernel #1
 | 
			
		||||
    printf("[VOTE Kernel Test 1/3]\n");
 | 
			
		||||
    printf("\tRunning <<Vote.Any>> kernel1 ...\n");
 | 
			
		||||
    {
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
        dim3 gridBlock(1, 1);
 | 
			
		||||
        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
 | 
			
		||||
        VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
 | 
			
		||||
        getLastCudaError("VoteAnyKernel() execution failed\n");
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    }
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
 | 
			
		||||
    error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
 | 
			
		||||
 | 
			
		||||
  // Start of Vote All Test Kernel #2
 | 
			
		||||
  printf("\n[VOTE Kernel Test 2/3]\n");
 | 
			
		||||
  printf("\tRunning <<Vote.All>> kernel2 ...\n");
 | 
			
		||||
  {
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    dim3 gridBlock(1, 1);
 | 
			
		||||
    dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
 | 
			
		||||
    VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result,
 | 
			
		||||
                                               VOTE_DATA_GROUP * warp_size);
 | 
			
		||||
    getLastCudaError("VoteAllKernel() execution failed\n");
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  }
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h_result, d_result,
 | 
			
		||||
                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
  error_count[1] += checkResultsVoteAllKernel2(
 | 
			
		||||
      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
 | 
			
		||||
    // Start of Vote All Test Kernel #2
 | 
			
		||||
    printf("\n[VOTE Kernel Test 2/3]\n");
 | 
			
		||||
    printf("\tRunning <<Vote.All>> kernel2 ...\n");
 | 
			
		||||
    {
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
        dim3 gridBlock(1, 1);
 | 
			
		||||
        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
 | 
			
		||||
        VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
 | 
			
		||||
        getLastCudaError("VoteAllKernel() execution failed\n");
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    }
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
 | 
			
		||||
    error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
 | 
			
		||||
 | 
			
		||||
  // Second Vote Kernel Test #3 (both Any/All)
 | 
			
		||||
  hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
 | 
			
		||||
  cudaMalloc(reinterpret_cast<void **>(&dinfo),
 | 
			
		||||
             warp_size * 3 * 3 * sizeof(bool));
 | 
			
		||||
  cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool),
 | 
			
		||||
             cudaMemcpyHostToDevice);
 | 
			
		||||
    // Second Vote Kernel Test #3 (both Any/All)
 | 
			
		||||
    hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
 | 
			
		||||
    cudaMalloc(reinterpret_cast<void **>(&dinfo), warp_size * 3 * 3 * sizeof(bool));
 | 
			
		||||
    cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice);
 | 
			
		||||
 | 
			
		||||
  printf("\n[VOTE Kernel Test 3/3]\n");
 | 
			
		||||
  printf("\tRunning <<Vote.Any>> kernel3 ...\n");
 | 
			
		||||
  {
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size);
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  }
 | 
			
		||||
    printf("\n[VOTE Kernel Test 3/3]\n");
 | 
			
		||||
    printf("\tRunning <<Vote.Any>> kernel3 ...\n");
 | 
			
		||||
    {
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
        VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size);
 | 
			
		||||
        checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool),
 | 
			
		||||
             cudaMemcpyDeviceToHost);
 | 
			
		||||
    cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost);
 | 
			
		||||
 | 
			
		||||
  error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
 | 
			
		||||
    error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
 | 
			
		||||
 | 
			
		||||
  // Now free these resources for Test #1,2
 | 
			
		||||
  checkCudaErrors(cudaFree(d_input));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_result));
 | 
			
		||||
  free(h_input);
 | 
			
		||||
  free(h_result);
 | 
			
		||||
    // Now free these resources for Test #1,2
 | 
			
		||||
    checkCudaErrors(cudaFree(d_input));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_result));
 | 
			
		||||
    free(h_input);
 | 
			
		||||
    free(h_result);
 | 
			
		||||
 | 
			
		||||
  // Free resources from Test #3
 | 
			
		||||
  free(hinfo);
 | 
			
		||||
  cudaFree(dinfo);
 | 
			
		||||
    // Free resources from Test #3
 | 
			
		||||
    free(hinfo);
 | 
			
		||||
    cudaFree(dinfo);
 | 
			
		||||
 | 
			
		||||
  printf("\tShutting down...\n");
 | 
			
		||||
    printf("\tShutting down...\n");
 | 
			
		||||
 | 
			
		||||
  return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0)
 | 
			
		||||
             ? EXIT_SUCCESS
 | 
			
		||||
             : EXIT_FAILURE;
 | 
			
		||||
    return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -38,43 +38,44 @@
 | 
			
		||||
// If ANY one of the threads (within the warp) of the predicated condition
 | 
			
		||||
// returns a non-zero value, then all threads within this warp will return a
 | 
			
		||||
// non-zero value
 | 
			
		||||
__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
 | 
			
		||||
                               int size) {
 | 
			
		||||
  int tx = threadIdx.x;
 | 
			
		||||
__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size)
 | 
			
		||||
{
 | 
			
		||||
    int tx = threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  int mask = 0xffffffff;
 | 
			
		||||
  result[tx] = __any_sync(mask, input[tx]);
 | 
			
		||||
    int mask   = 0xffffffff;
 | 
			
		||||
    result[tx] = __any_sync(mask, input[tx]);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Kernel #2 tests the across-the-warp vote(all) intrinsic.
 | 
			
		||||
// If ALL of the threads (within the warp) of the predicated condition returns
 | 
			
		||||
// a non-zero value, then all threads within this warp will return a non-zero
 | 
			
		||||
// value
 | 
			
		||||
__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
 | 
			
		||||
                               int size) {
 | 
			
		||||
  int tx = threadIdx.x;
 | 
			
		||||
__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size)
 | 
			
		||||
{
 | 
			
		||||
    int tx = threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  int mask = 0xffffffff;
 | 
			
		||||
  result[tx] = __all_sync(mask, input[tx]);
 | 
			
		||||
    int mask   = 0xffffffff;
 | 
			
		||||
    result[tx] = __all_sync(mask, input[tx]);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
 | 
			
		||||
// This kernel will test for conditions across warps, and within half warps
 | 
			
		||||
__global__ void VoteAnyKernel3(bool *info, int warp_size) {
 | 
			
		||||
  int tx = threadIdx.x;
 | 
			
		||||
  unsigned int mask = 0xffffffff;
 | 
			
		||||
  bool *offs = info + (tx * 3);
 | 
			
		||||
__global__ void VoteAnyKernel3(bool *info, int warp_size)
 | 
			
		||||
{
 | 
			
		||||
    int          tx   = threadIdx.x;
 | 
			
		||||
    unsigned int mask = 0xffffffff;
 | 
			
		||||
    bool        *offs = info + (tx * 3);
 | 
			
		||||
 | 
			
		||||
  // The following should hold true for the second and third warp
 | 
			
		||||
  *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2));
 | 
			
		||||
  // The following should hold true for the "upper half" of the second warp,
 | 
			
		||||
  // and all of the third warp
 | 
			
		||||
  *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false);
 | 
			
		||||
    // The following should hold true for the second and third warp
 | 
			
		||||
    *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2));
 | 
			
		||||
    // The following should hold true for the "upper half" of the second warp,
 | 
			
		||||
    // and all of the third warp
 | 
			
		||||
    *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false);
 | 
			
		||||
 | 
			
		||||
  // The following should hold true for the third warp only
 | 
			
		||||
  if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) {
 | 
			
		||||
    *(offs + 2) = true;
 | 
			
		||||
  }
 | 
			
		||||
    // The following should hold true for the third warp only
 | 
			
		||||
    if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) {
 | 
			
		||||
        *(offs + 2) = true;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@ -41,12 +41,13 @@
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/* Add two vectors on the GPU */
 | 
			
		||||
__global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
 | 
			
		||||
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
 | 
			
		||||
{
 | 
			
		||||
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (idx < N) {
 | 
			
		||||
    c[idx] = a[idx] + b[idx];
 | 
			
		||||
  }
 | 
			
		||||
    if (idx < N) {
 | 
			
		||||
        c[idx] = a[idx] + b[idx];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Allocate generic memory with malloc() and pin it laster instead of using
 | 
			
		||||
@ -54,194 +55,196 @@ __global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
 | 
			
		||||
bool bPinGenericMemory = false;
 | 
			
		||||
 | 
			
		||||
// Macro to aligned up to the memory size in question
 | 
			
		||||
#define MEMORY_ALIGNMENT 4096
 | 
			
		||||
#define MEMORY_ALIGNMENT  4096
 | 
			
		||||
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  int n, nelem, deviceCount;
 | 
			
		||||
  int idev = 0;  // use default device 0
 | 
			
		||||
  char *device = NULL;
 | 
			
		||||
  unsigned int flags;
 | 
			
		||||
  size_t bytes;
 | 
			
		||||
  float *a, *b, *c;           // Pinned memory allocated on the CPU
 | 
			
		||||
  float *a_UA, *b_UA, *c_UA;  // Non-4K Aligned Pinned memory on the CPU
 | 
			
		||||
  float *d_a, *d_b, *d_c;     // Device pointers for mapped memory
 | 
			
		||||
  float errorNorm, refNorm, ref, diff;
 | 
			
		||||
  cudaDeviceProp deviceProp;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int            n, nelem, deviceCount;
 | 
			
		||||
    int            idev   = 0; // use default device 0
 | 
			
		||||
    char          *device = NULL;
 | 
			
		||||
    unsigned int   flags;
 | 
			
		||||
    size_t         bytes;
 | 
			
		||||
    float         *a, *b, *c;          // Pinned memory allocated on the CPU
 | 
			
		||||
    float         *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU
 | 
			
		||||
    float         *d_a, *d_b, *d_c;    // Device pointers for mapped memory
 | 
			
		||||
    float          errorNorm, refNorm, ref, diff;
 | 
			
		||||
    cudaDeviceProp deviceProp;
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
 | 
			
		||||
    printf("Usage:  simpleZeroCopy [OPTION]\n\n");
 | 
			
		||||
    printf("Options:\n");
 | 
			
		||||
    printf("  --device=[device #]  Specify the device to be used\n");
 | 
			
		||||
    printf(
 | 
			
		||||
        "  --use_generic_memory (optional) use generic page-aligned for system "
 | 
			
		||||
        "memory\n");
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /* Get the device selected by the user or default to 0, and then set it. */
 | 
			
		||||
  if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
 | 
			
		||||
    cudaGetDeviceCount(&deviceCount);
 | 
			
		||||
    idev = atoi(device);
 | 
			
		||||
 | 
			
		||||
    if (idev >= deviceCount || idev < 0) {
 | 
			
		||||
      fprintf(stderr,
 | 
			
		||||
              "Device number %d is invalid, will use default CUDA device 0.\n",
 | 
			
		||||
              idev);
 | 
			
		||||
      idev = 0;
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
 | 
			
		||||
        printf("Usage:  simpleZeroCopy [OPTION]\n\n");
 | 
			
		||||
        printf("Options:\n");
 | 
			
		||||
        printf("  --device=[device #]  Specify the device to be used\n");
 | 
			
		||||
        printf("  --use_generic_memory (optional) use generic page-aligned for system "
 | 
			
		||||
               "memory\n");
 | 
			
		||||
        return EXIT_SUCCESS;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // if GPU found supports SM 1.2, then continue, otherwise we exit
 | 
			
		||||
  if (!checkCudaCapabilities(1, 2)) {
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
    /* Get the device selected by the user or default to 0, and then set it. */
 | 
			
		||||
    if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
 | 
			
		||||
        cudaGetDeviceCount(&deviceCount);
 | 
			
		||||
        idev = atoi(device);
 | 
			
		||||
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 | 
			
		||||
        if (idev >= deviceCount || idev < 0) {
 | 
			
		||||
            fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev);
 | 
			
		||||
            idev = 0;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // if GPU found supports SM 1.2, then continue, otherwise we exit
 | 
			
		||||
    if (!checkCudaCapabilities(1, 2)) {
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 | 
			
		||||
#if defined(__APPLE__) || defined(MACOSX)
 | 
			
		||||
    bPinGenericMemory = false;  // Generic Pinning of System Paged memory is not
 | 
			
		||||
                                // currently supported on Mac OSX
 | 
			
		||||
        bPinGenericMemory = false; // Generic Pinning of System Paged memory is not
 | 
			
		||||
                                   // currently supported on Mac OSX
 | 
			
		||||
#else
 | 
			
		||||
    bPinGenericMemory = true;
 | 
			
		||||
        bPinGenericMemory = true;
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (bPinGenericMemory) {
 | 
			
		||||
    printf("> Using Generic System Paged Memory (malloc)\n");
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
 | 
			
		||||
  }
 | 
			
		||||
    if (bPinGenericMemory) {
 | 
			
		||||
        printf("> Using Generic System Paged Memory (malloc)\n");
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaSetDevice(idev));
 | 
			
		||||
    checkCudaErrors(cudaSetDevice(idev));
 | 
			
		||||
 | 
			
		||||
  /* Verify the selected device supports mapped memory and set the device
 | 
			
		||||
     flags for mapping host memory. */
 | 
			
		||||
    /* Verify the selected device supports mapped memory and set the device
 | 
			
		||||
       flags for mapping host memory. */
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev));
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev));
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION >= 2020
 | 
			
		||||
 | 
			
		||||
  if (!deviceProp.canMapHostMemory) {
 | 
			
		||||
    fprintf(stderr, "Device %d does not support mapping CPU host memory!\n",
 | 
			
		||||
            idev);
 | 
			
		||||
    if (!deviceProp.canMapHostMemory) {
 | 
			
		||||
        fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev);
 | 
			
		||||
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
 | 
			
		||||
#else
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "CUDART version %d.%d does not support "
 | 
			
		||||
            "<cudaDeviceProp.canMapHostMemory> field\n",
 | 
			
		||||
            ,
 | 
			
		||||
            CUDART_VERSION / 1000,
 | 
			
		||||
            (CUDART_VERSION % 100) / 10);
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
 | 
			
		||||
#else
 | 
			
		||||
  fprintf(stderr,
 | 
			
		||||
          "CUDART version %d.%d does not support "
 | 
			
		||||
          "<cudaDeviceProp.canMapHostMemory> field\n",
 | 
			
		||||
          , CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
 | 
			
		||||
 | 
			
		||||
  exit(EXIT_SUCCESS);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION < 4000
 | 
			
		||||
 | 
			
		||||
  if (bPinGenericMemory) {
 | 
			
		||||
    fprintf(
 | 
			
		||||
        stderr,
 | 
			
		||||
        "CUDART version %d.%d does not support <cudaHostRegister> function\n",
 | 
			
		||||
        CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
 | 
			
		||||
    if (bPinGenericMemory) {
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "CUDART version %d.%d does not support <cudaHostRegister> function\n",
 | 
			
		||||
                CUDART_VERSION / 1000,
 | 
			
		||||
                (CUDART_VERSION % 100) / 10);
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
        exit(EXIT_SUCCESS);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  /* Allocate mapped CPU memory. */
 | 
			
		||||
    /* Allocate mapped CPU memory. */
 | 
			
		||||
 | 
			
		||||
  nelem = 1048576;
 | 
			
		||||
  bytes = nelem * sizeof(float);
 | 
			
		||||
    nelem = 1048576;
 | 
			
		||||
    bytes = nelem * sizeof(float);
 | 
			
		||||
 | 
			
		||||
  if (bPinGenericMemory) {
 | 
			
		||||
    if (bPinGenericMemory) {
 | 
			
		||||
#if CUDART_VERSION >= 4000
 | 
			
		||||
    a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
 | 
			
		||||
    b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
 | 
			
		||||
    c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
 | 
			
		||||
        a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
 | 
			
		||||
        b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
 | 
			
		||||
        c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
 | 
			
		||||
 | 
			
		||||
    // We need to ensure memory is aligned to 4K (so we will need to padd memory
 | 
			
		||||
    // accordingly)
 | 
			
		||||
    a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
 | 
			
		||||
    b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT);
 | 
			
		||||
    c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT);
 | 
			
		||||
        // We need to ensure memory is aligned to 4K (so we will need to padd memory
 | 
			
		||||
        // accordingly)
 | 
			
		||||
        a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
 | 
			
		||||
        b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT);
 | 
			
		||||
        c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT);
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped));
 | 
			
		||||
    checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
 | 
			
		||||
    checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
 | 
			
		||||
        checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped));
 | 
			
		||||
        checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
 | 
			
		||||
        checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
 | 
			
		||||
#endif
 | 
			
		||||
  } else {
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
#if CUDART_VERSION >= 2020
 | 
			
		||||
    flags = cudaHostAllocMapped;
 | 
			
		||||
    checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
 | 
			
		||||
    checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags));
 | 
			
		||||
    checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags));
 | 
			
		||||
        flags = cudaHostAllocMapped;
 | 
			
		||||
        checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
 | 
			
		||||
        checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags));
 | 
			
		||||
        checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags));
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  /* Initialize the vectors. */
 | 
			
		||||
    /* Initialize the vectors. */
 | 
			
		||||
 | 
			
		||||
  for (n = 0; n < nelem; n++) {
 | 
			
		||||
    a[n] = rand() / (float)RAND_MAX;
 | 
			
		||||
    b[n] = rand() / (float)RAND_MAX;
 | 
			
		||||
  }
 | 
			
		||||
    for (n = 0; n < nelem; n++) {
 | 
			
		||||
        a[n] = rand() / (float)RAND_MAX;
 | 
			
		||||
        b[n] = rand() / (float)RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Get the device pointers for the pinned CPU memory mapped into the GPU
 | 
			
		||||
       memory space. */
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION >= 2020
 | 
			
		||||
  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
 | 
			
		||||
  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
 | 
			
		||||
  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
 | 
			
		||||
    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
 | 
			
		||||
    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
 | 
			
		||||
    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory.
 | 
			
		||||
   */
 | 
			
		||||
  printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n");
 | 
			
		||||
  dim3 block(256);
 | 
			
		||||
  dim3 grid((unsigned int)ceil(nelem / (float)block.x));
 | 
			
		||||
  vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
  getLastCudaError("vectorAddGPU() execution failed");
 | 
			
		||||
    /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory.
 | 
			
		||||
     */
 | 
			
		||||
    printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n");
 | 
			
		||||
    dim3 block(256);
 | 
			
		||||
    dim3 grid((unsigned int)ceil(nelem / (float)block.x));
 | 
			
		||||
    vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    getLastCudaError("vectorAddGPU() execution failed");
 | 
			
		||||
 | 
			
		||||
  /* Compare the results */
 | 
			
		||||
    /* Compare the results */
 | 
			
		||||
 | 
			
		||||
  printf("> Checking the results from vectorAddGPU() ...\n");
 | 
			
		||||
  errorNorm = 0.f;
 | 
			
		||||
  refNorm = 0.f;
 | 
			
		||||
    printf("> Checking the results from vectorAddGPU() ...\n");
 | 
			
		||||
    errorNorm = 0.f;
 | 
			
		||||
    refNorm   = 0.f;
 | 
			
		||||
 | 
			
		||||
  for (n = 0; n < nelem; n++) {
 | 
			
		||||
    ref = a[n] + b[n];
 | 
			
		||||
    diff = c[n] - ref;
 | 
			
		||||
    errorNorm += diff * diff;
 | 
			
		||||
    refNorm += ref * ref;
 | 
			
		||||
  }
 | 
			
		||||
    for (n = 0; n < nelem; n++) {
 | 
			
		||||
        ref  = a[n] + b[n];
 | 
			
		||||
        diff = c[n] - ref;
 | 
			
		||||
        errorNorm += diff * diff;
 | 
			
		||||
        refNorm += ref * ref;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  errorNorm = (float)sqrt((double)errorNorm);
 | 
			
		||||
  refNorm = (float)sqrt((double)refNorm);
 | 
			
		||||
    errorNorm = (float)sqrt((double)errorNorm);
 | 
			
		||||
    refNorm   = (float)sqrt((double)refNorm);
 | 
			
		||||
 | 
			
		||||
  /* Memory clean up */
 | 
			
		||||
    /* Memory clean up */
 | 
			
		||||
 | 
			
		||||
  printf("> Releasing CPU memory...\n");
 | 
			
		||||
    printf("> Releasing CPU memory...\n");
 | 
			
		||||
 | 
			
		||||
  if (bPinGenericMemory) {
 | 
			
		||||
    if (bPinGenericMemory) {
 | 
			
		||||
#if CUDART_VERSION >= 4000
 | 
			
		||||
    checkCudaErrors(cudaHostUnregister(a));
 | 
			
		||||
    checkCudaErrors(cudaHostUnregister(b));
 | 
			
		||||
    checkCudaErrors(cudaHostUnregister(c));
 | 
			
		||||
    free(a_UA);
 | 
			
		||||
    free(b_UA);
 | 
			
		||||
    free(c_UA);
 | 
			
		||||
        checkCudaErrors(cudaHostUnregister(a));
 | 
			
		||||
        checkCudaErrors(cudaHostUnregister(b));
 | 
			
		||||
        checkCudaErrors(cudaHostUnregister(c));
 | 
			
		||||
        free(a_UA);
 | 
			
		||||
        free(b_UA);
 | 
			
		||||
        free(c_UA);
 | 
			
		||||
#endif
 | 
			
		||||
  } else {
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
#if CUDART_VERSION >= 2020
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(a));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(b));
 | 
			
		||||
    checkCudaErrors(cudaFreeHost(c));
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(a));
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(b));
 | 
			
		||||
        checkCudaErrors(cudaFreeHost(c));
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 | 
			
		||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 | 
			
		||||
 | 
			
		||||
## References (for more details)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -29,113 +29,111 @@
 | 
			
		||||
 * memory.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdint.h>
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
 | 
			
		||||
#define min(a, b) (a) < (b) ? (a) : (b)
 | 
			
		||||
#define max(a, b) (a) > (b) ? (a) : (b)
 | 
			
		||||
 | 
			
		||||
#define LOOP_NUM 50
 | 
			
		||||
 | 
			
		||||
__global__ void atomicKernel(int *atom_arr) {
 | 
			
		||||
  unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
__global__ void atomicKernel(int *atom_arr)
 | 
			
		||||
{
 | 
			
		||||
    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < LOOP_NUM; i++) {
 | 
			
		||||
    // Atomic addition
 | 
			
		||||
    atomicAdd_system(&atom_arr[0], 10);
 | 
			
		||||
    for (int i = 0; i < LOOP_NUM; i++) {
 | 
			
		||||
        // Atomic addition
 | 
			
		||||
        atomicAdd_system(&atom_arr[0], 10);
 | 
			
		||||
 | 
			
		||||
    // Atomic exchange
 | 
			
		||||
    atomicExch_system(&atom_arr[1], tid);
 | 
			
		||||
        // Atomic exchange
 | 
			
		||||
        atomicExch_system(&atom_arr[1], tid);
 | 
			
		||||
 | 
			
		||||
    // Atomic maximum
 | 
			
		||||
    atomicMax_system(&atom_arr[2], tid);
 | 
			
		||||
        // Atomic maximum
 | 
			
		||||
        atomicMax_system(&atom_arr[2], tid);
 | 
			
		||||
 | 
			
		||||
    // Atomic minimum
 | 
			
		||||
    atomicMin_system(&atom_arr[3], tid);
 | 
			
		||||
        // Atomic minimum
 | 
			
		||||
        atomicMin_system(&atom_arr[3], tid);
 | 
			
		||||
 | 
			
		||||
    // Atomic increment (modulo 17+1)
 | 
			
		||||
    atomicInc_system((unsigned int *)&atom_arr[4], 17);
 | 
			
		||||
        // Atomic increment (modulo 17+1)
 | 
			
		||||
        atomicInc_system((unsigned int *)&atom_arr[4], 17);
 | 
			
		||||
 | 
			
		||||
    // Atomic decrement
 | 
			
		||||
    atomicDec_system((unsigned int *)&atom_arr[5], 137);
 | 
			
		||||
        // Atomic decrement
 | 
			
		||||
        atomicDec_system((unsigned int *)&atom_arr[5], 137);
 | 
			
		||||
 | 
			
		||||
    // Atomic compare-and-swap
 | 
			
		||||
    atomicCAS_system(&atom_arr[6], tid - 1, tid);
 | 
			
		||||
        // Atomic compare-and-swap
 | 
			
		||||
        atomicCAS_system(&atom_arr[6], tid - 1, tid);
 | 
			
		||||
 | 
			
		||||
    // Bitwise atomic instructions
 | 
			
		||||
        // Bitwise atomic instructions
 | 
			
		||||
 | 
			
		||||
    // Atomic AND
 | 
			
		||||
    atomicAnd_system(&atom_arr[7], 2 * tid + 7);
 | 
			
		||||
        // Atomic AND
 | 
			
		||||
        atomicAnd_system(&atom_arr[7], 2 * tid + 7);
 | 
			
		||||
 | 
			
		||||
    // Atomic OR
 | 
			
		||||
    atomicOr_system(&atom_arr[8], 1 << tid);
 | 
			
		||||
        // Atomic OR
 | 
			
		||||
        atomicOr_system(&atom_arr[8], 1 << tid);
 | 
			
		||||
 | 
			
		||||
    // Atomic XOR
 | 
			
		||||
    atomicXor_system(&atom_arr[9], tid);
 | 
			
		||||
  }
 | 
			
		||||
        // Atomic XOR
 | 
			
		||||
        atomicXor_system(&atom_arr[9], tid);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
 | 
			
		||||
  for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
 | 
			
		||||
    for (int j = 0; j < LOOP_NUM; j++) {
 | 
			
		||||
      // Atomic addition
 | 
			
		||||
      __sync_fetch_and_add(&atom_arr[0], 10);
 | 
			
		||||
void atomicKernel_CPU(int *atom_arr, int no_of_threads)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
 | 
			
		||||
        for (int j = 0; j < LOOP_NUM; j++) {
 | 
			
		||||
            // Atomic addition
 | 
			
		||||
            __sync_fetch_and_add(&atom_arr[0], 10);
 | 
			
		||||
 | 
			
		||||
      // Atomic exchange
 | 
			
		||||
      __sync_lock_test_and_set(&atom_arr[1], i);
 | 
			
		||||
            // Atomic exchange
 | 
			
		||||
            __sync_lock_test_and_set(&atom_arr[1], i);
 | 
			
		||||
 | 
			
		||||
      // Atomic maximum
 | 
			
		||||
      int old, expected;
 | 
			
		||||
      do {
 | 
			
		||||
        expected = atom_arr[2];
 | 
			
		||||
        old = __sync_val_compare_and_swap(&atom_arr[2], expected,
 | 
			
		||||
                                          max(expected, i));
 | 
			
		||||
      } while (old != expected);
 | 
			
		||||
            // Atomic maximum
 | 
			
		||||
            int old, expected;
 | 
			
		||||
            do {
 | 
			
		||||
                expected = atom_arr[2];
 | 
			
		||||
                old      = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i));
 | 
			
		||||
            } while (old != expected);
 | 
			
		||||
 | 
			
		||||
      // Atomic minimum
 | 
			
		||||
      do {
 | 
			
		||||
        expected = atom_arr[3];
 | 
			
		||||
        old = __sync_val_compare_and_swap(&atom_arr[3], expected,
 | 
			
		||||
                                          min(expected, i));
 | 
			
		||||
      } while (old != expected);
 | 
			
		||||
            // Atomic minimum
 | 
			
		||||
            do {
 | 
			
		||||
                expected = atom_arr[3];
 | 
			
		||||
                old      = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i));
 | 
			
		||||
            } while (old != expected);
 | 
			
		||||
 | 
			
		||||
      // Atomic increment (modulo 17+1)
 | 
			
		||||
      int limit = 17;
 | 
			
		||||
      do {
 | 
			
		||||
        expected = atom_arr[4];
 | 
			
		||||
        old = __sync_val_compare_and_swap(
 | 
			
		||||
            &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
 | 
			
		||||
      } while (old != expected);
 | 
			
		||||
            // Atomic increment (modulo 17+1)
 | 
			
		||||
            int limit = 17;
 | 
			
		||||
            do {
 | 
			
		||||
                expected = atom_arr[4];
 | 
			
		||||
                old      = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
 | 
			
		||||
            } while (old != expected);
 | 
			
		||||
 | 
			
		||||
      // Atomic decrement
 | 
			
		||||
      limit = 137;
 | 
			
		||||
      do {
 | 
			
		||||
        expected = atom_arr[5];
 | 
			
		||||
        old = __sync_val_compare_and_swap(
 | 
			
		||||
            &atom_arr[5], expected,
 | 
			
		||||
            ((expected == 0) || (expected > limit)) ? limit : expected - 1);
 | 
			
		||||
      } while (old != expected);
 | 
			
		||||
            // Atomic decrement
 | 
			
		||||
            limit = 137;
 | 
			
		||||
            do {
 | 
			
		||||
                expected = atom_arr[5];
 | 
			
		||||
                old      = __sync_val_compare_and_swap(
 | 
			
		||||
                    &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1);
 | 
			
		||||
            } while (old != expected);
 | 
			
		||||
 | 
			
		||||
      // Atomic compare-and-swap
 | 
			
		||||
      __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
 | 
			
		||||
            // Atomic compare-and-swap
 | 
			
		||||
            __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
 | 
			
		||||
 | 
			
		||||
      // Bitwise atomic instructions
 | 
			
		||||
            // Bitwise atomic instructions
 | 
			
		||||
 | 
			
		||||
      // Atomic AND
 | 
			
		||||
      __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
 | 
			
		||||
            // Atomic AND
 | 
			
		||||
            __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
 | 
			
		||||
 | 
			
		||||
      // Atomic OR
 | 
			
		||||
      __sync_fetch_and_or(&atom_arr[8], 1 << i);
 | 
			
		||||
            // Atomic OR
 | 
			
		||||
            __sync_fetch_and_or(&atom_arr[8], 1 << i);
 | 
			
		||||
 | 
			
		||||
      // Atomic XOR
 | 
			
		||||
      // 11th element should be 0xff
 | 
			
		||||
      __sync_fetch_and_xor(&atom_arr[9], i);
 | 
			
		||||
            // Atomic XOR
 | 
			
		||||
            // 11th element should be 0xff
 | 
			
		||||
            __sync_fetch_and_xor(&atom_arr[9], i);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -145,198 +143,201 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
 | 
			
		||||
//! @param idata      input data as provided to device
 | 
			
		||||
//! @param len        number of elements in reference / idata
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int verify(int *testData, const int len) {
 | 
			
		||||
  int val = 0;
 | 
			
		||||
int verify(int *testData, const int len)
 | 
			
		||||
{
 | 
			
		||||
    int val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len * LOOP_NUM; ++i) {
 | 
			
		||||
    val += 10;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[0]) {
 | 
			
		||||
    printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  bool found = false;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // second element should be a member of [0, len)
 | 
			
		||||
    if (i == testData[1]) {
 | 
			
		||||
      found = true;
 | 
			
		||||
      break;
 | 
			
		||||
    for (int i = 0; i < len * LOOP_NUM; ++i) {
 | 
			
		||||
        val += 10;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!found) {
 | 
			
		||||
    printf("atomicExch failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = -(1 << 8);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // third element should be len-1
 | 
			
		||||
    val = max(val, i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[2]) {
 | 
			
		||||
    printf("atomicMax failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val = 1 << 8;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    val = min(val, i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[3]) {
 | 
			
		||||
    printf("atomicMin failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int limit = 17;
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len * LOOP_NUM; ++i) {
 | 
			
		||||
    val = (val >= limit) ? 0 : val + 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[4]) {
 | 
			
		||||
    printf("atomicInc failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  limit = 137;
 | 
			
		||||
  val = 0;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len * LOOP_NUM; ++i) {
 | 
			
		||||
    val = ((val == 0) || (val > limit)) ? limit : val - 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[5]) {
 | 
			
		||||
    printf("atomicDec failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  found = false;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // seventh element should be a member of [0, len)
 | 
			
		||||
    if (i == testData[6]) {
 | 
			
		||||
      found = true;
 | 
			
		||||
      break;
 | 
			
		||||
    if (val != testData[0]) {
 | 
			
		||||
        printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!found) {
 | 
			
		||||
    printf("atomicCAS failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    val = 0;
 | 
			
		||||
 | 
			
		||||
  val = 0xff;
 | 
			
		||||
    bool found = false;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 8th element should be 1
 | 
			
		||||
    val &= (2 * i + 7);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // second element should be a member of [0, len)
 | 
			
		||||
        if (i == testData[1]) {
 | 
			
		||||
            found = true;
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[7]) {
 | 
			
		||||
    printf("atomicAnd failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    if (!found) {
 | 
			
		||||
        printf("atomicExch failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  val = 0;
 | 
			
		||||
    val = -(1 << 8);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 9th element should be 0xff
 | 
			
		||||
    val |= (1 << i);
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // third element should be len-1
 | 
			
		||||
        val = max(val, i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[8]) {
 | 
			
		||||
    printf("atomicOr failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    if (val != testData[2]) {
 | 
			
		||||
        printf("atomicMax failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  val = 0xff;
 | 
			
		||||
    val = 1 << 8;
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < len; ++i) {
 | 
			
		||||
    // 11th element should be 0xff
 | 
			
		||||
    val ^= i;
 | 
			
		||||
  }
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        val = min(val, i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (val != testData[9]) {
 | 
			
		||||
    printf("atomicXor failed\n");
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
    if (val != testData[3]) {
 | 
			
		||||
        printf("atomicMin failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return true;
 | 
			
		||||
    int limit = 17;
 | 
			
		||||
    val       = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len * LOOP_NUM; ++i) {
 | 
			
		||||
        val = (val >= limit) ? 0 : val + 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != testData[4]) {
 | 
			
		||||
        printf("atomicInc failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    limit = 137;
 | 
			
		||||
    val   = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len * LOOP_NUM; ++i) {
 | 
			
		||||
        val = ((val == 0) || (val > limit)) ? limit : val - 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != testData[5]) {
 | 
			
		||||
        printf("atomicDec failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    found = false;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // seventh element should be a member of [0, len)
 | 
			
		||||
        if (i == testData[6]) {
 | 
			
		||||
            found = true;
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!found) {
 | 
			
		||||
        printf("atomicCAS failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0xff;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 8th element should be 1
 | 
			
		||||
        val &= (2 * i + 7);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != testData[7]) {
 | 
			
		||||
        printf("atomicAnd failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 9th element should be 0xff
 | 
			
		||||
        val |= (1 << i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != testData[8]) {
 | 
			
		||||
        printf("atomicOr failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    val = 0xff;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < len; ++i) {
 | 
			
		||||
        // 11th element should be 0xff
 | 
			
		||||
        val ^= i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (val != testData[9]) {
 | 
			
		||||
        printf("atomicXor failed\n");
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  // set device
 | 
			
		||||
  cudaDeviceProp device_prop;
 | 
			
		||||
  int dev_id = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    // set device
 | 
			
		||||
    cudaDeviceProp device_prop;
 | 
			
		||||
    int            dev_id = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
 | 
			
		||||
 | 
			
		||||
  if (!device_prop.managedMemory) {
 | 
			
		||||
    // This samples requires being run on a device that supports Unified Memory
 | 
			
		||||
    fprintf(stderr, "Unified Memory not supported on this device\n");
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
    if (!device_prop.managedMemory) {
 | 
			
		||||
        // This samples requires being run on a device that supports Unified Memory
 | 
			
		||||
        fprintf(stderr, "Unified Memory not supported on this device\n");
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (device_prop.computeMode == cudaComputeModeProhibited) {
 | 
			
		||||
    // This sample requires being run with a default or process exclusive mode
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "This sample requires a device in either default or process "
 | 
			
		||||
            "exclusive mode\n");
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
    if (device_prop.computeMode == cudaComputeModeProhibited) {
 | 
			
		||||
        // This sample requires being run with a default or process exclusive mode
 | 
			
		||||
        fprintf(stderr,
 | 
			
		||||
                "This sample requires a device in either default or process "
 | 
			
		||||
                "exclusive mode\n");
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (device_prop.major < 6) {
 | 
			
		||||
    printf(
 | 
			
		||||
        "%s: requires a minimum CUDA compute 6.0 capability, waiving "
 | 
			
		||||
        "testing.\n",
 | 
			
		||||
        argv[0]);
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
    if (device_prop.major < 6) {
 | 
			
		||||
        printf("%s: requires a minimum CUDA compute 6.0 capability, waiving "
 | 
			
		||||
               "testing.\n",
 | 
			
		||||
               argv[0]);
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  unsigned int numThreads = 256;
 | 
			
		||||
  unsigned int numBlocks = 64;
 | 
			
		||||
  unsigned int numData = 10;
 | 
			
		||||
    unsigned int numThreads = 256;
 | 
			
		||||
    unsigned int numBlocks  = 64;
 | 
			
		||||
    unsigned int numData    = 10;
 | 
			
		||||
 | 
			
		||||
  int *atom_arr;
 | 
			
		||||
    int *atom_arr;
 | 
			
		||||
 | 
			
		||||
  if (device_prop.pageableMemoryAccess) {
 | 
			
		||||
    printf("CAN access pageable memory\n");
 | 
			
		||||
    atom_arr = (int *)malloc(sizeof(int) * numData);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("CANNOT access pageable memory\n");
 | 
			
		||||
    checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
 | 
			
		||||
  }
 | 
			
		||||
    if (device_prop.pageableMemoryAccess) {
 | 
			
		||||
        printf("CAN access pageable memory\n");
 | 
			
		||||
        atom_arr = (int *)malloc(sizeof(int) * numData);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("CANNOT access pageable memory\n");
 | 
			
		||||
        checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;
 | 
			
		||||
    for (unsigned int i = 0; i < numData; i++)
 | 
			
		||||
        atom_arr[i] = 0;
 | 
			
		||||
 | 
			
		||||
  // To make the AND and XOR tests generate something other than 0...
 | 
			
		||||
  atom_arr[7] = atom_arr[9] = 0xff;
 | 
			
		||||
    // To make the AND and XOR tests generate something other than 0...
 | 
			
		||||
    atom_arr[7] = atom_arr[9] = 0xff;
 | 
			
		||||
 | 
			
		||||
  atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
 | 
			
		||||
  atomicKernel_CPU(atom_arr, numBlocks * numThreads);
 | 
			
		||||
    atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
 | 
			
		||||
    atomicKernel_CPU(atom_arr, numBlocks * numThreads);
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
    checkCudaErrors(cudaDeviceSynchronize());
 | 
			
		||||
 | 
			
		||||
  // Compute & verify reference solution
 | 
			
		||||
  int testResult = verify(atom_arr, 2 * numThreads * numBlocks);
 | 
			
		||||
    // Compute & verify reference solution
 | 
			
		||||
    int testResult = verify(atom_arr, 2 * numThreads * numBlocks);
 | 
			
		||||
 | 
			
		||||
  if (device_prop.pageableMemoryAccess) {
 | 
			
		||||
    free(atom_arr);
 | 
			
		||||
  } else {
 | 
			
		||||
    cudaFree(atom_arr);
 | 
			
		||||
  }
 | 
			
		||||
    if (device_prop.pageableMemoryAccess) {
 | 
			
		||||
        free(atom_arr);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        cudaFree(atom_arr);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  printf("systemWideAtomics completed, returned %s \n",
 | 
			
		||||
         testResult ? "OK" : "ERROR!");
 | 
			
		||||
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!");
 | 
			
		||||
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -31,10 +31,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes CUDA
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
@ -47,34 +47,34 @@
 | 
			
		||||
// declaration, forward
 | 
			
		||||
void runTest(int argc, char **argv);
 | 
			
		||||
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata,
 | 
			
		||||
                            const unsigned int len);
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Simple test kernel for device functionality
 | 
			
		||||
//! @param g_idata  input data in global memory
 | 
			
		||||
//! @param g_odata  output data in global memory
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
__global__ void testKernel(float *g_idata, float *g_odata) {
 | 
			
		||||
  // shared memory
 | 
			
		||||
  // the size is determined by the host application
 | 
			
		||||
  extern __shared__ float sdata[];
 | 
			
		||||
__global__ void testKernel(float *g_idata, float *g_odata)
 | 
			
		||||
{
 | 
			
		||||
    // shared memory
 | 
			
		||||
    // the size is determined by the host application
 | 
			
		||||
    extern __shared__ float sdata[];
 | 
			
		||||
 | 
			
		||||
  // access thread id
 | 
			
		||||
  const unsigned int tid = threadIdx.x;
 | 
			
		||||
  // access number of threads in this block
 | 
			
		||||
  const unsigned int num_threads = blockDim.x;
 | 
			
		||||
    // access thread id
 | 
			
		||||
    const unsigned int tid = threadIdx.x;
 | 
			
		||||
    // access number of threads in this block
 | 
			
		||||
    const unsigned int num_threads = blockDim.x;
 | 
			
		||||
 | 
			
		||||
  // read in input data from global memory
 | 
			
		||||
  sdata[tid] = g_idata[tid];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
    // read in input data from global memory
 | 
			
		||||
    sdata[tid] = g_idata[tid];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
 | 
			
		||||
  // perform some computations
 | 
			
		||||
  sdata[tid] = (float)num_threads * sdata[tid];
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
    // perform some computations
 | 
			
		||||
    sdata[tid] = (float)num_threads * sdata[tid];
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
 | 
			
		||||
  // write data to global memory
 | 
			
		||||
  g_odata[tid] = sdata[tid];
 | 
			
		||||
    // write data to global memory
 | 
			
		||||
    g_odata[tid] = sdata[tid];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@ -85,81 +85,81 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Run a simple test for CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void runTest(int argc, char **argv) {
 | 
			
		||||
  bool bTestResult = true;
 | 
			
		||||
void runTest(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    bool bTestResult = true;
 | 
			
		||||
 | 
			
		||||
  printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
    printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
  // Gflops/s
 | 
			
		||||
  int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
    // use command-line specified CUDA device, otherwise use device with highest
 | 
			
		||||
    // Gflops/s
 | 
			
		||||
    int devID = findCudaDevice(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  StopWatchInterface *timer = 0;
 | 
			
		||||
  sdkCreateTimer(&timer);
 | 
			
		||||
  sdkStartTimer(&timer);
 | 
			
		||||
    StopWatchInterface *timer = 0;
 | 
			
		||||
    sdkCreateTimer(&timer);
 | 
			
		||||
    sdkStartTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  unsigned int num_threads = 32;
 | 
			
		||||
  unsigned int mem_size = sizeof(float) * num_threads;
 | 
			
		||||
    unsigned int num_threads = 32;
 | 
			
		||||
    unsigned int mem_size    = sizeof(float) * num_threads;
 | 
			
		||||
 | 
			
		||||
  // allocate host memory
 | 
			
		||||
  float *h_idata = (float *)malloc(mem_size);
 | 
			
		||||
    // allocate host memory
 | 
			
		||||
    float *h_idata = (float *)malloc(mem_size);
 | 
			
		||||
 | 
			
		||||
  // initalize the memory
 | 
			
		||||
  for (unsigned int i = 0; i < num_threads; ++i) {
 | 
			
		||||
    h_idata[i] = (float)i;
 | 
			
		||||
  }
 | 
			
		||||
    // initalize the memory
 | 
			
		||||
    for (unsigned int i = 0; i < num_threads; ++i) {
 | 
			
		||||
        h_idata[i] = (float)i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // allocate device memory
 | 
			
		||||
  float *d_idata;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
 | 
			
		||||
  // copy host memory to device
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
 | 
			
		||||
    // allocate device memory
 | 
			
		||||
    float *d_idata;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
 | 
			
		||||
    // copy host memory to device
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
 | 
			
		||||
 | 
			
		||||
  // allocate device memory for result
 | 
			
		||||
  float *d_odata;
 | 
			
		||||
  checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
 | 
			
		||||
    // allocate device memory for result
 | 
			
		||||
    float *d_odata;
 | 
			
		||||
    checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
 | 
			
		||||
 | 
			
		||||
  // setup execution parameters
 | 
			
		||||
  dim3 grid(1, 1, 1);
 | 
			
		||||
  dim3 threads(num_threads, 1, 1);
 | 
			
		||||
    // setup execution parameters
 | 
			
		||||
    dim3 grid(1, 1, 1);
 | 
			
		||||
    dim3 threads(num_threads, 1, 1);
 | 
			
		||||
 | 
			
		||||
  // execute the kernel
 | 
			
		||||
  testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
 | 
			
		||||
    // execute the kernel
 | 
			
		||||
    testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
 | 
			
		||||
 | 
			
		||||
  // check if kernel execution generated and error
 | 
			
		||||
  getLastCudaError("Kernel execution failed");
 | 
			
		||||
    // check if kernel execution generated and error
 | 
			
		||||
    getLastCudaError("Kernel execution failed");
 | 
			
		||||
 | 
			
		||||
  // allocate mem for the result on host side
 | 
			
		||||
  float *h_odata = (float *)malloc(mem_size);
 | 
			
		||||
  // copy result from device to host
 | 
			
		||||
  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,
 | 
			
		||||
                             cudaMemcpyDeviceToHost));
 | 
			
		||||
    // allocate mem for the result on host side
 | 
			
		||||
    float *h_odata = (float *)malloc(mem_size);
 | 
			
		||||
    // copy result from device to host
 | 
			
		||||
    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost));
 | 
			
		||||
 | 
			
		||||
  sdkStopTimer(&timer);
 | 
			
		||||
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
  sdkDeleteTimer(&timer);
 | 
			
		||||
    sdkStopTimer(&timer);
 | 
			
		||||
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
 | 
			
		||||
    sdkDeleteTimer(&timer);
 | 
			
		||||
 | 
			
		||||
  // compute reference solution
 | 
			
		||||
  float *reference = (float *)malloc(mem_size);
 | 
			
		||||
  computeGold(reference, h_idata, num_threads);
 | 
			
		||||
    // compute reference solution
 | 
			
		||||
    float *reference = (float *)malloc(mem_size);
 | 
			
		||||
    computeGold(reference, h_idata, num_threads);
 | 
			
		||||
 | 
			
		||||
  // check result
 | 
			
		||||
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
    // write file for regression test
 | 
			
		||||
    sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
 | 
			
		||||
  } else {
 | 
			
		||||
    // custom output handling when no regression test running
 | 
			
		||||
    // in this case check if the result is equivalent to the expected solution
 | 
			
		||||
    bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
 | 
			
		||||
  }
 | 
			
		||||
    // check result
 | 
			
		||||
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
 | 
			
		||||
        // write file for regression test
 | 
			
		||||
        sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // custom output handling when no regression test running
 | 
			
		||||
        // in this case check if the result is equivalent to the expected solution
 | 
			
		||||
        bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // cleanup memory
 | 
			
		||||
  free(h_idata);
 | 
			
		||||
  free(h_odata);
 | 
			
		||||
  free(reference);
 | 
			
		||||
  checkCudaErrors(cudaFree(d_idata));
 | 
			
		||||
  checkCudaErrors(cudaFree(d_odata));
 | 
			
		||||
    // cleanup memory
 | 
			
		||||
    free(h_idata);
 | 
			
		||||
    free(h_odata);
 | 
			
		||||
    free(reference);
 | 
			
		||||
    checkCudaErrors(cudaFree(d_idata));
 | 
			
		||||
    checkCudaErrors(cudaFree(d_odata));
 | 
			
		||||
 | 
			
		||||
  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -26,8 +26,7 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// export C interface
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata,
 | 
			
		||||
                            const unsigned int len);
 | 
			
		||||
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Compute reference data set
 | 
			
		||||
@ -36,10 +35,11 @@ extern "C" void computeGold(float *reference, float *idata,
 | 
			
		||||
//! @param idata      input data as provided to device
 | 
			
		||||
//! @param len        number of elements in reference / idata
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void computeGold(float *reference, float *idata, const unsigned int len) {
 | 
			
		||||
  const float f_len = static_cast<float>(len);
 | 
			
		||||
void computeGold(float *reference, float *idata, const unsigned int len)
 | 
			
		||||
{
 | 
			
		||||
    const float f_len = static_cast<float>(len);
 | 
			
		||||
 | 
			
		||||
  for (unsigned int i = 0; i < len; ++i) {
 | 
			
		||||
    reference[i] = idata[i] * f_len;
 | 
			
		||||
  }
 | 
			
		||||
    for (unsigned int i = 0; i < len; ++i) {
 | 
			
		||||
        reference[i] = idata[i] * f_len;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -37,7 +37,6 @@
 | 
			
		||||
 | 
			
		||||
// For the CUDA runtime routines (prefixed with "cuda_")
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
/**
 | 
			
		||||
 * CUDA Kernel Device code
 | 
			
		||||
@ -45,166 +44,153 @@
 | 
			
		||||
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 | 
			
		||||
 * number of elements numElements.
 | 
			
		||||
 */
 | 
			
		||||
__global__ void vectorAdd(const float *A, const float *B, float *C,
 | 
			
		||||
                          int numElements) {
 | 
			
		||||
  int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
 | 
			
		||||
{
 | 
			
		||||
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (i < numElements) {
 | 
			
		||||
    C[i] = A[i] + B[i] + 0.0f;
 | 
			
		||||
  }
 | 
			
		||||
    if (i < numElements) {
 | 
			
		||||
        C[i] = A[i] + B[i] + 0.0f;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Host main routine
 | 
			
		||||
 */
 | 
			
		||||
int main(void) {
 | 
			
		||||
  // Error code to check return values for CUDA calls
 | 
			
		||||
  cudaError_t err = cudaSuccess;
 | 
			
		||||
int main(void)
 | 
			
		||||
{
 | 
			
		||||
    // Error code to check return values for CUDA calls
 | 
			
		||||
    cudaError_t err = cudaSuccess;
 | 
			
		||||
 | 
			
		||||
  // Print the vector length to be used, and compute its size
 | 
			
		||||
  int numElements = 50000;
 | 
			
		||||
  size_t size = numElements * sizeof(float);
 | 
			
		||||
  printf("[Vector addition of %d elements]\n", numElements);
 | 
			
		||||
    // Print the vector length to be used, and compute its size
 | 
			
		||||
    int    numElements = 50000;
 | 
			
		||||
    size_t size        = numElements * sizeof(float);
 | 
			
		||||
    printf("[Vector addition of %d elements]\n", numElements);
 | 
			
		||||
 | 
			
		||||
  // Allocate the host input vector A
 | 
			
		||||
  float *h_A = (float *)malloc(size);
 | 
			
		||||
    // Allocate the host input vector A
 | 
			
		||||
    float *h_A = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  // Allocate the host input vector B
 | 
			
		||||
  float *h_B = (float *)malloc(size);
 | 
			
		||||
    // Allocate the host input vector B
 | 
			
		||||
    float *h_B = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  // Allocate the host output vector C
 | 
			
		||||
  float *h_C = (float *)malloc(size);
 | 
			
		||||
    // Allocate the host output vector C
 | 
			
		||||
    float *h_C = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  // Verify that allocations succeeded
 | 
			
		||||
  if (h_A == NULL || h_B == NULL || h_C == NULL) {
 | 
			
		||||
    fprintf(stderr, "Failed to allocate host vectors!\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Initialize the host input vectors
 | 
			
		||||
  for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
    h_A[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
    h_B[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Allocate the device input vector A
 | 
			
		||||
  float *d_A = NULL;
 | 
			
		||||
  err = cudaMalloc((void **)&d_A, size);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Allocate the device input vector B
 | 
			
		||||
  float *d_B = NULL;
 | 
			
		||||
  err = cudaMalloc((void **)&d_B, size);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Allocate the device output vector C
 | 
			
		||||
  float *d_C = NULL;
 | 
			
		||||
  err = cudaMalloc((void **)&d_C, size);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Copy the host input vectors A and B in host memory to the device input
 | 
			
		||||
  // vectors in
 | 
			
		||||
  // device memory
 | 
			
		||||
  printf("Copy input data from the host memory to the CUDA device\n");
 | 
			
		||||
  err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "Failed to copy vector A from host to device (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "Failed to copy vector B from host to device (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Launch the Vector Add CUDA Kernel
 | 
			
		||||
  int threadsPerBlock = 256;
 | 
			
		||||
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
 | 
			
		||||
         threadsPerBlock);
 | 
			
		||||
  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
 | 
			
		||||
  err = cudaGetLastError();
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Copy the device result vector in device memory to the host result vector
 | 
			
		||||
  // in host memory.
 | 
			
		||||
  printf("Copy output data from the CUDA device to the host memory\n");
 | 
			
		||||
  err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr,
 | 
			
		||||
            "Failed to copy vector C from device to host (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Verify that the result vector is correct
 | 
			
		||||
  for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
 | 
			
		||||
      fprintf(stderr, "Result verification failed at element %d!\n", i);
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
    // Verify that allocations succeeded
 | 
			
		||||
    if (h_A == NULL || h_B == NULL || h_C == NULL) {
 | 
			
		||||
        fprintf(stderr, "Failed to allocate host vectors!\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("Test PASSED\n");
 | 
			
		||||
    // Initialize the host input vectors
 | 
			
		||||
    for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
        h_A[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
        h_B[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Free device global memory
 | 
			
		||||
  err = cudaFree(d_A);
 | 
			
		||||
    // Allocate the device input vector A
 | 
			
		||||
    float *d_A = NULL;
 | 
			
		||||
    err        = cudaMalloc((void **)&d_A, size);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  err = cudaFree(d_B);
 | 
			
		||||
    // Allocate the device input vector B
 | 
			
		||||
    float *d_B = NULL;
 | 
			
		||||
    err        = cudaMalloc((void **)&d_B, size);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  err = cudaFree(d_C);
 | 
			
		||||
    // Allocate the device output vector C
 | 
			
		||||
    float *d_C = NULL;
 | 
			
		||||
    err        = cudaMalloc((void **)&d_C, size);
 | 
			
		||||
 | 
			
		||||
  if (err != cudaSuccess) {
 | 
			
		||||
    fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
 | 
			
		||||
            cudaGetErrorString(err));
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Free host memory
 | 
			
		||||
  free(h_A);
 | 
			
		||||
  free(h_B);
 | 
			
		||||
  free(h_C);
 | 
			
		||||
    // Copy the host input vectors A and B in host memory to the device input
 | 
			
		||||
    // vectors in
 | 
			
		||||
    // device memory
 | 
			
		||||
    printf("Copy input data from the host memory to the CUDA device\n");
 | 
			
		||||
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
 | 
			
		||||
 | 
			
		||||
  printf("Done\n");
 | 
			
		||||
  return 0;
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 | 
			
		||||
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Launch the Vector Add CUDA Kernel
 | 
			
		||||
    int threadsPerBlock = 256;
 | 
			
		||||
    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 | 
			
		||||
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
 | 
			
		||||
    err = cudaGetLastError();
 | 
			
		||||
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Copy the device result vector in device memory to the host result vector
 | 
			
		||||
    // in host memory.
 | 
			
		||||
    printf("Copy output data from the CUDA device to the host memory\n");
 | 
			
		||||
    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
 | 
			
		||||
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Verify that the result vector is correct
 | 
			
		||||
    for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
 | 
			
		||||
            fprintf(stderr, "Result verification failed at element %d!\n", i);
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("Test PASSED\n");
 | 
			
		||||
 | 
			
		||||
    // Free device global memory
 | 
			
		||||
    err = cudaFree(d_A);
 | 
			
		||||
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    err = cudaFree(d_B);
 | 
			
		||||
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    err = cudaFree(d_C);
 | 
			
		||||
 | 
			
		||||
    if (err != cudaSuccess) {
 | 
			
		||||
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Free host memory
 | 
			
		||||
    free(h_A);
 | 
			
		||||
    free(h_B);
 | 
			
		||||
    free(h_C);
 | 
			
		||||
 | 
			
		||||
    printf("Done\n");
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,11 +34,11 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Includes
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda_drvapi.h>
 | 
			
		||||
@ -50,19 +50,19 @@
 | 
			
		||||
using namespace std;
 | 
			
		||||
 | 
			
		||||
// Variables
 | 
			
		||||
CUdevice cuDevice;
 | 
			
		||||
CUcontext cuContext;
 | 
			
		||||
CUmodule cuModule;
 | 
			
		||||
CUfunction vecAdd_kernel;
 | 
			
		||||
float *h_A;
 | 
			
		||||
float *h_B;
 | 
			
		||||
float *h_C;
 | 
			
		||||
CUdevice    cuDevice;
 | 
			
		||||
CUcontext   cuContext;
 | 
			
		||||
CUmodule    cuModule;
 | 
			
		||||
CUfunction  vecAdd_kernel;
 | 
			
		||||
float      *h_A;
 | 
			
		||||
float      *h_B;
 | 
			
		||||
float      *h_C;
 | 
			
		||||
CUdeviceptr d_A;
 | 
			
		||||
CUdeviceptr d_B;
 | 
			
		||||
CUdeviceptr d_C;
 | 
			
		||||
 | 
			
		||||
// Functions
 | 
			
		||||
int CleanupNoFailure();
 | 
			
		||||
int  CleanupNoFailure();
 | 
			
		||||
void RandomInit(float *, int);
 | 
			
		||||
bool findModulePath(const char *, string &, char **, string &);
 | 
			
		||||
 | 
			
		||||
@ -72,150 +72,152 @@ bool findModulePath(const char *, string &, char **, string &);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Host code
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("Vector Addition (Driver API)\n");
 | 
			
		||||
  int N = 50000, devID = 0;
 | 
			
		||||
  size_t size = N * sizeof(float);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("Vector Addition (Driver API)\n");
 | 
			
		||||
    int    N = 50000, devID = 0;
 | 
			
		||||
    size_t size = N * sizeof(float);
 | 
			
		||||
 | 
			
		||||
  // Initialize
 | 
			
		||||
  checkCudaErrors(cuInit(0));
 | 
			
		||||
    // Initialize
 | 
			
		||||
    checkCudaErrors(cuInit(0));
 | 
			
		||||
 | 
			
		||||
  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
  // Create context
 | 
			
		||||
  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
    // Create context
 | 
			
		||||
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
 | 
			
		||||
  // first search for the module path before we load the results
 | 
			
		||||
  string module_path;
 | 
			
		||||
    // first search for the module path before we load the results
 | 
			
		||||
    string module_path;
 | 
			
		||||
 | 
			
		||||
  std::ostringstream fatbin;
 | 
			
		||||
    std::ostringstream fatbin;
 | 
			
		||||
 | 
			
		||||
  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
  }
 | 
			
		||||
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (!fatbin.str().size()) {
 | 
			
		||||
    printf("fatbin file empty. exiting..\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (!fatbin.str().size()) {
 | 
			
		||||
        printf("fatbin file empty. exiting..\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Create module from binary file (FATBIN)
 | 
			
		||||
  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
    // Create module from binary file (FATBIN)
 | 
			
		||||
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
 | 
			
		||||
 | 
			
		||||
  // Get function handle from module
 | 
			
		||||
  checkCudaErrors(
 | 
			
		||||
      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
 | 
			
		||||
    // Get function handle from module
 | 
			
		||||
    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
 | 
			
		||||
 | 
			
		||||
  // Allocate input vectors h_A and h_B in host memory
 | 
			
		||||
  h_A = (float *)malloc(size);
 | 
			
		||||
  h_B = (float *)malloc(size);
 | 
			
		||||
  h_C = (float *)malloc(size);
 | 
			
		||||
    // Allocate input vectors h_A and h_B in host memory
 | 
			
		||||
    h_A = (float *)malloc(size);
 | 
			
		||||
    h_B = (float *)malloc(size);
 | 
			
		||||
    h_C = (float *)malloc(size);
 | 
			
		||||
 | 
			
		||||
  // Initialize input vectors
 | 
			
		||||
  RandomInit(h_A, N);
 | 
			
		||||
  RandomInit(h_B, N);
 | 
			
		||||
    // Initialize input vectors
 | 
			
		||||
    RandomInit(h_A, N);
 | 
			
		||||
    RandomInit(h_B, N);
 | 
			
		||||
 | 
			
		||||
  // Allocate vectors in device memory
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_A, size));
 | 
			
		||||
    // Allocate vectors in device memory
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_A, size));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_B, size));
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_B, size));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_C, size));
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_C, size));
 | 
			
		||||
 | 
			
		||||
  // Copy vectors from host memory to device memory
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
 | 
			
		||||
    // Copy vectors from host memory to device memory
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
 | 
			
		||||
 | 
			
		||||
  if (1) {
 | 
			
		||||
    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
 | 
			
		||||
    // Launch (simpler method)
 | 
			
		||||
    if (1) {
 | 
			
		||||
        // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
 | 
			
		||||
        // Launch (simpler method)
 | 
			
		||||
 | 
			
		||||
    // Grid/Block configuration
 | 
			
		||||
    int threadsPerBlock = 256;
 | 
			
		||||
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
        // Grid/Block configuration
 | 
			
		||||
        int threadsPerBlock = 256;
 | 
			
		||||
        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
 | 
			
		||||
    void *args[] = {&d_A, &d_B, &d_C, &N};
 | 
			
		||||
        void *args[] = {&d_A, &d_B, &d_C, &N};
 | 
			
		||||
 | 
			
		||||
    // Launch the CUDA kernel
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
 | 
			
		||||
                                   threadsPerBlock, 1, 1, 0, NULL, args, NULL));
 | 
			
		||||
  } else {
 | 
			
		||||
    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
 | 
			
		||||
    // Launch (advanced method)
 | 
			
		||||
    int offset = 0;
 | 
			
		||||
    void *argBuffer[16];
 | 
			
		||||
    *((CUdeviceptr *)&argBuffer[offset]) = d_A;
 | 
			
		||||
    offset += sizeof(d_A);
 | 
			
		||||
    *((CUdeviceptr *)&argBuffer[offset]) = d_B;
 | 
			
		||||
    offset += sizeof(d_B);
 | 
			
		||||
    *((CUdeviceptr *)&argBuffer[offset]) = d_C;
 | 
			
		||||
    offset += sizeof(d_C);
 | 
			
		||||
    *((int *)&argBuffer[offset]) = N;
 | 
			
		||||
    offset += sizeof(N);
 | 
			
		||||
        // Launch the CUDA kernel
 | 
			
		||||
        checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
 | 
			
		||||
        // Launch (advanced method)
 | 
			
		||||
        int   offset = 0;
 | 
			
		||||
        void *argBuffer[16];
 | 
			
		||||
        *((CUdeviceptr *)&argBuffer[offset]) = d_A;
 | 
			
		||||
        offset += sizeof(d_A);
 | 
			
		||||
        *((CUdeviceptr *)&argBuffer[offset]) = d_B;
 | 
			
		||||
        offset += sizeof(d_B);
 | 
			
		||||
        *((CUdeviceptr *)&argBuffer[offset]) = d_C;
 | 
			
		||||
        offset += sizeof(d_C);
 | 
			
		||||
        *((int *)&argBuffer[offset]) = N;
 | 
			
		||||
        offset += sizeof(N);
 | 
			
		||||
 | 
			
		||||
    // Grid/Block configuration
 | 
			
		||||
    int threadsPerBlock = 256;
 | 
			
		||||
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
        // Grid/Block configuration
 | 
			
		||||
        int threadsPerBlock = 256;
 | 
			
		||||
        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
 | 
			
		||||
    // Launch the CUDA kernel
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
 | 
			
		||||
                                   threadsPerBlock, 1, 1, 0, NULL, NULL,
 | 
			
		||||
                                   argBuffer));
 | 
			
		||||
  }
 | 
			
		||||
        // Launch the CUDA kernel
 | 
			
		||||
        checkCudaErrors(
 | 
			
		||||
            cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#ifdef _DEBUG
 | 
			
		||||
  checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  // Copy result from device memory to host memory
 | 
			
		||||
  // h_C contains the result in host memory
 | 
			
		||||
  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
 | 
			
		||||
    // Copy result from device memory to host memory
 | 
			
		||||
    // h_C contains the result in host memory
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
 | 
			
		||||
 | 
			
		||||
  // Verify result
 | 
			
		||||
  int i;
 | 
			
		||||
    // Verify result
 | 
			
		||||
    int i;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < N; ++i) {
 | 
			
		||||
    float sum = h_A[i] + h_B[i];
 | 
			
		||||
    for (i = 0; i < N; ++i) {
 | 
			
		||||
        float sum = h_A[i] + h_B[i];
 | 
			
		||||
 | 
			
		||||
    if (fabs(h_C[i] - sum) > 1e-7f) {
 | 
			
		||||
      break;
 | 
			
		||||
        if (fabs(h_C[i] - sum) > 1e-7f) {
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CleanupNoFailure();
 | 
			
		||||
  printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
    CleanupNoFailure();
 | 
			
		||||
    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
 | 
			
		||||
  exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int CleanupNoFailure() {
 | 
			
		||||
  // Free device memory
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
int CleanupNoFailure()
 | 
			
		||||
{
 | 
			
		||||
    // Free device memory
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
 | 
			
		||||
  // Free host memory
 | 
			
		||||
  if (h_A) {
 | 
			
		||||
    free(h_A);
 | 
			
		||||
  }
 | 
			
		||||
    // Free host memory
 | 
			
		||||
    if (h_A) {
 | 
			
		||||
        free(h_A);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (h_B) {
 | 
			
		||||
    free(h_B);
 | 
			
		||||
  }
 | 
			
		||||
    if (h_B) {
 | 
			
		||||
        free(h_B);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if (h_C) {
 | 
			
		||||
    free(h_C);
 | 
			
		||||
  }
 | 
			
		||||
    if (h_C) {
 | 
			
		||||
        free(h_C);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
    checkCudaErrors(cuCtxDestroy(cuContext));
 | 
			
		||||
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
// Allocates an array with random float entries.
 | 
			
		||||
void RandomInit(float *data, int n) {
 | 
			
		||||
  for (int i = 0; i < n; ++i) {
 | 
			
		||||
    data[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
  }
 | 
			
		||||
void RandomInit(float *data, int n)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < n; ++i) {
 | 
			
		||||
        data[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -33,9 +33,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Device code
 | 
			
		||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
 | 
			
		||||
                                         float *C, int N) {
 | 
			
		||||
  int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
 | 
			
		||||
{
 | 
			
		||||
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (i < N) C[i] = A[i] + B[i];
 | 
			
		||||
    if (i < N)
 | 
			
		||||
        C[i] = A[i] + B[i];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR
 | 
			
		||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 | 
			
		||||
 | 
			
		||||
## References (for more details)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -29,172 +29,172 @@
 | 
			
		||||
 | 
			
		||||
static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
 | 
			
		||||
 | 
			
		||||
CUresult simpleMallocMultiDeviceMmap(
 | 
			
		||||
    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
 | 
			
		||||
    const std::vector<CUdevice> &residentDevices,
 | 
			
		||||
    const std::vector<CUdevice> &mappingDevices, size_t align) {
 | 
			
		||||
  CUresult status = CUDA_SUCCESS;
 | 
			
		||||
  size_t min_granularity = 0;
 | 
			
		||||
  size_t stripeSize;
 | 
			
		||||
CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
 | 
			
		||||
                                     size_t                      *allocationSize,
 | 
			
		||||
                                     size_t                       size,
 | 
			
		||||
                                     const std::vector<CUdevice> &residentDevices,
 | 
			
		||||
                                     const std::vector<CUdevice> &mappingDevices,
 | 
			
		||||
                                     size_t                       align)
 | 
			
		||||
{
 | 
			
		||||
    CUresult status          = CUDA_SUCCESS;
 | 
			
		||||
    size_t   min_granularity = 0;
 | 
			
		||||
    size_t   stripeSize;
 | 
			
		||||
 | 
			
		||||
  // Setup the properties common for all the chunks
 | 
			
		||||
  // The allocations will be device pinned memory.
 | 
			
		||||
  // This property structure describes the physical location where the memory
 | 
			
		||||
  // will be allocated via cuMemCreate allong with additional properties In this
 | 
			
		||||
  // case, the allocation will be pinnded device memory local to a given device.
 | 
			
		||||
  CUmemAllocationProp prop = {};
 | 
			
		||||
  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 | 
			
		||||
  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 | 
			
		||||
    // Setup the properties common for all the chunks
 | 
			
		||||
    // The allocations will be device pinned memory.
 | 
			
		||||
    // This property structure describes the physical location where the memory
 | 
			
		||||
    // will be allocated via cuMemCreate allong with additional properties In this
 | 
			
		||||
    // case, the allocation will be pinnded device memory local to a given device.
 | 
			
		||||
    CUmemAllocationProp prop = {};
 | 
			
		||||
    prop.type                = CU_MEM_ALLOCATION_TYPE_PINNED;
 | 
			
		||||
    prop.location.type       = CU_MEM_LOCATION_TYPE_DEVICE;
 | 
			
		||||
 | 
			
		||||
  // Get the minimum granularity needed for the resident devices
 | 
			
		||||
  // (the max of the minimum granularity of each participating device)
 | 
			
		||||
  for (int idx = 0; idx < residentDevices.size(); idx++) {
 | 
			
		||||
    size_t granularity = 0;
 | 
			
		||||
    // Get the minimum granularity needed for the resident devices
 | 
			
		||||
    // (the max of the minimum granularity of each participating device)
 | 
			
		||||
    for (int idx = 0; idx < residentDevices.size(); idx++) {
 | 
			
		||||
        size_t granularity = 0;
 | 
			
		||||
 | 
			
		||||
    // get the minnimum granularity for residentDevices[idx]
 | 
			
		||||
    prop.location.id = residentDevices[idx];
 | 
			
		||||
    status = cuMemGetAllocationGranularity(&granularity, &prop,
 | 
			
		||||
                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
      goto done;
 | 
			
		||||
    }
 | 
			
		||||
    if (min_granularity < granularity) {
 | 
			
		||||
      min_granularity = granularity;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Get the minimum granularity needed for the accessing devices
 | 
			
		||||
  // (the max of the minimum granularity of each participating device)
 | 
			
		||||
  for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
 | 
			
		||||
    size_t granularity = 0;
 | 
			
		||||
 | 
			
		||||
    // get the minnimum granularity for mappingDevices[idx]
 | 
			
		||||
    prop.location.id = mappingDevices[idx];
 | 
			
		||||
    status = cuMemGetAllocationGranularity(&granularity, &prop,
 | 
			
		||||
                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
      goto done;
 | 
			
		||||
    }
 | 
			
		||||
    if (min_granularity < granularity) {
 | 
			
		||||
      min_granularity = granularity;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Round up the size such that we can evenly split it into a stripe size tha
 | 
			
		||||
  // meets the granularity requirements Essentially size = N *
 | 
			
		||||
  // residentDevices.size() * min_granularity is the requirement, since each
 | 
			
		||||
  // piece of the allocation will be stripeSize = N * min_granularity and the
 | 
			
		||||
  // min_granularity requirement applies to each stripeSize piece of the
 | 
			
		||||
  // allocation.
 | 
			
		||||
  size = round_up(size, residentDevices.size() * min_granularity);
 | 
			
		||||
  stripeSize = size / residentDevices.size();
 | 
			
		||||
 | 
			
		||||
  // Return the rounded up size to the caller for use in the free
 | 
			
		||||
  if (allocationSize) {
 | 
			
		||||
    *allocationSize = size;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Reserve the required contiguous VA space for the allocations
 | 
			
		||||
  status = cuMemAddressReserve(dptr, size, align, 0, 0);
 | 
			
		||||
  if (status != CUDA_SUCCESS) {
 | 
			
		||||
    goto done;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Create and map the backings on each gpu
 | 
			
		||||
  // note: reusing CUmemAllocationProp prop from earlier with prop.type &
 | 
			
		||||
  // prop.location.type already specified.
 | 
			
		||||
  for (size_t idx = 0; idx < residentDevices.size(); idx++) {
 | 
			
		||||
    CUresult status2 = CUDA_SUCCESS;
 | 
			
		||||
 | 
			
		||||
    // Set the location for this chunk to this device
 | 
			
		||||
    prop.location.id = residentDevices[idx];
 | 
			
		||||
 | 
			
		||||
    // Create the allocation as a pinned allocation on this device
 | 
			
		||||
    CUmemGenericAllocationHandle allocationHandle;
 | 
			
		||||
    status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
      goto done;
 | 
			
		||||
        // get the minnimum granularity for residentDevices[idx]
 | 
			
		||||
        prop.location.id = residentDevices[idx];
 | 
			
		||||
        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
 | 
			
		||||
        if (status != CUDA_SUCCESS) {
 | 
			
		||||
            goto done;
 | 
			
		||||
        }
 | 
			
		||||
        if (min_granularity < granularity) {
 | 
			
		||||
            min_granularity = granularity;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Assign the chunk to the appropriate VA range and release the handle.
 | 
			
		||||
    // After mapping the memory, it can be referenced by virtual address.
 | 
			
		||||
    // Since we do not need to make any other mappings of this memory or export
 | 
			
		||||
    // it, we no longer need and can release the allocationHandle. The
 | 
			
		||||
    // allocation will be kept live until it is unmapped.
 | 
			
		||||
    status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0,
 | 
			
		||||
                      allocationHandle, 0);
 | 
			
		||||
 | 
			
		||||
    // the handle needs to be released even if the mapping failed.
 | 
			
		||||
    status2 = cuMemRelease(allocationHandle);
 | 
			
		||||
    if (status == CUDA_SUCCESS) {
 | 
			
		||||
      // cuMemRelease should not have failed here
 | 
			
		||||
      // as the handle was just allocated successfully
 | 
			
		||||
      // however return an error if it does.
 | 
			
		||||
      status = status2;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Cleanup in case of any mapping failures.
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
      goto done;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    // Each accessDescriptor will describe the mapping requirement for a single
 | 
			
		||||
    // device
 | 
			
		||||
    std::vector<CUmemAccessDesc> accessDescriptors;
 | 
			
		||||
    accessDescriptors.resize(mappingDevices.size());
 | 
			
		||||
 | 
			
		||||
    // Prepare the access descriptor array indicating where and how the backings
 | 
			
		||||
    // should be visible.
 | 
			
		||||
    // Get the minimum granularity needed for the accessing devices
 | 
			
		||||
    // (the max of the minimum granularity of each participating device)
 | 
			
		||||
    for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
 | 
			
		||||
      // Specify which device we are adding mappings for.
 | 
			
		||||
      accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 | 
			
		||||
      accessDescriptors[idx].location.id = mappingDevices[idx];
 | 
			
		||||
        size_t granularity = 0;
 | 
			
		||||
 | 
			
		||||
      // Specify both read and write access.
 | 
			
		||||
      accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 | 
			
		||||
        // get the minnimum granularity for mappingDevices[idx]
 | 
			
		||||
        prop.location.id = mappingDevices[idx];
 | 
			
		||||
        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
 | 
			
		||||
        if (status != CUDA_SUCCESS) {
 | 
			
		||||
            goto done;
 | 
			
		||||
        }
 | 
			
		||||
        if (min_granularity < granularity) {
 | 
			
		||||
            min_granularity = granularity;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Apply the access descriptors to the whole VA range.
 | 
			
		||||
    status = cuMemSetAccess(*dptr, size, &accessDescriptors[0],
 | 
			
		||||
                            accessDescriptors.size());
 | 
			
		||||
    // Round up the size such that we can evenly split it into a stripe size tha
 | 
			
		||||
    // meets the granularity requirements Essentially size = N *
 | 
			
		||||
    // residentDevices.size() * min_granularity is the requirement, since each
 | 
			
		||||
    // piece of the allocation will be stripeSize = N * min_granularity and the
 | 
			
		||||
    // min_granularity requirement applies to each stripeSize piece of the
 | 
			
		||||
    // allocation.
 | 
			
		||||
    size       = round_up(size, residentDevices.size() * min_granularity);
 | 
			
		||||
    stripeSize = size / residentDevices.size();
 | 
			
		||||
 | 
			
		||||
    // Return the rounded up size to the caller for use in the free
 | 
			
		||||
    if (allocationSize) {
 | 
			
		||||
        *allocationSize = size;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Reserve the required contiguous VA space for the allocations
 | 
			
		||||
    status = cuMemAddressReserve(dptr, size, align, 0, 0);
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
      goto done;
 | 
			
		||||
        goto done;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Create and map the backings on each gpu
 | 
			
		||||
    // note: reusing CUmemAllocationProp prop from earlier with prop.type &
 | 
			
		||||
    // prop.location.type already specified.
 | 
			
		||||
    for (size_t idx = 0; idx < residentDevices.size(); idx++) {
 | 
			
		||||
        CUresult status2 = CUDA_SUCCESS;
 | 
			
		||||
 | 
			
		||||
        // Set the location for this chunk to this device
 | 
			
		||||
        prop.location.id = residentDevices[idx];
 | 
			
		||||
 | 
			
		||||
        // Create the allocation as a pinned allocation on this device
 | 
			
		||||
        CUmemGenericAllocationHandle allocationHandle;
 | 
			
		||||
        status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
 | 
			
		||||
        if (status != CUDA_SUCCESS) {
 | 
			
		||||
            goto done;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Assign the chunk to the appropriate VA range and release the handle.
 | 
			
		||||
        // After mapping the memory, it can be referenced by virtual address.
 | 
			
		||||
        // Since we do not need to make any other mappings of this memory or export
 | 
			
		||||
        // it, we no longer need and can release the allocationHandle. The
 | 
			
		||||
        // allocation will be kept live until it is unmapped.
 | 
			
		||||
        status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0);
 | 
			
		||||
 | 
			
		||||
        // the handle needs to be released even if the mapping failed.
 | 
			
		||||
        status2 = cuMemRelease(allocationHandle);
 | 
			
		||||
        if (status == CUDA_SUCCESS) {
 | 
			
		||||
            // cuMemRelease should not have failed here
 | 
			
		||||
            // as the handle was just allocated successfully
 | 
			
		||||
            // however return an error if it does.
 | 
			
		||||
            status = status2;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Cleanup in case of any mapping failures.
 | 
			
		||||
        if (status != CUDA_SUCCESS) {
 | 
			
		||||
            goto done;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    {
 | 
			
		||||
        // Each accessDescriptor will describe the mapping requirement for a single
 | 
			
		||||
        // device
 | 
			
		||||
        std::vector<CUmemAccessDesc> accessDescriptors;
 | 
			
		||||
        accessDescriptors.resize(mappingDevices.size());
 | 
			
		||||
 | 
			
		||||
        // Prepare the access descriptor array indicating where and how the backings
 | 
			
		||||
        // should be visible.
 | 
			
		||||
        for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
 | 
			
		||||
            // Specify which device we are adding mappings for.
 | 
			
		||||
            accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 | 
			
		||||
            accessDescriptors[idx].location.id   = mappingDevices[idx];
 | 
			
		||||
 | 
			
		||||
            // Specify both read and write access.
 | 
			
		||||
            accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Apply the access descriptors to the whole VA range.
 | 
			
		||||
        status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size());
 | 
			
		||||
        if (status != CUDA_SUCCESS) {
 | 
			
		||||
            goto done;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
done:
 | 
			
		||||
  if (status != CUDA_SUCCESS) {
 | 
			
		||||
    if (*dptr) {
 | 
			
		||||
      simpleFreeMultiDeviceMmap(*dptr, size);
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
        if (*dptr) {
 | 
			
		||||
            simpleFreeMultiDeviceMmap(*dptr, size);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return status;
 | 
			
		||||
    return status;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) {
 | 
			
		||||
  CUresult status = CUDA_SUCCESS;
 | 
			
		||||
CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size)
 | 
			
		||||
{
 | 
			
		||||
    CUresult status = CUDA_SUCCESS;
 | 
			
		||||
 | 
			
		||||
  // Unmap the mapped virtual memory region
 | 
			
		||||
  // Since the handles to the mapped backing stores have already been released
 | 
			
		||||
  // by cuMemRelease, and these are the only/last mappings referencing them,
 | 
			
		||||
  // The backing stores will be freed.
 | 
			
		||||
  // Since the memory has been unmapped after this call, accessing the specified
 | 
			
		||||
  // va range will result in a fault (unitll it is remapped).
 | 
			
		||||
  status = cuMemUnmap(dptr, size);
 | 
			
		||||
  if (status != CUDA_SUCCESS) {
 | 
			
		||||
    return status;
 | 
			
		||||
  }
 | 
			
		||||
  // Free the virtual address region.  This allows the virtual address region
 | 
			
		||||
  // to be reused by future cuMemAddressReserve calls.  This also allows the
 | 
			
		||||
  // virtual address region to be used by other allocation made through
 | 
			
		||||
  // opperating system calls like malloc & mmap.
 | 
			
		||||
  status = cuMemAddressFree(dptr, size);
 | 
			
		||||
  if (status != CUDA_SUCCESS) {
 | 
			
		||||
    return status;
 | 
			
		||||
  }
 | 
			
		||||
    // Unmap the mapped virtual memory region
 | 
			
		||||
    // Since the handles to the mapped backing stores have already been released
 | 
			
		||||
    // by cuMemRelease, and these are the only/last mappings referencing them,
 | 
			
		||||
    // The backing stores will be freed.
 | 
			
		||||
    // Since the memory has been unmapped after this call, accessing the specified
 | 
			
		||||
    // va range will result in a fault (unitll it is remapped).
 | 
			
		||||
    status = cuMemUnmap(dptr, size);
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
        return status;
 | 
			
		||||
    }
 | 
			
		||||
    // Free the virtual address region.  This allows the virtual address region
 | 
			
		||||
    // to be reused by future cuMemAddressReserve calls.  This also allows the
 | 
			
		||||
    // virtual address region to be used by other allocation made through
 | 
			
		||||
    // opperating system calls like malloc & mmap.
 | 
			
		||||
    status = cuMemAddressFree(dptr, size);
 | 
			
		||||
    if (status != CUDA_SUCCESS) {
 | 
			
		||||
        return status;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return status;
 | 
			
		||||
    return status;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -63,10 +63,12 @@
 | 
			
		||||
//! handle
 | 
			
		||||
//!   is not needed after its mappings are set up.
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
CUresult simpleMallocMultiDeviceMmap(
 | 
			
		||||
    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
 | 
			
		||||
    const std::vector<CUdevice> &residentDevices,
 | 
			
		||||
    const std::vector<CUdevice> &mappingDevices, size_t align = 0);
 | 
			
		||||
CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
 | 
			
		||||
                                     size_t                      *allocationSize,
 | 
			
		||||
                                     size_t                       size,
 | 
			
		||||
                                     const std::vector<CUdevice> &residentDevices,
 | 
			
		||||
                                     const std::vector<CUdevice> &mappingDevices,
 | 
			
		||||
                                     size_t                       align = 0);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//! Frees resources allocated by simpleMallocMultiDeviceMmap
 | 
			
		||||
 | 
			
		||||
@ -36,11 +36,11 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Includes
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda_drvapi.h>
 | 
			
		||||
@ -54,115 +54,111 @@
 | 
			
		||||
using namespace std;
 | 
			
		||||
 | 
			
		||||
// Variables
 | 
			
		||||
CUdevice cuDevice;
 | 
			
		||||
CUcontext cuContext;
 | 
			
		||||
CUmodule cuModule;
 | 
			
		||||
CUfunction vecAdd_kernel;
 | 
			
		||||
float *h_A;
 | 
			
		||||
float *h_B;
 | 
			
		||||
float *h_C;
 | 
			
		||||
CUdevice    cuDevice;
 | 
			
		||||
CUcontext   cuContext;
 | 
			
		||||
CUmodule    cuModule;
 | 
			
		||||
CUfunction  vecAdd_kernel;
 | 
			
		||||
float      *h_A;
 | 
			
		||||
float      *h_B;
 | 
			
		||||
float      *h_C;
 | 
			
		||||
CUdeviceptr d_A;
 | 
			
		||||
CUdeviceptr d_B;
 | 
			
		||||
CUdeviceptr d_C;
 | 
			
		||||
size_t allocationSize = 0;
 | 
			
		||||
size_t      allocationSize = 0;
 | 
			
		||||
 | 
			
		||||
// Functions
 | 
			
		||||
int CleanupNoFailure();
 | 
			
		||||
int  CleanupNoFailure();
 | 
			
		||||
void RandomInit(float *, int);
 | 
			
		||||
 | 
			
		||||
//define input fatbin file
 | 
			
		||||
// define input fatbin file
 | 
			
		||||
#ifndef FATBIN_FILE
 | 
			
		||||
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// collect all of the devices whose memory can be mapped from cuDevice.
 | 
			
		||||
vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
 | 
			
		||||
  int num_devices;
 | 
			
		||||
vector<CUdevice> getBackingDevices(CUdevice cuDevice)
 | 
			
		||||
{
 | 
			
		||||
    int num_devices;
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuDeviceGetCount(&num_devices));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetCount(&num_devices));
 | 
			
		||||
 | 
			
		||||
  vector<CUdevice> backingDevices;
 | 
			
		||||
  backingDevices.push_back(cuDevice);
 | 
			
		||||
  for (int dev = 0; dev < num_devices; dev++) {
 | 
			
		||||
    int capable = 0;
 | 
			
		||||
    int attributeVal = 0;
 | 
			
		||||
    vector<CUdevice> backingDevices;
 | 
			
		||||
    backingDevices.push_back(cuDevice);
 | 
			
		||||
    for (int dev = 0; dev < num_devices; dev++) {
 | 
			
		||||
        int capable      = 0;
 | 
			
		||||
        int attributeVal = 0;
 | 
			
		||||
 | 
			
		||||
    // The mapping device is already in the backingDevices vector
 | 
			
		||||
    if (dev == cuDevice) {
 | 
			
		||||
      continue;
 | 
			
		||||
        // The mapping device is already in the backingDevices vector
 | 
			
		||||
        if (dev == cuDevice) {
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Only peer capable devices can map each others memory
 | 
			
		||||
        checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
 | 
			
		||||
        if (!capable) {
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // The device needs to support virtual address management for the required
 | 
			
		||||
        // apis to work
 | 
			
		||||
        checkCudaErrors(
 | 
			
		||||
            cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
 | 
			
		||||
        if (attributeVal == 0) {
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        backingDevices.push_back(dev);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Only peer capable devices can map each others memory
 | 
			
		||||
    checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
 | 
			
		||||
    if (!capable) {
 | 
			
		||||
      continue;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // The device needs to support virtual address management for the required
 | 
			
		||||
    // apis to work
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
 | 
			
		||||
        cuDevice));
 | 
			
		||||
    if (attributeVal == 0) {
 | 
			
		||||
      continue;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    backingDevices.push_back(dev);
 | 
			
		||||
  }
 | 
			
		||||
  return backingDevices;
 | 
			
		||||
    return backingDevices;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Host code
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  printf("Vector Addition (Driver API)\n");
 | 
			
		||||
  int N = 50000;
 | 
			
		||||
  size_t size = N * sizeof(float);
 | 
			
		||||
  int attributeVal = 0;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    printf("Vector Addition (Driver API)\n");
 | 
			
		||||
    int    N            = 50000;
 | 
			
		||||
    size_t size         = N * sizeof(float);
 | 
			
		||||
    int    attributeVal = 0;
 | 
			
		||||
 | 
			
		||||
  // Initialize
 | 
			
		||||
  checkCudaErrors(cuInit(0));
 | 
			
		||||
    // Initialize
 | 
			
		||||
    checkCudaErrors(cuInit(0));
 | 
			
		||||
 | 
			
		||||
  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 | 
			
		||||
 | 
			
		||||
  // Check that the selected device supports virtual address management
 | 
			
		||||
  checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
      &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
 | 
			
		||||
      cuDevice));
 | 
			
		||||
  printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice,
 | 
			
		||||
         attributeVal);
 | 
			
		||||
  if (attributeVal == 0) {
 | 
			
		||||
    printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
 | 
			
		||||
    exit(EXIT_WAIVED);
 | 
			
		||||
  }
 | 
			
		||||
    // Check that the selected device supports virtual address management
 | 
			
		||||
    checkCudaErrors(
 | 
			
		||||
        cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
 | 
			
		||||
    printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal);
 | 
			
		||||
    if (attributeVal == 0) {
 | 
			
		||||
        printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
 | 
			
		||||
        exit(EXIT_WAIVED);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // The vector addition happens on cuDevice, so the allocations need to be
 | 
			
		||||
  // mapped there.
 | 
			
		||||
  vector<CUdevice> mappingDevices;
 | 
			
		||||
  mappingDevices.push_back(cuDevice);
 | 
			
		||||
    // The vector addition happens on cuDevice, so the allocations need to be
 | 
			
		||||
    // mapped there.
 | 
			
		||||
    vector<CUdevice> mappingDevices;
 | 
			
		||||
    mappingDevices.push_back(cuDevice);
 | 
			
		||||
 | 
			
		||||
  // Collect devices accessible by the mapping device (cuDevice) into the
 | 
			
		||||
  // backingDevices vector.
 | 
			
		||||
  vector<CUdevice> backingDevices = getBackingDevices(cuDevice);
 | 
			
		||||
    // Collect devices accessible by the mapping device (cuDevice) into the
 | 
			
		||||
    // backingDevices vector.
 | 
			
		||||
    vector<CUdevice> backingDevices = getBackingDevices(cuDevice);
 | 
			
		||||
 | 
			
		||||
  // Create context
 | 
			
		||||
  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
    // Create context
 | 
			
		||||
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
 | 
			
		||||
 | 
			
		||||
    // first search for the module path before we load the results
 | 
			
		||||
    string module_path;
 | 
			
		||||
 | 
			
		||||
    std::ostringstream fatbin;
 | 
			
		||||
 | 
			
		||||
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
 | 
			
		||||
    {
 | 
			
		||||
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
    else {
 | 
			
		||||
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!fatbin.str().size())
 | 
			
		||||
    {
 | 
			
		||||
    if (!fatbin.str().size()) {
 | 
			
		||||
        printf("fatbin file empty. exiting..\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
@ -204,13 +200,10 @@ int main(int argc, char **argv) {
 | 
			
		||||
    int threadsPerBlock = 256;
 | 
			
		||||
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
 | 
			
		||||
    void *args[] = { &d_A, &d_B, &d_C, &N };
 | 
			
		||||
    void *args[] = {&d_A, &d_B, &d_C, &N};
 | 
			
		||||
 | 
			
		||||
    // Launch the CUDA kernel
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
 | 
			
		||||
                               threadsPerBlock, 1, 1,
 | 
			
		||||
                               0,
 | 
			
		||||
                               NULL, args, NULL));
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
 | 
			
		||||
 | 
			
		||||
    // Copy result from device memory to host memory
 | 
			
		||||
    // h_C contains the result in host memory
 | 
			
		||||
@ -219,20 +212,18 @@ int main(int argc, char **argv) {
 | 
			
		||||
    // Verify result
 | 
			
		||||
    int i;
 | 
			
		||||
 | 
			
		||||
    for (i = 0; i < N; ++i)
 | 
			
		||||
    {
 | 
			
		||||
    for (i = 0; i < N; ++i) {
 | 
			
		||||
        float sum = h_A[i] + h_B[i];
 | 
			
		||||
 | 
			
		||||
        if (fabs(h_C[i] - sum) > 1e-7f)
 | 
			
		||||
        {
 | 
			
		||||
        if (fabs(h_C[i] - sum) > 1e-7f) {
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    CleanupNoFailure();
 | 
			
		||||
    printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
 | 
			
		||||
 | 
			
		||||
    exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int CleanupNoFailure()
 | 
			
		||||
@ -243,18 +234,15 @@ int CleanupNoFailure()
 | 
			
		||||
    checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
 | 
			
		||||
 | 
			
		||||
    // Free host memory
 | 
			
		||||
    if (h_A)
 | 
			
		||||
    {
 | 
			
		||||
    if (h_A) {
 | 
			
		||||
        free(h_A);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (h_B)
 | 
			
		||||
    {
 | 
			
		||||
    if (h_B) {
 | 
			
		||||
        free(h_B);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (h_C)
 | 
			
		||||
    {
 | 
			
		||||
    if (h_C) {
 | 
			
		||||
        free(h_C);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@ -265,8 +253,7 @@ int CleanupNoFailure()
 | 
			
		||||
// Allocates an array with random float entries.
 | 
			
		||||
void RandomInit(float *data, int n)
 | 
			
		||||
{
 | 
			
		||||
    for (int i = 0; i < n; ++i)
 | 
			
		||||
    {
 | 
			
		||||
    for (int i = 0; i < n; ++i) {
 | 
			
		||||
        data[i] = rand() / (float)RAND_MAX;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -34,9 +34,10 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Device code
 | 
			
		||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
 | 
			
		||||
                                         float *C, int N) {
 | 
			
		||||
  int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
 | 
			
		||||
{
 | 
			
		||||
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (i < N) C[i] = A[i] + B[i];
 | 
			
		||||
    if (i < N)
 | 
			
		||||
        C[i] = A[i] + B[i];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -33,8 +33,8 @@
 | 
			
		||||
 * of the programming guide with some additions like error checking.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <cmath>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
// For the CUDA runtime routines (prefixed with "cuda_")
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
@ -42,112 +42,116 @@
 | 
			
		||||
 | 
			
		||||
// helper functions and utilities to work with CUDA
 | 
			
		||||
#include <helper_functions.h>
 | 
			
		||||
 | 
			
		||||
#include <nvrtc_helper.h>
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Host main routine
 | 
			
		||||
 */
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  char *cubin, *kernel_file;
 | 
			
		||||
  size_t cubinSize;
 | 
			
		||||
  kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
 | 
			
		||||
  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
  CUmodule module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    char  *cubin, *kernel_file;
 | 
			
		||||
    size_t cubinSize;
 | 
			
		||||
    kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
 | 
			
		||||
    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
 | 
			
		||||
    CUmodule module = loadCUBIN(cubin, argc, argv);
 | 
			
		||||
 | 
			
		||||
  CUfunction kernel_addr;
 | 
			
		||||
  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
 | 
			
		||||
    CUfunction kernel_addr;
 | 
			
		||||
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
 | 
			
		||||
 | 
			
		||||
  // Print the vector length to be used, and compute its size
 | 
			
		||||
  int numElements = 50000;
 | 
			
		||||
  size_t size = numElements * sizeof(float);
 | 
			
		||||
  printf("[Vector addition of %d elements]\n", numElements);
 | 
			
		||||
    // Print the vector length to be used, and compute its size
 | 
			
		||||
    int    numElements = 50000;
 | 
			
		||||
    size_t size        = numElements * sizeof(float);
 | 
			
		||||
    printf("[Vector addition of %d elements]\n", numElements);
 | 
			
		||||
 | 
			
		||||
  // Allocate the host input vector A
 | 
			
		||||
  float *h_A = reinterpret_cast<float *>(malloc(size));
 | 
			
		||||
    // Allocate the host input vector A
 | 
			
		||||
    float *h_A = reinterpret_cast<float *>(malloc(size));
 | 
			
		||||
 | 
			
		||||
  // Allocate the host input vector B
 | 
			
		||||
  float *h_B = reinterpret_cast<float *>(malloc(size));
 | 
			
		||||
    // Allocate the host input vector B
 | 
			
		||||
    float *h_B = reinterpret_cast<float *>(malloc(size));
 | 
			
		||||
 | 
			
		||||
  // Allocate the host output vector C
 | 
			
		||||
  float *h_C = reinterpret_cast<float *>(malloc(size));
 | 
			
		||||
    // Allocate the host output vector C
 | 
			
		||||
    float *h_C = reinterpret_cast<float *>(malloc(size));
 | 
			
		||||
 | 
			
		||||
  // Verify that allocations succeeded
 | 
			
		||||
  if (h_A == NULL || h_B == NULL || h_C == NULL) {
 | 
			
		||||
    fprintf(stderr, "Failed to allocate host vectors!\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Initialize the host input vectors
 | 
			
		||||
  for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
    h_A[i] = rand() / static_cast<float>(RAND_MAX);
 | 
			
		||||
    h_B[i] = rand() / static_cast<float>(RAND_MAX);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Allocate the device input vector A
 | 
			
		||||
  CUdeviceptr d_A;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_A, size));
 | 
			
		||||
 | 
			
		||||
  // Allocate the device input vector B
 | 
			
		||||
  CUdeviceptr d_B;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_B, size));
 | 
			
		||||
 | 
			
		||||
  // Allocate the device output vector C
 | 
			
		||||
  CUdeviceptr d_C;
 | 
			
		||||
  checkCudaErrors(cuMemAlloc(&d_C, size));
 | 
			
		||||
 | 
			
		||||
  // Copy the host input vectors A and B in host memory to the device input
 | 
			
		||||
  // vectors in device memory
 | 
			
		||||
  printf("Copy input data from the host memory to the CUDA device\n");
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
 | 
			
		||||
  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
 | 
			
		||||
 | 
			
		||||
  // Launch the Vector Add CUDA Kernel
 | 
			
		||||
  int threadsPerBlock = 256;
 | 
			
		||||
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
 | 
			
		||||
         threadsPerBlock);
 | 
			
		||||
  dim3 cudaBlockSize(threadsPerBlock, 1, 1);
 | 
			
		||||
  dim3 cudaGridSize(blocksPerGrid, 1, 1);
 | 
			
		||||
 | 
			
		||||
  void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
 | 
			
		||||
                 reinterpret_cast<void *>(&d_C),
 | 
			
		||||
                 reinterpret_cast<void *>(&numElements)};
 | 
			
		||||
  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
 | 
			
		||||
                                 cudaGridSize.z, /* grid dim */
 | 
			
		||||
                                 cudaBlockSize.x, cudaBlockSize.y,
 | 
			
		||||
                                 cudaBlockSize.z, /* block dim */
 | 
			
		||||
                                 0, 0,            /* shared mem, stream */
 | 
			
		||||
                                 &arr[0],         /* arguments */
 | 
			
		||||
                                 0));
 | 
			
		||||
  checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
 | 
			
		||||
  // Copy the device result vector in device memory to the host result vector
 | 
			
		||||
  // in host memory.
 | 
			
		||||
  printf("Copy output data from the CUDA device to the host memory\n");
 | 
			
		||||
  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
 | 
			
		||||
 | 
			
		||||
  // Verify that the result vector is correct
 | 
			
		||||
  for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
 | 
			
		||||
      fprintf(stderr, "Result verification failed at element %d!\n", i);
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
    // Verify that allocations succeeded
 | 
			
		||||
    if (h_A == NULL || h_B == NULL || h_C == NULL) {
 | 
			
		||||
        fprintf(stderr, "Failed to allocate host vectors!\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("Test PASSED\n");
 | 
			
		||||
    // Initialize the host input vectors
 | 
			
		||||
    for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
        h_A[i] = rand() / static_cast<float>(RAND_MAX);
 | 
			
		||||
        h_B[i] = rand() / static_cast<float>(RAND_MAX);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // Free device global memory
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
  checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
    // Allocate the device input vector A
 | 
			
		||||
    CUdeviceptr d_A;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_A, size));
 | 
			
		||||
 | 
			
		||||
  // Free host memory
 | 
			
		||||
  free(h_A);
 | 
			
		||||
  free(h_B);
 | 
			
		||||
  free(h_C);
 | 
			
		||||
    // Allocate the device input vector B
 | 
			
		||||
    CUdeviceptr d_B;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_B, size));
 | 
			
		||||
 | 
			
		||||
  printf("Done\n");
 | 
			
		||||
    // Allocate the device output vector C
 | 
			
		||||
    CUdeviceptr d_C;
 | 
			
		||||
    checkCudaErrors(cuMemAlloc(&d_C, size));
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
    // Copy the host input vectors A and B in host memory to the device input
 | 
			
		||||
    // vectors in device memory
 | 
			
		||||
    printf("Copy input data from the host memory to the CUDA device\n");
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
 | 
			
		||||
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
 | 
			
		||||
 | 
			
		||||
    // Launch the Vector Add CUDA Kernel
 | 
			
		||||
    int threadsPerBlock = 256;
 | 
			
		||||
    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 | 
			
		||||
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 | 
			
		||||
    dim3 cudaBlockSize(threadsPerBlock, 1, 1);
 | 
			
		||||
    dim3 cudaGridSize(blocksPerGrid, 1, 1);
 | 
			
		||||
 | 
			
		||||
    void *arr[] = {reinterpret_cast<void *>(&d_A),
 | 
			
		||||
                   reinterpret_cast<void *>(&d_B),
 | 
			
		||||
                   reinterpret_cast<void *>(&d_C),
 | 
			
		||||
                   reinterpret_cast<void *>(&numElements)};
 | 
			
		||||
    checkCudaErrors(cuLaunchKernel(kernel_addr,
 | 
			
		||||
                                   cudaGridSize.x,
 | 
			
		||||
                                   cudaGridSize.y,
 | 
			
		||||
                                   cudaGridSize.z, /* grid dim */
 | 
			
		||||
                                   cudaBlockSize.x,
 | 
			
		||||
                                   cudaBlockSize.y,
 | 
			
		||||
                                   cudaBlockSize.z, /* block dim */
 | 
			
		||||
                                   0,
 | 
			
		||||
                                   0,       /* shared mem, stream */
 | 
			
		||||
                                   &arr[0], /* arguments */
 | 
			
		||||
                                   0));
 | 
			
		||||
    checkCudaErrors(cuCtxSynchronize());
 | 
			
		||||
 | 
			
		||||
    // Copy the device result vector in device memory to the host result vector
 | 
			
		||||
    // in host memory.
 | 
			
		||||
    printf("Copy output data from the CUDA device to the host memory\n");
 | 
			
		||||
    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
 | 
			
		||||
 | 
			
		||||
    // Verify that the result vector is correct
 | 
			
		||||
    for (int i = 0; i < numElements; ++i) {
 | 
			
		||||
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
 | 
			
		||||
            fprintf(stderr, "Result verification failed at element %d!\n", i);
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("Test PASSED\n");
 | 
			
		||||
 | 
			
		||||
    // Free device global memory
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_A));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_B));
 | 
			
		||||
    checkCudaErrors(cuMemFree(d_C));
 | 
			
		||||
 | 
			
		||||
    // Free host memory
 | 
			
		||||
    free(h_A);
 | 
			
		||||
    free(h_B);
 | 
			
		||||
    free(h_C);
 | 
			
		||||
 | 
			
		||||
    printf("Done\n");
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -32,11 +32,11 @@
 | 
			
		||||
 * number of elements numElements.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
 | 
			
		||||
                                     int numElements) {
 | 
			
		||||
  int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
 | 
			
		||||
{
 | 
			
		||||
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  if (i < numElements) {
 | 
			
		||||
    C[i] = A[i] + B[i];
 | 
			
		||||
  }
 | 
			
		||||
    if (i < numElements) {
 | 
			
		||||
        C[i] = A[i] + B[i];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -32,12 +32,11 @@
 | 
			
		||||
 | 
			
		||||
#include <cuda_runtime.h>
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <memory>
 | 
			
		||||
#include <string>
 | 
			
		||||
 | 
			
		||||
int *pArgc = NULL;
 | 
			
		||||
int   *pArgc = NULL;
 | 
			
		||||
char **pArgv = NULL;
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION < 5000
 | 
			
		||||
@ -46,19 +45,16 @@ char **pArgv = NULL;
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
 | 
			
		||||
// This function wraps the CUDA Driver API into a template function
 | 
			
		||||
template <class T>
 | 
			
		||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
 | 
			
		||||
                             int device) {
 | 
			
		||||
  CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
 | 
			
		||||
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
 | 
			
		||||
{
 | 
			
		||||
    CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
 | 
			
		||||
 | 
			
		||||
  if (CUDA_SUCCESS != error) {
 | 
			
		||||
    fprintf(
 | 
			
		||||
        stderr,
 | 
			
		||||
        "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
 | 
			
		||||
        error, __FILE__, __LINE__);
 | 
			
		||||
    if (CUDA_SUCCESS != error) {
 | 
			
		||||
        fprintf(
 | 
			
		||||
            stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif /* CUDART_VERSION < 5000 */
 | 
			
		||||
@ -66,278 +62,259 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  pArgc = &argc;
 | 
			
		||||
  pArgv = argv;
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    pArgc = &argc;
 | 
			
		||||
    pArgv = argv;
 | 
			
		||||
 | 
			
		||||
  printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
  printf(
 | 
			
		||||
      " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
 | 
			
		||||
    printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
 | 
			
		||||
 | 
			
		||||
  int deviceCount = 0;
 | 
			
		||||
  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
 | 
			
		||||
    int         deviceCount = 0;
 | 
			
		||||
    cudaError_t error_id    = cudaGetDeviceCount(&deviceCount);
 | 
			
		||||
 | 
			
		||||
  if (error_id != cudaSuccess) {
 | 
			
		||||
    printf("cudaGetDeviceCount returned %d\n-> %s\n",
 | 
			
		||||
           static_cast<int>(error_id), cudaGetErrorString(error_id));
 | 
			
		||||
    printf("Result = FAIL\n");
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
    if (error_id != cudaSuccess) {
 | 
			
		||||
        printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
 | 
			
		||||
        printf("Result = FAIL\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // This function call returns 0 if there are no CUDA capable devices.
 | 
			
		||||
  if (deviceCount == 0) {
 | 
			
		||||
    printf("There are no available device(s) that support CUDA\n");
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
 | 
			
		||||
  }
 | 
			
		||||
    // This function call returns 0 if there are no CUDA capable devices.
 | 
			
		||||
    if (deviceCount == 0) {
 | 
			
		||||
        printf("There are no available device(s) that support CUDA\n");
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  int dev, driverVersion = 0, runtimeVersion = 0;
 | 
			
		||||
    int dev, driverVersion = 0, runtimeVersion = 0;
 | 
			
		||||
 | 
			
		||||
  for (dev = 0; dev < deviceCount; ++dev) {
 | 
			
		||||
    cudaSetDevice(dev);
 | 
			
		||||
    cudaDeviceProp deviceProp;
 | 
			
		||||
    cudaGetDeviceProperties(&deviceProp, dev);
 | 
			
		||||
    for (dev = 0; dev < deviceCount; ++dev) {
 | 
			
		||||
        cudaSetDevice(dev);
 | 
			
		||||
        cudaDeviceProp deviceProp;
 | 
			
		||||
        cudaGetDeviceProperties(&deviceProp, dev);
 | 
			
		||||
 | 
			
		||||
    printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
 | 
			
		||||
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
 | 
			
		||||
 | 
			
		||||
    // Console log
 | 
			
		||||
    cudaDriverGetVersion(&driverVersion);
 | 
			
		||||
    cudaRuntimeGetVersion(&runtimeVersion);
 | 
			
		||||
    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
 | 
			
		||||
           driverVersion / 1000, (driverVersion % 100) / 10,
 | 
			
		||||
           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 | 
			
		||||
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
 | 
			
		||||
           deviceProp.major, deviceProp.minor);
 | 
			
		||||
        // Console log
 | 
			
		||||
        cudaDriverGetVersion(&driverVersion);
 | 
			
		||||
        cudaRuntimeGetVersion(&runtimeVersion);
 | 
			
		||||
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
 | 
			
		||||
               driverVersion / 1000,
 | 
			
		||||
               (driverVersion % 100) / 10,
 | 
			
		||||
               runtimeVersion / 1000,
 | 
			
		||||
               (runtimeVersion % 100) / 10);
 | 
			
		||||
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
 | 
			
		||||
 | 
			
		||||
    char msg[256];
 | 
			
		||||
        char msg[256];
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
    sprintf_s(msg, sizeof(msg),
 | 
			
		||||
              "  Total amount of global memory:                 %.0f MBytes "
 | 
			
		||||
              "(%llu bytes)\n",
 | 
			
		||||
              static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
 | 
			
		||||
              (unsigned long long)deviceProp.totalGlobalMem);
 | 
			
		||||
        sprintf_s(msg,
 | 
			
		||||
                  sizeof(msg),
 | 
			
		||||
                  "  Total amount of global memory:                 %.0f MBytes "
 | 
			
		||||
                  "(%llu bytes)\n",
 | 
			
		||||
                  static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
 | 
			
		||||
                  (unsigned long long)deviceProp.totalGlobalMem);
 | 
			
		||||
#else
 | 
			
		||||
    snprintf(msg, sizeof(msg),
 | 
			
		||||
             "  Total amount of global memory:                 %.0f MBytes "
 | 
			
		||||
             "(%llu bytes)\n",
 | 
			
		||||
             static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
 | 
			
		||||
             (unsigned long long)deviceProp.totalGlobalMem);
 | 
			
		||||
        snprintf(msg,
 | 
			
		||||
                 sizeof(msg),
 | 
			
		||||
                 "  Total amount of global memory:                 %.0f MBytes "
 | 
			
		||||
                 "(%llu bytes)\n",
 | 
			
		||||
                 static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
 | 
			
		||||
                 (unsigned long long)deviceProp.totalGlobalMem);
 | 
			
		||||
#endif
 | 
			
		||||
    printf("%s", msg);
 | 
			
		||||
        printf("%s", msg);
 | 
			
		||||
 | 
			
		||||
    printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
 | 
			
		||||
           deviceProp.multiProcessorCount,
 | 
			
		||||
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 | 
			
		||||
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
 | 
			
		||||
               deviceProp.multiProcessorCount);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
 | 
			
		||||
        "GHz)\n",
 | 
			
		||||
        deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
 | 
			
		||||
        printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
 | 
			
		||||
               deviceProp.multiProcessorCount,
 | 
			
		||||
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 | 
			
		||||
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
 | 
			
		||||
        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
 | 
			
		||||
               "GHz)\n",
 | 
			
		||||
               deviceProp.clockRate * 1e-3f,
 | 
			
		||||
               deviceProp.clockRate * 1e-6f);
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION >= 5000
 | 
			
		||||
    // This is supported in CUDA 5.0 (runtime API device properties)
 | 
			
		||||
    printf("  Memory Clock rate:                             %.0f Mhz\n",
 | 
			
		||||
           deviceProp.memoryClockRate * 1e-3f);
 | 
			
		||||
    printf("  Memory Bus Width:                              %d-bit\n",
 | 
			
		||||
           deviceProp.memoryBusWidth);
 | 
			
		||||
        // This is supported in CUDA 5.0 (runtime API device properties)
 | 
			
		||||
        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
 | 
			
		||||
        printf("  Memory Bus Width:                              %d-bit\n", deviceProp.memoryBusWidth);
 | 
			
		||||
 | 
			
		||||
    if (deviceProp.l2CacheSize) {
 | 
			
		||||
      printf("  L2 Cache Size:                                 %d bytes\n",
 | 
			
		||||
             deviceProp.l2CacheSize);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
    // This only available in CUDA 4.0-4.2 (but these were only exposed in the
 | 
			
		||||
    // CUDA Driver API)
 | 
			
		||||
    int memoryClock;
 | 
			
		||||
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Memory Clock rate:                             %.0f Mhz\n",
 | 
			
		||||
           memoryClock * 1e-3f);
 | 
			
		||||
    int memBusWidth;
 | 
			
		||||
    getCudaAttribute<int>(&memBusWidth,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
 | 
			
		||||
    printf("  Memory Bus Width:                              %d-bit\n",
 | 
			
		||||
           memBusWidth);
 | 
			
		||||
    int L2CacheSize;
 | 
			
		||||
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
 | 
			
		||||
 | 
			
		||||
    if (L2CacheSize) {
 | 
			
		||||
      printf("  L2 Cache Size:                                 %d bytes\n",
 | 
			
		||||
             L2CacheSize);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
 | 
			
		||||
        "%d), 3D=(%d, %d, %d)\n",
 | 
			
		||||
        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
 | 
			
		||||
        deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
 | 
			
		||||
        deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
 | 
			
		||||
        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
 | 
			
		||||
        "layers\n",
 | 
			
		||||
        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
 | 
			
		||||
        deviceProp.maxTexture2DLayered[2]);
 | 
			
		||||
 | 
			
		||||
    printf("  Total amount of constant memory:               %zu bytes\n",
 | 
			
		||||
           deviceProp.totalConstMem);
 | 
			
		||||
    printf("  Total amount of shared memory per block:       %zu bytes\n",
 | 
			
		||||
           deviceProp.sharedMemPerBlock);
 | 
			
		||||
    printf("  Total shared memory per multiprocessor:        %zu bytes\n",
 | 
			
		||||
           deviceProp.sharedMemPerMultiprocessor);
 | 
			
		||||
    printf("  Total number of registers available per block: %d\n",
 | 
			
		||||
           deviceProp.regsPerBlock);
 | 
			
		||||
    printf("  Warp size:                                     %d\n",
 | 
			
		||||
           deviceProp.warpSize);
 | 
			
		||||
    printf("  Maximum number of threads per multiprocessor:  %d\n",
 | 
			
		||||
           deviceProp.maxThreadsPerMultiProcessor);
 | 
			
		||||
    printf("  Maximum number of threads per block:           %d\n",
 | 
			
		||||
           deviceProp.maxThreadsPerBlock);
 | 
			
		||||
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
 | 
			
		||||
           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
 | 
			
		||||
           deviceProp.maxThreadsDim[2]);
 | 
			
		||||
    printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
 | 
			
		||||
           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
 | 
			
		||||
           deviceProp.maxGridSize[2]);
 | 
			
		||||
    printf("  Maximum memory pitch:                          %zu bytes\n",
 | 
			
		||||
           deviceProp.memPitch);
 | 
			
		||||
    printf("  Texture alignment:                             %zu bytes\n",
 | 
			
		||||
           deviceProp.textureAlignment);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Concurrent copy and kernel execution:          %s with %d copy "
 | 
			
		||||
        "engine(s)\n",
 | 
			
		||||
        (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
 | 
			
		||||
    printf("  Run time limit on kernels:                     %s\n",
 | 
			
		||||
           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
 | 
			
		||||
    printf("  Integrated GPU sharing Host Memory:            %s\n",
 | 
			
		||||
           deviceProp.integrated ? "Yes" : "No");
 | 
			
		||||
    printf("  Support host page-locked memory mapping:       %s\n",
 | 
			
		||||
           deviceProp.canMapHostMemory ? "Yes" : "No");
 | 
			
		||||
    printf("  Alignment requirement for Surfaces:            %s\n",
 | 
			
		||||
           deviceProp.surfaceAlignment ? "Yes" : "No");
 | 
			
		||||
    printf("  Device has ECC support:                        %s\n",
 | 
			
		||||
           deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
 | 
			
		||||
           deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
 | 
			
		||||
                                : "WDDM (Windows Display Driver Model)");
 | 
			
		||||
#endif
 | 
			
		||||
    printf("  Device supports Unified Addressing (UVA):      %s\n",
 | 
			
		||||
           deviceProp.unifiedAddressing ? "Yes" : "No");
 | 
			
		||||
    printf("  Device supports Managed Memory:                %s\n",
 | 
			
		||||
           deviceProp.managedMemory ? "Yes" : "No");
 | 
			
		||||
    printf("  Device supports Compute Preemption:            %s\n",
 | 
			
		||||
           deviceProp.computePreemptionSupported ? "Yes" : "No");
 | 
			
		||||
    printf("  Supports Cooperative Kernel Launch:            %s\n",
 | 
			
		||||
           deviceProp.cooperativeLaunch ? "Yes" : "No");
 | 
			
		||||
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
 | 
			
		||||
           deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
 | 
			
		||||
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
 | 
			
		||||
           deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
 | 
			
		||||
 | 
			
		||||
    const char *sComputeMode[] = {
 | 
			
		||||
        "Default (multiple host threads can use ::cudaSetDevice() with device "
 | 
			
		||||
        "simultaneously)",
 | 
			
		||||
        "Exclusive (only one host thread in one process is able to use "
 | 
			
		||||
        "::cudaSetDevice() with this device)",
 | 
			
		||||
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
 | 
			
		||||
        "device)",
 | 
			
		||||
        "Exclusive Process (many threads in one process is able to use "
 | 
			
		||||
        "::cudaSetDevice() with this device)",
 | 
			
		||||
        "Unknown", NULL};
 | 
			
		||||
    printf("  Compute Mode:\n");
 | 
			
		||||
    printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // If there are 2 or more GPUs, query to determine whether RDMA is supported
 | 
			
		||||
  if (deviceCount >= 2) {
 | 
			
		||||
    cudaDeviceProp prop[64];
 | 
			
		||||
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
 | 
			
		||||
    int gpu_p2p_count = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < deviceCount; i++) {
 | 
			
		||||
      checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
 | 
			
		||||
 | 
			
		||||
      // Only boards based on Fermi or later can support P2P
 | 
			
		||||
      if ((prop[i].major >= 2)
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
 | 
			
		||||
          // must be enabled to support this
 | 
			
		||||
          && prop[i].tccDriver
 | 
			
		||||
#endif
 | 
			
		||||
          ) {
 | 
			
		||||
        // This is an array of P2P capable GPUs
 | 
			
		||||
        gpuid[gpu_p2p_count++] = i;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Show all the combinations of support P2P GPUs
 | 
			
		||||
    int can_access_peer;
 | 
			
		||||
 | 
			
		||||
    if (gpu_p2p_count >= 2) {
 | 
			
		||||
      for (int i = 0; i < gpu_p2p_count; i++) {
 | 
			
		||||
        for (int j = 0; j < gpu_p2p_count; j++) {
 | 
			
		||||
          if (gpuid[i] == gpuid[j]) {
 | 
			
		||||
            continue;
 | 
			
		||||
          }
 | 
			
		||||
          checkCudaErrors(
 | 
			
		||||
              cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
 | 
			
		||||
          printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
 | 
			
		||||
                 prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
 | 
			
		||||
                 can_access_peer ? "Yes" : "No");
 | 
			
		||||
        if (deviceProp.l2CacheSize) {
 | 
			
		||||
            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the
 | 
			
		||||
        // CUDA Driver API)
 | 
			
		||||
        int memoryClock;
 | 
			
		||||
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
 | 
			
		||||
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
 | 
			
		||||
        int memBusWidth;
 | 
			
		||||
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
 | 
			
		||||
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
 | 
			
		||||
        int L2CacheSize;
 | 
			
		||||
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
 | 
			
		||||
 | 
			
		||||
        if (L2CacheSize) {
 | 
			
		||||
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
 | 
			
		||||
               "%d), 3D=(%d, %d, %d)\n",
 | 
			
		||||
               deviceProp.maxTexture1D,
 | 
			
		||||
               deviceProp.maxTexture2D[0],
 | 
			
		||||
               deviceProp.maxTexture2D[1],
 | 
			
		||||
               deviceProp.maxTexture3D[0],
 | 
			
		||||
               deviceProp.maxTexture3D[1],
 | 
			
		||||
               deviceProp.maxTexture3D[2]);
 | 
			
		||||
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
 | 
			
		||||
               deviceProp.maxTexture1DLayered[0],
 | 
			
		||||
               deviceProp.maxTexture1DLayered[1]);
 | 
			
		||||
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
 | 
			
		||||
               "layers\n",
 | 
			
		||||
               deviceProp.maxTexture2DLayered[0],
 | 
			
		||||
               deviceProp.maxTexture2DLayered[1],
 | 
			
		||||
               deviceProp.maxTexture2DLayered[2]);
 | 
			
		||||
 | 
			
		||||
        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
 | 
			
		||||
        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
 | 
			
		||||
        printf("  Total shared memory per multiprocessor:        %zu bytes\n", deviceProp.sharedMemPerMultiprocessor);
 | 
			
		||||
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
 | 
			
		||||
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
 | 
			
		||||
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
 | 
			
		||||
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
 | 
			
		||||
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
 | 
			
		||||
               deviceProp.maxThreadsDim[0],
 | 
			
		||||
               deviceProp.maxThreadsDim[1],
 | 
			
		||||
               deviceProp.maxThreadsDim[2]);
 | 
			
		||||
        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
 | 
			
		||||
               deviceProp.maxGridSize[0],
 | 
			
		||||
               deviceProp.maxGridSize[1],
 | 
			
		||||
               deviceProp.maxGridSize[2]);
 | 
			
		||||
        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
 | 
			
		||||
        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
 | 
			
		||||
        printf("  Concurrent copy and kernel execution:          %s with %d copy "
 | 
			
		||||
               "engine(s)\n",
 | 
			
		||||
               (deviceProp.deviceOverlap ? "Yes" : "No"),
 | 
			
		||||
               deviceProp.asyncEngineCount);
 | 
			
		||||
        printf("  Run time limit on kernels:                     %s\n",
 | 
			
		||||
               deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
 | 
			
		||||
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
 | 
			
		||||
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
 | 
			
		||||
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
 | 
			
		||||
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
 | 
			
		||||
               deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
 | 
			
		||||
#endif
 | 
			
		||||
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
 | 
			
		||||
        printf("  Device supports Managed Memory:                %s\n", deviceProp.managedMemory ? "Yes" : "No");
 | 
			
		||||
        printf("  Device supports Compute Preemption:            %s\n",
 | 
			
		||||
               deviceProp.computePreemptionSupported ? "Yes" : "No");
 | 
			
		||||
        printf("  Supports Cooperative Kernel Launch:            %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
 | 
			
		||||
        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
 | 
			
		||||
               deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
 | 
			
		||||
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
 | 
			
		||||
               deviceProp.pciDomainID,
 | 
			
		||||
               deviceProp.pciBusID,
 | 
			
		||||
               deviceProp.pciDeviceID);
 | 
			
		||||
 | 
			
		||||
        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
 | 
			
		||||
                                      "simultaneously)",
 | 
			
		||||
                                      "Exclusive (only one host thread in one process is able to use "
 | 
			
		||||
                                      "::cudaSetDevice() with this device)",
 | 
			
		||||
                                      "Prohibited (no host thread can use ::cudaSetDevice() with this "
 | 
			
		||||
                                      "device)",
 | 
			
		||||
                                      "Exclusive Process (many threads in one process is able to use "
 | 
			
		||||
                                      "::cudaSetDevice() with this device)",
 | 
			
		||||
                                      "Unknown",
 | 
			
		||||
                                      NULL};
 | 
			
		||||
        printf("  Compute Mode:\n");
 | 
			
		||||
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // csv masterlog info
 | 
			
		||||
  // *****************************
 | 
			
		||||
  // exe and CUDA driver name
 | 
			
		||||
  printf("\n");
 | 
			
		||||
  std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
 | 
			
		||||
  char cTemp[16];
 | 
			
		||||
    // If there are 2 or more GPUs, query to determine whether RDMA is supported
 | 
			
		||||
    if (deviceCount >= 2) {
 | 
			
		||||
        cudaDeviceProp prop[64];
 | 
			
		||||
        int            gpuid[64]; // we want to find the first two GPUs that can support P2P
 | 
			
		||||
        int            gpu_p2p_count = 0;
 | 
			
		||||
 | 
			
		||||
  // driver version
 | 
			
		||||
  sProfileString += ", CUDA Driver Version = ";
 | 
			
		||||
        for (int i = 0; i < deviceCount; i++) {
 | 
			
		||||
            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
 | 
			
		||||
 | 
			
		||||
            // Only boards based on Fermi or later can support P2P
 | 
			
		||||
            if ((prop[i].major >= 2)
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
  sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
 | 
			
		||||
            (driverVersion % 100) / 10);
 | 
			
		||||
#else
 | 
			
		||||
  snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
 | 
			
		||||
           (driverVersion % 100) / 10);
 | 
			
		||||
                // on Windows (64-bit), the Tesla Compute Cluster driver for windows
 | 
			
		||||
                // must be enabled to support this
 | 
			
		||||
                && prop[i].tccDriver
 | 
			
		||||
#endif
 | 
			
		||||
  sProfileString += cTemp;
 | 
			
		||||
            ) {
 | 
			
		||||
                // This is an array of P2P capable GPUs
 | 
			
		||||
                gpuid[gpu_p2p_count++] = i;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
  // Runtime version
 | 
			
		||||
  sProfileString += ", CUDA Runtime Version = ";
 | 
			
		||||
        // Show all the combinations of support P2P GPUs
 | 
			
		||||
        int can_access_peer;
 | 
			
		||||
 | 
			
		||||
        if (gpu_p2p_count >= 2) {
 | 
			
		||||
            for (int i = 0; i < gpu_p2p_count; i++) {
 | 
			
		||||
                for (int j = 0; j < gpu_p2p_count; j++) {
 | 
			
		||||
                    if (gpuid[i] == gpuid[j]) {
 | 
			
		||||
                        continue;
 | 
			
		||||
                    }
 | 
			
		||||
                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
 | 
			
		||||
                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
 | 
			
		||||
                           prop[gpuid[i]].name,
 | 
			
		||||
                           gpuid[i],
 | 
			
		||||
                           prop[gpuid[j]].name,
 | 
			
		||||
                           gpuid[j],
 | 
			
		||||
                           can_access_peer ? "Yes" : "No");
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // csv masterlog info
 | 
			
		||||
    // *****************************
 | 
			
		||||
    // exe and CUDA driver name
 | 
			
		||||
    printf("\n");
 | 
			
		||||
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
 | 
			
		||||
    char        cTemp[16];
 | 
			
		||||
 | 
			
		||||
    // driver version
 | 
			
		||||
    sProfileString += ", CUDA Driver Version = ";
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
 | 
			
		||||
            (runtimeVersion % 100) / 10);
 | 
			
		||||
    sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
 | 
			
		||||
#else
 | 
			
		||||
  snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
 | 
			
		||||
           (runtimeVersion % 100) / 10);
 | 
			
		||||
    snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
 | 
			
		||||
#endif
 | 
			
		||||
  sProfileString += cTemp;
 | 
			
		||||
    sProfileString += cTemp;
 | 
			
		||||
 | 
			
		||||
  // Device count
 | 
			
		||||
  sProfileString += ", NumDevs = ";
 | 
			
		||||
    // Runtime version
 | 
			
		||||
    sProfileString += ", CUDA Runtime Version = ";
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
  sprintf_s(cTemp, 10, "%d", deviceCount);
 | 
			
		||||
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 | 
			
		||||
#else
 | 
			
		||||
  snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
 | 
			
		||||
    snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 | 
			
		||||
#endif
 | 
			
		||||
  sProfileString += cTemp;
 | 
			
		||||
  sProfileString += "\n";
 | 
			
		||||
  printf("%s", sProfileString.c_str());
 | 
			
		||||
    sProfileString += cTemp;
 | 
			
		||||
 | 
			
		||||
  printf("Result = PASS\n");
 | 
			
		||||
    // Device count
 | 
			
		||||
    sProfileString += ", NumDevs = ";
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
    sprintf_s(cTemp, 10, "%d", deviceCount);
 | 
			
		||||
#else
 | 
			
		||||
    snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
 | 
			
		||||
#endif
 | 
			
		||||
    sProfileString += cTemp;
 | 
			
		||||
    sProfileString += "\n";
 | 
			
		||||
    printf("%s", sProfileString.c_str());
 | 
			
		||||
 | 
			
		||||
  // finish
 | 
			
		||||
  exit(EXIT_SUCCESS);
 | 
			
		||||
    printf("Result = PASS\n");
 | 
			
		||||
 | 
			
		||||
    // finish
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -30,358 +30,295 @@
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// includes, system
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
#include <helper_cuda_drvapi.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Program main
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  CUdevice dev;
 | 
			
		||||
  int major = 0, minor = 0;
 | 
			
		||||
  int deviceCount = 0;
 | 
			
		||||
  char deviceName[256];
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    CUdevice dev;
 | 
			
		||||
    int      major = 0, minor = 0;
 | 
			
		||||
    int      deviceCount = 0;
 | 
			
		||||
    char     deviceName[256];
 | 
			
		||||
 | 
			
		||||
  printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
    printf("%s Starting...\n\n", argv[0]);
 | 
			
		||||
 | 
			
		||||
  // note your project will need to link with cuda.lib files on windows
 | 
			
		||||
  printf("CUDA Device Query (Driver API) statically linked version \n");
 | 
			
		||||
    // note your project will need to link with cuda.lib files on windows
 | 
			
		||||
    printf("CUDA Device Query (Driver API) statically linked version \n");
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuInit(0));
 | 
			
		||||
    checkCudaErrors(cuInit(0));
 | 
			
		||||
 | 
			
		||||
  checkCudaErrors(cuDeviceGetCount(&deviceCount));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetCount(&deviceCount));
 | 
			
		||||
 | 
			
		||||
  // This function call returns 0 if there are no CUDA capable devices.
 | 
			
		||||
  if (deviceCount == 0) {
 | 
			
		||||
    printf("There are no available device(s) that support CUDA\n");
 | 
			
		||||
  } else {
 | 
			
		||||
    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (dev = 0; dev < deviceCount; ++dev) {
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
 | 
			
		||||
    checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
 | 
			
		||||
 | 
			
		||||
    checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
 | 
			
		||||
 | 
			
		||||
    printf("\nDevice %d: \"%s\"\n", dev, deviceName);
 | 
			
		||||
 | 
			
		||||
    int driverVersion = 0;
 | 
			
		||||
    checkCudaErrors(cuDriverGetVersion(&driverVersion));
 | 
			
		||||
    printf("  CUDA Driver Version:                           %d.%d\n",
 | 
			
		||||
           driverVersion / 1000, (driverVersion % 100) / 10);
 | 
			
		||||
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major,
 | 
			
		||||
           minor);
 | 
			
		||||
 | 
			
		||||
    size_t totalGlobalMem;
 | 
			
		||||
    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
 | 
			
		||||
 | 
			
		||||
    char msg[256];
 | 
			
		||||
    SPRINTF(msg,
 | 
			
		||||
            "  Total amount of global memory:                 %.0f MBytes "
 | 
			
		||||
            "(%llu bytes)\n",
 | 
			
		||||
            (float)totalGlobalMem / 1048576.0f,
 | 
			
		||||
            (unsigned long long)totalGlobalMem);
 | 
			
		||||
    printf("%s", msg);
 | 
			
		||||
 | 
			
		||||
    int multiProcessorCount;
 | 
			
		||||
    getCudaAttribute<int>(&multiProcessorCount,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 | 
			
		||||
 | 
			
		||||
    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
 | 
			
		||||
           multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
 | 
			
		||||
           _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
 | 
			
		||||
 | 
			
		||||
    int clockRate;
 | 
			
		||||
    getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
 | 
			
		||||
        "GHz)\n",
 | 
			
		||||
        clockRate * 1e-3f, clockRate * 1e-6f);
 | 
			
		||||
    int memoryClock;
 | 
			
		||||
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Memory Clock rate:                             %.0f Mhz\n",
 | 
			
		||||
           memoryClock * 1e-3f);
 | 
			
		||||
    int memBusWidth;
 | 
			
		||||
    getCudaAttribute<int>(&memBusWidth,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
 | 
			
		||||
    printf("  Memory Bus Width:                              %d-bit\n",
 | 
			
		||||
           memBusWidth);
 | 
			
		||||
    int L2CacheSize;
 | 
			
		||||
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
 | 
			
		||||
 | 
			
		||||
    if (L2CacheSize) {
 | 
			
		||||
      printf("  L2 Cache Size:                                 %d bytes\n",
 | 
			
		||||
             L2CacheSize);
 | 
			
		||||
    // This function call returns 0 if there are no CUDA capable devices.
 | 
			
		||||
    if (deviceCount == 0) {
 | 
			
		||||
        printf("There are no available device(s) that support CUDA\n");
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int maxTex1D, maxTex2D[2], maxTex3D[3];
 | 
			
		||||
    getCudaAttribute<int>(&maxTex1D,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex2D[0],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex2D[1],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex3D[0],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex3D[1],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex3D[2],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
 | 
			
		||||
        "3D=(%d, %d, %d)\n",
 | 
			
		||||
        maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
 | 
			
		||||
        maxTex3D[2]);
 | 
			
		||||
    for (dev = 0; dev < deviceCount; ++dev) {
 | 
			
		||||
        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
 | 
			
		||||
        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
 | 
			
		||||
 | 
			
		||||
    int maxTex1DLayered[2];
 | 
			
		||||
    getCudaAttribute<int>(&maxTex1DLayered[0],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
 | 
			
		||||
                          dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex1DLayered[1],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
 | 
			
		||||
        maxTex1DLayered[0], maxTex1DLayered[1]);
 | 
			
		||||
        checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
 | 
			
		||||
 | 
			
		||||
    int maxTex2DLayered[3];
 | 
			
		||||
    getCudaAttribute<int>(&maxTex2DLayered[0],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
 | 
			
		||||
                          dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex2DLayered[1],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
 | 
			
		||||
                          dev);
 | 
			
		||||
    getCudaAttribute<int>(&maxTex2DLayered[2],
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
 | 
			
		||||
        "layers\n",
 | 
			
		||||
        maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
 | 
			
		||||
        printf("\nDevice %d: \"%s\"\n", dev, deviceName);
 | 
			
		||||
 | 
			
		||||
    int totalConstantMemory;
 | 
			
		||||
    getCudaAttribute<int>(&totalConstantMemory,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
 | 
			
		||||
    printf("  Total amount of constant memory:               %u bytes\n",
 | 
			
		||||
           totalConstantMemory);
 | 
			
		||||
    int sharedMemPerBlock;
 | 
			
		||||
    getCudaAttribute<int>(&sharedMemPerBlock,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
 | 
			
		||||
    printf("  Total amount of shared memory per block:       %u bytes\n",
 | 
			
		||||
           sharedMemPerBlock);
 | 
			
		||||
    int regsPerBlock;
 | 
			
		||||
    getCudaAttribute<int>(®sPerBlock,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 | 
			
		||||
    printf("  Total number of registers available per block: %d\n",
 | 
			
		||||
           regsPerBlock);
 | 
			
		||||
    int warpSize;
 | 
			
		||||
    getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 | 
			
		||||
    printf("  Warp size:                                     %d\n", warpSize);
 | 
			
		||||
    int maxThreadsPerMultiProcessor;
 | 
			
		||||
    getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Maximum number of threads per multiprocessor:  %d\n",
 | 
			
		||||
           maxThreadsPerMultiProcessor);
 | 
			
		||||
    int maxThreadsPerBlock;
 | 
			
		||||
    getCudaAttribute<int>(&maxThreadsPerBlock,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 | 
			
		||||
    printf("  Maximum number of threads per block:           %d\n",
 | 
			
		||||
           maxThreadsPerBlock);
 | 
			
		||||
        int driverVersion = 0;
 | 
			
		||||
        checkCudaErrors(cuDriverGetVersion(&driverVersion));
 | 
			
		||||
        printf("  CUDA Driver Version:                           %d.%d\n",
 | 
			
		||||
               driverVersion / 1000,
 | 
			
		||||
               (driverVersion % 100) / 10);
 | 
			
		||||
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);
 | 
			
		||||
 | 
			
		||||
    int blockDim[3];
 | 
			
		||||
    getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
 | 
			
		||||
                          dev);
 | 
			
		||||
    getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
 | 
			
		||||
                          dev);
 | 
			
		||||
    getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
 | 
			
		||||
           blockDim[0], blockDim[1], blockDim[2]);
 | 
			
		||||
    int gridDim[3];
 | 
			
		||||
    getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
 | 
			
		||||
    getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
 | 
			
		||||
    getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
 | 
			
		||||
    printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n",
 | 
			
		||||
           gridDim[0], gridDim[1], gridDim[2]);
 | 
			
		||||
        size_t totalGlobalMem;
 | 
			
		||||
        checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
 | 
			
		||||
 | 
			
		||||
    int textureAlign;
 | 
			
		||||
    getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Texture alignment:                             %u bytes\n",
 | 
			
		||||
           textureAlign);
 | 
			
		||||
        char msg[256];
 | 
			
		||||
        SPRINTF(msg,
 | 
			
		||||
                "  Total amount of global memory:                 %.0f MBytes "
 | 
			
		||||
                "(%llu bytes)\n",
 | 
			
		||||
                (float)totalGlobalMem / 1048576.0f,
 | 
			
		||||
                (unsigned long long)totalGlobalMem);
 | 
			
		||||
        printf("%s", msg);
 | 
			
		||||
 | 
			
		||||
    int memPitch;
 | 
			
		||||
    getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
 | 
			
		||||
    printf("  Maximum memory pitch:                          %u bytes\n",
 | 
			
		||||
           memPitch);
 | 
			
		||||
        int multiProcessorCount;
 | 
			
		||||
        getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 | 
			
		||||
 | 
			
		||||
    int gpuOverlap;
 | 
			
		||||
    getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 | 
			
		||||
        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
 | 
			
		||||
               multiProcessorCount,
 | 
			
		||||
               _ConvertSMVer2CoresDRV(major, minor),
 | 
			
		||||
               _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
 | 
			
		||||
 | 
			
		||||
    int asyncEngineCount;
 | 
			
		||||
    getCudaAttribute<int>(&asyncEngineCount,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 | 
			
		||||
    printf(
 | 
			
		||||
        "  Concurrent copy and kernel execution:          %s with %d copy "
 | 
			
		||||
        "engine(s)\n",
 | 
			
		||||
        (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
 | 
			
		||||
        int clockRate;
 | 
			
		||||
        getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 | 
			
		||||
        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
 | 
			
		||||
               "GHz)\n",
 | 
			
		||||
               clockRate * 1e-3f,
 | 
			
		||||
               clockRate * 1e-6f);
 | 
			
		||||
        int memoryClock;
 | 
			
		||||
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
 | 
			
		||||
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
 | 
			
		||||
        int memBusWidth;
 | 
			
		||||
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
 | 
			
		||||
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
 | 
			
		||||
        int L2CacheSize;
 | 
			
		||||
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
 | 
			
		||||
 | 
			
		||||
    int kernelExecTimeoutEnabled;
 | 
			
		||||
    getCudaAttribute<int>(&kernelExecTimeoutEnabled,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
 | 
			
		||||
    printf("  Run time limit on kernels:                     %s\n",
 | 
			
		||||
           kernelExecTimeoutEnabled ? "Yes" : "No");
 | 
			
		||||
    int integrated;
 | 
			
		||||
    getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 | 
			
		||||
    printf("  Integrated GPU sharing Host Memory:            %s\n",
 | 
			
		||||
           integrated ? "Yes" : "No");
 | 
			
		||||
    int canMapHostMemory;
 | 
			
		||||
    getCudaAttribute<int>(&canMapHostMemory,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 | 
			
		||||
    printf("  Support host page-locked memory mapping:       %s\n",
 | 
			
		||||
           canMapHostMemory ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int concurrentKernels;
 | 
			
		||||
    getCudaAttribute<int>(&concurrentKernels,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 | 
			
		||||
    printf("  Concurrent kernel execution:                   %s\n",
 | 
			
		||||
           concurrentKernels ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int surfaceAlignment;
 | 
			
		||||
    getCudaAttribute<int>(&surfaceAlignment,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
 | 
			
		||||
    printf("  Alignment requirement for Surfaces:            %s\n",
 | 
			
		||||
           surfaceAlignment ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int eccEnabled;
 | 
			
		||||
    getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
 | 
			
		||||
    printf("  Device has ECC support:                        %s\n",
 | 
			
		||||
           eccEnabled ? "Enabled" : "Disabled");
 | 
			
		||||
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
    int tccDriver;
 | 
			
		||||
    getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
 | 
			
		||||
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
 | 
			
		||||
           tccDriver ? "TCC (Tesla Compute Cluster Driver)"
 | 
			
		||||
                     : "WDDM (Windows Display Driver Model)");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    int unifiedAddressing;
 | 
			
		||||
    getCudaAttribute<int>(&unifiedAddressing,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
 | 
			
		||||
    printf("  Device supports Unified Addressing (UVA):      %s\n",
 | 
			
		||||
           unifiedAddressing ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int managedMemory;
 | 
			
		||||
    getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Device supports Managed Memory:                %s\n",
 | 
			
		||||
           managedMemory ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int computePreemption;
 | 
			
		||||
    getCudaAttribute<int>(&computePreemption,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Device supports Compute Preemption:            %s\n",
 | 
			
		||||
           computePreemption ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int cooperativeLaunch;
 | 
			
		||||
    getCudaAttribute<int>(&cooperativeLaunch,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
 | 
			
		||||
    printf("  Supports Cooperative Kernel Launch:            %s\n",
 | 
			
		||||
           cooperativeLaunch ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int cooperativeMultiDevLaunch;
 | 
			
		||||
    getCudaAttribute<int>(&cooperativeMultiDevLaunch,
 | 
			
		||||
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
 | 
			
		||||
                          dev);
 | 
			
		||||
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
 | 
			
		||||
           cooperativeMultiDevLaunch ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
    int pciDomainID, pciBusID, pciDeviceID;
 | 
			
		||||
    getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
 | 
			
		||||
    getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
 | 
			
		||||
    getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
 | 
			
		||||
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
 | 
			
		||||
           pciDomainID, pciBusID, pciDeviceID);
 | 
			
		||||
 | 
			
		||||
    const char *sComputeMode[] = {
 | 
			
		||||
        "Default (multiple host threads can use ::cudaSetDevice() with device "
 | 
			
		||||
        "simultaneously)",
 | 
			
		||||
        "Exclusive (only one host thread in one process is able to use "
 | 
			
		||||
        "::cudaSetDevice() with this device)",
 | 
			
		||||
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
 | 
			
		||||
        "device)",
 | 
			
		||||
        "Exclusive Process (many threads in one process is able to use "
 | 
			
		||||
        "::cudaSetDevice() with this device)",
 | 
			
		||||
        "Unknown", NULL};
 | 
			
		||||
 | 
			
		||||
    int computeMode;
 | 
			
		||||
    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 | 
			
		||||
    printf("  Compute Mode:\n");
 | 
			
		||||
    printf("     < %s >\n", sComputeMode[computeMode]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // If there are 2 or more GPUs, query to determine whether RDMA is supported
 | 
			
		||||
  if (deviceCount >= 2) {
 | 
			
		||||
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
 | 
			
		||||
    int gpu_p2p_count = 0;
 | 
			
		||||
    int tccDriver = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < deviceCount; i++) {
 | 
			
		||||
      checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
 | 
			
		||||
      checkCudaErrors(cuDeviceGetAttribute(
 | 
			
		||||
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
 | 
			
		||||
      getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
 | 
			
		||||
 | 
			
		||||
      // Only boards based on Fermi or later can support P2P
 | 
			
		||||
      if ((major >= 2)
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
 | 
			
		||||
          // must be enabled to support this
 | 
			
		||||
          && tccDriver
 | 
			
		||||
#endif
 | 
			
		||||
          ) {
 | 
			
		||||
        // This is an array of P2P capable GPUs
 | 
			
		||||
        gpuid[gpu_p2p_count++] = i;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Show all the combinations of support P2P GPUs
 | 
			
		||||
    int can_access_peer;
 | 
			
		||||
    char deviceName0[256], deviceName1[256];
 | 
			
		||||
 | 
			
		||||
    if (gpu_p2p_count >= 2) {
 | 
			
		||||
      for (int i = 0; i < gpu_p2p_count; i++) {
 | 
			
		||||
        for (int j = 0; j < gpu_p2p_count; j++) {
 | 
			
		||||
          if (gpuid[i] == gpuid[j]) {
 | 
			
		||||
            continue;
 | 
			
		||||
          }
 | 
			
		||||
          checkCudaErrors(
 | 
			
		||||
              cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
 | 
			
		||||
          checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
 | 
			
		||||
          checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
 | 
			
		||||
          printf(
 | 
			
		||||
              "> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
 | 
			
		||||
              "%s\n",
 | 
			
		||||
              deviceName0, gpuid[i], deviceName1, gpuid[j],
 | 
			
		||||
              can_access_peer ? "Yes" : "No");
 | 
			
		||||
        if (L2CacheSize) {
 | 
			
		||||
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
        int maxTex1D, maxTex2D[2], maxTex3D[3];
 | 
			
		||||
        getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
 | 
			
		||||
        printf("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
 | 
			
		||||
               "3D=(%d, %d, %d)\n",
 | 
			
		||||
               maxTex1D,
 | 
			
		||||
               maxTex2D[0],
 | 
			
		||||
               maxTex2D[1],
 | 
			
		||||
               maxTex3D[0],
 | 
			
		||||
               maxTex3D[1],
 | 
			
		||||
               maxTex3D[2]);
 | 
			
		||||
 | 
			
		||||
        int maxTex1DLayered[2];
 | 
			
		||||
        getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
 | 
			
		||||
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
 | 
			
		||||
               maxTex1DLayered[0],
 | 
			
		||||
               maxTex1DLayered[1]);
 | 
			
		||||
 | 
			
		||||
        int maxTex2DLayered[3];
 | 
			
		||||
        getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
 | 
			
		||||
        getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
 | 
			
		||||
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
 | 
			
		||||
               "layers\n",
 | 
			
		||||
               maxTex2DLayered[0],
 | 
			
		||||
               maxTex2DLayered[1],
 | 
			
		||||
               maxTex2DLayered[2]);
 | 
			
		||||
 | 
			
		||||
        int totalConstantMemory;
 | 
			
		||||
        getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
 | 
			
		||||
        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
 | 
			
		||||
        int sharedMemPerBlock;
 | 
			
		||||
        getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
 | 
			
		||||
        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
 | 
			
		||||
        int regsPerBlock;
 | 
			
		||||
        getCudaAttribute<int>(®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 | 
			
		||||
        printf("  Total number of registers available per block: %d\n", regsPerBlock);
 | 
			
		||||
        int warpSize;
 | 
			
		||||
        getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 | 
			
		||||
        printf("  Warp size:                                     %d\n", warpSize);
 | 
			
		||||
        int maxThreadsPerMultiProcessor;
 | 
			
		||||
        getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 | 
			
		||||
        printf("  Maximum number of threads per multiprocessor:  %d\n", maxThreadsPerMultiProcessor);
 | 
			
		||||
        int maxThreadsPerBlock;
 | 
			
		||||
        getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 | 
			
		||||
        printf("  Maximum number of threads per block:           %d\n", maxThreadsPerBlock);
 | 
			
		||||
 | 
			
		||||
        int blockDim[3];
 | 
			
		||||
        getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
 | 
			
		||||
        getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
 | 
			
		||||
        getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
 | 
			
		||||
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
 | 
			
		||||
        int gridDim[3];
 | 
			
		||||
        getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
 | 
			
		||||
        getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
 | 
			
		||||
        getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
 | 
			
		||||
        printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
 | 
			
		||||
 | 
			
		||||
        int textureAlign;
 | 
			
		||||
        getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
 | 
			
		||||
        printf("  Texture alignment:                             %u bytes\n", textureAlign);
 | 
			
		||||
 | 
			
		||||
        int memPitch;
 | 
			
		||||
        getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
 | 
			
		||||
        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
 | 
			
		||||
 | 
			
		||||
        int gpuOverlap;
 | 
			
		||||
        getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 | 
			
		||||
 | 
			
		||||
        int asyncEngineCount;
 | 
			
		||||
        getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 | 
			
		||||
        printf("  Concurrent copy and kernel execution:          %s with %d copy "
 | 
			
		||||
               "engine(s)\n",
 | 
			
		||||
               (gpuOverlap ? "Yes" : "No"),
 | 
			
		||||
               asyncEngineCount);
 | 
			
		||||
 | 
			
		||||
        int kernelExecTimeoutEnabled;
 | 
			
		||||
        getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
 | 
			
		||||
        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
 | 
			
		||||
        int integrated;
 | 
			
		||||
        getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 | 
			
		||||
        printf("  Integrated GPU sharing Host Memory:            %s\n", integrated ? "Yes" : "No");
 | 
			
		||||
        int canMapHostMemory;
 | 
			
		||||
        getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 | 
			
		||||
        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int concurrentKernels;
 | 
			
		||||
        getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 | 
			
		||||
        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int surfaceAlignment;
 | 
			
		||||
        getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
 | 
			
		||||
        printf("  Alignment requirement for Surfaces:            %s\n", surfaceAlignment ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int eccEnabled;
 | 
			
		||||
        getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
 | 
			
		||||
        printf("  Device has ECC support:                        %s\n", eccEnabled ? "Enabled" : "Disabled");
 | 
			
		||||
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
        int tccDriver;
 | 
			
		||||
        getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
 | 
			
		||||
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
 | 
			
		||||
               tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        int unifiedAddressing;
 | 
			
		||||
        getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
 | 
			
		||||
        printf("  Device supports Unified Addressing (UVA):      %s\n", unifiedAddressing ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int managedMemory;
 | 
			
		||||
        getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
 | 
			
		||||
        printf("  Device supports Managed Memory:                %s\n", managedMemory ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int computePreemption;
 | 
			
		||||
        getCudaAttribute<int>(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
 | 
			
		||||
        printf("  Device supports Compute Preemption:            %s\n", computePreemption ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int cooperativeLaunch;
 | 
			
		||||
        getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
 | 
			
		||||
        printf("  Supports Cooperative Kernel Launch:            %s\n", cooperativeLaunch ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int cooperativeMultiDevLaunch;
 | 
			
		||||
        getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
 | 
			
		||||
        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
 | 
			
		||||
 | 
			
		||||
        int pciDomainID, pciBusID, pciDeviceID;
 | 
			
		||||
        getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
 | 
			
		||||
        getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
 | 
			
		||||
        getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
 | 
			
		||||
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
 | 
			
		||||
 | 
			
		||||
        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
 | 
			
		||||
                                      "simultaneously)",
 | 
			
		||||
                                      "Exclusive (only one host thread in one process is able to use "
 | 
			
		||||
                                      "::cudaSetDevice() with this device)",
 | 
			
		||||
                                      "Prohibited (no host thread can use ::cudaSetDevice() with this "
 | 
			
		||||
                                      "device)",
 | 
			
		||||
                                      "Exclusive Process (many threads in one process is able to use "
 | 
			
		||||
                                      "::cudaSetDevice() with this device)",
 | 
			
		||||
                                      "Unknown",
 | 
			
		||||
                                      NULL};
 | 
			
		||||
 | 
			
		||||
        int computeMode;
 | 
			
		||||
        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 | 
			
		||||
        printf("  Compute Mode:\n");
 | 
			
		||||
        printf("     < %s >\n", sComputeMode[computeMode]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  printf("Result = PASS\n");
 | 
			
		||||
    // If there are 2 or more GPUs, query to determine whether RDMA is supported
 | 
			
		||||
    if (deviceCount >= 2) {
 | 
			
		||||
        int gpuid[64]; // we want to find the first two GPUs that can support P2P
 | 
			
		||||
        int gpu_p2p_count = 0;
 | 
			
		||||
        int tccDriver     = 0;
 | 
			
		||||
 | 
			
		||||
  exit(EXIT_SUCCESS);
 | 
			
		||||
        for (int i = 0; i < deviceCount; i++) {
 | 
			
		||||
            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
 | 
			
		||||
            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
 | 
			
		||||
            getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
 | 
			
		||||
 | 
			
		||||
            // Only boards based on Fermi or later can support P2P
 | 
			
		||||
            if ((major >= 2)
 | 
			
		||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | 
			
		||||
                // on Windows (64-bit), the Tesla Compute Cluster driver for windows
 | 
			
		||||
                // must be enabled to support this
 | 
			
		||||
                && tccDriver
 | 
			
		||||
#endif
 | 
			
		||||
            ) {
 | 
			
		||||
                // This is an array of P2P capable GPUs
 | 
			
		||||
                gpuid[gpu_p2p_count++] = i;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Show all the combinations of support P2P GPUs
 | 
			
		||||
        int  can_access_peer;
 | 
			
		||||
        char deviceName0[256], deviceName1[256];
 | 
			
		||||
 | 
			
		||||
        if (gpu_p2p_count >= 2) {
 | 
			
		||||
            for (int i = 0; i < gpu_p2p_count; i++) {
 | 
			
		||||
                for (int j = 0; j < gpu_p2p_count; j++) {
 | 
			
		||||
                    if (gpuid[i] == gpuid[j]) {
 | 
			
		||||
                        continue;
 | 
			
		||||
                    }
 | 
			
		||||
                    checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
 | 
			
		||||
                    checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
 | 
			
		||||
                    checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
 | 
			
		||||
                    printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
 | 
			
		||||
                           "%s\n",
 | 
			
		||||
                           deviceName0,
 | 
			
		||||
                           gpuid[i],
 | 
			
		||||
                           deviceName1,
 | 
			
		||||
                           gpuid[j],
 | 
			
		||||
                           can_access_peer ? "Yes" : "No");
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("Result = PASS\n");
 | 
			
		||||
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute
 | 
			
		||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 | 
			
		||||
 | 
			
		||||
## References (for more details)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -35,48 +35,44 @@
 | 
			
		||||
 | 
			
		||||
// includes, project
 | 
			
		||||
#include <helper_cuda.h>
 | 
			
		||||
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
 | 
			
		||||
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  int deviceCount = 0;
 | 
			
		||||
  checkCudaErrors(cudaGetDeviceCount(&deviceCount));
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
    int deviceCount = 0;
 | 
			
		||||
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
 | 
			
		||||
 | 
			
		||||
  // Enumerates Device <-> Device links
 | 
			
		||||
  for (int device1 = 0; device1 < deviceCount; device1++) {
 | 
			
		||||
    for (int device2 = 0; device2 < deviceCount; device2++) {
 | 
			
		||||
      if (device1 == device2) continue;
 | 
			
		||||
    // Enumerates Device <-> Device links
 | 
			
		||||
    for (int device1 = 0; device1 < deviceCount; device1++) {
 | 
			
		||||
        for (int device2 = 0; device2 < deviceCount; device2++) {
 | 
			
		||||
            if (device1 == device2)
 | 
			
		||||
                continue;
 | 
			
		||||
 | 
			
		||||
      int perfRank = 0;
 | 
			
		||||
      int atomicSupported = 0;
 | 
			
		||||
      int accessSupported = 0;
 | 
			
		||||
            int perfRank        = 0;
 | 
			
		||||
            int atomicSupported = 0;
 | 
			
		||||
            int accessSupported = 0;
 | 
			
		||||
 | 
			
		||||
      checkCudaErrors(cudaDeviceGetP2PAttribute(
 | 
			
		||||
          &accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
 | 
			
		||||
      checkCudaErrors(cudaDeviceGetP2PAttribute(
 | 
			
		||||
          &perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
 | 
			
		||||
      checkCudaErrors(cudaDeviceGetP2PAttribute(
 | 
			
		||||
          &atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1,
 | 
			
		||||
          device2));
 | 
			
		||||
            checkCudaErrors(
 | 
			
		||||
                cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
 | 
			
		||||
            checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
 | 
			
		||||
            checkCudaErrors(
 | 
			
		||||
                cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));
 | 
			
		||||
 | 
			
		||||
      if (accessSupported) {
 | 
			
		||||
        std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":"
 | 
			
		||||
                  << std::endl;
 | 
			
		||||
        std::cout << "  * Atomic Supported: "
 | 
			
		||||
                  << (atomicSupported ? "yes" : "no") << std::endl;
 | 
			
		||||
        std::cout << "  * Perf Rank: " << perfRank << std::endl;
 | 
			
		||||
      }
 | 
			
		||||
            if (accessSupported) {
 | 
			
		||||
                std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
 | 
			
		||||
                std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
 | 
			
		||||
                std::cout << "  * Perf Rank: " << perfRank << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Enumerates Device <-> Host links
 | 
			
		||||
  for (int device = 0; device < deviceCount; device++) {
 | 
			
		||||
    int atomicSupported = 0;
 | 
			
		||||
    checkCudaErrors(cudaDeviceGetAttribute(
 | 
			
		||||
        &atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
 | 
			
		||||
    std::cout << "GPU" << device << " <-> CPU:" << std::endl;
 | 
			
		||||
    std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no")
 | 
			
		||||
              << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
    // Enumerates Device <-> Host links
 | 
			
		||||
    for (int device = 0; device < deviceCount; device++) {
 | 
			
		||||
        int atomicSupported = 0;
 | 
			
		||||
        checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
 | 
			
		||||
        std::cout << "GPU" << device << " <-> CPU:" << std::endl;
 | 
			
		||||
        std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return 0;
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user