mirror of
				https://github.com/NVIDIA/cuda-samples.git
				synced 2025-10-31 03:07:49 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			698 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			698 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions
 | |
|  * are met:
 | |
|  *  * Redistributions of source code must retain the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer.
 | |
|  *  * Redistributions in binary form must reproduce the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer in the
 | |
|  *    documentation and/or other materials provided with the distribution.
 | |
|  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 | |
|  *    contributors may be used to endorse or promote products derived
 | |
|  *    from this software without specific prior written permission.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 | |
|  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 | |
|  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 | |
|  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 | |
|  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 | |
|  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 | |
|  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 | |
|  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | |
|  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
|  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  */
 | |
| 
 | |
| #include <helper_cuda.h>
 | |
| #include <helper_timer.h>
 | |
| #include "commonDefs.hpp"
 | |
| #include "commonKernels.hpp"
 | |
| 
 | |
| #define VERIFY_GPU_CORRECTNESS 0
 | |
| 
 | |
| size_t maxSampleSizeInMb = 64;
 | |
| int numKernelRuns = 20;
 | |
| int verboseResults = 0;
 | |
| 
 | |
| const char *memAllocTypeStr[MEMALLOC_TYPE_COUNT] = {
 | |
|     "Managed_Memory_With_Hints",
 | |
|     "Managed_Memory_With_Hints_FullyAsync",
 | |
|     "Managed_Memory_NoHints",
 | |
|     "Zero_Copy",
 | |
|     "Memcpy_HostMalloc_DeviceCudaMalloc",
 | |
|     "MemcpyAsync_HostMalloc_DeviceCudaMalloc",
 | |
|     "Memcpy_HostCudaHostAlloc_DeviceCudaMalloc",
 | |
|     "MemcpyAsync_HostCudaHostAlloc_DeviceCudaMalloc"};
 | |
| 
 | |
| const char *memAllocTypeShortStr[MEMALLOC_TYPE_COUNT] = {
 | |
|     "UMhint",   // Managed Memory With Hints
 | |
|     "UMhntAs",  // Managed Memory With_Hints Async
 | |
|     "UMeasy",   // Managed_Memory with No Hints
 | |
|     "0Copy",    // Zero Copy
 | |
|     "MemCopy",  // USE HOST PAGEABLE AND DEVICE_MEMORY
 | |
|     "CpAsync",  // USE HOST PAGEABLE AND DEVICE_MEMORY ASYNC
 | |
|     "CpHpglk",  // USE HOST PAGELOCKED AND DEVICE MEMORY
 | |
|     "CpPglAs"   // USE HOST PAGELOCKED AND DEVICE MEMORY ASYNC
 | |
| };
 | |
| 
 | |
| static float RandFloat(float low, float high) {
 | |
|   float t = (float)rand() / (float)RAND_MAX;
 | |
|   return (1.0f - t) * low + t * high;
 | |
| }
 | |
| 
 | |
| void fillMatrixWithRandomValues(float *matrix, unsigned int matrixDim) {
 | |
|   unsigned int i, j;
 | |
|   for (i = 0; i < matrixDim; ++i) {
 | |
|     for (j = 0; j < matrixDim; ++j) {
 | |
|       matrix[j + i * matrixDim] = RandFloat(0.0f, 10.0f);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| #if VERIFY_GPU_CORRECTNESS
 | |
| void verifyMatrixMultiplyCorrectness(float *C, float *A, float *B,
 | |
|                                      unsigned int matrixDim) {
 | |
|   unsigned int i, j, k, numErrors = 0;
 | |
|   for (i = 0; i < matrixDim; ++i) {
 | |
|     for (j = 0; j < matrixDim; ++j) {
 | |
|       float result = 0.0f;
 | |
|       for (k = 0; k < matrixDim; ++k) {
 | |
|         result += A[k + i * matrixDim] * B[j + k * matrixDim];
 | |
|       }
 | |
|       if (fabs(C[j + i * matrixDim] - result) > 0.001 * matrixDim) {
 | |
|         printf("At [%u, %u]: Expected %f, Found %f\n", i, j, result,
 | |
|                C[j + i * matrixDim]);
 | |
|         ++numErrors;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   if (numErrors != 0) {
 | |
|     printf("%d value mismatches occured\n", numErrors);
 | |
|     fflush(stdout);
 | |
|     exit(EXIT_FAILURE);  // exit since value mismatches occured
 | |
|   }
 | |
| }
 | |
| #endif
 | |
| 
 | |
| void copyMatrix(float *dstMatrix, float *srcMatrix, unsigned int matrixDim) {
 | |
|   size_t size = matrixDim * matrixDim * sizeof(float);
 | |
|   memcpy(dstMatrix, srcMatrix, size);
 | |
| }
 | |
| 
 | |
| void verifyMatrixData(float *expectedData, float *observedData,
 | |
|                       unsigned int matrixDim) {
 | |
|   unsigned int i, j, numErrors = 0;
 | |
|   for (i = 0; i < matrixDim; ++i) {
 | |
|     for (j = 0; j < matrixDim; ++j) {
 | |
|       if (expectedData[j + i * matrixDim] != observedData[j + i * matrixDim]) {
 | |
|         ++numErrors;
 | |
|         if (verboseResults) {
 | |
|           printf("At [%u, %u]: Expected %f, Found %f\n", i, j,
 | |
|                  expectedData[j + i * matrixDim],
 | |
|                  observedData[j + i * matrixDim]);
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   if (numErrors != 0) {
 | |
|     printf("%d value mismatches occured\n", numErrors);
 | |
|     fflush(stdout);
 | |
|     exit(EXIT_FAILURE);  // exit since value mismatches occured
 | |
|   }
 | |
| }
 | |
| 
 | |
| #define BLOCK_SIZE 32
 | |
| __global__ void matrixMultiplyKernel(float *C, float *A, float *B,
 | |
|                                      unsigned int matrixDim) {
 | |
|   // Block index
 | |
|   int bx = blockIdx.x;
 | |
|   int by = blockIdx.y;
 | |
| 
 | |
|   // Thread index
 | |
|   int tx = threadIdx.x;
 | |
|   int ty = threadIdx.y;
 | |
| 
 | |
|   unsigned int wA = matrixDim;
 | |
|   unsigned int wB = matrixDim;
 | |
| 
 | |
|   // Index of the first sub-matrix of A processed by the block
 | |
|   int aBegin = matrixDim * BLOCK_SIZE * by;
 | |
| 
 | |
|   // Index of the last sub-matrix of A processed by the block
 | |
|   int aEnd = aBegin + wA - 1;
 | |
| 
 | |
|   // Step size used to iterate through the sub-matrices of A
 | |
|   int aStep = BLOCK_SIZE;
 | |
| 
 | |
|   // Index of the first sub-matrix of B processed by the block
 | |
|   int bBegin = BLOCK_SIZE * bx;
 | |
| 
 | |
|   // Step size used to iterate through the sub-matrices of B
 | |
|   int bStep = BLOCK_SIZE * wB;
 | |
| 
 | |
|   // Csub is used to store the element of the block sub-matrix
 | |
|   // that is computed by the thread
 | |
|   float Csub = 0;
 | |
| 
 | |
|   // Loop over all the sub-matrices of A and B
 | |
|   // required to compute the block sub-matrix
 | |
|   for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
 | |
|     // Declaration of the shared memory array As used to
 | |
|     // store the sub-matrix of A
 | |
|     __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 | |
| 
 | |
|     // Declaration of the shared memory array Bs used to
 | |
|     // store the sub-matrix of B
 | |
|     __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 | |
| 
 | |
|     // Load the matrices from device memory
 | |
|     // to shared memory; each thread loads
 | |
|     // one element of each matrix
 | |
|     As[ty][tx] = A[a + wA * ty + tx];
 | |
|     Bs[ty][tx] = B[b + wB * ty + tx];
 | |
| 
 | |
|     // Synchronize to make sure the matrices are loaded
 | |
|     __syncthreads();
 | |
| 
 | |
|     // Multiply the two matrices together;
 | |
|     // each thread computes one element
 | |
|     // of the block sub-matrix
 | |
| #pragma unroll
 | |
| 
 | |
|     for (int k = 0; k < BLOCK_SIZE; ++k) {
 | |
|       Csub += As[ty][k] * Bs[k][tx];
 | |
|     }
 | |
| 
 | |
|     // Synchronize to make sure that the preceding
 | |
|     // computation is done before loading two new
 | |
|     // sub-matrices of A and B in the next iteration
 | |
|     __syncthreads();
 | |
|   }
 | |
| 
 | |
|   // Write the block sub-matrix to device memory;
 | |
|   // each thread writes one element
 | |
|   int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 | |
|   C[c + wB * ty + tx] = Csub;
 | |
| }
 | |
| 
 | |
| void runMatrixMultiplyKernel(unsigned int matrixDim, int allocType,
 | |
|                              unsigned int numLoops, double *gpuLaunchCallsTimes,
 | |
|                              double *gpuTransferToCallsTimes,
 | |
|                              double *gpuTransferFromCallsTimes,
 | |
|                              double *gpuLaunchAndTransferCallsTimes,
 | |
|                              double *gpuLaunchTransferSyncTimes,
 | |
|                              double *cpuAccessTimes, double *overallTimes,
 | |
|                              int device_id) {
 | |
|   float *dptrA = NULL, *hptrA = NULL;
 | |
|   float *dptrB = NULL, *hptrB = NULL;
 | |
|   float *dptrC = NULL, *hptrC = NULL;
 | |
|   float *randValuesX = NULL, *randValuesY = NULL;
 | |
|   float *randValuesVerifyXmulY = NULL, *randValuesVerifyYmulX = NULL;
 | |
|   bool copyRequired = false, hintsRequired = false;
 | |
|   bool someTransferOpRequired;
 | |
|   bool isAsync = false;
 | |
|   cudaStream_t streamToRunOn;
 | |
|   unsigned int *latch;
 | |
|   size_t size = matrixDim * matrixDim * sizeof(float);
 | |
|   dim3 threads(32, 32);
 | |
|   dim3 grid(matrixDim / threads.x, matrixDim / threads.y);
 | |
|   StopWatchInterface *gpuLaunchCallsTimer = 0, *gpuTransferCallsTimer = 0;
 | |
|   StopWatchInterface *gpuSyncTimer = 0, *cpuAccessTimer = 0;
 | |
|   sdkCreateTimer(&gpuLaunchCallsTimer);
 | |
|   sdkCreateTimer(&gpuTransferCallsTimer);
 | |
|   sdkCreateTimer(&gpuSyncTimer);
 | |
|   sdkCreateTimer(&cpuAccessTimer);
 | |
|   unsigned int i;
 | |
| 
 | |
|   cudaDeviceProp deviceProp;
 | |
|   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device_id));
 | |
|   checkCudaErrors(cudaStreamCreate(&streamToRunOn));
 | |
| 
 | |
|   randValuesX = (float *)malloc(size);
 | |
|   if (!randValuesX) {
 | |
|     exit(EXIT_FAILURE);  // exit since memory allocation error
 | |
|   }
 | |
|   randValuesY = (float *)malloc(size);
 | |
|   if (!randValuesY) {
 | |
|     exit(EXIT_FAILURE);  // exit since memory allocation error
 | |
|   }
 | |
|   randValuesVerifyXmulY = (float *)malloc(size);
 | |
|   if (!randValuesVerifyXmulY) {
 | |
|     exit(EXIT_FAILURE);  // exit since memory allocation error
 | |
|   }
 | |
|   randValuesVerifyYmulX = (float *)malloc(size);
 | |
|   if (!randValuesVerifyYmulX) {
 | |
|     exit(EXIT_FAILURE);  // exit since memory allocation error
 | |
|   }
 | |
|   checkCudaErrors(cudaMalloc(&dptrA, size));
 | |
|   checkCudaErrors(cudaMalloc(&dptrB, size));
 | |
|   checkCudaErrors(cudaMalloc(&dptrC, size));
 | |
| 
 | |
|   fillMatrixWithRandomValues(randValuesX, matrixDim);
 | |
|   fillMatrixWithRandomValues(randValuesY, matrixDim);
 | |
| 
 | |
|   checkCudaErrors(
 | |
|       cudaMemcpyAsync(dptrA, randValuesX, size, cudaMemcpyHostToDevice));
 | |
|   checkCudaErrors(
 | |
|       cudaMemcpyAsync(dptrB, randValuesY, size, cudaMemcpyHostToDevice));
 | |
|   matrixMultiplyKernel<<<grid, threads>>>(dptrC, dptrA, dptrB, matrixDim);
 | |
|   checkCudaErrors(cudaMemcpyAsync(randValuesVerifyXmulY, dptrC, size,
 | |
|                                   cudaMemcpyDeviceToHost));
 | |
|   checkCudaErrors(cudaStreamSynchronize(NULL));
 | |
|   matrixMultiplyKernel<<<grid, threads>>>(dptrC, dptrB, dptrA, matrixDim);
 | |
|   checkCudaErrors(cudaMemcpyAsync(randValuesVerifyYmulX, dptrC, size,
 | |
|                                   cudaMemcpyDeviceToHost));
 | |
|   checkCudaErrors(cudaStreamSynchronize(NULL));
 | |
| #if VERIFY_GPU_CORRECTNESS
 | |
|   verifyMatrixMultiplyCorrectness(randValuesVerifyXmulY, randValuesX,
 | |
|                                   randValuesY, matrixDim);
 | |
|   verifyMatrixMultiplyCorrectness(randValuesVerifyYmulX, randValuesY,
 | |
|                                   randValuesX, matrixDim);
 | |
| #endif
 | |
|   checkCudaErrors(cudaFree(dptrA));
 | |
|   checkCudaErrors(cudaFree(dptrB));
 | |
|   checkCudaErrors(cudaFree(dptrC));
 | |
| 
 | |
|   checkCudaErrors(cudaMallocHost(&latch, sizeof(unsigned int)));
 | |
| 
 | |
|   switch (allocType) {
 | |
|     case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY:
 | |
|     case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC:
 | |
|       hptrA = (float *)malloc(size);
 | |
|       if (!hptrA) {
 | |
|         exit(EXIT_FAILURE);  // exit since memory allocation error
 | |
|       }
 | |
|       hptrB = (float *)malloc(size);
 | |
|       if (!hptrB) {
 | |
|         exit(EXIT_FAILURE);  // exit since memory allocation error
 | |
|       }
 | |
|       hptrC = (float *)malloc(size);
 | |
|       if (!hptrC) {
 | |
|         exit(EXIT_FAILURE);  // exit since memory allocation error
 | |
|       }
 | |
|       checkCudaErrors(cudaMalloc(&dptrA, size));
 | |
|       checkCudaErrors(cudaMalloc(&dptrB, size));
 | |
|       checkCudaErrors(cudaMalloc(&dptrC, size));
 | |
|       copyRequired = true;
 | |
|       break;
 | |
| 
 | |
|     case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY:
 | |
|     case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC:
 | |
|       checkCudaErrors(cudaMallocHost(&hptrA, size));
 | |
|       checkCudaErrors(cudaMallocHost(&hptrB, size));
 | |
|       checkCudaErrors(cudaMallocHost(&hptrC, size));
 | |
|       checkCudaErrors(cudaMalloc(&dptrA, size));
 | |
|       checkCudaErrors(cudaMalloc(&dptrB, size));
 | |
|       checkCudaErrors(cudaMalloc(&dptrC, size));
 | |
|       copyRequired = true;
 | |
|       break;
 | |
| 
 | |
|     case USE_ZERO_COPY:
 | |
|       checkCudaErrors(cudaMallocHost(&hptrA, size));
 | |
|       checkCudaErrors(cudaMallocHost(&hptrB, size));
 | |
|       checkCudaErrors(cudaMallocHost(&hptrC, size));
 | |
|       checkCudaErrors(cudaHostGetDevicePointer(&dptrA, hptrA, 0));
 | |
|       checkCudaErrors(cudaHostGetDevicePointer(&dptrB, hptrB, 0));
 | |
|       checkCudaErrors(cudaHostGetDevicePointer(&dptrC, hptrC, 0));
 | |
|       break;
 | |
| 
 | |
|     case USE_MANAGED_MEMORY:
 | |
|       checkCudaErrors(cudaMallocManaged(&dptrA, size));
 | |
|       checkCudaErrors(cudaMallocManaged(&dptrB, size));
 | |
|       checkCudaErrors(cudaMallocManaged(&dptrC, size));
 | |
|       hptrA = dptrA;
 | |
|       hptrB = dptrB;
 | |
|       hptrC = dptrC;
 | |
|       break;
 | |
| 
 | |
|     case USE_MANAGED_MEMORY_WITH_HINTS:
 | |
|     case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC:
 | |
|       if (deviceProp.concurrentManagedAccess) {
 | |
|         checkCudaErrors(cudaMallocManaged(&dptrA, size));
 | |
|         checkCudaErrors(cudaMallocManaged(&dptrB, size));
 | |
|         checkCudaErrors(cudaMallocManaged(&dptrC, size));
 | |
|         checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId));
 | |
|         checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId));
 | |
|         checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId));
 | |
|       } else {
 | |
|         checkCudaErrors(cudaMallocManaged(&dptrA, size, cudaMemAttachHost));
 | |
|         checkCudaErrors(cudaMallocManaged(&dptrB, size, cudaMemAttachHost));
 | |
|         checkCudaErrors(cudaMallocManaged(&dptrC, size, cudaMemAttachHost));
 | |
|       }
 | |
|       hptrA = dptrA;
 | |
|       hptrB = dptrB;
 | |
|       hptrC = dptrC;
 | |
|       hintsRequired = true;
 | |
|       break;
 | |
| 
 | |
|     default:
 | |
|       exit(EXIT_FAILURE);  // exit with error
 | |
|   }
 | |
| 
 | |
|   if (allocType == USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC ||
 | |
|       allocType == USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC ||
 | |
|       allocType == USE_MANAGED_MEMORY_WITH_HINTS_ASYNC) {
 | |
|     isAsync = true;
 | |
|   }
 | |
| 
 | |
|   someTransferOpRequired = copyRequired || hintsRequired;
 | |
| 
 | |
|   // fill buffers with 0 to avoid any first access page-fault overheads.
 | |
|   memset(hptrA, 0, size);
 | |
|   memset(hptrB, 0, size);
 | |
|   memset(hptrC, 0, size);
 | |
| 
 | |
|   for (i = 0; i < numLoops; i++) {
 | |
|     cpuAccessTimes[i] = 0.0;
 | |
|     gpuLaunchCallsTimes[i] = 0.0;
 | |
|     gpuTransferToCallsTimes[i] = 0.0;
 | |
|     gpuTransferFromCallsTimes[i] = 0.0;
 | |
| 
 | |
|     sdkStartTimer(&cpuAccessTimer);
 | |
|     {
 | |
|       copyMatrix(hptrA, (i & 0x1 == 0) ? randValuesX : randValuesY, matrixDim);
 | |
|       copyMatrix(hptrB, (i & 0x1 == 0) ? randValuesY : randValuesX, matrixDim);
 | |
|     }
 | |
|     sdkStopTimer(&cpuAccessTimer);
 | |
|     cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer);
 | |
|     sdkResetTimer(&cpuAccessTimer);
 | |
| 
 | |
|     if (isAsync && hintsRequired) {
 | |
|       *latch = 0;
 | |
|       // Prevent any work on stream from starting until all work is pushed
 | |
|       spinWhileLessThanOne<<<1, 1, 0, streamToRunOn>>>(latch);
 | |
|     }
 | |
| 
 | |
|     if (someTransferOpRequired) {
 | |
|       sdkStartTimer(&gpuTransferCallsTimer);
 | |
|       if (copyRequired) {
 | |
|         if (isAsync) {
 | |
|           checkCudaErrors(cudaMemcpyAsync(
 | |
|               dptrA, hptrA, size, cudaMemcpyHostToDevice, streamToRunOn));
 | |
|           checkCudaErrors(cudaMemcpyAsync(
 | |
|               dptrB, hptrB, size, cudaMemcpyHostToDevice, streamToRunOn));
 | |
|         } else {
 | |
|           checkCudaErrors(
 | |
|               cudaMemcpy(dptrA, hptrA, size, cudaMemcpyHostToDevice));
 | |
|           checkCudaErrors(
 | |
|               cudaMemcpy(dptrB, hptrB, size, cudaMemcpyHostToDevice));
 | |
|         }
 | |
|       }
 | |
|       if (hintsRequired) {
 | |
|         if (deviceProp.concurrentManagedAccess) {
 | |
|           checkCudaErrors(
 | |
|               cudaMemPrefetchAsync(dptrA, size, device_id, streamToRunOn));
 | |
|           checkCudaErrors(
 | |
|               cudaMemPrefetchAsync(dptrB, size, device_id, streamToRunOn));
 | |
|           checkCudaErrors(
 | |
|               cudaMemPrefetchAsync(dptrC, size, device_id, streamToRunOn));
 | |
|         } else {
 | |
|           checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0,
 | |
|                                                    cudaMemAttachGlobal));
 | |
|           checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0,
 | |
|                                                    cudaMemAttachGlobal));
 | |
|           checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0,
 | |
|                                                    cudaMemAttachGlobal));
 | |
|         }
 | |
|         if (!isAsync) {
 | |
|           checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
 | |
|         }
 | |
|       }
 | |
| 
 | |
|       sdkStopTimer(&gpuTransferCallsTimer);
 | |
|       gpuTransferToCallsTimes[i] +=
 | |
|           sdkGetAverageTimerValue(&gpuTransferCallsTimer);
 | |
|       sdkResetTimer(&gpuTransferCallsTimer);
 | |
|     }
 | |
| 
 | |
|     sdkStartTimer(&gpuLaunchCallsTimer);
 | |
|     {
 | |
|       matrixMultiplyKernel<<<grid, threads, 0, streamToRunOn>>>(
 | |
|           dptrC, dptrA, dptrB, matrixDim);
 | |
|       if (!isAsync) {
 | |
|         checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
 | |
|       }
 | |
|     }
 | |
|     sdkStopTimer(&gpuLaunchCallsTimer);
 | |
| 
 | |
|     gpuLaunchCallsTimes[i] += sdkGetAverageTimerValue(&gpuLaunchCallsTimer);
 | |
|     sdkResetTimer(&gpuLaunchCallsTimer);
 | |
| 
 | |
|     if (someTransferOpRequired) {
 | |
|       sdkStartTimer(&gpuTransferCallsTimer);
 | |
|       if (hintsRequired) {
 | |
|         if (deviceProp.concurrentManagedAccess) {
 | |
|           checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId));
 | |
|           checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId));
 | |
|           checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId));
 | |
|         } else {
 | |
|           checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0,
 | |
|                                                    cudaMemAttachHost));
 | |
|           checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0,
 | |
|                                                    cudaMemAttachHost));
 | |
|           checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0,
 | |
|                                                    cudaMemAttachHost));
 | |
|         }
 | |
|         if (!isAsync) {
 | |
|           checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
 | |
|         }
 | |
|       }
 | |
|       if (copyRequired) {
 | |
|         if (isAsync) {
 | |
|           checkCudaErrors(cudaMemcpyAsync(
 | |
|               hptrC, dptrC, size, cudaMemcpyDeviceToHost, streamToRunOn));
 | |
|         } else {
 | |
|           checkCudaErrors(
 | |
|               cudaMemcpy(hptrC, dptrC, size, cudaMemcpyDeviceToHost));
 | |
|         }
 | |
|       }
 | |
|       sdkStopTimer(&gpuTransferCallsTimer);
 | |
|       gpuTransferFromCallsTimes[i] +=
 | |
|           sdkGetAverageTimerValue(&gpuTransferCallsTimer);
 | |
|       sdkResetTimer(&gpuTransferCallsTimer);
 | |
|     }
 | |
|     gpuLaunchAndTransferCallsTimes[i] = gpuLaunchCallsTimes[i] +
 | |
|                                         gpuTransferToCallsTimes[i] +
 | |
|                                         gpuTransferFromCallsTimes[i];
 | |
|     gpuLaunchTransferSyncTimes[i] = gpuLaunchAndTransferCallsTimes[i];
 | |
|     if (isAsync) {
 | |
|       sdkStartTimer(&gpuSyncTimer);
 | |
|       {
 | |
|         if (hintsRequired) {
 | |
|           *latch = 1;
 | |
|         }
 | |
|         checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
 | |
|       }
 | |
|       sdkStopTimer(&gpuSyncTimer);
 | |
|       gpuLaunchTransferSyncTimes[i] += sdkGetAverageTimerValue(&gpuSyncTimer);
 | |
|       sdkResetTimer(&gpuSyncTimer);
 | |
|     }
 | |
| 
 | |
|     sdkStartTimer(&cpuAccessTimer);
 | |
|     {
 | |
|       verifyMatrixData(
 | |
|           (i & 0x1 == 0) ? randValuesVerifyXmulY : randValuesVerifyYmulX, hptrC,
 | |
|           matrixDim);
 | |
|     }
 | |
|     sdkStopTimer(&cpuAccessTimer);
 | |
|     cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer);
 | |
|     sdkResetTimer(&cpuAccessTimer);
 | |
|     overallTimes[i] = cpuAccessTimes[i] + gpuLaunchTransferSyncTimes[i];
 | |
|   }
 | |
| 
 | |
|   switch (allocType) {
 | |
|     case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY:
 | |
|     case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC:
 | |
|       free(hptrA);
 | |
|       free(hptrB);
 | |
|       free(hptrC);
 | |
|       checkCudaErrors(cudaFree(dptrA));
 | |
|       checkCudaErrors(cudaFree(dptrB));
 | |
|       checkCudaErrors(cudaFree(dptrC));
 | |
|       break;
 | |
| 
 | |
|     case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY:
 | |
|     case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC:
 | |
|       checkCudaErrors(cudaFreeHost(hptrA));
 | |
|       checkCudaErrors(cudaFreeHost(hptrB));
 | |
|       checkCudaErrors(cudaFreeHost(hptrC));
 | |
|       checkCudaErrors(cudaFree(dptrA));
 | |
|       checkCudaErrors(cudaFree(dptrB));
 | |
|       checkCudaErrors(cudaFree(dptrC));
 | |
|       break;
 | |
| 
 | |
|     case USE_ZERO_COPY:
 | |
|       checkCudaErrors(cudaFreeHost(hptrA));
 | |
|       checkCudaErrors(cudaFreeHost(hptrB));
 | |
|       checkCudaErrors(cudaFreeHost(hptrC));
 | |
|       break;
 | |
| 
 | |
|     case USE_MANAGED_MEMORY:
 | |
|     case USE_MANAGED_MEMORY_WITH_HINTS:
 | |
|     case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC:
 | |
|       checkCudaErrors(cudaFree(dptrA));
 | |
|       checkCudaErrors(cudaFree(dptrB));
 | |
|       checkCudaErrors(cudaFree(dptrC));
 | |
|       break;
 | |
| 
 | |
|     default:
 | |
|       exit(EXIT_FAILURE);  // exit due to error
 | |
|   }
 | |
| 
 | |
|   checkCudaErrors(cudaStreamDestroy(streamToRunOn));
 | |
|   checkCudaErrors(cudaFreeHost(latch));
 | |
|   free(randValuesX);
 | |
|   free(randValuesY);
 | |
|   free(randValuesVerifyXmulY);
 | |
|   free(randValuesVerifyYmulX);
 | |
|   sdkDeleteTimer(&gpuLaunchCallsTimer);
 | |
|   sdkDeleteTimer(&gpuTransferCallsTimer);
 | |
|   sdkDeleteTimer(&gpuSyncTimer);
 | |
|   sdkDeleteTimer(&cpuAccessTimer);
 | |
| }
 | |
| 
 | |
| void matrixMultiplyPerfRunner(bool reportAsBandwidth,
 | |
|                               bool print_launch_transfer_results,
 | |
|                               bool print_std_deviation, int device_id) {
 | |
|   int i;
 | |
|   unsigned int minMatrixDim = 32;
 | |
|   unsigned int multiplierDim = 2;
 | |
|   unsigned int matrixDim;
 | |
|   unsigned int minSize = minMatrixDim * minMatrixDim * sizeof(float);
 | |
|   unsigned int maxSize =
 | |
|       (maxSampleSizeInMb * ONE_MB) /
 | |
|       4;  // 3 buffers are used, but dividing by 4 (power of 2)
 | |
|   unsigned int multiplier = multiplierDim * multiplierDim;
 | |
|   unsigned int numSizesToTest;
 | |
| 
 | |
|   struct testResults *results;
 | |
|   struct resultsData *gpuLaunchCallsTimes;
 | |
|   struct resultsData *gpuTransferToCallsTimes;
 | |
|   struct resultsData *gpuTransferFromCallsTimes;
 | |
|   struct resultsData *gpuLaunchAndTransferCallsTimes;
 | |
|   struct resultsData *gpuLaunchTransferSyncTimes;
 | |
|   struct resultsData *cpuAccessTimes;
 | |
|   struct resultsData *overallTimes;
 | |
|   unsigned long *sizesToTest;
 | |
|   unsigned int j;
 | |
| 
 | |
|   numSizesToTest = findNumSizesToTest(minSize, maxSize, multiplier);
 | |
| 
 | |
|   createAndInitTestResults(&results, "matrixMultiplyPerf", numKernelRuns,
 | |
|                            numSizesToTest);
 | |
| 
 | |
|   sizesToTest = getPtrSizesToTest(results);
 | |
| 
 | |
|   createResultDataAndAddToTestResults(&gpuLaunchCallsTimes, results,
 | |
|                                       "GPU Kernel Launch Call Time", false,
 | |
|                                       reportAsBandwidth);
 | |
|   createResultDataAndAddToTestResults(&gpuTransferToCallsTimes, results,
 | |
|                                       "CPU to GPU Transfer Calls Time", false,
 | |
|                                       reportAsBandwidth);
 | |
|   createResultDataAndAddToTestResults(&gpuTransferFromCallsTimes, results,
 | |
|                                       "GPU to CPU Transfer Calls Time", false,
 | |
|                                       reportAsBandwidth);
 | |
|   createResultDataAndAddToTestResults(&gpuLaunchAndTransferCallsTimes, results,
 | |
|                                       "GPU Launch and Transfer Calls Time",
 | |
|                                       false, reportAsBandwidth);
 | |
|   createResultDataAndAddToTestResults(&gpuLaunchTransferSyncTimes, results,
 | |
|                                       "GPU Launch Transfer and Sync Time",
 | |
|                                       false, reportAsBandwidth);
 | |
|   createResultDataAndAddToTestResults(
 | |
|       &cpuAccessTimes, results, "CPU Access Time", false, reportAsBandwidth);
 | |
|   createResultDataAndAddToTestResults(&overallTimes, results, "Overall Time",
 | |
|                                       false, reportAsBandwidth);
 | |
| 
 | |
|   printf("Running ");
 | |
|   for (matrixDim = minMatrixDim, j = 0;
 | |
|        matrixDim * matrixDim <= maxSize / sizeof(float);
 | |
|        matrixDim *= multiplierDim, ++j) {
 | |
|     sizesToTest[j] = matrixDim * matrixDim * sizeof(float);
 | |
|     for (i = MEMALLOC_TYPE_START; i <= MEMALLOC_TYPE_END; i++) {
 | |
|       printf(".");
 | |
|       fflush(stdout);
 | |
|       runMatrixMultiplyKernel(
 | |
|           matrixDim, i, numKernelRuns,
 | |
|           getPtrRunTimesInMs(gpuLaunchCallsTimes, i, j),
 | |
|           getPtrRunTimesInMs(gpuTransferToCallsTimes, i, j),
 | |
|           getPtrRunTimesInMs(gpuTransferFromCallsTimes, i, j),
 | |
|           getPtrRunTimesInMs(gpuLaunchAndTransferCallsTimes, i, j),
 | |
|           getPtrRunTimesInMs(gpuLaunchTransferSyncTimes, i, j),
 | |
|           getPtrRunTimesInMs(cpuAccessTimes, i, j),
 | |
|           getPtrRunTimesInMs(overallTimes, i, j), device_id);
 | |
|     }
 | |
|   }
 | |
|   printf("\n");
 | |
|   printResults(results, print_launch_transfer_results, print_std_deviation);
 | |
|   freeTestResultsAndAllResultsData(results);
 | |
| }
 | |
| 
 | |
| static void usage() {
 | |
|   printf(
 | |
|       "./cudaMemoryTypesPerf [-device=<device_id>] [-reportAsBandwidth] "
 | |
|       "[-print-launch-transfer-results] [-print-std-deviation] [-verbose]\n");
 | |
|   printf("Options:\n");
 | |
|   printf(
 | |
|       "-reportAsBandwidth:             By default time taken is printed, this "
 | |
|       "option allows to instead print bandwidth.\n");
 | |
|   printf(
 | |
|       "-print-launch-transfer-results: By default overall results are printed, "
 | |
|       "this option allows to print data transfers and kernel time as well.\n");
 | |
|   printf(
 | |
|       "-print-std-deviation:           Prints std deviation of the results.\n");
 | |
|   printf(
 | |
|       "-kernel-iterations=<num>:       Number of times the kernel tests should "
 | |
|       "be run[default is 100 iterations].\n");
 | |
|   printf(
 | |
|       "-device=<device_id>:            Allows to pass GPU Device ID on which "
 | |
|       "the tests will be run.\n");
 | |
|   printf("-verbose:                       Prints highly verbose output.\n");
 | |
| }
 | |
| 
 | |
| int main(int argc, char **argv) {
 | |
|   bool reportAsBandwidth = false;
 | |
|   bool print_launch_transfer_results = false;
 | |
|   bool print_std_deviation = false;
 | |
| 
 | |
|   if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
 | |
|       checkCmdLineFlag(argc, (const char **)argv, "h")) {
 | |
|     usage();
 | |
|     printf("&&&& %s WAIVED\n", argv[0]);
 | |
|     exit(EXIT_WAIVED);
 | |
|   }
 | |
| 
 | |
|   if (checkCmdLineFlag(argc, (const char **)argv, "reportAsBandwidth")) {
 | |
|     reportAsBandwidth = true;
 | |
|   }
 | |
| 
 | |
|   if (checkCmdLineFlag(argc, (const char **)argv,
 | |
|                        "print-launch-transfer-results")) {
 | |
|     print_launch_transfer_results = true;
 | |
|   }
 | |
| 
 | |
|   if (checkCmdLineFlag(argc, (const char **)argv, "print-std-deviation")) {
 | |
|     print_std_deviation = true;
 | |
|   }
 | |
| 
 | |
|   if (checkCmdLineFlag(argc, (const char **)argv, "kernel-iterations")) {
 | |
|     numKernelRuns =
 | |
|         getCmdLineArgumentInt(argc, (const char **)argv, "kernel-iterations");
 | |
|   }
 | |
| 
 | |
|   if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) {
 | |
|     verboseResults = 1;
 | |
|   }
 | |
| 
 | |
|   int device_id = findCudaDevice(argc, (const char **)argv);
 | |
| 
 | |
|   matrixMultiplyPerfRunner(reportAsBandwidth, print_launch_transfer_results,
 | |
|                            print_std_deviation, device_id);
 | |
| 
 | |
|   printf(
 | |
|       "\nNOTE: The CUDA Samples are not meant for performance measurements. "
 | |
|       "Results may vary when GPU Boost is enabled.\n");
 | |
|   exit(EXIT_SUCCESS);
 | |
| }
 |