/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * Matrix multiplication: C = A * B. * * This sample demonstrates implements matrix multiplication which makes use of * shared memory to ensure data reuse, the matrix multiplication is done using * tiling approach. * With compute capability 8.0 or higher the CUDA kernels involved uses * asynchronously copy data from global to shared memory; a.k.a., async-copy. * This sample has been written for clarity of exposition to illustrate various * CUDA programming principles, not with the goal of providing the most * performant generic kernel for matrix multiplication. */ // System includes #include #include // CUDA runtime #include #include #if __CUDA_ARCH__ >= 700 #include #endif #include namespace cg = cooperative_groups; // Helper functions and utilities to work with CUDA #include #include enum kernels { AsyncCopyMultiStageLargeChunk = 0, AsyncCopyLargeChunk = 1, AsyncCopyLargeChunkAWBarrier = 2, AsyncCopyMultiStageSharedState = 3, AsyncCopyMultiStage = 4, AsyncCopySingleStage = 5, Naive = 6, NaiveLargeChunk = 7 }; const char *kernelNames[] = {"AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStageSharedState", "AsyncCopyMultiStage", "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"}; constexpr int blockSize = 16; // Multi Stage memcpy_async pipeline with large chunk copy template __global__ void MatrixMulAsyncCopyMultiStageLargeChunk( float *__restrict__ C, const float *__restrict__ A, const float *__restrict__ B, int wA, int wB) { // Requires BLOCK_SIZE % 4 == 0 // Multi-stage pipeline version constexpr size_t maxPipelineStages = 4; // Declaration of the shared memory array As used to // store the sub-matrix of A for each stage __shared__ alignas( alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B for each stage __shared__ alignas( alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; float Csub = 0.0; // Index of the first sub-matrix of A processed by the block const int aBegin = wA * (BLOCK_SIZE)*blockIdx.y; // Index of the last sub-matrix of A processed by the block const int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block const int bBegin = BLOCK_SIZE * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * wB; const int t4x = threadIdx.x * 4; const auto shape4 = cuda::aligned_size_t(sizeof(float4)); cuda::pipeline pipe = cuda::make_pipeline(); // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i) { // Load the matrices from device memory to shared memory; each thread loads // one element of each matrix for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { pipe.producer_acquire(); if (aStage <= aEnd && t4x < BLOCK_SIZE) { // Rotating buffer const int j = iStage % maxPipelineStages; cuda::memcpy_async(&As[j][threadIdx.y][t4x], &A[aStage + wA * threadIdx.y + t4x], shape4, pipe); cuda::memcpy_async(&Bs[j][threadIdx.y][t4x], &B[aStage + wA * threadIdx.y + t4x], shape4, pipe); } pipe.producer_commit(); } pipe.consumer_wait(); // Synchronize to make sure the matrices are loaded __syncthreads(); // Rotating buffer const int j = i % maxPipelineStages; // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; } pipe.consumer_release(); // Don't have to synchronize because maxPipelineStages is greater than one // therefore next iteration is loading to a different buffer. } // Write the block sub-matrix to device memory; // each thread writes four element int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Single Stage memcpy_async pipeline with Large copy chunk (float4) template __global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C, const float *__restrict__ A, const float *__restrict__ B, int wA, int wB) { // Requires BLOCK_SIZE % 4 == 0 // Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; // Index of the first sub-matrix of A processed by the block int aBegin = wA * BLOCK_SIZE * blockIdx.y; // Index of the last sub-matrix of A processed by the block int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * wB; // Single-stage pipeline version float Csub = 0.0; const int t4x = threadIdx.x * 4; const auto shape4 = cuda::aligned_size_t(sizeof(float4)); cuda::pipeline pipe = cuda::make_pipeline(); // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Load the matrices from device memory to shared memory; // a subset of threads loads a contiguous chunk of elements. // Previously, per-thread: // As[ty][tx] = A[a + wA * ty + tx]; // Bs[ty][tx] = B[b + wB * ty + tx]; // Now, one fourth of the threads load four elements of each matrix if (t4x < BLOCK_SIZE) { pipe.producer_acquire(); cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x], shape4, pipe); cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x], shape4, pipe); pipe.producer_commit(); pipe.consumer_wait(); } // Synchronize to make sure the matrices are loaded __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; } pipe.consumer_release(); // Synchronize to make sure that the preceding // computation is done before overwriting the // shared memory sub-matrix buffers As and Bs in the next iteration. __syncthreads(); } // Write the block sub-matrix to device memory; // each thread writes four element int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Single Stage memcpy_async pipeline with Large copy chunk (float4) using // arrive-wait barrier template __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier( float *__restrict__ C, const float *__restrict__ A, const float *__restrict__ B, int wA, int wB) { #if __CUDA_ARCH__ >= 700 #pragma diag_suppress static_var_with_dynamic_init // Requires BLOCK_SIZE % 4 == 0 __shared__ cuda::barrier bar; // Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; if (threadIdx.x == 0) { init(&bar, blockDim.x * blockDim.y); } __syncthreads(); // Index of the first sub-matrix of A processed by the block int aBegin = wA * BLOCK_SIZE * blockIdx.y; // Index of the last sub-matrix of A processed by the block int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * wB; float Csub = 0.0; const int t4x = threadIdx.x * 4; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Load the matrices from device memory to shared memory; // a subset of threads loads a contiguous chunk of elements. // Now, one fourth of the threads load four elements of each matrix if (t4x < BLOCK_SIZE) { float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); const float4 *const A4 = reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); const float4 *const B4 = reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); cuda::memcpy_async(A4s, A4, sizeof(float4), bar); cuda::memcpy_async(B4s, B4, sizeof(float4), bar); } // Synchronize to make sure the matrices are loaded bar.arrive_and_wait(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; } // Synchronize to make sure that the preceding // computation is done before overwriting the // shared memory sub-matrix buffers As and Bs in the next iteration. bar.arrive_and_wait(); } // Write the block sub-matrix to device memory; // each thread writes four element int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; #endif } // Single Stage memcpy_async pipeline with float copy template __global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A, const float *B, int wA, int wB) { // Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; // Index of the first sub-matrix of A processed by the block int aBegin = wA * BLOCK_SIZE * blockIdx.y; // Index of the last sub-matrix of A processed by the block int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * wB; // Single-stage pipeline version float Csub = 0.0; cuda::pipeline pipe = cuda::make_pipeline(); const auto shape1 = cuda::aligned_size_t(sizeof(float)); // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Load the matrices from device memory to shared memory; each thread loads // one element of each matrix { pipe.producer_acquire(); cuda::memcpy_async(&As[threadIdx.y][threadIdx.x], &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe); cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x], &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe); pipe.producer_commit(); } pipe.consumer_wait(); // Synchronize to make sure the matrices are loaded __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; } // Synchronize to make sure that the preceding // computation is done before overwriting the // shared memory sub-matrix buffers As and Bs in the next iteration. __syncthreads(); } // Write the block sub-matrix to device memory; // each thread writes four element int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Multi Stage memcpy_async thread_scope_thread pipeline with single-element // async-copy template __global__ void MatrixMulAsyncCopyMultiStage(float *__restrict__ C, const float *__restrict__ A, const float *__restrict__ B, int wA, int wB) { // Multi-stage pipeline version constexpr size_t maxPipelineStages = 4; // Declaration of the shared memory array As used to // store the sub-matrix of A for each stage __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B for each stage __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; float Csub = 0.0; // Index of the first sub-matrix of A processed by the block const int aBegin = wA * BLOCK_SIZE * blockIdx.y; // Index of the last sub-matrix of A processed by the block const int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block const int bBegin = BLOCK_SIZE * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * wB; cuda::pipeline pipe = cuda::make_pipeline(); const auto shape1 = cuda::aligned_size_t(sizeof(float)); // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i) { // Load the matrices from device memory to shared memory; each thread loads // one element of each matrix for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { if (aStage <= aEnd) { // Rotating buffer const int j = iStage % maxPipelineStages; pipe.producer_acquire(); cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x], &A[aStage + wA * threadIdx.y + threadIdx.x], shape1, pipe); cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x], &B[bStage + wB * threadIdx.y + threadIdx.x], shape1, pipe); pipe.producer_commit(); } } pipe.consumer_wait(); // Synchronize to make sure the matrices are loaded __syncthreads(); const int j = i % maxPipelineStages; // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; } pipe.consumer_release(); // Don't have to synchronize because maxPipelineStages is greater than one // therefore next iteration is loading to a different buffer. } // Write the block sub-matrix to device memory; // each thread writes four element int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Multi Stage shared state memcpy_async pipeline thread_scope_block // with parititioned producer & consumer, here we've 1 warp as producer // group which issues memcpy_async operations and rest all warps are part of // consumer group which perform gemm computation on the loaded matrices by // producer. template __global__ void MatrixMulAsyncCopyMultiStageSharedState( float *__restrict__ C, const float *__restrict__ A, const float *__restrict__ B, int wA, int wB) { // Multi-stage pipeline version constexpr size_t maxPipelineStages = 4; // Declaration of the shared memory array As used to // store the sub-matrix of A for each stage __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B for each stage __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; float Csub = 0.0; // Index of the first sub-matrix of A processed by the block const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y; // Index of the last sub-matrix of A processed by the block const int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A constexpr int aStep = BLOCK_SIZE_X; // Index of the first sub-matrix of B processed by the block const int bBegin = BLOCK_SIZE_X * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE_X * wB; auto cta = cg::this_thread_block(); const auto shape1 = cuda::aligned_size_t(sizeof(float)); __shared__ cuda::pipeline_shared_state shared_state; constexpr int consumer_row_count = BLOCK_SIZE_X; const auto thread_role = (cta.thread_index().y < consumer_row_count) ? cuda::pipeline_role::consumer : cuda::pipeline_role::producer; auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role); // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i) { if (threadIdx.y >= consumer_row_count) { // this is a whole producer warp because threadIdx.y >= 16 where 16 == // consumer_row_count, // which loads the matrices from device memory to shared memory; for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { if (aStage <= aEnd) { // Rotating buffer const int j = iStage % maxPipelineStages; const int strideRows = (blockDim.y - consumer_row_count); pipe.producer_acquire(); for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) { cuda::memcpy_async(&As[j][rowId][threadIdx.x], &A[aStage + wA * rowId + threadIdx.x], shape1, pipe); cuda::memcpy_async(&Bs[j][rowId][threadIdx.x], &B[bStage + wB * rowId + threadIdx.x], shape1, pipe); } pipe.producer_commit(); } } } else { // this is a whole set of consumer group because threadIdx.y < // consumer_row_count where consumer_row_count == 16, // which computes gemm operation on matrices loaded in shared memory by // producer warp. const int j = i % maxPipelineStages; // Synchronize consumer group to make sure the matrices are loaded by // producer group. pipe.consumer_wait(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE_X; ++k) { Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; } pipe.consumer_release(); } } // Write the block sub-matrix to device memory; // each thread writes four element if (threadIdx.y < consumer_row_count) { const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } } /** * Matrix multiplication (CUDA Kernel) on the device: C = A * B * wA is A's width and wB is B's width */ template __global__ void MatrixMulNaive(float *C, float *A, float *B, int wA, int wB) { // Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; // Index of the first sub-matrix of A processed by the block int aBegin = wA * BLOCK_SIZE * blockIdx.y; // Index of the last sub-matrix of A processed by the block int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * wB; // Csub is used to store the element of the block sub-matrix // that is computed by the thread float Csub = 0; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Load the matrices from device memory // to shared memory; each thread loads // one element of each matrix As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x]; Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x]; // Synchronize to make sure the matrices are loaded __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; } // Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration __syncthreads(); } // Write the block sub-matrix to device memory; // each thread writes one element int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } template __global__ void MatrixMulNaiveLargeChunk(float *C, float *A, float *B, int wA, int wB) { // Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; int t4x = threadIdx.x * 4; // Index of the first sub-matrix of A processed by the block int aBegin = wA * BLOCK_SIZE * blockIdx.y; // Index of the last sub-matrix of A processed by the block int aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * blockIdx.x; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * wB; // Csub is used to store the element of the block sub-matrix // that is computed by the thread float Csub = 0; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Load the matrices from device memory // to shared memory; // One fourth of the threads load four elements of each matrix if (t4x < BLOCK_SIZE) { float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); const float4 *const A4 = reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); const float4 *const B4 = reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); *A4s = *A4; *B4s = *B4; } // Synchronize to make sure the matrices are loaded __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; } // Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration __syncthreads(); } // Write the block sub-matrix to device memory; // each thread writes one element int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } void ConstantInit(float *data, int size, float val) { for (int i = 0; i < size; ++i) { data[i] = val; } } /** * Run matrix multiplication using CUDA */ int MatrixMultiply(int argc, char **argv, const dim3 &dimsA, const dim3 &dimsB, kernels kernel_number) { // Allocate host memory for matrices A and B unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A; checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B; checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); cudaStream_t stream; // Initialize host memory const float valB = 2.10f; ConstantInit(h_A, size_A, 1.0f); ConstantInit(h_B, size_B, valB); // Allocate device memory float *d_A, *d_B, *d_C; // Allocate host matrix C dim3 dimsC(dimsB.x, dimsA.y, 1); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); float *h_C; checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); if (h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix C!\n"); exit(EXIT_FAILURE); } checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); // Allocate CUDA events that we'll use for timing cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); // copy host memory to device checkCudaErrors( cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); checkCudaErrors( cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream)); // Setup execution parameters dim3 threads(blockSize, blockSize); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); // Here the block size is 16x18, where first 16 rows are consumer thread group // and last 2 rows (1 warp) is producer thread group dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1); dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, dimsA.y / threadsSharedStateKernel.x); printf("Running kernel = %d - %s\n", kernel_number, kernelNames[kernel_number]); // Create and start timer printf("Computing result using CUDA Kernel...\n"); // Performs warmup operation using matrixMul CUDA kernel switch (kernel_number) { case AsyncCopyMultiStageLargeChunk: default: MatrixMulAsyncCopyMultiStageLargeChunk< blockSize><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyLargeChunk: MatrixMulAsyncCopyLargeChunk<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyLargeChunkAWBarrier: MatrixMulAsyncCopyLargeChunkAWBarrier< blockSize><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyMultiStageSharedState: MatrixMulAsyncCopyMultiStageSharedState<<< gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyMultiStage: MatrixMulAsyncCopyMultiStage<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopySingleStage: MatrixMulAsyncCopySingleStage<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case Naive: MatrixMulNaive<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; case NaiveLargeChunk: MatrixMulNaiveLargeChunk<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; } printf("done\n"); checkCudaErrors(cudaStreamSynchronize(stream)); // Execute the kernel int nIter = 100; // Record the start event checkCudaErrors(cudaEventRecord(start, stream)); for (int j = 0; j < nIter; j++) { switch (kernel_number) { case AsyncCopyMultiStageLargeChunk: default: MatrixMulAsyncCopyMultiStageLargeChunk< blockSize><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyLargeChunk: MatrixMulAsyncCopyLargeChunk<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyLargeChunkAWBarrier: MatrixMulAsyncCopyLargeChunkAWBarrier< blockSize><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyMultiStageSharedState: MatrixMulAsyncCopyMultiStageSharedState<<< gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopyMultiStage: MatrixMulAsyncCopyMultiStage<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case AsyncCopySingleStage: MatrixMulAsyncCopySingleStage<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case Naive: MatrixMulNaive<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; case NaiveLargeChunk: MatrixMulNaiveLargeChunk<<>>( d_C, d_A, d_B, dimsA.x, dimsB.x); break; } } // Record the stop event checkCudaErrors(cudaEventRecord(stop, stream)); // Wait for the stop event to complete checkCudaErrors(cudaEventSynchronize(stop)); float msecTotal = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * static_cast(dimsA.x) * static_cast(dimsA.y) * static_cast(dimsB.x); double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," " WorkgroupSize= %u threads/block\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); // Copy result from device to host checkCudaErrors( cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); printf("Checking computed result for correctness: "); bool correct = true; // test relative error by the formula // |_cpu - _gpu|/<|x|, |y|> < eps double eps = 1.e-6; // machine zero for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { double abs_err = fabs(h_C[i] - (dimsA.x * valB)); double dot_length = dimsA.x; double abs_val = fabs(h_C[i]); double rel_err = abs_err / abs_val / dot_length; if (rel_err > eps) { printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps); correct = false; } } printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); // Clean up memory checkCudaErrors(cudaFreeHost(h_A)); checkCudaErrors(cudaFreeHost(h_B)); checkCudaErrors(cudaFreeHost(h_C)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_C)); checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); printf( "\nNOTE: The CUDA Samples are not meant for performance " "measurements. Results may vary when GPU Boost is enabled.\n"); if (correct) { return EXIT_SUCCESS; } else { return EXIT_FAILURE; } } int main(int argc, char **argv) { printf("[globalToShmemAsyncCopy] - Starting...\n"); if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { printf("Usage -device=n (n >= 0 for deviceID)\n"); printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); printf( " -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - " "AsyncCopyLargeChunk)\n"); printf( " (2 - AsyncCopyLargeChunkAWBarrier; 3 - " "AsyncCopyMultiStageSharedState)\n"); printf( " (4 - AsyncCopyMultiStage; 5 - " "AsyncCopySingleStage; 6 - Naive without memcpy_async)\n"); printf( " (7 - NaiveLargeChunk without " "memcpy_async)\n"); printf( " Note: Outer matrix dimensions of A & B matrices must be equal.\n"); exit(EXIT_SUCCESS); } // This will pick the best possible CUDA capable device, otherwise // override the device ID based on input provided at the command line int dev = findCudaDevice(argc, (const char **)argv); int matrixBlock = 32; dim3 dimsA(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); dim3 dimsB(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); // width of Matrix A if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); } // height of Matrix A if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); } // width of Matrix B if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); } // height of Matrix B if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); } if (dimsA.x != dimsB.y) { printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y); exit(EXIT_FAILURE); } kernels selected_kernel = AsyncCopyMultiStageLargeChunk; // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); if (kernel_number < 8) { selected_kernel = (kernels)kernel_number; } else { printf( "Error: kernel number should be between 0 to 6, you have entered " "%d\n", kernel_number); exit(EXIT_FAILURE); } } int major = 0; checkCudaErrors( cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); if (major < 7) { printf("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...\n"); exit(EXIT_WAIVED); } printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel); exit(matrix_result); }