From 067cb6552367fba9750c1d4af3ce92115e10c69a Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Wed, 3 Mar 2021 22:53:02 +0530 Subject: [PATCH] -- Add partitioned cuda pipeline prod-cons gemm kernel -- Add cudaCompressibleMemory sample to use copy engine vs SM writes depending on arch --- Samples/cudaCompressibleMemory/saxpy.cu | 63 ++++-- .../globalToShmemAsyncCopy.cu | 179 ++++++++++++++---- 2 files changed, 199 insertions(+), 43 deletions(-) diff --git a/Samples/cudaCompressibleMemory/saxpy.cu b/Samples/cudaCompressibleMemory/saxpy.cu index 1b8a61ec..ad148ae9 100644 --- a/Samples/cudaCompressibleMemory/saxpy.cu +++ b/Samples/cudaCompressibleMemory/saxpy.cu @@ -48,26 +48,47 @@ __global__ void saxpy(const float a, const float4 *x, const float4 *y, float4 *z } } -__global__ void init(float4 *x, float4 *y, float4 *z, const float val, const size_t n) +__global__ void init(float4 *x, float4 *y, const float val, const size_t n) { const float4 val4 = make_float4(val, val, val, val); for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) { - z[i] = x[i] = y[i] = val4; + x[i] = y[i] = val4; } } -void launchSaxpy(const float a, float4 *x, float4 *y, float4 *z, const size_t n, const float init_val) +void launchSaxpy(const float a, float4 *x, float4 *y, float4 *z, const size_t n, const float init_val, const bool compressibleZbuf) { cudaEvent_t start, stop; float ms; int blockSize; int minGridSize; + dim3 threads, blocks; - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)init)); - dim3 threads = dim3(blockSize, 1, 1); - dim3 blocks = dim3(minGridSize, 1, 1); - init<<>>(x, y, z, init_val, n); + if (!compressibleZbuf) + { + // We are on config where compressible buffer can only be initialized through cudaMemcpy + // hence, x & y buffers are allocated as compressible and initialized via cudaMemcpy + // whereas z buffer is allocated as non-compressible. + float4 *h_x = (float4 *) malloc(sizeof(float4) * n); + float4 *h_y = (float4 *) malloc(sizeof(float4) * n); + for (int i = 0; i < n; i++) + { + h_x[i].x = h_x[i].y = h_x[i].z = h_x[i].w = init_val; + h_y[i].x = h_y[i].y = h_y[i].z = h_y[i].w = init_val; + } + checkCudaErrors(cudaMemcpy(x, h_x, sizeof(float4) * n, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(y, h_y, sizeof(float4) * n, cudaMemcpyHostToDevice)); + free(h_x); + free(h_y); + } + else + { + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)init)); + threads = dim3(blockSize, 1, 1); + blocks = dim3(minGridSize, 1, 1); + init<<>>(x, y, init_val, n); + } checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)saxpy)); threads = dim3(blockSize, 1, 1); @@ -121,19 +142,39 @@ int main(int argc, char **argv) printf("Generic memory compression support is available\n"); + int major, minor; + checkCudaErrors(cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + currentDevice)); + checkCudaErrors(cuDeviceGetAttribute(&minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + currentDevice)); float4 *x, *y, *z; const size_t size = n * sizeof(float4); // Allocating compressible memory checkCudaErrors(allocateCompressible((void **)&x, size, true)); checkCudaErrors(allocateCompressible((void **)&y, size, true)); - checkCudaErrors(allocateCompressible((void **)&z, size, true)); + bool compressibleZbuf = 0; + if ((major == 8 && minor == 0) || (major == 8 && minor == 6)) + { + // On SM 8.0 and 8.6 GPUs compressible buffer can only be initialized + // through cudaMemcpy. + printf("allocating non-compressible Z buffer\n"); + checkCudaErrors(allocateCompressible((void **)&z, size, false)); + compressibleZbuf = 0; + } + else + { + checkCudaErrors(allocateCompressible((void **)&z, size, true)); + compressibleZbuf = 1; + } printf("Running saxpy on %zu bytes of Compressible memory\n", size); const float a = 1.0f; const float init_val = 1.0f; - launchSaxpy(a, x, y, z, n, init_val); + launchSaxpy(a, x, y, z, n, init_val, compressibleZbuf); checkCudaErrors(freeCompressible(x, size, true)); checkCudaErrors(freeCompressible(y, size, true)); @@ -145,8 +186,8 @@ int main(int argc, char **argv) checkCudaErrors(allocateCompressible((void **)&y, size, false)); checkCudaErrors(allocateCompressible((void **)&z, size, false)); - launchSaxpy(a, x, y, z, n, init_val); - + launchSaxpy(a, x, y, z, n, init_val, compressibleZbuf); + checkCudaErrors(freeCompressible(x, size, false)); checkCudaErrors(freeCompressible(y, size, false)); checkCudaErrors(freeCompressible(z, size, false)); diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu index 3dd705a7..ad0229a7 100644 --- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu +++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu @@ -47,6 +47,9 @@ #if __CUDA_ARCH__ >= 700 #include #endif +#include + +namespace cg = cooperative_groups; // Helper functions and utilities to work with CUDA #include @@ -54,18 +57,19 @@ enum kernels { - AsyncCopyMultiStageLargeChunk = 0, - AsyncCopyLargeChunk = 1, - AsyncCopyLargeChunkAWBarrier = 2, - AsyncCopyMultiStage = 3, - AsyncCopySingleStage = 4, - Naive = 5, - NaiveLargeChunk = 6 + AsyncCopyMultiStageLargeChunk = 0, + AsyncCopyLargeChunk = 1, + AsyncCopyLargeChunkAWBarrier = 2, + AsyncCopyMultiStageSharedState = 3, + AsyncCopyMultiStage = 4, + AsyncCopySingleStage = 5, + Naive = 6, + NaiveLargeChunk = 7 }; const char* kernelNames[] = {"AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", - "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStage", - "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"}; + "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStageSharedState", + "AsyncCopyMultiStage", "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"}; constexpr int blockSize = 16; @@ -143,8 +147,8 @@ template __global__ void MatrixMulAsyncCopyMultiStageLargeChunk } pipe.consumer_release(); - // Don't have to synchronize because - // next iteration is loading to a different buffer + // Don't have to synchronize because maxPipelineStages is greater than one + // therefore next iteration is loading to a different buffer. } // Write the block sub-matrix to device memory; @@ -227,8 +231,8 @@ template __global__ void MatrixMulAsyncCopyLargeChunk(float* __ pipe.consumer_release(); // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. __syncthreads(); } @@ -310,9 +314,9 @@ template __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier( } // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. + bar.arrive_and_wait(); } // Write the block sub-matrix to device memory; @@ -384,8 +388,8 @@ template __global__ void MatrixMulAsyncCopySingleStage(float *C } // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. __syncthreads(); } @@ -395,7 +399,7 @@ template __global__ void MatrixMulAsyncCopySingleStage(float *C C[c + wB * threadIdx.y + threadIdx.x] = Csub; } -// Multi Stage memcpy_async pipeline with int copy +// Multi Stage memcpy_async thread_scope_thread pipeline with single-element async-copy template __global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C, const float* __restrict__ A, const float* __restrict__ B, int wA, @@ -467,8 +471,8 @@ template __global__ void MatrixMulAsyncCopyMultiStage(float* __ } pipe.consumer_release(); - // Don't have to synchronize because - // next iteration is loading to a different buffer + // Don't have to synchronize because maxPipelineStages is greater than one + // therefore next iteration is loading to a different buffer. } // Write the block sub-matrix to device memory; @@ -477,6 +481,102 @@ template __global__ void MatrixMulAsyncCopyMultiStage(float* __ C[c + wB * threadIdx.y + threadIdx.x] = Csub; } +// Multi Stage shared state memcpy_async pipeline thread_scope_block +// with parititioned producer & consumer, here we've 1 warp as producer +// group which issues memcpy_async operations and rest all warps are part of +// consumer group which perform gemm computation on the loaded matrices by producer. +template __global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C, + const float* __restrict__ A, + const float* __restrict__ B, int wA, + int wB) { + // Multi-stage pipeline version + constexpr size_t maxPipelineStages = 4; + + // Declaration of the shared memory array As used to + // store the sub-matrix of A for each stage + __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B for each stage + __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; + + float Csub = 0.0; + + // Index of the first sub-matrix of A processed by the block + const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y; + + // Index of the last sub-matrix of A processed by the block + const int aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + constexpr int aStep = BLOCK_SIZE_X; + + // Index of the first sub-matrix of B processed by the block + const int bBegin = BLOCK_SIZE_X * blockIdx.x; + + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE_X * wB; + + auto cta = cg::this_thread_block(); + + const auto shape1 = cuda::aligned_size_t(sizeof(float)); + __shared__ cuda::pipeline_shared_state shared_state; + constexpr int consumer_row_count = BLOCK_SIZE_X; + + const auto thread_role = (cta.thread_index().y < consumer_row_count) + ? cuda::pipeline_role::consumer + : cuda::pipeline_role::producer; + auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role); + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; + a <= aEnd; a += aStep, b += bStep, ++i) { + if (threadIdx.y >= consumer_row_count) { + // this is a whole producer warp because threadIdx.y >= 16 where 16 == consumer_row_count, + // which loads the matrices from device memory to shared memory; + for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { + if (aStage <= aEnd) { + // Rotating buffer + const int j = iStage % maxPipelineStages; + const int strideRows = (blockDim.y - consumer_row_count); + pipe.producer_acquire(); + for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) { + cuda::memcpy_async(&As[j][rowId][threadIdx.x], + &A[aStage + wA * rowId + threadIdx.x], shape1, pipe); + cuda::memcpy_async(&Bs[j][rowId][threadIdx.x], + &B[bStage + wB * rowId + threadIdx.x], shape1, pipe); + } + pipe.producer_commit(); + } + } + } + else { + // this is a whole set of consumer group because threadIdx.y < consumer_row_count where consumer_row_count == 16, + // which computes gemm operation on matrices loaded in shared memory by producer warp. + const int j = i % maxPipelineStages; + // Synchronize consumer group to make sure the matrices are loaded by producer group. + pipe.consumer_wait(); + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix + #pragma unroll + for (int k = 0; k < BLOCK_SIZE_X; ++k) { + Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + } + pipe.consumer_release(); + } + } + + // Write the block sub-matrix to device memory; + // each thread writes four element + if (threadIdx.y < consumer_row_count) + { + const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; + } +} + /** * Matrix multiplication (CUDA Kernel) on the device: C = A * B * wA is A's width and wB is B's width @@ -637,10 +737,12 @@ int MatrixMultiply(int argc, char **argv, // Allocate host memory for matrices A and B unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(float) * size_A; - float *h_A = reinterpret_cast(malloc(mem_size_A)); + float* h_A; + checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(float) * size_B; - float *h_B = reinterpret_cast(malloc(mem_size_B)); + float* h_B; + checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); cudaStream_t stream; // Initialize host memory @@ -654,7 +756,8 @@ int MatrixMultiply(int argc, char **argv, // Allocate host matrix C dim3 dimsC(dimsB.x, dimsA.y, 1); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); - float *h_C = reinterpret_cast(malloc(mem_size_C)); + float* h_C; + checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); if (h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix C!\n"); @@ -680,6 +783,10 @@ int MatrixMultiply(int argc, char **argv, dim3 threads(blockSize, blockSize); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); + // Here the block size is 16x18, where first 16 rows are consumer thread group + // and last 2 rows (1 warp) is producer thread group + dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1); + dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, dimsA.y / threadsSharedStateKernel.x); printf("Running kernel = %d - %s\n", kernel_number, kernelNames[kernel_number]); // Create and start timer @@ -698,6 +805,10 @@ int MatrixMultiply(int argc, char **argv, case AsyncCopyLargeChunkAWBarrier : MatrixMulAsyncCopyLargeChunkAWBarrier<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; + case AsyncCopyMultiStageSharedState : + MatrixMulAsyncCopyMultiStageSharedState<<>> + (d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case AsyncCopyMultiStage : MatrixMulAsyncCopyMultiStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; @@ -735,6 +846,10 @@ int MatrixMultiply(int argc, char **argv, case AsyncCopyLargeChunkAWBarrier : MatrixMulAsyncCopyLargeChunkAWBarrier<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; + case AsyncCopyMultiStageSharedState : + MatrixMulAsyncCopyMultiStageSharedState<<>> + (d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case AsyncCopyMultiStage : MatrixMulAsyncCopyMultiStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; @@ -801,15 +916,15 @@ int MatrixMultiply(int argc, char **argv, printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); // Clean up memory - free(h_A); - free(h_B); - free(h_C); + checkCudaErrors(cudaFreeHost(h_A)); + checkCudaErrors(cudaFreeHost(h_B)); + checkCudaErrors(cudaFreeHost(h_C)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_C)); checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); - printf("\nNOTE: The CUDA Samples are not meant for performance"\ + printf("\nNOTE: The CUDA Samples are not meant for performance "\ "measurements. Results may vary when GPU Boost is enabled.\n"); if (correct) { @@ -829,9 +944,9 @@ int main(int argc, char **argv) { printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); printf(" -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)\n"); - printf(" (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStage)\n"); - printf(" (4 - AsyncCopySingleStage; 5 - Naive without memcpy_async)\n"); - printf(" (6 - NaiveLargeChunk without memcpy_async)\n"); + printf(" (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)\n"); + printf(" (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)\n"); + printf(" (7 - NaiveLargeChunk without memcpy_async)\n"); printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n"); exit(EXIT_SUCCESS); @@ -876,7 +991,7 @@ int main(int argc, char **argv) { // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); - if (kernel_number < 7) + if (kernel_number < 8) { selected_kernel = (kernels)kernel_number; }