cuda-samples/Samples/matrixMul_nvrtc/matrixMul_kernel.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication as described in Chapter 3
 * of the programming guide.
 * It has been written for clarity of exposition to illustrate various CUDA
 * programming principles, not with the goal of providing the most
 * performant generic kernel for matrix multiplication.
 *
 * See also:
 * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
 * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 */

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */

#include <cooperative_groups.h>

template <int BLOCK_SIZE>
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
  // Handle to thread block group
  cooperative_groups::thread_block cta =
      cooperative_groups::this_thread_block();
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;

  // Thread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // Index of the first sub-matrix of A processed by the block
  int aBegin = wA * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  int aStep = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep = BLOCK_SIZE * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    As[ty][tx] = A[a + wA * ty + tx];
    Bs[ty][tx] = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    cooperative_groups::sync(cta);

// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
    for (int k = 0; k < BLOCK_SIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    cooperative_groups::sync(cta);
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}

extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
                                                 int wA, int wB) {
  matrixMulCUDA<16>(C, A, B, wA, wB);
}

extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
                                                 int wA, int wB) {
  matrixMulCUDA<32>(C, A, B, wA, wB);
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/**`
			`* Matrix multiplication: C = A * B.`
			`* Host code.`
			`*`
			`* This sample implements matrix multiplication as described in Chapter 3`
			`* of the programming guide.`
			`* It has been written for clarity of exposition to illustrate various CUDA`
			`* programming principles, not with the goal of providing the most`
			`* performant generic kernel for matrix multiplication.`
			`*`
			`* See also:`
			`* V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"`
			`* in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),`
			`* Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.`
			`*/`

			`/**`
			`* Matrix multiplication (CUDA Kernel) on the device: C = A * B`
			`* wA is A's width and wB is B's width`
			`*/`

			`#include <cooperative_groups.h>`

			`template <int BLOCK_SIZE>`
			`__device__ void matrixMulCUDA(float C, float A, float *B, int wA, int wB) {`
			`// Handle to thread block group`
			`cooperative_groups::thread_block cta =`
			`cooperative_groups::this_thread_block();`
			`// Block index`
			`int bx = blockIdx.x;`
			`int by = blockIdx.y;`

			`// Thread index`
			`int tx = threadIdx.x;`
			`int ty = threadIdx.y;`

			`// Index of the first sub-matrix of A processed by the block`
			`int aBegin = wA * BLOCK_SIZE * by;`

			`// Index of the last sub-matrix of A processed by the block`
			`int aEnd = aBegin + wA - 1;`

			`// Step size used to iterate through the sub-matrices of A`
			`int aStep = BLOCK_SIZE;`

			`// Index of the first sub-matrix of B processed by the block`
			`int bBegin = BLOCK_SIZE * bx;`

			`// Step size used to iterate through the sub-matrices of B`
			`int bStep = BLOCK_SIZE * wB;`

			`// Csub is used to store the element of the block sub-matrix`
			`// that is computed by the thread`
			`float Csub = 0;`

			`// Loop over all the sub-matrices of A and B`
			`// required to compute the block sub-matrix`
			`for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {`
			`// Declaration of the shared memory array As used to`
			`// store the sub-matrix of A`
			`__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];`

			`// Declaration of the shared memory array Bs used to`
			`// store the sub-matrix of B`
			`__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];`

			`// Load the matrices from device memory`
			`// to shared memory; each thread loads`
			`// one element of each matrix`
			`As[ty][tx] = A[a + wA * ty + tx];`
			`Bs[ty][tx] = B[b + wB * ty + tx];`

			`// Synchronize to make sure the matrices are loaded`
			`cooperative_groups::sync(cta);`

			`// Multiply the two matrices together;`
			`// each thread computes one element`
			`// of the block sub-matrix`
			`#pragma unroll`
			`for (int k = 0; k < BLOCK_SIZE; ++k) {`
			`Csub += As[ty][k] * Bs[k][tx];`
			`}`

			`// Synchronize to make sure that the preceding`
			`// computation is done before loading two new`
			`// sub-matrices of A and B in the next iteration`
			`cooperative_groups::sync(cta);`
			`}`

			`// Write the block sub-matrix to device memory;`
			`// each thread writes one element`
			`int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;`
			`C[c + wB * ty + tx] = Csub;`
			`}`

			`extern "C" __global__ void matrixMulCUDA_block16(float C, float A, float *B,`
			`int wA, int wB) {`
			`matrixMulCUDA<16>(C, A, B, wA, wB);`
			`}`

			`extern "C" __global__ void matrixMulCUDA_block32(float C, float A, float *B,`
			`int wA, int wB) {`
			`matrixMulCUDA<32>(C, A, B, wA, wB);`
			`}`