Add TMA example

2025-10-16 18:08:58 +08:00 · 2023-06-30 17:39:53 +02:00 · 2023-06-30 17:39:53 +02:00 · 5925483b33
commit 5925483b33
parent 03309a2d42
3 changed files with 472 additions and 0 deletions
--- a/Samples/3_CUDA_Features/globalToShmemTMACopy/README.md
+++ b/Samples/3_CUDA_Features/globalToShmemTMACopy/README.md
@ -0,0 +1,49 @@
+# globalToShmemTMACopy - Global Memory to Shared Memory TMA Copy
+
+## Description
+
+This sample shows how to use the CUDA driver API and inline PTX assembly to copy
+a 2D tile of a tensor into shared memory. It also demonstrates arrive-wait
+barrier for synchronization. 
+
+## Key Concepts
+
+CUDA Runtime API, CUDA Driver API, PTX ISA, CPP11 CUDA
+
+## Supported SM Architectures
+
+This sample requires compute capability 9.0 or higher.
+
+[SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, QNX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMalloc, cudaMemcpy, cudaFree, cudaDeviceSynchronize
+
+### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
+cudaMalloc, cudaMemcpy, cudaFree, cudaDeviceSynchronize
+
+### [CUDA PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html)
+
+## Dependencies needed to build/run
+[CPP11](../../../README.md#cpp11)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+
+## References (for more details)
+
--- a/Samples/3_CUDA_Features/globalToShmemTMACopy/globalToShmemTMACopy.cu
+++ b/Samples/3_CUDA_Features/globalToShmemTMACopy/globalToShmemTMACopy.cu
@ -0,0 +1,359 @@
+/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/**
+ * This sample demonstrates how to:
+ *
+ * - Create a TensorMap (TMA descriptor)
+ * - Load a 2D tile of data into shared memory
+ *
+ * Compile with:
+ *
+ * nvcc -arch sm_90 globalToShmemTMACopy.cu -o globalToShmemTMACopy
+ *
+ * It can be that the compiler issues the following note. This can be safely ignored.
+ *
+ *   note: the ABI for passing parameters with 64-byte alignment has changed in
+ *   GCC 4.6
+ *
+ */
+#include <cstdio>                      // fprintf
+#include <vector>                      // std::vector
+
+#include <cuda.h>                      // CUtensorMap
+#include <cuda_awbarrier_primitives.h> // __mbarrier_*
+
+#include "util.h"                      // CUDA_CHECK macro
+
+/*
+ * Constants.
+ */
+constexpr size_t W_global = 1024; // Width of tensor (in # elements)
+constexpr size_t H_global = 1024; // Height of tensor (in # elements)
+
+constexpr int SMEM_W = 32;     // Width of shared memory buffer (in # elements)
+constexpr int SMEM_H = 8;      // Height of shared memory buffer (in # elements)
+
+/*
+ * CUDA Driver API
+ */
+
+// The type of the cuTensorMapEncodeTiled function.
+using cuTensorMapEncodeTiled_t = decltype(cuTensorMapEncodeTiled);
+
+// Get function pointer to driver API cuTensorMapEncodeTiled
+cuTensorMapEncodeTiled_t * get_cuTensorMapEncodeTiled() {
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+  void* cuda_ptr = nullptr;
+  unsigned long long flags = cudaEnableDefault;
+
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DRIVER__ENTRY__POINT.html
+  CUDA_CHECK(cudaGetDriverEntryPoint("cuTensorMapEncodeTiled", &cuda_ptr, flags));
+
+  return reinterpret_cast<cuTensorMapEncodeTiled_t*>(cuda_ptr);
+}
+
+/*
+ * PTX wrappers
+ */
+
+inline __device__ __mbarrier_token_t barrier_arrive1_tx(
+  __mbarrier_t *barrier, uint32_t expected_tx_count
+)
+{
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
+  __mbarrier_token_t token;
+
+  asm volatile("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
+               : "=l"(token)
+               : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(expected_tx_count)
+               : "memory");
+  return token;
+}
+
+inline __device__ bool barrier_try_wait_token(__mbarrier_t *barrier, __mbarrier_token_t token)
+{
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait
+  //
+  // This function returns a bool, so that software can retry.
+  //
+  //  The HW only provides best-effort waiting support. The wait time is limited
+  //  by the HW capability, after which a fail occurs, in which case the SW is
+  //  responsible for retrying.
+  int __ready;
+  asm volatile("{\n\t"
+               ".reg .pred p;\n\t"
+               "mbarrier.try_wait.acquire.cta.shared::cta.b64 p, [%1], %2;\n\t"
+               "selp.b32 %0, 1, 0, p;\n\t"
+               "}"
+               : "=r"(__ready)
+               : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))),
+                 "l"(token)
+               : "memory");
+  return __ready;
+}
+
+inline __device__ void cp_async_bulk_tensor_2d(
+  __mbarrier_t *barrier, void *dst, int access_coord_x, int access_coord_y, const CUtensorMap *tensor_desc)
+{
+  unsigned smem_int_ptr = static_cast<unsigned int>(__cvta_generic_to_shared(dst));
+  unsigned smem_barrier_int_ptr = static_cast<unsigned int>(__cvta_generic_to_shared(barrier));
+  uint64_t tensor_desc_ptr = reinterpret_cast<uint64_t>(tensor_desc);
+
+  asm volatile(
+    "cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
+    "[%0], [%1, {%2, %3}], [%4];\n"
+    :
+    : "r"(smem_int_ptr),
+      "l"(tensor_desc_ptr),
+      "r"(access_coord_x),
+      "r"(access_coord_y),
+      "r"(smem_barrier_int_ptr)
+    : "memory");
+}
+
+// Layout of shared memory. It contains:
+//
+// - a buffer to hold a subset of a tensor,
+// - a shared memory barrier.
+template <int H, int W>
+struct smem_t {
+
+  // The destination shared memory buffer of a bulk tensor operation should be
+  // 128 byte aligned.
+  struct alignas(128) tensor_buffer {
+    int data[H][W];
+
+    __device__ constexpr int width() {return W;}
+    __device__ constexpr int height() {return H;}
+  };
+
+  tensor_buffer buffer;
+
+  // Put the barrier behind the tensor buffer to prevent 100+ bytes of padding.
+  __mbarrier_t bar;
+
+  __device__ constexpr int buffer_size_in_bytes() {
+    return sizeof(tensor_buffer::data);
+  }
+};
+
+
+/*
+ * Main kernel: takes a TMA descriptor and two coordinates.
+ *
+ * Loads a tile into shared memory using TMA and prints the tile.
+ *
+ */
+__global__ void kernel(const __grid_constant__ CUtensorMap tma_desc, int x_0, int y_0) {
+  /*
+   * ***NOTE***:
+
+     A CUtensorMap can only be passed as a `const __grid_constant__`
+     parameter. Passing a CUtensorMap in any other way from the host to
+     device can result in difficult if not impossible to debug failures.
+
+  */
+
+  // Declare shared memory to hold tensor buffer and shared memory barrier.
+  __shared__ smem_t<SMEM_H, SMEM_W> smem;
+
+  // Utility variable to elect a leader thread.
+  bool leader = threadIdx.x == 0;
+
+
+  if (leader) {
+    // Initialize barrier. We will participate in the barrier with `blockDim.x`
+    // threads.
+    __mbarrier_init(&smem.bar, blockDim.x);
+  }
+  // Syncthreads so initialized barrier is visible to all threads.
+  __syncthreads();
+
+
+  // This token is created when arriving on the shared memory barrier. It is
+  // used again when waiting on the barrier.
+  __mbarrier_token_t token;
+
+  // Load first  batch
+  if (leader) {
+    // Initiate bulk tensor copy.
+    cp_async_bulk_tensor_2d(&smem.bar, &smem.buffer.data, x_0, y_0, &tma_desc);
+    // Arrive with arrival count of 1 and expected transaction count equal to
+    // the number of bytes that are copied by cp_async_bulk_tensor_2d.
+    token = barrier_arrive1_tx(&smem.bar, smem.buffer_size_in_bytes());
+  } else {
+    // Other threads arrive with arrival count of 1 and expected tx count of 0.
+    token = barrier_arrive1_tx(&smem.bar, 0);
+  }
+
+  // The barrier will flip when the following two conditions have been met:
+  //
+  // - Its arrival count reaches blockDim.x (see __mbarrier_init above).
+  //   Typically, each thread will arrive with an arrival count of one so this
+  //   indicates that all threads have arrived.
+  //
+  // - Its expected transaction count reaches smem.buffer_size_in_bytes(). The
+  //   bulk tensor operation will increment the transaction count as it copies
+  //   bytes.
+
+  // Wait for barrier to flip. Try_wait puts the thread to sleep while waiting.
+  // It is woken up when the barrier flips or when a hardware-defined number of
+  // clock cycles have passed. In the second case, we retry waiting.
+  while(! barrier_try_wait_token(&smem.bar, token)) { };
+
+  // From this point onwards, the data in smem.buffer is readable by all threads
+  // participating the in the barrier.
+
+  // Print the data:
+  if (leader) {
+    printf("\n\nPrinting tile at coordinates x0 = %d, y0 = %d\n", x_0, y_0);
+
+    // Print global x coordinates
+    printf("global->\t");
+    for (int x = 0; x < smem.buffer.width(); ++x) {
+      printf("[%4d] ", x_0 + x);
+    }
+    printf("\n");
+
+    // Print local x coordinates
+    printf("local ->\t");
+    for (int x = 0; x < smem.buffer.width(); ++x) {
+      printf("[%4d] ", x);
+    }
+    printf("\n");
+
+    for (int y = 0; y < smem.buffer.height(); ++y) {
+      // Print global and local y coordinates
+      printf("[%4d] [%2d]\t", y_0 + y, y);
+      for (int x = 0; x < smem.buffer.width(); ++x) {
+        printf(" %4d  ", smem.buffer.data[y][x]);
+      }
+      printf("\n");
+    }
+
+    // Invalidate barrier. If further computations were to take place in the
+    // kernel, this allows the memory location of the shared memory barrier to
+    // be repurposed.
+    __mbarrier_inval(&smem.bar);
+  }
+}
+
+int main(int argc, char **argv) {
+
+  // Create a 2D tensor in GPU global memory containing linear indices 0, 1, 2, ... .
+  // The data layout is row-major.
+
+  // First fill in a vector on the host.
+  std::vector<int> tensor_host(H_global * W_global);
+  for (int i = 0; i < H_global * W_global; ++i) {
+    tensor_host[i] = i;
+  }
+
+  // Move it to device
+  int * tensor = nullptr;
+  CUDA_CHECK(cudaMalloc(&tensor, H_global * W_global * sizeof(int)));
+  CUDA_CHECK(cudaMemcpy(tensor, tensor_host.data(), H_global * W_global * sizeof(int), cudaMemcpyHostToDevice));
+
+  // Set up parameters to create TMA descriptor.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+
+  CUtensorMap tma_desc{};
+  CUtensorMapDataType dtype = CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32;
+  auto rank = 2;
+  uint64_t size[rank] = {W_global, H_global};
+  // The stride is the number of bytes to traverse from the first element of one row to the next.
+  // It must be a multiple of 16.
+  uint64_t stride[rank - 1] = {W_global * sizeof(int)};
+  // The box_size is the size of the shared memory buffer that is used as the destination of a TMA transfer.
+  uint32_t box_size[rank] = {SMEM_W, SMEM_H};
+  // The distance between elements in units of sizeof(element). A stride of 2
+  // can be used to load only the real component of a complex-valued tensor, for instance.
+  uint32_t elem_stride[rank] = {1, 1};
+  // Interleave patterns are sometimes used to accelerate loading of values that
+  // are less than 4 bytes long.
+  CUtensorMapInterleave interleave = CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE;
+  // Swizzling can be used to avoid shared memory bank conflicts.
+  CUtensorMapSwizzle swizzle = CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE;
+  CUtensorMapL2promotion l2_promotion = CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE;
+  // Any element that is outside of bounds will be set to zero by the TMA transfer.
+  CUtensorMapFloatOOBfill oob_fill = CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+
+  // Get a function pointer to the cuTensorMapEncodeTiled driver API.
+  auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();
+
+  // Create the tensor descriptor.
+  CUresult res = cuTensorMapEncodeTiled(
+      &tma_desc,    // CUtensorMap *tensorMap,
+      dtype,        // CUtensorMapDataType tensorDataType,
+      rank,         // cuuint32_t tensorRank,
+      tensor,       // void *globalAddress,
+      size,         // const cuuint64_t *globalDim,
+      stride,       // const cuuint64_t *globalStrides,
+      box_size,     // const cuuint32_t *boxDim,
+      elem_stride,  // const cuuint32_t *elementStrides,
+      interleave,   // CUtensorMapInterleave interleave,
+      swizzle,      // CUtensorMapSwizzle swizzle,
+      l2_promotion, // CUtensorMapL2promotion l2Promotion,
+      oob_fill      // CUtensorMapFloatOOBfill oobFill);
+    );
+  // Print the result. Should be zero.
+  printf("cuTensorMapEncodeTiled returned CUresult: %d\n\n", res);
+
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  dim3 grid(1);
+  dim3 block(128);
+
+  printf("Print the top right corner tile of the tensor:\n");
+  kernel<<<grid, block>>>(tma_desc, 0, 0);
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  printf("Negative indices work:\n");
+  kernel<<<grid, block>>>(tma_desc, -4, 0);
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  printf("When the indices are out of bounds, the shared memory buffer is filled with zeros:\n");
+  kernel<<<grid, block>>>(tma_desc, W_global, H_global);
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  printf(
+    "\nCare must be taken to ensure that the coordinates result in a memory offset\n"
+    "that is aligned to 16 bytes. With 32 bit integer elements, x coordinates\n"
+    "that are not a multiple of 4 result in a non-recoverable error:\n"
+  );
+  kernel<<<grid, block>>>(tma_desc, 1, 0);
+  CUDA_REPORT(cudaDeviceSynchronize());
+  kernel<<<grid, block>>>(tma_desc, 2, 0);
+  CUDA_REPORT(cudaDeviceSynchronize());
+  kernel<<<grid, block>>>(tma_desc, 3, 0);
+  CUDA_REPORT(cudaDeviceSynchronize());
+
+  CUDA_CHECK(cudaFree(tensor));
+  return 0;
+}
--- a/Samples/3_CUDA_Features/globalToShmemTMACopy/util.h
+++ b/Samples/3_CUDA_Features/globalToShmemTMACopy/util.h
@ -0,0 +1,64 @@
+/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstdlib>
+
+// Macro for checking cuda errors following a cuda launch or api call
+#define CUDA_CHECK(ans)                       \
+    {                                         \
+        gpuAssert((ans), __FILE__, __LINE__); \
+    }
+
+inline void gpuAssert(cudaError_t code, const char *file, int line)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(code), file,
+                line);
+        exit(code);
+    }
+}
+
+
+// Macro to report cuda errors following a cuda launch or api call
+#define CUDA_REPORT(ans)                       \
+    {                                         \
+        gpuReport((ans), __FILE__, __LINE__); \
+    }
+
+inline void gpuReport(cudaError_t code, const char *file, int line)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(code), file,
+                line);
+    }
+}