Fixing correctness of bf16TensorCoreGemm

2026-03-27 04:35:41 +08:00 · 2024-05-26 18:30:30 +08:00 · 2024-05-26 18:30:30 +08:00 · fb9f77575a
commit fb9f77575a
parent 5f97d7d0df
1 changed files with 2 additions and 2 deletions
--- a/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu
+++ b/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu
@ -592,8 +592,8 @@ __global__ void simple_wmma_bf16gemm(__nv_bfloat16 *a, __nv_bfloat16 *b, float *
      int aCol = i; 
      int aRow = warpM * M;

-      int bCol = i;
-      int bRow = warpN * N;
+      int bCol = warpN * N;
+      int bRow = i;

      // Bounds checking
      if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {