diff --git a/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu b/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu index a89e4096..dc42ecbb 100644 --- a/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu +++ b/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu @@ -592,8 +592,8 @@ __global__ void simple_wmma_bf16gemm(__nv_bfloat16 *a, __nv_bfloat16 *b, float * int aCol = i; int aRow = warpM * M; - int bCol = i; - int bRow = warpN * N; + int bCol = warpN * N; + int bRow = i; // Bounds checking if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {