Merge branch 'shawnz_bug_fix' into 'master'

Bug 5412815: Fix the issue of cudaTensorCoreGemm.cu See merge request cuda-samples/cuda-samples!125
2026-01-07 09:47:50 +08:00 · 2025-07-28 10:18:27 -07:00 · 2025-07-28 10:18:27 -07:00 · a5267b83a5
commit a5267b83a5
parent 2ab16e6d15 b38ed29c95
1 changed files with 1 additions and 1 deletions
--- a/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+++ b/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
@ -224,7 +224,7 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C, float
    // there's no such tile, all warps in this CTA exit.
    for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
-        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+        const unsigned int block_tile_j = (block_pos * BLOCK_ROW_TILES) % N_TILES;

        // Stop when there are no more D matrix tiles to compute in this CTA.
        if (block_tile_i >= M_TILES) {