From b530f1cf426ead6bdebd2d162257a713fc773d4c Mon Sep 17 00:00:00 2001
From: Francesco Rizzi <fr.francescorizzi@gmail.com>
Date: Mon, 5 May 2025 17:43:23 +0200
Subject: [PATCH] Fix bug in 6_Performance/transpose: copy sharedmem kernel
 (#363)

Update kernel loop bounds handling, main loop data copy to avoid incorrect reuse of output results.

---------

Authored-by: Francesco Rizzi <francesco.rizzi@ng-analytics.com>
---
 Samples/6_Performance/transpose/transpose.cu | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/Samples/6_Performance/transpose/transpose.cu b/Samples/6_Performance/transpose/transpose.cu
index 068aa02e..8a1d8bec 100644
--- a/Samples/6_Performance/transpose/transpose.cu
+++ b/Samples/6_Performance/transpose/transpose.cu
@@ -103,7 +103,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height)
 
     for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
         if (xIndex < width && yIndex < height) {
-            tile[threadIdx.y][threadIdx.x] = idata[index];
+            tile[threadIdx.y + i][threadIdx.x] = idata[index + i * width];
         }
     }
 
@@ -111,7 +111,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height)
 
     for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
         if (xIndex < height && yIndex < width) {
-            odata[index] = tile[threadIdx.y][threadIdx.x];
+            odata[index + i * width] = tile[threadIdx.y + i][threadIdx.x];
         }
     }
 }
@@ -596,6 +596,18 @@ int main(int argc, char **argv)
                (size_x * size_y),
                1,
                TILE_DIM * BLOCK_ROWS);
+
+        // Reset d_odata to zero before starting the next loop iteration to avoid 
+        // carrying over results from previous kernels. Without this reset, residual 
+        // data from a prior kernel (e.g., 'copy') could make a subsequent 
+        // kernel (e.g., 'copySharedMem') appear correct even if it performs no work, 
+        // leading to false positives in compareData.
+        for (int i = 0; i < (size_x * size_y); ++i) {
+            h_odata[i] = 0;
+        }
+        // copy host data to device
+        checkCudaErrors(
+            cudaMemcpy(d_odata, h_odata, mem_size, cudaMemcpyHostToDevice));
     }
 
     // cleanup