Resolve merge between 13.0 and 13.1 branches

2026-07-16 21:06:52 +08:00 · 2025-09-05 09:34:27 -07:00 · 2025-09-05 09:34:27 -07:00 · 320c7e6392
commit 320c7e6392
parent 22fafb0a7c
1 changed files with 13 additions and 0 deletions
--- a/Samples/6_Performance/transpose/transpose.cu
+++ b/Samples/6_Performance/transpose/transpose.cu
@ -104,6 +104,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height)
    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
        if (xIndex < width && yIndex < height) {
            tile[threadIdx.y + i][threadIdx.x] = idata[index + i * width];
+            tile[threadIdx.y + i][threadIdx.x] = idata[index + i * width];
        }
    }

@ -112,6 +113,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height)
    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
        if (xIndex < height && yIndex < width) {
            odata[index + i * width] = tile[threadIdx.y + i][threadIdx.x];
+            odata[index + i * width] = tile[threadIdx.y + i][threadIdx.x];
        }
    }
 }
@ -607,6 +609,17 @@ int main(int argc, char **argv)
        }
        // copy host data to device
        checkCudaErrors(cudaMemcpy(d_odata, h_odata, mem_size, cudaMemcpyHostToDevice));
+
+        // Reset d_odata to zero before starting the next loop iteration to avoid
+        // carrying over results from previous kernels. Without this reset, residual
+        // data from a prior kernel (e.g., 'copy') could make a subsequent
+        // kernel (e.g., 'copySharedMem') appear correct even if it performs no work,
+        // leading to false positives in compareData.
+        for (int i = 0; i < (size_x * size_y); ++i) {
+            h_odata[i] = 0;
+        }
+        // copy host data to device
+        checkCudaErrors(cudaMemcpy(d_odata, h_odata, mem_size, cudaMemcpyHostToDevice));
    }

    // cleanup