From b530f1cf426ead6bdebd2d162257a713fc773d4c Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Mon, 5 May 2025 17:43:23 +0200 Subject: [PATCH] Fix bug in 6_Performance/transpose: copy sharedmem kernel (#363) Update kernel loop bounds handling, main loop data copy to avoid incorrect reuse of output results. --------- Authored-by: Francesco Rizzi --- Samples/6_Performance/transpose/transpose.cu | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/Samples/6_Performance/transpose/transpose.cu b/Samples/6_Performance/transpose/transpose.cu index 068aa02e..8a1d8bec 100644 --- a/Samples/6_Performance/transpose/transpose.cu +++ b/Samples/6_Performance/transpose/transpose.cu @@ -103,7 +103,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height) for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { if (xIndex < width && yIndex < height) { - tile[threadIdx.y][threadIdx.x] = idata[index]; + tile[threadIdx.y + i][threadIdx.x] = idata[index + i * width]; } } @@ -111,7 +111,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height) for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { if (xIndex < height && yIndex < width) { - odata[index] = tile[threadIdx.y][threadIdx.x]; + odata[index + i * width] = tile[threadIdx.y + i][threadIdx.x]; } } } @@ -596,6 +596,18 @@ int main(int argc, char **argv) (size_x * size_y), 1, TILE_DIM * BLOCK_ROWS); + + // Reset d_odata to zero before starting the next loop iteration to avoid + // carrying over results from previous kernels. Without this reset, residual + // data from a prior kernel (e.g., 'copy') could make a subsequent + // kernel (e.g., 'copySharedMem') appear correct even if it performs no work, + // leading to false positives in compareData. + for (int i = 0; i < (size_x * size_y); ++i) { + h_odata[i] = 0; + } + // copy host data to device + checkCudaErrors( + cudaMemcpy(d_odata, h_odata, mem_size, cudaMemcpyHostToDevice)); } // cleanup