Bug 5742096: Using uint32_t instead of int to fix the overflow undefine dehavior issue of the sample 0_Introduction/simpleOccupancy

This commit is contained in:
Shawn Zeng 2025-12-19 00:26:38 -08:00
parent 5b2dd19a21
commit 70fcdd353f

View File

@ -25,6 +25,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <cstdint>
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
#include <iostream> #include <iostream>
@ -38,7 +39,7 @@ const int manualBlockSize = 32;
// execution configuration, including anything the launch configurator // execution configuration, including anything the launch configurator
// API suggests. // API suggests.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void square(int *array, int arrayCount) __global__ void square(uint32_t *array, int arrayCount)
{ {
extern __shared__ int dynamicSmem[]; extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
@ -99,7 +100,7 @@ static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynam
// This function configures the launch based on the "automatic" // This function configures the launch based on the "automatic"
// argument, records the runtime, and reports occupancy and runtime. // argument, records the runtime, and reports occupancy and runtime.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int launchConfig(int *array, int arrayCount, bool automatic) static int launchConfig(uint32_t *array, int arrayCount, bool automatic)
{ {
int blockSize; int blockSize;
int minGridSize; int minGridSize;
@ -166,20 +167,20 @@ static int launchConfig(int *array, int arrayCount, bool automatic)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int test(bool automaticLaunchConfig, const int count = 1000000) static int test(bool automaticLaunchConfig, const int count = 1000000)
{ {
int *array; uint32_t *array;
int *dArray; uint32_t *dArray;
int size = count * sizeof(int); int size = count * sizeof(uint32_t);
array = new int[count]; array = new uint32_t[count];
for (int i = 0; i < count; i += 1) { for (uint32_t i = 0; i < count; i += 1) {
array[i] = i; array[i] = i;
} }
checkCudaErrors(cudaMalloc(&dArray, size)); checkCudaErrors(cudaMalloc(&dArray, size));
checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
for (int i = 0; i < count; i += 1) { for (uint32_t i = 0; i < count; i += 1) {
array[i] = 0; array[i] = 0;
} }
@ -189,8 +190,9 @@ static int test(bool automaticLaunchConfig, const int count = 1000000)
checkCudaErrors(cudaFree(dArray)); checkCudaErrors(cudaFree(dArray));
// Verify the return data // Verify the return data
// Both GPU and CPU use uint32_t * uint32_t, which has well-defined overflow behavior (modulo 2^32)
// //
for (int i = 0; i < count; i += 1) { for (uint32_t i = 0; i < count; i += 1) {
if (array[i] != i * i) { if (array[i] != i * i) {
std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl; std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
return 1; return 1;