cuda-samples/Samples/alignedTypes/alignedTypes.cu

315 lines
10 KiB
Plaintext
Raw Normal View History

2021-10-21 19:04:49 +08:00
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is a simple test showing huge access speed gap
* between aligned and misaligned structures
* (those having/missing __align__ keyword).
* It measures per-element copy throughput for
* aligned and misaligned structures on
* big chunks of data.
*/
// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, project
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper utility functions
////////////////////////////////////////////////////////////////////////////////
// Misaligned types
////////////////////////////////////////////////////////////////////////////////
typedef unsigned char uint8;
typedef unsigned short int uint16;
typedef struct {
unsigned char r, g, b, a;
} RGBA8_misaligned;
typedef struct {
unsigned int l, a;
} LA32_misaligned;
typedef struct {
unsigned int r, g, b;
} RGB32_misaligned;
typedef struct {
unsigned int r, g, b, a;
} RGBA32_misaligned;
////////////////////////////////////////////////////////////////////////////////
// Aligned types
////////////////////////////////////////////////////////////////////////////////
typedef struct __align__(4) {
unsigned char r, g, b, a;
}
RGBA8;
typedef unsigned int I32;
typedef struct __align__(8) {
unsigned int l, a;
}
LA32;
typedef struct __align__(16) {
unsigned int r, g, b;
}
RGB32;
typedef struct __align__(16) {
unsigned int r, g, b, a;
}
RGBA32;
////////////////////////////////////////////////////////////////////////////////
// Because G80 class hardware natively supports global memory operations
// only with data elements of 4, 8 and 16 bytes, if structure size
// exceeds 16 bytes, it can't be efficiently read or written,
// since more than one global memory non-coalescable load/store instructions
// will be generated, even if __align__ option is supplied.
// "Structure of arrays" storage strategy offers best performance
// in general case. See section 5.1.2 of the Programming Guide.
////////////////////////////////////////////////////////////////////////////////
typedef struct __align__(16) {
RGBA32 c1, c2;
}
RGBA32_2;
////////////////////////////////////////////////////////////////////////////////
// Common host and device functions
////////////////////////////////////////////////////////////////////////////////
// Round a / b to nearest higher integer value
int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
// Round a / b to nearest lower integer value
int iDivDown(int a, int b) { return a / b; }
// Align a to nearest higher multiple of b
int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; }
// Align a to nearest lower multiple of b
int iAlignDown(int a, int b) { return a - a % b; }
////////////////////////////////////////////////////////////////////////////////
// Simple CUDA kernel.
// Copy is carried out on per-element basis,
// so it's not per-byte in case of padded structures.
////////////////////////////////////////////////////////////////////////////////
template <class TData>
__global__ void testKernel(TData *d_odata, TData *d_idata, int numElements) {
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
const int numThreads = blockDim.x * gridDim.x;
for (int pos = tid; pos < numElements; pos += numThreads) {
d_odata[pos] = d_idata[pos];
}
}
////////////////////////////////////////////////////////////////////////////////
// Validation routine for simple copy kernel.
// We must know "packed" size of TData (number_of_fields * sizeof(simple_type))
// and compare only these "packed" parts of the structure,
// containing actual user data. The compiler behavior with padding bytes
// is undefined, since padding is merely a placeholder
// and doesn't contain any user data.
////////////////////////////////////////////////////////////////////////////////
template <class TData>
int testCPU(TData *h_odata, TData *h_idata, int numElements,
int packedElementSize) {
for (int pos = 0; pos < numElements; pos++) {
TData src = h_idata[pos];
TData dst = h_odata[pos];
for (int i = 0; i < packedElementSize; i++)
if (((char *)&src)[i] != ((char *)&dst)[i]) {
return 0;
}
}
return 1;
}
////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
// Memory chunk size in bytes. Reused for test
const int MEM_SIZE = 50000000;
const int NUM_ITERATIONS = 32;
// GPU input and output data
unsigned char *d_idata, *d_odata;
// CPU input data and instance of GPU output data
unsigned char *h_idataCPU, *h_odataGPU;
StopWatchInterface *hTimer = NULL;
template <class TData>
int runTest(int packedElementSize, int memory_size) {
const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData));
const int numElements = iDivDown(memory_size, sizeof(TData));
// Clean output buffer before current test
checkCudaErrors(cudaMemset(d_odata, 0, memory_size));
// Run test
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
for (int i = 0; i < NUM_ITERATIONS; i++) {
testKernel<TData>
<<<64, 256>>>((TData *)d_odata, (TData *)d_idata, numElements);
getLastCudaError("testKernel() execution failed\n");
}
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
printf("Avg. time: %f ms / Copy throughput: %f GB/s.\n", gpuTime,
(double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0));
// Read back GPU results and run validation
checkCudaErrors(
cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost));
int flag = testCPU((TData *)h_odataGPU, (TData *)h_idataCPU, numElements,
packedElementSize);
printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n");
return !flag;
}
int main(int argc, char **argv) {
int i, nTotalFailures = 0;
int devID;
cudaDeviceProp deviceProp;
printf("[%s] - Starting...\n", argv[0]);
// find first CUDA device
devID = findCudaDevice(argc, (const char **)argv);
// get number of SMs on this GPU
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
deviceProp.multiProcessorCount);
// Anything that is less than 192 Cores will have a scaled down workload
float scale_factor =
max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
(float)deviceProp.multiProcessorCount)),
1.0f);
int MemorySize = (int)(MEM_SIZE / scale_factor) &
0xffffff00; // force multiple of 256 bytes
printf("> Compute scaling value = %4.2f\n", scale_factor);
printf("> Memory Size = %d\n", MemorySize);
sdkCreateTimer(&hTimer);
printf("Allocating memory...\n");
h_idataCPU = (unsigned char *)malloc(MemorySize);
h_odataGPU = (unsigned char *)malloc(MemorySize);
checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize));
checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize));
printf("Generating host input data array...\n");
for (i = 0; i < MemorySize; i++) {
h_idataCPU[i] = (i & 0xFF) + 1;
}
printf("Uploading input data to GPU memory...\n");
checkCudaErrors(
cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice));
printf("Testing misaligned types...\n");
printf("uint8...\n");
nTotalFailures += runTest<uint8>(1, MemorySize);
printf("uint16...\n");
nTotalFailures += runTest<uint16>(2, MemorySize);
printf("RGBA8_misaligned...\n");
nTotalFailures += runTest<RGBA8_misaligned>(4, MemorySize);
printf("LA32_misaligned...\n");
nTotalFailures += runTest<LA32_misaligned>(8, MemorySize);
printf("RGB32_misaligned...\n");
nTotalFailures += runTest<RGB32_misaligned>(12, MemorySize);
printf("RGBA32_misaligned...\n");
nTotalFailures += runTest<RGBA32_misaligned>(16, MemorySize);
printf("Testing aligned types...\n");
printf("RGBA8...\n");
nTotalFailures += runTest<RGBA8>(4, MemorySize);
printf("I32...\n");
nTotalFailures += runTest<I32>(4, MemorySize);
printf("LA32...\n");
nTotalFailures += runTest<LA32>(8, MemorySize);
printf("RGB32...\n");
nTotalFailures += runTest<RGB32>(12, MemorySize);
printf("RGBA32...\n");
nTotalFailures += runTest<RGBA32>(16, MemorySize);
printf("RGBA32_2...\n");
nTotalFailures += runTest<RGBA32_2>(32, MemorySize);
printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures);
printf("Shutting down...\n");
checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata));
free(h_odataGPU);
free(h_idataCPU);
sdkDeleteTimer(&hTimer);
if (nTotalFailures != 0) {
printf("Test failed!\n");
exit(EXIT_FAILURE);
}
printf("Test passed\n");
exit(EXIT_SUCCESS);
}