/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This is a simple test showing huge access speed gap * between aligned and misaligned structures * (those having/missing __align__ keyword). * It measures per-element copy throughput for * aligned and misaligned structures on * big chunks of data. */ // includes, system #include #include #include #include // includes, project #include // helper functions for CUDA error checking and initialization #include // helper utility functions //////////////////////////////////////////////////////////////////////////////// // Misaligned types //////////////////////////////////////////////////////////////////////////////// typedef unsigned char uint8; typedef unsigned short int uint16; typedef struct { unsigned char r, g, b, a; } RGBA8_misaligned; typedef struct { unsigned int l, a; } LA32_misaligned; typedef struct { unsigned int r, g, b; } RGB32_misaligned; typedef struct { unsigned int r, g, b, a; } RGBA32_misaligned; //////////////////////////////////////////////////////////////////////////////// // Aligned types //////////////////////////////////////////////////////////////////////////////// typedef struct __align__(4) { unsigned char r, g, b, a; } RGBA8; typedef unsigned int I32; typedef struct __align__(8) { unsigned int l, a; } LA32; typedef struct __align__(16) { unsigned int r, g, b; } RGB32; typedef struct __align__(16) { unsigned int r, g, b, a; } RGBA32; //////////////////////////////////////////////////////////////////////////////// // Because G80 class hardware natively supports global memory operations // only with data elements of 4, 8 and 16 bytes, if structure size // exceeds 16 bytes, it can't be efficiently read or written, // since more than one global memory non-coalescable load/store instructions // will be generated, even if __align__ option is supplied. // "Structure of arrays" storage strategy offers best performance // in general case. See section 5.1.2 of the Programming Guide. //////////////////////////////////////////////////////////////////////////////// typedef struct __align__(16) { RGBA32 c1, c2; } RGBA32_2; //////////////////////////////////////////////////////////////////////////////// // Common host and device functions //////////////////////////////////////////////////////////////////////////////// // Round a / b to nearest higher integer value int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } // Round a / b to nearest lower integer value int iDivDown(int a, int b) { return a / b; } // Align a to nearest higher multiple of b int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; } // Align a to nearest lower multiple of b int iAlignDown(int a, int b) { return a - a % b; } //////////////////////////////////////////////////////////////////////////////// // Simple CUDA kernel. // Copy is carried out on per-element basis, // so it's not per-byte in case of padded structures. //////////////////////////////////////////////////////////////////////////////// template __global__ void testKernel(TData *d_odata, TData *d_idata, int numElements) { const int tid = blockDim.x * blockIdx.x + threadIdx.x; const int numThreads = blockDim.x * gridDim.x; for (int pos = tid; pos < numElements; pos += numThreads) { d_odata[pos] = d_idata[pos]; } } //////////////////////////////////////////////////////////////////////////////// // Validation routine for simple copy kernel. // We must know "packed" size of TData (number_of_fields * sizeof(simple_type)) // and compare only these "packed" parts of the structure, // containing actual user data. The compiler behavior with padding bytes // is undefined, since padding is merely a placeholder // and doesn't contain any user data. //////////////////////////////////////////////////////////////////////////////// template int testCPU(TData *h_odata, TData *h_idata, int numElements, int packedElementSize) { for (int pos = 0; pos < numElements; pos++) { TData src = h_idata[pos]; TData dst = h_odata[pos]; for (int i = 0; i < packedElementSize; i++) if (((char *)&src)[i] != ((char *)&dst)[i]) { return 0; } } return 1; } //////////////////////////////////////////////////////////////////////////////// // Data configuration //////////////////////////////////////////////////////////////////////////////// // Memory chunk size in bytes. Reused for test const int MEM_SIZE = 50000000; const int NUM_ITERATIONS = 32; // GPU input and output data unsigned char *d_idata, *d_odata; // CPU input data and instance of GPU output data unsigned char *h_idataCPU, *h_odataGPU; StopWatchInterface *hTimer = NULL; template int runTest(int packedElementSize, int memory_size) { const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData)); const int numElements = iDivDown(memory_size, sizeof(TData)); // Clean output buffer before current test checkCudaErrors(cudaMemset(d_odata, 0, memory_size)); // Run test checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < NUM_ITERATIONS; i++) { testKernel <<<64, 256>>>((TData *)d_odata, (TData *)d_idata, numElements); getLastCudaError("testKernel() execution failed\n"); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; printf("Avg. time: %f ms / Copy throughput: %f GB/s.\n", gpuTime, (double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0)); // Read back GPU results and run validation checkCudaErrors( cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost)); int flag = testCPU((TData *)h_odataGPU, (TData *)h_idataCPU, numElements, packedElementSize); printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n"); return !flag; } int main(int argc, char **argv) { int i, nTotalFailures = 0; int devID; cudaDeviceProp deviceProp; printf("[%s] - Starting...\n", argv[0]); // find first CUDA device devID = findCudaDevice(argc, (const char **)argv); // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); // Anything that is less than 192 Cores will have a scaled down workload float scale_factor = max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f); int MemorySize = (int)(MEM_SIZE / scale_factor) & 0xffffff00; // force multiple of 256 bytes printf("> Compute scaling value = %4.2f\n", scale_factor); printf("> Memory Size = %d\n", MemorySize); sdkCreateTimer(&hTimer); printf("Allocating memory...\n"); h_idataCPU = (unsigned char *)malloc(MemorySize); h_odataGPU = (unsigned char *)malloc(MemorySize); checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize)); checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize)); printf("Generating host input data array...\n"); for (i = 0; i < MemorySize; i++) { h_idataCPU[i] = (i & 0xFF) + 1; } printf("Uploading input data to GPU memory...\n"); checkCudaErrors( cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice)); printf("Testing misaligned types...\n"); printf("uint8...\n"); nTotalFailures += runTest(1, MemorySize); printf("uint16...\n"); nTotalFailures += runTest(2, MemorySize); printf("RGBA8_misaligned...\n"); nTotalFailures += runTest(4, MemorySize); printf("LA32_misaligned...\n"); nTotalFailures += runTest(8, MemorySize); printf("RGB32_misaligned...\n"); nTotalFailures += runTest(12, MemorySize); printf("RGBA32_misaligned...\n"); nTotalFailures += runTest(16, MemorySize); printf("Testing aligned types...\n"); printf("RGBA8...\n"); nTotalFailures += runTest(4, MemorySize); printf("I32...\n"); nTotalFailures += runTest(4, MemorySize); printf("LA32...\n"); nTotalFailures += runTest(8, MemorySize); printf("RGB32...\n"); nTotalFailures += runTest(12, MemorySize); printf("RGBA32...\n"); nTotalFailures += runTest(16, MemorySize); printf("RGBA32_2...\n"); nTotalFailures += runTest(32, MemorySize); printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures); printf("Shutting down...\n"); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); free(h_odataGPU); free(h_idataCPU); sdkDeleteTimer(&hTimer); if (nTotalFailures != 0) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }