mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 14:19:15 +08:00
315 lines
10 KiB
Plaintext
315 lines
10 KiB
Plaintext
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This is a simple test showing huge access speed gap
|
|
* between aligned and misaligned structures
|
|
* (those having/missing __align__ keyword).
|
|
* It measures per-element copy throughput for
|
|
* aligned and misaligned structures on
|
|
* big chunks of data.
|
|
*/
|
|
|
|
// includes, system
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
// includes, project
|
|
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
|
|
#include <helper_functions.h> // helper utility functions
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Misaligned types
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
typedef unsigned char uint8;
|
|
|
|
typedef unsigned short int uint16;
|
|
|
|
typedef struct {
|
|
unsigned char r, g, b, a;
|
|
} RGBA8_misaligned;
|
|
|
|
typedef struct {
|
|
unsigned int l, a;
|
|
} LA32_misaligned;
|
|
|
|
typedef struct {
|
|
unsigned int r, g, b;
|
|
} RGB32_misaligned;
|
|
|
|
typedef struct {
|
|
unsigned int r, g, b, a;
|
|
} RGBA32_misaligned;
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Aligned types
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
typedef struct __align__(4) {
|
|
unsigned char r, g, b, a;
|
|
}
|
|
RGBA8;
|
|
|
|
typedef unsigned int I32;
|
|
|
|
typedef struct __align__(8) {
|
|
unsigned int l, a;
|
|
}
|
|
LA32;
|
|
|
|
typedef struct __align__(16) {
|
|
unsigned int r, g, b;
|
|
}
|
|
RGB32;
|
|
|
|
typedef struct __align__(16) {
|
|
unsigned int r, g, b, a;
|
|
}
|
|
RGBA32;
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Because G80 class hardware natively supports global memory operations
|
|
// only with data elements of 4, 8 and 16 bytes, if structure size
|
|
// exceeds 16 bytes, it can't be efficiently read or written,
|
|
// since more than one global memory non-coalescable load/store instructions
|
|
// will be generated, even if __align__ option is supplied.
|
|
// "Structure of arrays" storage strategy offers best performance
|
|
// in general case. See section 5.1.2 of the Programming Guide.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
typedef struct __align__(16) {
|
|
RGBA32 c1, c2;
|
|
}
|
|
RGBA32_2;
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Common host and device functions
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Round a / b to nearest higher integer value
|
|
int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
|
|
|
|
// Round a / b to nearest lower integer value
|
|
int iDivDown(int a, int b) { return a / b; }
|
|
|
|
// Align a to nearest higher multiple of b
|
|
int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; }
|
|
|
|
// Align a to nearest lower multiple of b
|
|
int iAlignDown(int a, int b) { return a - a % b; }
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Simple CUDA kernel.
|
|
// Copy is carried out on per-element basis,
|
|
// so it's not per-byte in case of padded structures.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
template <class TData>
|
|
__global__ void testKernel(TData *d_odata, TData *d_idata, int numElements) {
|
|
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
const int numThreads = blockDim.x * gridDim.x;
|
|
|
|
for (int pos = tid; pos < numElements; pos += numThreads) {
|
|
d_odata[pos] = d_idata[pos];
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Validation routine for simple copy kernel.
|
|
// We must know "packed" size of TData (number_of_fields * sizeof(simple_type))
|
|
// and compare only these "packed" parts of the structure,
|
|
// containing actual user data. The compiler behavior with padding bytes
|
|
// is undefined, since padding is merely a placeholder
|
|
// and doesn't contain any user data.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
template <class TData>
|
|
int testCPU(TData *h_odata, TData *h_idata, int numElements,
|
|
int packedElementSize) {
|
|
for (int pos = 0; pos < numElements; pos++) {
|
|
TData src = h_idata[pos];
|
|
TData dst = h_odata[pos];
|
|
|
|
for (int i = 0; i < packedElementSize; i++)
|
|
if (((char *)&src)[i] != ((char *)&dst)[i]) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Data configuration
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Memory chunk size in bytes. Reused for test
|
|
const int MEM_SIZE = 50000000;
|
|
const int NUM_ITERATIONS = 32;
|
|
|
|
// GPU input and output data
|
|
unsigned char *d_idata, *d_odata;
|
|
// CPU input data and instance of GPU output data
|
|
unsigned char *h_idataCPU, *h_odataGPU;
|
|
StopWatchInterface *hTimer = NULL;
|
|
|
|
template <class TData>
|
|
int runTest(int packedElementSize, int memory_size) {
|
|
const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData));
|
|
const int numElements = iDivDown(memory_size, sizeof(TData));
|
|
|
|
// Clean output buffer before current test
|
|
checkCudaErrors(cudaMemset(d_odata, 0, memory_size));
|
|
// Run test
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
sdkResetTimer(&hTimer);
|
|
sdkStartTimer(&hTimer);
|
|
|
|
for (int i = 0; i < NUM_ITERATIONS; i++) {
|
|
testKernel<TData>
|
|
<<<64, 256>>>((TData *)d_odata, (TData *)d_idata, numElements);
|
|
getLastCudaError("testKernel() execution failed\n");
|
|
}
|
|
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
sdkStopTimer(&hTimer);
|
|
double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
|
|
printf("Avg. time: %f ms / Copy throughput: %f GB/s.\n", gpuTime,
|
|
(double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0));
|
|
|
|
// Read back GPU results and run validation
|
|
checkCudaErrors(
|
|
cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost));
|
|
int flag = testCPU((TData *)h_odataGPU, (TData *)h_idataCPU, numElements,
|
|
packedElementSize);
|
|
|
|
printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n");
|
|
|
|
return !flag;
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
int i, nTotalFailures = 0;
|
|
|
|
int devID;
|
|
cudaDeviceProp deviceProp;
|
|
printf("[%s] - Starting...\n", argv[0]);
|
|
|
|
// find first CUDA device
|
|
devID = findCudaDevice(argc, (const char **)argv);
|
|
|
|
// get number of SMs on this GPU
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
|
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
|
|
deviceProp.multiProcessorCount,
|
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
|
deviceProp.multiProcessorCount);
|
|
|
|
// Anything that is less than 192 Cores will have a scaled down workload
|
|
float scale_factor =
|
|
max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
|
(float)deviceProp.multiProcessorCount)),
|
|
1.0f);
|
|
|
|
int MemorySize = (int)(MEM_SIZE / scale_factor) &
|
|
0xffffff00; // force multiple of 256 bytes
|
|
|
|
printf("> Compute scaling value = %4.2f\n", scale_factor);
|
|
printf("> Memory Size = %d\n", MemorySize);
|
|
|
|
sdkCreateTimer(&hTimer);
|
|
|
|
printf("Allocating memory...\n");
|
|
h_idataCPU = (unsigned char *)malloc(MemorySize);
|
|
h_odataGPU = (unsigned char *)malloc(MemorySize);
|
|
checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize));
|
|
checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize));
|
|
|
|
printf("Generating host input data array...\n");
|
|
|
|
for (i = 0; i < MemorySize; i++) {
|
|
h_idataCPU[i] = (i & 0xFF) + 1;
|
|
}
|
|
|
|
printf("Uploading input data to GPU memory...\n");
|
|
checkCudaErrors(
|
|
cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice));
|
|
|
|
printf("Testing misaligned types...\n");
|
|
printf("uint8...\n");
|
|
nTotalFailures += runTest<uint8>(1, MemorySize);
|
|
|
|
printf("uint16...\n");
|
|
nTotalFailures += runTest<uint16>(2, MemorySize);
|
|
|
|
printf("RGBA8_misaligned...\n");
|
|
nTotalFailures += runTest<RGBA8_misaligned>(4, MemorySize);
|
|
|
|
printf("LA32_misaligned...\n");
|
|
nTotalFailures += runTest<LA32_misaligned>(8, MemorySize);
|
|
|
|
printf("RGB32_misaligned...\n");
|
|
nTotalFailures += runTest<RGB32_misaligned>(12, MemorySize);
|
|
|
|
printf("RGBA32_misaligned...\n");
|
|
nTotalFailures += runTest<RGBA32_misaligned>(16, MemorySize);
|
|
|
|
printf("Testing aligned types...\n");
|
|
printf("RGBA8...\n");
|
|
nTotalFailures += runTest<RGBA8>(4, MemorySize);
|
|
|
|
printf("I32...\n");
|
|
nTotalFailures += runTest<I32>(4, MemorySize);
|
|
|
|
printf("LA32...\n");
|
|
nTotalFailures += runTest<LA32>(8, MemorySize);
|
|
|
|
printf("RGB32...\n");
|
|
nTotalFailures += runTest<RGB32>(12, MemorySize);
|
|
|
|
printf("RGBA32...\n");
|
|
nTotalFailures += runTest<RGBA32>(16, MemorySize);
|
|
|
|
printf("RGBA32_2...\n");
|
|
nTotalFailures += runTest<RGBA32_2>(32, MemorySize);
|
|
|
|
printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures);
|
|
|
|
printf("Shutting down...\n");
|
|
checkCudaErrors(cudaFree(d_idata));
|
|
checkCudaErrors(cudaFree(d_odata));
|
|
free(h_odataGPU);
|
|
free(h_idataCPU);
|
|
|
|
sdkDeleteTimer(&hTimer);
|
|
|
|
if (nTotalFailures != 0) {
|
|
printf("Test failed!\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
printf("Test passed\n");
|
|
exit(EXIT_SUCCESS);
|
|
}
|