mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 16:29:17 +08:00
231 lines
7.8 KiB
C++
231 lines
7.8 KiB
C++
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This sample implements 64-bin histogram calculation
|
|
* of arbitrary-sized 8-bit data array
|
|
*/
|
|
|
|
// CUDA Runtime
|
|
#include <cuda_runtime.h>
|
|
|
|
// Utility and system includes
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
|
|
|
// project include
|
|
#include "histogram_common.h"
|
|
|
|
const int numRuns = 16;
|
|
const static char *sSDKsample = "[histogram]\0";
|
|
|
|
int main(int argc, char **argv) {
|
|
uchar *h_Data;
|
|
uint *h_HistogramCPU, *h_HistogramGPU;
|
|
uchar *d_Data;
|
|
uint *d_Histogram;
|
|
StopWatchInterface *hTimer = NULL;
|
|
int PassFailFlag = 1;
|
|
uint byteCount = 64 * 1048576;
|
|
uint uiSizeMult = 1;
|
|
|
|
cudaDeviceProp deviceProp;
|
|
deviceProp.major = 0;
|
|
deviceProp.minor = 0;
|
|
|
|
// set logfile name and start logs
|
|
printf("[%s] - Starting...\n", sSDKsample);
|
|
|
|
// Use command-line specified CUDA device, otherwise use device with highest
|
|
// Gflops/s
|
|
int dev = findCudaDevice(argc, (const char **)argv);
|
|
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
|
|
|
|
printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n",
|
|
deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major,
|
|
deviceProp.minor);
|
|
|
|
sdkCreateTimer(&hTimer);
|
|
|
|
// Optional Command-line multiplier to increase size of array to histogram
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) {
|
|
uiSizeMult = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
|
|
uiSizeMult = MAX(1, MIN(uiSizeMult, 10));
|
|
byteCount *= uiSizeMult;
|
|
}
|
|
|
|
printf("Initializing data...\n");
|
|
printf("...allocating CPU memory.\n");
|
|
h_Data = (uchar *)malloc(byteCount);
|
|
h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
|
|
h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
|
|
|
|
printf("...generating input data\n");
|
|
srand(2009);
|
|
|
|
for (uint i = 0; i < byteCount; i++) {
|
|
h_Data[i] = rand() % 256;
|
|
}
|
|
|
|
printf("...allocating GPU memory and copying input data\n\n");
|
|
checkCudaErrors(cudaMalloc((void **)&d_Data, byteCount));
|
|
checkCudaErrors(
|
|
cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)));
|
|
checkCudaErrors(
|
|
cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice));
|
|
|
|
{
|
|
printf("Starting up 64-bin histogram...\n\n");
|
|
initHistogram64();
|
|
|
|
printf("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n",
|
|
byteCount, numRuns);
|
|
|
|
for (int iter = -1; iter < numRuns; iter++) {
|
|
// iter == -1 -- warmup iteration
|
|
if (iter == 0) {
|
|
cudaDeviceSynchronize();
|
|
sdkResetTimer(&hTimer);
|
|
sdkStartTimer(&hTimer);
|
|
}
|
|
|
|
histogram64(d_Histogram, d_Data, byteCount);
|
|
}
|
|
|
|
cudaDeviceSynchronize();
|
|
sdkStopTimer(&hTimer);
|
|
double dAvgSecs =
|
|
1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)numRuns;
|
|
printf("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs,
|
|
((double)byteCount * 1.0e-6) / dAvgSecs);
|
|
printf(
|
|
"histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, "
|
|
"NumDevsUsed = %u, Workgroup = %u\n",
|
|
(1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1,
|
|
HISTOGRAM64_THREADBLOCK_SIZE);
|
|
|
|
printf("\nValidating GPU results...\n");
|
|
printf(" ...reading back GPU results\n");
|
|
checkCudaErrors(cudaMemcpy(h_HistogramGPU, d_Histogram,
|
|
HISTOGRAM64_BIN_COUNT * sizeof(uint),
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
printf(" ...histogram64CPU()\n");
|
|
histogram64CPU(h_HistogramCPU, h_Data, byteCount);
|
|
|
|
printf(" ...comparing the results...\n");
|
|
|
|
for (uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
|
|
if (h_HistogramGPU[i] != h_HistogramCPU[i]) {
|
|
PassFailFlag = 0;
|
|
}
|
|
|
|
printf(PassFailFlag ? " ...64-bin histograms match\n\n"
|
|
: " ***64-bin histograms do not match!!!***\n\n");
|
|
|
|
printf("Shutting down 64-bin histogram...\n\n\n");
|
|
closeHistogram64();
|
|
}
|
|
|
|
{
|
|
printf("Initializing 256-bin histogram...\n");
|
|
initHistogram256();
|
|
|
|
printf("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n",
|
|
byteCount, numRuns);
|
|
|
|
for (int iter = -1; iter < numRuns; iter++) {
|
|
// iter == -1 -- warmup iteration
|
|
if (iter == 0) {
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
sdkResetTimer(&hTimer);
|
|
sdkStartTimer(&hTimer);
|
|
}
|
|
|
|
histogram256(d_Histogram, d_Data, byteCount);
|
|
}
|
|
|
|
cudaDeviceSynchronize();
|
|
sdkStopTimer(&hTimer);
|
|
double dAvgSecs =
|
|
1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)numRuns;
|
|
printf("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n",
|
|
dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
|
|
printf(
|
|
"histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, "
|
|
"NumDevsUsed = %u, Workgroup = %u\n",
|
|
(1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1,
|
|
HISTOGRAM256_THREADBLOCK_SIZE);
|
|
|
|
printf("\nValidating GPU results...\n");
|
|
printf(" ...reading back GPU results\n");
|
|
checkCudaErrors(cudaMemcpy(h_HistogramGPU, d_Histogram,
|
|
HISTOGRAM256_BIN_COUNT * sizeof(uint),
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
printf(" ...histogram256CPU()\n");
|
|
histogram256CPU(h_HistogramCPU, h_Data, byteCount);
|
|
|
|
printf(" ...comparing the results\n");
|
|
|
|
for (uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
|
|
if (h_HistogramGPU[i] != h_HistogramCPU[i]) {
|
|
PassFailFlag = 0;
|
|
}
|
|
|
|
printf(PassFailFlag ? " ...256-bin histograms match\n\n"
|
|
: " ***256-bin histograms do not match!!!***\n\n");
|
|
|
|
printf("Shutting down 256-bin histogram...\n\n\n");
|
|
closeHistogram256();
|
|
}
|
|
|
|
printf("Shutting down...\n");
|
|
sdkDeleteTimer(&hTimer);
|
|
checkCudaErrors(cudaFree(d_Histogram));
|
|
checkCudaErrors(cudaFree(d_Data));
|
|
free(h_HistogramGPU);
|
|
free(h_HistogramCPU);
|
|
free(h_Data);
|
|
|
|
printf(
|
|
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
|
"Results may vary when GPU Boost is enabled.\n\n");
|
|
|
|
printf("%s - Test Summary\n", sSDKsample);
|
|
|
|
// pass or fail (for both 64 bit and 256 bit histograms)
|
|
if (!PassFailFlag) {
|
|
printf("Test failed!\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
printf("Test passed\n");
|
|
exit(EXIT_SUCCESS);
|
|
}
|