cuda-samples/Samples/histogram/histogram64.cu
2021-10-21 16:34:49 +05:30

206 lines
8.0 KiB
Plaintext

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include "histogram_common.h"
////////////////////////////////////////////////////////////////////////////////
// GPU-specific common definitions
////////////////////////////////////////////////////////////////////////////////
// Data type used for input data fetches
typedef uint4 data_t;
// May change on future hardware, so better parametrize the code
#define SHARED_MEMORY_BANKS 16
////////////////////////////////////////////////////////////////////////////////
// Main computation pass: compute gridDim.x partial histograms
////////////////////////////////////////////////////////////////////////////////
// Count a byte into shared-memory storage
inline __device__ void addByte(uchar *s_ThreadBase, uint data) {
s_ThreadBase[UMUL(data, HISTOGRAM64_THREADBLOCK_SIZE)]++;
}
// Count four bytes of a word
inline __device__ void addWord(uchar *s_ThreadBase, uint data) {
// Only higher 6 bits of each byte matter, as this is a 64-bin histogram
addByte(s_ThreadBase, (data >> 2) & 0x3FU);
addByte(s_ThreadBase, (data >> 10) & 0x3FU);
addByte(s_ThreadBase, (data >> 18) & 0x3FU);
addByte(s_ThreadBase, (data >> 26) & 0x3FU);
}
__global__ void histogram64Kernel(uint *d_PartialHistograms, data_t *d_Data,
uint dataCount) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// Encode thread index in order to avoid bank conflicts in s_Hist[] access:
// each group of SHARED_MEMORY_BANKS threads accesses consecutive shared
// memory banks
// and the same bytes [0..3] within the banks
// Because of this permutation block size should be a multiple of 4 *
// SHARED_MEMORY_BANKS
const uint threadPos = ((threadIdx.x & ~(SHARED_MEMORY_BANKS * 4 - 1)) << 0) |
((threadIdx.x & (SHARED_MEMORY_BANKS - 1)) << 2) |
((threadIdx.x & (SHARED_MEMORY_BANKS * 3)) >> 4);
// Per-thread histogram storage
__shared__ uchar s_Hist[HISTOGRAM64_THREADBLOCK_SIZE * HISTOGRAM64_BIN_COUNT];
uchar *s_ThreadBase = s_Hist + threadPos;
// Initialize shared memory (writing 32-bit words)
#pragma unroll
for (uint i = 0; i < (HISTOGRAM64_BIN_COUNT / 4); i++) {
((uint *)s_Hist)[threadIdx.x + i * HISTOGRAM64_THREADBLOCK_SIZE] = 0;
}
// Read data from global memory and submit to the shared-memory histogram
// Since histogram counters are byte-sized, every single thread can't do more
// than 255 submission
cg::sync(cta);
for (uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount;
pos += UMUL(blockDim.x, gridDim.x)) {
data_t data = d_Data[pos];
addWord(s_ThreadBase, data.x);
addWord(s_ThreadBase, data.y);
addWord(s_ThreadBase, data.z);
addWord(s_ThreadBase, data.w);
}
// Accumulate per-thread histograms into per-block and write to global memory
cg::sync(cta);
if (threadIdx.x < HISTOGRAM64_BIN_COUNT) {
uchar *s_HistBase =
s_Hist + UMUL(threadIdx.x, HISTOGRAM64_THREADBLOCK_SIZE);
uint sum = 0;
uint pos = 4 * (threadIdx.x & (SHARED_MEMORY_BANKS - 1));
#pragma unroll
for (uint i = 0; i < (HISTOGRAM64_THREADBLOCK_SIZE / 4); i++) {
sum += s_HistBase[pos + 0] + s_HistBase[pos + 1] + s_HistBase[pos + 2] +
s_HistBase[pos + 3];
pos = (pos + 4) & (HISTOGRAM64_THREADBLOCK_SIZE - 1);
}
d_PartialHistograms[blockIdx.x * HISTOGRAM64_BIN_COUNT + threadIdx.x] = sum;
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge histogram64() output
// Run one threadblock per bin; each threadbock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram64
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
#define MERGE_THREADBLOCK_SIZE 256
__global__ void mergeHistogram64Kernel(uint *d_Histogram,
uint *d_PartialHistograms,
uint histogramCount) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
uint sum = 0;
for (uint i = threadIdx.x; i < histogramCount; i += MERGE_THREADBLOCK_SIZE) {
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM64_BIN_COUNT];
}
data[threadIdx.x] = sum;
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
if (threadIdx.x < stride) {
data[threadIdx.x] += data[threadIdx.x + stride];
}
}
if (threadIdx.x == 0) {
d_Histogram[blockIdx.x] = data[0];
}
}
////////////////////////////////////////////////////////////////////////////////
// CPU interface to GPU histogram calculator
////////////////////////////////////////////////////////////////////////////////
// histogram64kernel() intermediate results buffer
// MAX_PARTIAL_HISTOGRAM64_COUNT == 32768 and HISTOGRAM64_THREADBLOCK_SIZE == 64
// amounts to max. 480MB of input data
static const uint MAX_PARTIAL_HISTOGRAM64_COUNT = 32768;
static uint *d_PartialHistograms;
// Internal memory allocation
extern "C" void initHistogram64(void) {
assert(HISTOGRAM64_THREADBLOCK_SIZE % (4 * SHARED_MEMORY_BANKS) == 0);
checkCudaErrors(cudaMalloc(
(void **)&d_PartialHistograms,
MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint)));
}
// Internal memory deallocation
extern "C" void closeHistogram64(void) {
checkCudaErrors(cudaFree(d_PartialHistograms));
}
// Round a / b to nearest higher integer value
inline uint iDivUp(uint a, uint b) {
return (a % b != 0) ? (a / b + 1) : (a / b);
}
// Snap a to nearest lower multiple of b
inline uint iSnapDown(uint a, uint b) { return a - a % b; }
extern "C" void histogram64(uint *d_Histogram, void *d_Data, uint byteCount) {
const uint histogramCount = iDivUp(
byteCount, HISTOGRAM64_THREADBLOCK_SIZE * iSnapDown(255, sizeof(data_t)));
assert(byteCount % sizeof(data_t) == 0);
assert(histogramCount <= MAX_PARTIAL_HISTOGRAM64_COUNT);
histogram64Kernel<<<histogramCount, HISTOGRAM64_THREADBLOCK_SIZE>>>(
d_PartialHistograms, (data_t *)d_Data, byteCount / sizeof(data_t));
getLastCudaError("histogram64Kernel() execution failed\n");
mergeHistogram64Kernel<<<HISTOGRAM64_BIN_COUNT, MERGE_THREADBLOCK_SIZE>>>(
d_Histogram, d_PartialHistograms, histogramCount);
getLastCudaError("mergeHistogram64() execution failed\n");
}