cuda-samples/Samples/shfl_scan/shfl_scan.cu

419 lines
14 KiB
Plaintext
Raw Normal View History

/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Shuffle intrinsics CUDA Sample
// This sample demonstrates the use of the shuffle intrinsic
// First, a simple example of a prefix sum using the shuffle to
// perform a scan operation is provided.
// Secondly, a more involved example of computing an integral image
// using the shuffle intrinsic is provided, where the shuffle
// scan operation and shuffle xor operations are used
#include <stdio.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include "shfl_integral_image.cuh"
// Scan using shfl - takes log2(n) steps
// This function demonstrates basic use of the shuffle intrinsic, __shfl_up,
// to perform a scan operation across a block.
// First, it performs a scan (prefix sum in this case) inside a warp
// Then to continue the scan operation across the block,
// each warp's sum is placed into shared memory. A single warp
// then performs a shuffle scan on that shared memory. The results
// are then uniformly added to each warp's threads.
// This pyramid type approach is continued by placing each block's
// final sum in global memory and prefix summing that via another kernel call,
// then uniformly adding across the input data via the uniform_add<<<>>> kernel.
__global__ void shfl_scan_test(int *data, int width, int *partial_sums = NULL) {
extern __shared__ int sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
int lane_id = id % warpSize;
// determine a warp_id within a block
int warp_id = threadIdx.x / warpSize;
// Below is the basic structure of using a shfl instruction
// for a scan.
// Record "value" as a variable - we accumulate it along the way
int value = data[id];
// Now accumulate in log steps up the chain
// compute sums, with another thread's value who is
// distance delta away (i). Note
// those threads where the thread 'i' away would have
// been out of bounds of the warp are unaffected. This
// creates the scan sum.
#pragma unroll
for (int i = 1; i <= width; i *= 2) {
unsigned int mask = 0xffffffff;
int n = __shfl_up_sync(mask, value, i, width);
if (lane_id >= i) value += n;
}
// value now holds the scan value for the individual thread
// next sum the largest values for each warp
// write the sum of the warp to smem
if (threadIdx.x % warpSize == warpSize - 1) {
sums[warp_id] = value;
}
__syncthreads();
//
// scan sum the warp sums
// the same shfl scan operation, but performed on warp sums
//
if (warp_id == 0 && lane_id < (blockDim.x / warpSize)) {
int warp_sum = sums[lane_id];
int mask = (1 << (blockDim.x / warpSize)) - 1;
for (int i = 1; i <= (blockDim.x / warpSize); i *= 2) {
int n = __shfl_up_sync(mask, warp_sum, i, (blockDim.x / warpSize));
if (lane_id >= i) warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
// perform a uniform add across warps in the block
// read neighbouring warp's sum and add it to threads value
int blockSum = 0;
if (warp_id > 0) {
blockSum = sums[warp_id - 1];
}
value += blockSum;
// Now write out our result
data[id] = value;
// last thread has sum, write write out the block's sum
if (partial_sums != NULL && threadIdx.x == blockDim.x - 1) {
partial_sums[blockIdx.x] = value;
}
}
// Uniform add: add partial sums array
__global__ void uniform_add(int *data, int *partial_sums, int len) {
__shared__ int buf;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
if (id > len) return;
if (threadIdx.x == 0) {
buf = partial_sums[blockIdx.x];
}
__syncthreads();
data[id] += buf;
}
static unsigned int iDivUp(unsigned int dividend, unsigned int divisor) {
return ((dividend % divisor) == 0) ? (dividend / divisor)
: (dividend / divisor + 1);
}
// This function verifies the shuffle scan result, for the simple
// prefix sum case.
bool CPUverify(int *h_data, int *h_result, int n_elements) {
// cpu verify
for (int i = 0; i < n_elements - 1; i++) {
h_data[i + 1] = h_data[i] + h_data[i + 1];
}
int diff = 0;
for (int i = 0; i < n_elements; i++) {
diff += h_data[i] - h_result[i];
}
printf("CPU verify result diff (GPUvsCPU) = %d\n", diff);
bool bTestResult = false;
if (diff == 0) bTestResult = true;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
for (int j = 0; j < 100; j++)
for (int i = 0; i < n_elements - 1; i++) {
h_data[i + 1] = h_data[i] + h_data[i + 1];
}
sdkStopTimer(&hTimer);
double cput = sdkGetTimerValue(&hTimer);
printf("CPU sum (naive) took %f ms\n", cput / 100);
return bTestResult;
}
// this verifies the row scan result for synthetic data of all 1's
unsigned int verifyDataRowSums(unsigned int *h_image, int w, int h) {
unsigned int diff = 0;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
int gold = i + 1;
diff +=
abs(static_cast<int>(gold) - static_cast<int>(h_image[j * w + i]));
}
}
return diff;
}
bool shuffle_simple_test(int argc, char **argv) {
int *h_data, *h_partial_sums, *h_result;
int *d_data, *d_partial_sums;
const int n_elements = 65536;
int sz = sizeof(int) * n_elements;
int cuda_device = 0;
printf("Starting shfl_scan\n");
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// __shfl intrinsic needs SM 3.0 or higher
if (deviceProp.major < 3) {
printf("> __shfl() intrinsic requires device SM 3.0+\n");
printf("> Waiving test.\n");
exit(EXIT_WAIVED);
}
checkCudaErrors(cudaMallocHost(reinterpret_cast<void **>(&h_data),
sizeof(int) * n_elements));
checkCudaErrors(cudaMallocHost(reinterpret_cast<void **>(&h_result),
sizeof(int) * n_elements));
// initialize data:
printf("Computing Simple Sum test\n");
printf("---------------------------------------------------\n");
printf("Initialize test data [1, 1, 1...]\n");
for (int i = 0; i < n_elements; i++) {
h_data[i] = 1;
}
int blockSize = 256;
int gridSize = n_elements / blockSize;
int nWarps = blockSize / 32;
int shmem_sz = nWarps * sizeof(int);
int n_partialSums = n_elements / blockSize;
int partial_sz = n_partialSums * sizeof(int);
printf("Scan summation for %d elements, %d partial sums\n", n_elements,
n_elements / blockSize);
int p_blockSize = min(n_partialSums, blockSize);
int p_gridSize = iDivUp(n_partialSums, p_blockSize);
printf("Partial summing %d elements with %d blocks of size %d\n",
n_partialSums, p_gridSize, p_blockSize);
// initialize a timer
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
float et = 0;
float inc = 0;
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_data), sz));
checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&d_partial_sums), partial_sz));
checkCudaErrors(cudaMemset(d_partial_sums, 0, partial_sz));
checkCudaErrors(
cudaMallocHost(reinterpret_cast<void **>(&h_partial_sums), partial_sz));
checkCudaErrors(cudaMemcpy(d_data, h_data, sz, cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(start, 0));
shfl_scan_test<<<gridSize, blockSize, shmem_sz>>>(d_data, 32, d_partial_sums);
shfl_scan_test<<<p_gridSize, p_blockSize, shmem_sz>>>(d_partial_sums, 32);
uniform_add<<<gridSize - 1, blockSize>>>(d_data + blockSize, d_partial_sums,
n_elements);
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&inc, start, stop));
et += inc;
checkCudaErrors(cudaMemcpy(h_result, d_data, sz, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_partial_sums, d_partial_sums, partial_sz,
cudaMemcpyDeviceToHost));
printf("Test Sum: %d\n", h_partial_sums[n_partialSums - 1]);
printf("Time (ms): %f\n", et);
printf("%d elements scanned in %f ms -> %f MegaElements/s\n", n_elements, et,
n_elements / (et / 1000.0f) / 1000000.0f);
bool bTestResult = CPUverify(h_data, h_result, n_elements);
checkCudaErrors(cudaFreeHost(h_data));
checkCudaErrors(cudaFreeHost(h_result));
checkCudaErrors(cudaFreeHost(h_partial_sums));
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFree(d_partial_sums));
return bTestResult;
}
// This function tests creation of an integral image using
// synthetic data, of size 1920x1080 pixels greyscale.
bool shuffle_integral_image_test() {
char *d_data;
unsigned int *h_image;
unsigned int *d_integral_image;
int w = 1920;
int h = 1080;
int n_elements = w * h;
int sz = sizeof(unsigned int) * n_elements;
printf("\nComputing Integral Image Test on size %d x %d synthetic data\n", w,
h);
printf("---------------------------------------------------\n");
checkCudaErrors(cudaMallocHost(reinterpret_cast<void **>(&h_image), sz));
// fill test "image" with synthetic 1's data
memset(h_image, 0, sz);
// each thread handles 16 values, use 1 block/row
int blockSize = iDivUp(w, 16);
// launch 1 block / row
int gridSize = h;
// Create a synthetic image for testing
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_data), sz));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_integral_image),
n_elements * sizeof(int) * 4));
checkCudaErrors(cudaMemset(d_data, 1, sz));
checkCudaErrors(cudaMemset(d_integral_image, 0, sz));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float et = 0;
unsigned int err;
// Execute scan line prefix sum kernel, and time it
cudaEventRecord(start);
shfl_intimage_rows<<<gridSize, blockSize>>>(
reinterpret_cast<uint4 *>(d_data),
reinterpret_cast<uint4 *>(d_integral_image));
cudaEventRecord(stop);
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&et, start, stop));
printf("Method: Fast Time (GPU Timer): %f ms ", et);
// verify the scan line results
checkCudaErrors(
cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost));
err = verifyDataRowSums(h_image, w, h);
printf("Diff = %d\n", err);
// Execute column prefix sum kernel and time it
dim3 blockSz(32, 8);
dim3 testGrid(w / blockSz.x, 1);
cudaEventRecord(start);
shfl_vertical_shfl<<<testGrid, blockSz>>>((unsigned int *)d_integral_image, w,
h);
cudaEventRecord(stop);
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&et, start, stop));
printf("Method: Vertical Scan Time (GPU Timer): %f ms ", et);
// Verify the column results
checkCudaErrors(
cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost));
printf("\n");
int finalSum = h_image[w * h - 1];
printf("CheckSum: %d, (expect %dx%d=%d)\n", finalSum, w, h, w * h);
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFree(d_integral_image));
checkCudaErrors(cudaFreeHost(h_image));
// verify final sum: if the final value in the corner is the same as the size
// of the buffer (all 1's) then the integral image was generated successfully
return (finalSum == w * h) ? true : false;
}
int main(int argc, char *argv[]) {
// Initialization. The shuffle intrinsic is not available on SM < 3.0
// so waive the test if the hardware is not present.
int cuda_device = 0;
printf("Starting shfl_scan\n");
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// __shfl intrinsic needs SM 3.0 or higher
if (deviceProp.major < 3) {
printf("> __shfl() intrinsic requires device SM 3.0+\n");
printf("> Waiving test.\n");
exit(EXIT_WAIVED);
}
bool bTestResult = true;
bool simpleTest = shuffle_simple_test(argc, argv);
bool intTest = shuffle_integral_image_test();
bTestResult = simpleTest & intTest;
exit((bTestResult) ? EXIT_SUCCESS : EXIT_FAILURE);
}