/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Shuffle intrinsics CUDA Sample // This sample demonstrates the use of the shuffle intrinsic // First, a simple example of a prefix sum using the shuffle to // perform a scan operation is provided. // Secondly, a more involved example of computing an integral image // using the shuffle intrinsic is provided, where the shuffle // scan operation and shuffle xor operations are used #include #include #include #include #include "shfl_integral_image.cuh" // Scan using shfl - takes log2(n) steps // This function demonstrates basic use of the shuffle intrinsic, __shfl_up, // to perform a scan operation across a block. // First, it performs a scan (prefix sum in this case) inside a warp // Then to continue the scan operation across the block, // each warp's sum is placed into shared memory. A single warp // then performs a shuffle scan on that shared memory. The results // are then uniformly added to each warp's threads. // This pyramid type approach is continued by placing each block's // final sum in global memory and prefix summing that via another kernel call, // then uniformly adding across the input data via the uniform_add<<<>>> kernel. __global__ void shfl_scan_test(int *data, int width, int *partial_sums = NULL) { extern __shared__ int sums[]; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int lane_id = id % warpSize; // determine a warp_id within a block int warp_id = threadIdx.x / warpSize; // Below is the basic structure of using a shfl instruction // for a scan. // Record "value" as a variable - we accumulate it along the way int value = data[id]; // Now accumulate in log steps up the chain // compute sums, with another thread's value who is // distance delta away (i). Note // those threads where the thread 'i' away would have // been out of bounds of the warp are unaffected. This // creates the scan sum. #pragma unroll for (int i = 1; i <= width; i *= 2) { unsigned int mask = 0xffffffff; int n = __shfl_up_sync(mask, value, i, width); if (lane_id >= i) value += n; } // value now holds the scan value for the individual thread // next sum the largest values for each warp // write the sum of the warp to smem if (threadIdx.x % warpSize == warpSize - 1) { sums[warp_id] = value; } __syncthreads(); // // scan sum the warp sums // the same shfl scan operation, but performed on warp sums // if (warp_id == 0 && lane_id < (blockDim.x / warpSize)) { int warp_sum = sums[lane_id]; int mask = (1 << (blockDim.x / warpSize)) - 1; for (int i = 1; i <= (blockDim.x / warpSize); i *= 2) { int n = __shfl_up_sync(mask, warp_sum, i, (blockDim.x / warpSize)); if (lane_id >= i) warp_sum += n; } sums[lane_id] = warp_sum; } __syncthreads(); // perform a uniform add across warps in the block // read neighbouring warp's sum and add it to threads value int blockSum = 0; if (warp_id > 0) { blockSum = sums[warp_id - 1]; } value += blockSum; // Now write out our result data[id] = value; // last thread has sum, write write out the block's sum if (partial_sums != NULL && threadIdx.x == blockDim.x - 1) { partial_sums[blockIdx.x] = value; } } // Uniform add: add partial sums array __global__ void uniform_add(int *data, int *partial_sums, int len) { __shared__ int buf; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); if (id > len) return; if (threadIdx.x == 0) { buf = partial_sums[blockIdx.x]; } __syncthreads(); data[id] += buf; } static unsigned int iDivUp(unsigned int dividend, unsigned int divisor) { return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1); } // This function verifies the shuffle scan result, for the simple // prefix sum case. bool CPUverify(int *h_data, int *h_result, int n_elements) { // cpu verify for (int i = 0; i < n_elements - 1; i++) { h_data[i + 1] = h_data[i] + h_data[i + 1]; } int diff = 0; for (int i = 0; i < n_elements; i++) { diff += h_data[i] - h_result[i]; } printf("CPU verify result diff (GPUvsCPU) = %d\n", diff); bool bTestResult = false; if (diff == 0) bTestResult = true; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int j = 0; j < 100; j++) for (int i = 0; i < n_elements - 1; i++) { h_data[i + 1] = h_data[i] + h_data[i + 1]; } sdkStopTimer(&hTimer); double cput = sdkGetTimerValue(&hTimer); printf("CPU sum (naive) took %f ms\n", cput / 100); return bTestResult; } // this verifies the row scan result for synthetic data of all 1's unsigned int verifyDataRowSums(unsigned int *h_image, int w, int h) { unsigned int diff = 0; for (int j = 0; j < h; j++) { for (int i = 0; i < w; i++) { int gold = i + 1; diff += abs(static_cast(gold) - static_cast(h_image[j * w + i])); } } return diff; } bool shuffle_simple_test(int argc, char **argv) { int *h_data, *h_partial_sums, *h_result; int *d_data, *d_partial_sums; const int n_elements = 65536; int sz = sizeof(int) * n_elements; int cuda_device = 0; printf("Starting shfl_scan\n"); // use command-line specified CUDA device, otherwise use device with highest // Gflops/s cuda_device = findCudaDevice(argc, (const char **)argv); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDevice(&cuda_device)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); // __shfl intrinsic needs SM 3.0 or higher if (deviceProp.major < 3) { printf("> __shfl() intrinsic requires device SM 3.0+\n"); printf("> Waiving test.\n"); exit(EXIT_WAIVED); } checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_data), sizeof(int) * n_elements)); checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_result), sizeof(int) * n_elements)); // initialize data: printf("Computing Simple Sum test\n"); printf("---------------------------------------------------\n"); printf("Initialize test data [1, 1, 1...]\n"); for (int i = 0; i < n_elements; i++) { h_data[i] = 1; } int blockSize = 256; int gridSize = n_elements / blockSize; int nWarps = blockSize / 32; int shmem_sz = nWarps * sizeof(int); int n_partialSums = n_elements / blockSize; int partial_sz = n_partialSums * sizeof(int); printf("Scan summation for %d elements, %d partial sums\n", n_elements, n_elements / blockSize); int p_blockSize = min(n_partialSums, blockSize); int p_gridSize = iDivUp(n_partialSums, p_blockSize); printf("Partial summing %d elements with %d blocks of size %d\n", n_partialSums, p_gridSize, p_blockSize); // initialize a timer cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); float et = 0; float inc = 0; checkCudaErrors(cudaMalloc(reinterpret_cast(&d_data), sz)); checkCudaErrors( cudaMalloc(reinterpret_cast(&d_partial_sums), partial_sz)); checkCudaErrors(cudaMemset(d_partial_sums, 0, partial_sz)); checkCudaErrors( cudaMallocHost(reinterpret_cast(&h_partial_sums), partial_sz)); checkCudaErrors(cudaMemcpy(d_data, h_data, sz, cudaMemcpyHostToDevice)); checkCudaErrors(cudaEventRecord(start, 0)); shfl_scan_test<<>>(d_data, 32, d_partial_sums); shfl_scan_test<<>>(d_partial_sums, 32); uniform_add<<>>(d_data + blockSize, d_partial_sums, n_elements); checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&inc, start, stop)); et += inc; checkCudaErrors(cudaMemcpy(h_result, d_data, sz, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_partial_sums, d_partial_sums, partial_sz, cudaMemcpyDeviceToHost)); printf("Test Sum: %d\n", h_partial_sums[n_partialSums - 1]); printf("Time (ms): %f\n", et); printf("%d elements scanned in %f ms -> %f MegaElements/s\n", n_elements, et, n_elements / (et / 1000.0f) / 1000000.0f); bool bTestResult = CPUverify(h_data, h_result, n_elements); checkCudaErrors(cudaFreeHost(h_data)); checkCudaErrors(cudaFreeHost(h_result)); checkCudaErrors(cudaFreeHost(h_partial_sums)); checkCudaErrors(cudaFree(d_data)); checkCudaErrors(cudaFree(d_partial_sums)); return bTestResult; } // This function tests creation of an integral image using // synthetic data, of size 1920x1080 pixels greyscale. bool shuffle_integral_image_test() { char *d_data; unsigned int *h_image; unsigned int *d_integral_image; int w = 1920; int h = 1080; int n_elements = w * h; int sz = sizeof(unsigned int) * n_elements; printf("\nComputing Integral Image Test on size %d x %d synthetic data\n", w, h); printf("---------------------------------------------------\n"); checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_image), sz)); // fill test "image" with synthetic 1's data memset(h_image, 0, sz); // each thread handles 16 values, use 1 block/row int blockSize = iDivUp(w, 16); // launch 1 block / row int gridSize = h; // Create a synthetic image for testing checkCudaErrors(cudaMalloc(reinterpret_cast(&d_data), sz)); checkCudaErrors(cudaMalloc(reinterpret_cast(&d_integral_image), n_elements * sizeof(int) * 4)); checkCudaErrors(cudaMemset(d_data, 1, sz)); checkCudaErrors(cudaMemset(d_integral_image, 0, sz)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); float et = 0; unsigned int err; // Execute scan line prefix sum kernel, and time it cudaEventRecord(start); shfl_intimage_rows<<>>( reinterpret_cast(d_data), reinterpret_cast(d_integral_image)); cudaEventRecord(stop); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&et, start, stop)); printf("Method: Fast Time (GPU Timer): %f ms ", et); // verify the scan line results checkCudaErrors( cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost)); err = verifyDataRowSums(h_image, w, h); printf("Diff = %d\n", err); // Execute column prefix sum kernel and time it dim3 blockSz(32, 8); dim3 testGrid(w / blockSz.x, 1); cudaEventRecord(start); shfl_vertical_shfl<<>>((unsigned int *)d_integral_image, w, h); cudaEventRecord(stop); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&et, start, stop)); printf("Method: Vertical Scan Time (GPU Timer): %f ms ", et); // Verify the column results checkCudaErrors( cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost)); printf("\n"); int finalSum = h_image[w * h - 1]; printf("CheckSum: %d, (expect %dx%d=%d)\n", finalSum, w, h, w * h); checkCudaErrors(cudaFree(d_data)); checkCudaErrors(cudaFree(d_integral_image)); checkCudaErrors(cudaFreeHost(h_image)); // verify final sum: if the final value in the corner is the same as the size // of the buffer (all 1's) then the integral image was generated successfully return (finalSum == w * h) ? true : false; } int main(int argc, char *argv[]) { // Initialization. The shuffle intrinsic is not available on SM < 3.0 // so waive the test if the hardware is not present. int cuda_device = 0; printf("Starting shfl_scan\n"); // use command-line specified CUDA device, otherwise use device with highest // Gflops/s cuda_device = findCudaDevice(argc, (const char **)argv); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDevice(&cuda_device)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); // __shfl intrinsic needs SM 3.0 or higher if (deviceProp.major < 3) { printf("> __shfl() intrinsic requires device SM 3.0+\n"); printf("> Waiving test.\n"); exit(EXIT_WAIVED); } bool bTestResult = true; bool simpleTest = shuffle_simple_test(argc, argv); bool intTest = shuffle_integral_image_test(); bTestResult = simpleTest & intTest; exit((bTestResult) ? EXIT_SUCCESS : EXIT_FAILURE); }