mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 19:29:14 +08:00
178 lines
6.0 KiB
Plaintext
178 lines
6.0 KiB
Plaintext
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/**
|
|
*
|
|
* This sample is a simple code that illustrates basic usage of
|
|
* cooperative groups within the thread block. The code launches a single
|
|
* thread block, creates a cooperative group of all threads in the block,
|
|
* and a set of tiled partition cooperative groups. For each, it uses a
|
|
* generic reduction function to calculate the sum of all the ranks in
|
|
* that group. In each case the result is printed, together with the
|
|
* expected answer (which is calculated using the analytical formula
|
|
* (n-1)*n)/2, noting that the ranks start at zero).
|
|
*
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <cooperative_groups.h>
|
|
|
|
using namespace cooperative_groups;
|
|
|
|
/**
|
|
* CUDA device function
|
|
*
|
|
* calculates the sum of val across the group g. The workspace array, x,
|
|
* must be large enough to contain g.size() integers.
|
|
*/
|
|
__device__ int sumReduction(thread_group g, int *x, int val) {
|
|
// rank of this thread in the group
|
|
int lane = g.thread_rank();
|
|
|
|
// for each iteration of this loop, the number of threads active in the
|
|
// reduction, i, is halved, and each active thread (with index [lane])
|
|
// performs a single summation of it's own value with that
|
|
// of a "partner" (with index [lane+i]).
|
|
for (int i = g.size() / 2; i > 0; i /= 2) {
|
|
// store value for this thread in temporary array
|
|
x[lane] = val;
|
|
|
|
// synchronize all threads in group
|
|
g.sync();
|
|
|
|
if (lane < i)
|
|
// active threads perform summation of their value with
|
|
// their partner's value
|
|
val += x[lane + i];
|
|
|
|
// synchronize all threads in group
|
|
g.sync();
|
|
}
|
|
|
|
// master thread in group returns result, and others return -1.
|
|
if (g.thread_rank() == 0)
|
|
return val;
|
|
else
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* CUDA kernel device code
|
|
*
|
|
* Creates cooperative groups and performs reductions
|
|
*/
|
|
__global__ void cgkernel() {
|
|
// threadBlockGroup includes all threads in the block
|
|
thread_block threadBlockGroup = this_thread_block();
|
|
int threadBlockGroupSize = threadBlockGroup.size();
|
|
|
|
// workspace array in shared memory required for reduction
|
|
extern __shared__ int workspace[];
|
|
|
|
int input, output, expectedOutput;
|
|
|
|
// input to reduction, for each thread, is its' rank in the group
|
|
input = threadBlockGroup.thread_rank();
|
|
|
|
// expected output from analytical formula (n-1)(n)/2
|
|
// (noting that indexing starts at 0 rather than 1)
|
|
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
|
|
|
|
// perform reduction
|
|
output = sumReduction(threadBlockGroup, workspace, input);
|
|
|
|
// master thread in group prints out result
|
|
if (threadBlockGroup.thread_rank() == 0) {
|
|
printf(
|
|
" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
|
|
(int)threadBlockGroup.size() - 1, output, expectedOutput);
|
|
|
|
printf(" Now creating %d groups, each of size 16 threads:\n\n",
|
|
(int)threadBlockGroup.size() / 16);
|
|
}
|
|
|
|
threadBlockGroup.sync();
|
|
|
|
// each tiledPartition16 group includes 16 threads
|
|
thread_block_tile<16> tiledPartition16 =
|
|
tiled_partition<16>(threadBlockGroup);
|
|
|
|
// This offset allows each group to have its own unique area in the workspace
|
|
// array
|
|
int workspaceOffset =
|
|
threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
|
|
|
|
// input to reduction, for each thread, is its' rank in the group
|
|
input = tiledPartition16.thread_rank();
|
|
|
|
// expected output from analytical formula (n-1)(n)/2
|
|
// (noting that indexing starts at 0 rather than 1)
|
|
expectedOutput = 15 * 16 / 2;
|
|
|
|
// Perform reduction
|
|
output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
|
|
|
|
// each master thread prints out result
|
|
if (tiledPartition16.thread_rank() == 0)
|
|
printf(
|
|
" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
|
|
"(expected %d)\n",
|
|
output, expectedOutput);
|
|
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* Host main routine
|
|
*/
|
|
int main() {
|
|
// Error code to check return values for CUDA calls
|
|
cudaError_t err;
|
|
|
|
// Launch the kernel
|
|
|
|
int blocksPerGrid = 1;
|
|
int threadsPerBlock = 64;
|
|
|
|
printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
|
|
|
|
// we use the optional third argument to specify the size
|
|
// of shared memory required in the kernel
|
|
cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
|
|
err = cudaDeviceSynchronize();
|
|
|
|
if (err != cudaSuccess) {
|
|
fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
|
|
cudaGetErrorString(err));
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
printf("\n...Done.\n\n");
|
|
|
|
return 0;
|
|
}
|