mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-06-04 00:06:52 +08:00
This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
189 lines
7.8 KiB
Plaintext
189 lines
7.8 KiB
Plaintext
/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/* This sample demonstrates cub::DeviceSegmentedScan, added in CCCL 3.3.
|
|
* Two operations are shown: ExclusiveSegmentedSum across three independent
|
|
* segments, and InclusiveSegmentedScan with a custom binary operator
|
|
* (running maximum via cuda::maximum<>). Each is verified against a
|
|
* host reference implementation.
|
|
*/
|
|
|
|
/* Includes, system */
|
|
#include <algorithm>
|
|
#include <limits>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <vector>
|
|
|
|
/* Includes, cuda */
|
|
#include <cuda_runtime.h>
|
|
#include <helper_cuda.h>
|
|
|
|
/* Includes, cccl */
|
|
#include <cub/device/device_segmented_scan.cuh>
|
|
#include <cuda/functional>
|
|
#include <thrust/device_vector.h>
|
|
#include <thrust/host_vector.h>
|
|
|
|
template <typename T>
|
|
static void print_vec(const char *label, const std::vector<T> &v)
|
|
{
|
|
printf(" %-24s{", label);
|
|
for (size_t i = 0; i < v.size(); ++i)
|
|
printf(" %d", static_cast<int>(v[i]));
|
|
printf(" }\n");
|
|
}
|
|
|
|
static std::vector<int> host_exclusive_segmented_sum(const std::vector<int> &input, const std::vector<size_t> &offsets)
|
|
{
|
|
std::vector<int> out(input.size(), 0);
|
|
for (size_t s = 0; s + 1 < offsets.size(); ++s) {
|
|
int running = 0;
|
|
for (size_t i = offsets[s]; i < offsets[s + 1]; ++i) {
|
|
out[i] = running;
|
|
running += input[i];
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
static std::vector<int> host_inclusive_segmented_max(const std::vector<int> &input, const std::vector<size_t> &offsets)
|
|
{
|
|
std::vector<int> out(input.size(), 0);
|
|
for (size_t s = 0; s + 1 < offsets.size(); ++s) {
|
|
int running = std::numeric_limits<int>::min();
|
|
for (size_t i = offsets[s]; i < offsets[s + 1]; ++i) {
|
|
running = std::max(running, input[i]);
|
|
out[i] = running;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
static bool run_exclusive_segmented_sum()
|
|
{
|
|
/* 3 segments: [1,2,3] [4,5] [6,7,8] -> [0,1,3] [0,4] [0,6,13]. */
|
|
thrust::device_vector<int> d_in = {1, 2, 3, 4, 5, 6, 7, 8};
|
|
thrust::device_vector<size_t> d_offsets = {0, 3, 5, 8};
|
|
thrust::device_vector<int> d_out(d_in.size());
|
|
|
|
const auto num_segments = d_offsets.size() - 1;
|
|
auto begin_offsets = d_offsets.begin();
|
|
auto end_offsets = d_offsets.begin() + 1;
|
|
|
|
size_t temp_bytes = 0;
|
|
checkCudaErrors(cub::DeviceSegmentedScan::ExclusiveSegmentedSum(
|
|
nullptr, temp_bytes, d_in.begin(), d_out.begin(), begin_offsets, end_offsets, num_segments));
|
|
thrust::device_vector<char> temp(temp_bytes);
|
|
checkCudaErrors(cub::DeviceSegmentedScan::ExclusiveSegmentedSum(thrust::raw_pointer_cast(temp.data()),
|
|
temp_bytes,
|
|
d_in.begin(),
|
|
d_out.begin(),
|
|
begin_offsets,
|
|
end_offsets,
|
|
num_segments));
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
|
|
std::vector<int> h_in(d_in.begin(), d_in.end());
|
|
std::vector<size_t> h_off(d_offsets.begin(), d_offsets.end());
|
|
std::vector<int> got(d_out.begin(), d_out.end());
|
|
std::vector<int> expected = host_exclusive_segmented_sum(h_in, h_off);
|
|
|
|
printf("cub::DeviceSegmentedScan::ExclusiveSegmentedSum\n");
|
|
print_vec("input:", h_in);
|
|
printf(" %-24s{", "offsets:");
|
|
for (auto o : h_off)
|
|
printf(" %zu", o);
|
|
printf(" }\n");
|
|
print_vec("got:", got);
|
|
print_vec("expected:", expected);
|
|
const bool ok = got == expected;
|
|
printf(" %s\n", ok ? "OK" : "FAIL");
|
|
return ok;
|
|
}
|
|
|
|
static bool run_inclusive_segmented_max()
|
|
{
|
|
/* Same three segments, but compute running max per segment. */
|
|
thrust::device_vector<int> d_in = {3, 1, 4, 5, 2, 9, 7, 8};
|
|
thrust::device_vector<size_t> d_offsets = {0, 3, 5, 8};
|
|
thrust::device_vector<int> d_out(d_in.size());
|
|
|
|
const auto num_segments = d_offsets.size() - 1;
|
|
auto begin_offsets = d_offsets.begin();
|
|
auto end_offsets = d_offsets.begin() + 1;
|
|
|
|
auto max_op = [] __host__ __device__(int a, int b) -> int { return cuda::maximum<>{}(a, b); };
|
|
|
|
size_t temp_bytes = 0;
|
|
checkCudaErrors(cub::DeviceSegmentedScan::InclusiveSegmentedScan(
|
|
nullptr, temp_bytes, d_in.begin(), d_out.begin(), begin_offsets, end_offsets, num_segments, max_op));
|
|
thrust::device_vector<char> temp(temp_bytes);
|
|
checkCudaErrors(cub::DeviceSegmentedScan::InclusiveSegmentedScan(thrust::raw_pointer_cast(temp.data()),
|
|
temp_bytes,
|
|
d_in.begin(),
|
|
d_out.begin(),
|
|
begin_offsets,
|
|
end_offsets,
|
|
num_segments,
|
|
max_op));
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
|
|
std::vector<int> h_in(d_in.begin(), d_in.end());
|
|
std::vector<size_t> h_off(d_offsets.begin(), d_offsets.end());
|
|
std::vector<int> got(d_out.begin(), d_out.end());
|
|
std::vector<int> expected = host_inclusive_segmented_max(h_in, h_off);
|
|
|
|
printf("cub::DeviceSegmentedScan::InclusiveSegmentedScan (running max)\n");
|
|
print_vec("input:", h_in);
|
|
printf(" %-24s{", "offsets:");
|
|
for (auto o : h_off)
|
|
printf(" %zu", o);
|
|
printf(" }\n");
|
|
print_vec("got:", got);
|
|
print_vec("expected:", expected);
|
|
const bool ok = got == expected;
|
|
printf(" %s\n", ok ? "OK" : "FAIL");
|
|
return ok;
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
int devID = findCudaDevice(argc, (const char **)argv);
|
|
cudaDeviceProp props;
|
|
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
|
printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
|
|
|
|
bool ok = true;
|
|
ok &= run_exclusive_segmented_sum();
|
|
printf("\n");
|
|
ok &= run_inclusive_segmented_max();
|
|
|
|
printf("\n%s\n", ok ? "Done" : "FAILED");
|
|
return ok ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
}
|