/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This sample demonstrates cub::DeviceTransform in its N-input/M-output * form (extended in CCCL 3.3). A single device-wide call reads from * N input sequences and writes to M output sequences, driven by a * user-provided op that returns a tuple of M values. Two cases are * shown: N=3 -> 1 and N=2 -> 2. Results are verified against a host * reference. */ /* Includes, system */ #include #include #include /* Includes, cuda */ #include #include /* Includes, cccl */ #include #include #include #include static bool run_n_to_one_transform() { /* result[i] = (a[i] + b[i]) * c[i], with c = counting_iterator(100). */ thrust::device_vector a = {0, -2, 5, 3}; thrust::device_vector b = {5.2f, 3.1f, -1.1f, 3.0f}; auto counting = cuda::counting_iterator{100}; thrust::device_vector result(a.size()); auto op = [] __host__ __device__(int x, float y, int z) -> int { return static_cast((x + y) * z); }; checkCudaErrors(cub::DeviceTransform::Transform( cuda::std::tuple{a.begin(), b.begin(), counting}, result.begin(), a.size(), op)); checkCudaErrors(cudaDeviceSynchronize()); thrust::host_vector ha = a; thrust::host_vector hb = b; thrust::host_vector got = result; std::vector expected(a.size()); for (size_t i = 0; i < a.size(); ++i) { expected[i] = static_cast((ha[i] + hb[i]) * static_cast(100 + i)); } bool ok = true; printf("cub::DeviceTransform::Transform (N=3 inputs -> 1 output)\n"); printf(" result = (a + b) * c with c = counting_iterator(100)\n"); printf(" got = {"); for (size_t i = 0; i < got.size(); ++i) { printf(" %d", got[i]); if (got[i] != expected[i]) ok = false; } printf(" }\n expected = {"); for (size_t i = 0; i < expected.size(); ++i) printf(" %d", expected[i]); printf(" } %s\n", ok ? "OK" : "FAIL"); return ok; } static bool run_n_to_m_transform() { /* (sum[i], diff[i]) = (a[i] + b[i], a[i] - b[i]) in one pass. */ thrust::device_vector a = {1, 5, 10, 7, 3}; thrust::device_vector b = {4, 2, 8, 1, 9}; thrust::device_vector sum(a.size()); thrust::device_vector diff(a.size()); auto op = [] __host__ __device__(int x, int y) -> cuda::std::tuple { return {x + y, x - y}; }; checkCudaErrors(cub::DeviceTransform::Transform(cuda::std::tuple{a.begin(), b.begin()}, cuda::std::tuple{sum.begin(), diff.begin()}, a.size(), op)); checkCudaErrors(cudaDeviceSynchronize()); thrust::host_vector ha = a, hb = b, got_sum = sum, got_diff = diff; std::vector exp_sum(a.size()), exp_diff(a.size()); for (size_t i = 0; i < a.size(); ++i) { exp_sum[i] = ha[i] + hb[i]; exp_diff[i] = ha[i] - hb[i]; } bool ok = true; printf("cub::DeviceTransform::Transform (N=2 inputs -> M=2 outputs)\n"); printf(" op returns cuda::std::tuple{a + b, a - b}\n"); printf(" sum = {"); for (size_t i = 0; i < got_sum.size(); ++i) { printf(" %d", got_sum[i]); if (got_sum[i] != exp_sum[i]) ok = false; } printf(" } expected = {"); for (auto v : exp_sum) printf(" %d", v); printf(" }\n diff = {"); for (size_t i = 0; i < got_diff.size(); ++i) { printf(" %d", got_diff[i]); if (got_diff[i] != exp_diff[i]) ok = false; } printf(" } expected = {"); for (auto v : exp_diff) printf(" %d", v); printf(" } %s\n", ok ? "OK" : "FAIL"); return ok; } int main(int argc, char **argv) { int devID = findCudaDevice(argc, (const char **)argv); cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, devID)); printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor); bool ok = true; ok &= run_n_to_one_transform(); printf("\n"); ok &= run_n_to_m_transform(); printf("\n%s\n", ok ? "Done" : "FAILED"); return ok ? EXIT_SUCCESS : EXIT_FAILURE; }