Dheemanth b7c5481c55
Release v13.3 of the CUDA samples with CUDA 13.3 Toolkit (#435)
This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
2026-05-27 16:50:59 -05:00

247 lines
9.9 KiB
Plaintext

/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* This sample demonstrates two mdspan-centric features from CCCL 3.3:
*
* 1. DLPack <-> cuda::std::mdspan bridging through
* cuda::to_device_mdspan<T, Rank>(DLTensor) -> cuda::device_mdspan
* cuda::to_dlpack_tensor(device_mdspan) -> DLManagedTensor
* The DLPack format is the interchange protocol used by PyTorch,
* JAX, CuPy, and other frameworks; cuda::device_mdspan is the
* device-side view with rich shape/stride metadata for kernels.
*
* 2. cuda::shared_memory_mdspan: a multi-dimensional view over a
* shared-memory tile. The accessor guarantees shared-memory
* load/store instructions and adds address-space safety checks.
*
* A sample matrix is built on the device, wrapped in a DLTensor,
* converted to a cuda::device_mdspan, and two kernels run against it:
* scale_rows_kernel multiplies row i by (i + 1), and
* shared_tile_transpose_kernel uses a cuda::shared_memory_mdspan to
* transpose a block-sized tile through shared memory. The output
* mdspan is then converted back to DLPack metadata and printed.
*/
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <vector>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <helper_cuda.h>
/* Includes, cccl */
#include <cuda/mdspan>
#include <cuda/std/array>
#include <cuda/std/cstdint>
#include <cuda/std/mdspan>
#define ROWS 8
#define COLS 8
#define TILE 8 /* matches ROWS / COLS for simplicity */
using extents2d = cuda::std::dextents<cuda::std::size_t, 2>;
/* Kernel A: multiply row i of a 2-D device_mdspan by (i + 1). Templated
* on the mdspan type so it accepts the exact type produced by
* cuda::to_device_mdspan (which uses layout_stride_relaxed and int64_t
* extents). */
template <typename Tensor>
__global__ void scale_rows_kernel(Tensor tensor)
{
const int r = blockIdx.y * blockDim.y + threadIdx.y;
const int c = blockIdx.x * blockDim.x + threadIdx.x;
if (r < static_cast<int>(tensor.extent(0)) && c < static_cast<int>(tensor.extent(1))) {
tensor(r, c) *= static_cast<float>(r + 1);
}
}
/* Kernel B: block-tile transpose driven by a shared_memory_mdspan.
* Each block loads a TILE x TILE tile from the input into shared memory
* through a cuda::shared_memory_mdspan, transposes in shared, and writes
* to the output. */
template <typename InTensor, typename OutTensor>
__global__ void shared_tile_transpose_kernel(InTensor in, OutTensor out)
{
__shared__ float smem_storage[TILE * TILE];
cuda::shared_memory_mdspan smem(smem_storage, cuda::std::dextents<cuda::std::size_t, 2>{TILE, TILE});
const int tr = threadIdx.y;
const int tc = threadIdx.x;
const int r = blockIdx.y * TILE + tr;
const int c = blockIdx.x * TILE + tc;
if (r < static_cast<int>(in.extent(0)) && c < static_cast<int>(in.extent(1))) {
smem(tr, tc) = in(r, c);
}
__syncthreads();
const int r_out = blockIdx.x * TILE + tr;
const int c_out = blockIdx.y * TILE + tc;
if (r_out < static_cast<int>(out.extent(0)) && c_out < static_cast<int>(out.extent(1))) {
out(r_out, c_out) = smem(tc, tr);
}
}
struct DLTensorStorage
{
::DLTensor tensor{};
cuda::std::array<cuda::std::int64_t, 2> shape{};
cuda::std::array<cuda::std::int64_t, 2> strides{};
};
static DLTensorStorage make_row_major_dltensor(float *device_ptr, int rows, int cols, int device_ordinal)
{
DLTensorStorage s;
s.shape = {rows, cols};
s.strides = {cols, 1};
s.tensor.data = device_ptr;
s.tensor.device = ::DLDevice{::kDLCUDA, device_ordinal};
s.tensor.ndim = 2;
s.tensor.dtype = ::DLDataType{::DLDataTypeCode::kDLFloat, 32, 1};
s.tensor.shape = s.shape.data();
s.tensor.strides = s.strides.data();
s.tensor.byte_offset = 0;
return s;
}
int main(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp props;
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
float *d_in = nullptr;
float *d_out = nullptr;
const size_t nelem = static_cast<size_t>(ROWS) * COLS;
checkCudaErrors(cudaMalloc(&d_in, nelem * sizeof(float)));
checkCudaErrors(cudaMalloc(&d_out, nelem * sizeof(float)));
std::vector<float> host(nelem);
for (int r = 0; r < ROWS; ++r) {
for (int c = 0; c < COLS; ++c) {
host[r * COLS + c] = static_cast<float>(r * COLS + c);
}
}
checkCudaErrors(cudaMemcpy(d_in, host.data(), nelem * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(d_out, 0, nelem * sizeof(float)));
DLTensorStorage in_dl = make_row_major_dltensor(d_in, ROWS, COLS, devID);
DLTensorStorage out_dl = make_row_major_dltensor(d_out, ROWS, COLS, devID);
auto in_md = cuda::to_device_mdspan<float, 2>(in_dl.tensor);
auto out_md = cuda::to_device_mdspan<float, 2>(out_dl.tensor);
printf("cuda::to_device_mdspan produced a 2-D device_mdspan of shape (%zu, %zu)\n\n",
in_md.extent(0),
in_md.extent(1));
dim3 block(8, 8);
dim3 grid((COLS + block.x - 1) / block.x, (ROWS + block.y - 1) / block.y);
scale_rows_kernel<<<grid, block>>>(in_md);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
std::vector<float> scaled(nelem);
checkCudaErrors(cudaMemcpy(scaled.data(), d_in, nelem * sizeof(float), cudaMemcpyDeviceToHost));
bool scale_ok = true;
for (int r = 0; r < ROWS && scale_ok; ++r) {
for (int c = 0; c < COLS && scale_ok; ++c) {
const float expect = static_cast<float>((r * COLS + c) * (r + 1));
if (scaled[r * COLS + c] != expect) {
printf("scale_rows mismatch at (%d,%d): got %g expected %g\n",
r,
c,
scaled[r * COLS + c],
expect);
scale_ok = false;
}
}
}
if (scale_ok) {
printf("scale_rows kernel: OK (row i scaled by i+1 via cuda::device_mdspan)\n");
}
cuda::device_mdspan<const float, extents2d> in_md_const(d_in, extents2d{ROWS, COLS});
cuda::device_mdspan<float, extents2d> out_md_rw(d_out, extents2d{ROWS, COLS});
dim3 tile_block(TILE, TILE);
dim3 tile_grid((COLS + TILE - 1) / TILE, (ROWS + TILE - 1) / TILE);
shared_tile_transpose_kernel<<<tile_grid, tile_block>>>(in_md_const, out_md_rw);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
std::vector<float> transposed(nelem);
checkCudaErrors(cudaMemcpy(transposed.data(), d_out, nelem * sizeof(float), cudaMemcpyDeviceToHost));
bool tp_ok = true;
for (int r = 0; r < ROWS && tp_ok; ++r) {
for (int c = 0; c < COLS && tp_ok; ++c) {
const float expect = scaled[c * COLS + r];
if (transposed[r * COLS + c] != expect) {
printf("transpose mismatch at (%d,%d): got %g expected %g\n",
r,
c,
transposed[r * COLS + c],
expect);
tp_ok = false;
}
}
}
if (tp_ok) {
printf("shared_tile_transpose kernel: OK (tile transpose via cuda::shared_memory_mdspan)\n");
}
auto dl_wrapper = cuda::to_dlpack_tensor(out_md);
const auto &dltensor = dl_wrapper.get();
printf("\ncuda::to_dlpack_tensor metadata:\n");
printf(" device : kDLCUDA (ordinal %d)\n", dltensor.device.device_id);
printf(" ndim : %d\n", dltensor.ndim);
printf(" dtype : code=%u bits=%u lanes=%u\n",
static_cast<unsigned>(dltensor.dtype.code),
static_cast<unsigned>(dltensor.dtype.bits),
static_cast<unsigned>(dltensor.dtype.lanes));
printf(" shape : [%lld, %lld]\n",
static_cast<long long>(dltensor.shape[0]),
static_cast<long long>(dltensor.shape[1]));
if (dltensor.strides != nullptr) {
printf(" strides : [%lld, %lld]\n",
static_cast<long long>(dltensor.strides[0]),
static_cast<long long>(dltensor.strides[1]));
}
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
if (!scale_ok || !tp_ok) {
return EXIT_FAILURE;
}
printf("\nDone\n");
return EXIT_SUCCESS;
}