Dheemanth b7c5481c55
Release v13.3 of the CUDA samples with CUDA 13.3 Toolkit (#435)
This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
2026-05-27 16:50:59 -05:00

495 lines
20 KiB
Plaintext

/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This sample demonstrates sparse matrix-vector multiplication
* (SpMV) using CUDA Tile C++.
*
* y = A * x
*
* The matrix is built directly on the host in Sliced ELLPACK (SELL)
* format — the format the Tile kernel actually reads. SELL is the
* same idea as ELLPACK applied per-slice: rows are grouped into
* slices of SLICE_ROWS consecutive rows (sorted by length to
* minimize padding) and stored column-major so that "the k-th
* nonzero of every row in the slice" occupies contiguous memory.
*
* slice s contains rows row_perm[s*SLICE_ROWS .. s*SLICE_ROWS + SLICE_ROWS)
* sell_data[slice_offsets[s] + k * SLICE_ROWS + r] is the k-th
* nonzero of the r-th row of slice s, with padding (column = 0,
* value = 0.0f) when the row has fewer than slice_widths[s]
* nonzeros
*
* Each CTA processes one slice using a 2D tile of shape<ROWS, COLS>:
*
* - Dimension 0 (ROWS): the rows of the slice (one tile row per
* matrix row in the slice)
* - Dimension 1 (COLS): the next COLS nonzeros of every row in
* the slice, processed simultaneously
*
* Because of the column-major slice layout, "load the next COLS
* nonzeros for every row" is a contiguous, fully coalesced load.
* This is the property that distinguishes SELL from a row-by-row
* CSR kernel: CSR forces the kernel to gather column indices and
* values from disjoint per-row address streams. The kernel computes
* partial products against the x-vector (an irreducible gather),
* accumulates into a 2D tile, then reduces along the column
* dimension with 'cuda::tiles::sum(acc, 1_ic)' to produce one sum
* per row, and scatters the sums to y using row_perm.
*/
#include "helper_cuda.h"
#include "cuda_tile.h"
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdio>
#include <numeric>
#include <random>
#include <vector>
namespace ct = cuda::tiles;
//=============================================================================
// Tile SpMV kernel: 2D SELL SpMV
//
// Each CTA processes one slice of ROWS rows. The inner loop walks
// the slice's nonzeros COLS at a time. Because the SELL arrays are
// laid out column-major within a slice, the per-iteration load of
// 'sell_col_indices' and 'sell_values' is a contiguous load of
// ROWS*COLS contiguous elements — i.e. perfectly coalesced — even
// though the underlying rows have wildly different lengths.
//=============================================================================
template <int ROWS, int COLS, int OCCUPANCY = 8>
[[cutile::hint(0, occupancy = OCCUPANCY)]]
__tile_global__ void spmvSell(int num_rows,
const int* __restrict__ sell_col_indices,
const float* __restrict__ sell_values,
const int* __restrict__ slice_offsets,
const int* __restrict__ slice_widths,
const int* __restrict__ row_perm,
const float* __restrict__ vector_x,
float* __restrict__ vector_y) {
using namespace ct::literals;
using Tile2D = ct::tile<float, ct::shape<ROWS, COLS>>;
using RowI = ct::tile<int, ct::shape<ROWS>>;
using ColI = ct::tile<int, ct::shape<COLS>>;
sell_col_indices = ct::assume_aligned<16>(sell_col_indices);
sell_values = ct::assume_aligned<16>(sell_values);
vector_x = ct::assume_aligned<16>(vector_x);
int slice = ct::bid().x;
int row_base = slice * ROWS;
auto local_row = ct::iota<RowI>();
auto row_valid = (row_base + local_row) < num_rows;
/* destination row in y for each lane of the slice */
auto actual_row = ct::load_masked(row_perm + row_base + local_row,
row_valid, 0);
int offset = slice_offsets[slice];
int width = slice_widths[slice];
/* Build 2D index tiles. row_2d broadcasts the in-slice row index
* along the column dimension; col_base_2d broadcasts the COLS
* lanes along the row dimension. The SELL element address is
* offset + (k + col) * ROWS + r
* which becomes the body of the inner loop below. */
auto row_2d = ct::broadcast(
ct::reshape(local_row, ct::shape<ROWS, 1>{}), ct::shape<ROWS, COLS>{});
auto col_base = ct::iota<ColI>();
auto row_valid_2d = ct::broadcast(
ct::reshape(row_valid, ct::shape<ROWS, 1>{}), ct::shape<ROWS, COLS>{});
auto col_base_2d = ct::broadcast(
ct::reshape(col_base, ct::shape<1, COLS>{}), ct::shape<ROWS, COLS>{});
Tile2D acc = ct::zeros<Tile2D>();
/* Loop-split: 'full_width' iterations need no per-element column
* mask (every lane is within 'width'), and the optional trailing
* iteration uses a mask. Eliminating the mask from the hot loop
* saves a predicate evaluation per element per iteration. */
int full_width = (width / COLS) * COLS;
#pragma unroll 1
for (int k = 0; k < full_width; k += COLS) {
auto sell_idx = offset + (k + col_base_2d) * ROWS + row_2d;
auto cols_2d = ct::load_masked(sell_col_indices + sell_idx,
row_valid_2d, 0);
auto vals_2d = ct::load_masked(sell_values + sell_idx,
row_valid_2d, 0.0f);
auto x_2d = ct::load_masked(vector_x + cols_2d, row_valid_2d, 0.0f);
acc = acc + vals_2d * x_2d;
}
if (full_width < width) {
auto col_offsets_2d = full_width + col_base_2d;
auto valid = row_valid_2d & (col_offsets_2d < width);
auto sell_idx = offset + col_offsets_2d * ROWS + row_2d;
auto cols_2d = ct::load_masked(sell_col_indices + sell_idx, valid, 0);
auto vals_2d = ct::load_masked(sell_values + sell_idx, valid, 0.0f);
auto x_2d = ct::load_masked(vector_x + cols_2d, valid, 0.0f);
acc = acc + vals_2d * x_2d;
}
/* Reduce along the column dimension to get one sum per slice row,
* then scatter to the destination row in y. */
auto row_sums = ct::sum(acc, 1_ic);
ct::store_masked(vector_y + actual_row,
ct::reshape(row_sums, ct::shape<ROWS>{}), row_valid);
}
//=============================================================================
// Tile shape configuration
//
// The kernel is templated on (ROWS, COLS); ROWS is also the slice
// size in the SELL packing. We use a single shape sized for the
// random matrix generated below (~16 nonzeros per row on average).
//=============================================================================
constexpr int SLICE_ROWS = 64;
constexpr int TILE_COLS = 16;
//=============================================================================
// Sliced ELLPACK (SELL) matrix
//
// Layout:
// slice s covers row_perm[s*SLICE_ROWS .. s*SLICE_ROWS + SLICE_ROWS)
// slice_widths[s] = max( nnz_per_row[row] for row in slice s )
// slice_offsets[s] = sum_{t<s} slice_widths[t] * SLICE_ROWS
// sell_col_indices[slice_offsets[s] + k * SLICE_ROWS + r] =
// column index of the k-th nonzero of slice s, row r
// (or 0 if that slot is padding)
// sell_values[...] same but for the value
//=============================================================================
struct SellMatrix {
int num_rows = 0;
int num_cols = 0;
int num_slices = 0;
int nnz_total = 0; /* non-padding entries only */
std::size_t total_sell_entries = 0; /* including padding */
std::vector<int> row_perm; /* size num_slices * SLICE_ROWS */
std::vector<int> slice_offsets; /* size num_slices */
std::vector<int> slice_widths; /* size num_slices */
std::vector<int> row_lengths; /* size num_slices * SLICE_ROWS, padded */
std::vector<int> sell_col_indices;
std::vector<float> sell_values;
int* d_row_perm = nullptr;
int* d_slice_offsets = nullptr;
int* d_slice_widths = nullptr;
int* d_sell_col_indices = nullptr;
float* d_sell_values = nullptr;
void uploadToDevice() {
checkCudaErrors(cudaMalloc(&d_row_perm, row_perm.size() * sizeof(int)));
checkCudaErrors(cudaMalloc(&d_slice_offsets,
slice_offsets.size() * sizeof(int)));
checkCudaErrors(cudaMalloc(&d_slice_widths,
slice_widths.size() * sizeof(int)));
checkCudaErrors(cudaMalloc(&d_sell_col_indices,
sell_col_indices.size() * sizeof(int)));
checkCudaErrors(cudaMalloc(&d_sell_values,
sell_values.size() * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_row_perm, row_perm.data(),
row_perm.size() * sizeof(int),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_slice_offsets, slice_offsets.data(),
slice_offsets.size() * sizeof(int),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_slice_widths, slice_widths.data(),
slice_widths.size() * sizeof(int),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_sell_col_indices, sell_col_indices.data(),
sell_col_indices.size() * sizeof(int),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_sell_values, sell_values.data(),
sell_values.size() * sizeof(float),
cudaMemcpyHostToDevice));
}
void freeDevice() {
if (d_row_perm) checkCudaErrors(cudaFree(d_row_perm));
if (d_slice_offsets) checkCudaErrors(cudaFree(d_slice_offsets));
if (d_slice_widths) checkCudaErrors(cudaFree(d_slice_widths));
if (d_sell_col_indices) checkCudaErrors(cudaFree(d_sell_col_indices));
if (d_sell_values) checkCudaErrors(cudaFree(d_sell_values));
d_row_perm = nullptr;
d_slice_offsets = nullptr;
d_slice_widths = nullptr;
d_sell_col_indices = nullptr;
d_sell_values = nullptr;
}
};
/* Build a SellMatrix from per-row column-index and value lists. Rows
* are sorted by ascending length before slicing so that each slice's
* longest row is close to its shortest — this minimizes the amount
* of zero-padding required inside the slice. */
static SellMatrix packSell(int num_rows, int num_cols,
const std::vector<int>& row_lengths,
const std::vector<int>& row_cols_flat,
const std::vector<float>& row_vals_flat) {
SellMatrix S;
S.num_rows = num_rows;
S.num_cols = num_cols;
S.num_slices = (num_rows + SLICE_ROWS - 1) / SLICE_ROWS;
/* prefix sums into the flattened row arrays */
std::vector<int> prefix(num_rows + 1, 0);
for (int r = 0; r < num_rows; ++r) {
prefix[r + 1] = prefix[r] + row_lengths[r];
}
S.nnz_total = prefix[num_rows];
/* sort row indices by ascending length */
std::vector<int> perm(num_rows);
std::iota(perm.begin(), perm.end(), 0);
std::sort(perm.begin(), perm.end(), [&](int a, int b) {
return row_lengths[a] < row_lengths[b];
});
/* pad the permutation up to a whole number of slices; the padding
* slots map to row 0, but those lanes are masked out by row_valid
* in the kernel and contribute nothing */
std::size_t padded_rows =
static_cast<std::size_t>(S.num_slices) * SLICE_ROWS;
S.row_perm.assign(padded_rows, 0);
S.row_lengths.assign(padded_rows, 0);
for (int i = 0; i < num_rows; ++i) {
S.row_perm[i] = perm[i];
S.row_lengths[i] = row_lengths[perm[i]];
}
/* per-slice width = max row length within the slice */
S.slice_widths.assign(S.num_slices, 0);
for (int s = 0; s < S.num_slices; ++s) {
int w = 0;
for (int r = 0; r < SLICE_ROWS; ++r) {
w = std::max(w, S.row_lengths[s * SLICE_ROWS + r]);
}
S.slice_widths[s] = w;
}
/* per-slice offset = prefix sum of slice_width * SLICE_ROWS */
S.slice_offsets.assign(S.num_slices, 0);
std::size_t running = 0;
for (int s = 0; s < S.num_slices; ++s) {
S.slice_offsets[s] = static_cast<int>(running);
running += static_cast<std::size_t>(S.slice_widths[s]) * SLICE_ROWS;
}
S.total_sell_entries = running;
/* pack into column-major slice layout */
S.sell_col_indices.assign(S.total_sell_entries, 0);
S.sell_values.assign(S.total_sell_entries, 0.0f);
for (int s = 0; s < S.num_slices; ++s) {
int offset = S.slice_offsets[s];
for (int r = 0; r < SLICE_ROWS; ++r) {
int global_idx = s * SLICE_ROWS + r;
if (global_idx >= num_rows) continue;
int row = S.row_perm[global_idx];
int row_start = prefix[row];
int row_len = row_lengths[row];
for (int k = 0; k < row_len; ++k) {
std::size_t dst = static_cast<std::size_t>(offset)
+ static_cast<std::size_t>(k) * SLICE_ROWS + r;
S.sell_col_indices[dst] = row_cols_flat[row_start + k];
S.sell_values[dst] = row_vals_flat[row_start + k];
}
/* the remaining k in [row_len, slice_widths[s]) stays as the
* zero-init padding written by the assign() calls above */
}
}
return S;
}
//=============================================================================
// Random matrix generator (produces SellMatrix directly)
//
// Each row has a Poisson-distributed number of nonzeros; the column
// indices within a row are uniform random, then sorted and
// de-duplicated. packSell() handles the slice layout.
//=============================================================================
static SellMatrix generateRandom(int num_rows, int num_cols,
int avg_nnz_per_row, unsigned seed) {
std::mt19937 rng(seed);
std::uniform_int_distribution<int> col_dist(0, num_cols - 1);
std::uniform_real_distribution<float> val_dist(-1.0f, 1.0f);
std::poisson_distribution<int> len_dist(static_cast<double>(avg_nnz_per_row));
std::vector<int> cols_flat;
std::vector<float> vals_flat;
std::vector<int> row_lengths;
row_lengths.reserve(num_rows);
std::vector<int> scratch_cols;
for (int r = 0; r < num_rows; ++r) {
int len = std::min(num_cols, std::max(1, len_dist(rng)));
scratch_cols.clear();
scratch_cols.reserve(len);
for (int k = 0; k < len; ++k) {
scratch_cols.push_back(col_dist(rng));
}
std::sort(scratch_cols.begin(), scratch_cols.end());
scratch_cols.erase(std::unique(scratch_cols.begin(), scratch_cols.end()),
scratch_cols.end());
for (int c : scratch_cols) {
cols_flat.push_back(c);
vals_flat.push_back(val_dist(rng));
}
row_lengths.push_back(static_cast<int>(scratch_cols.size()));
}
return packSell(num_rows, num_cols, row_lengths, cols_flat, vals_flat);
}
//=============================================================================
// CPU reference SpMV — reads SELL directly so the sample has no
// dependency on CSR or any external sparse-matrix library.
//=============================================================================
static void cpuSpMV(const SellMatrix& S, const std::vector<float>& x,
std::vector<float>& y) {
y.assign(S.num_rows, 0.0f);
for (int s = 0; s < S.num_slices; ++s) {
int offset = S.slice_offsets[s];
int width = S.slice_widths[s];
for (int r = 0; r < SLICE_ROWS; ++r) {
int global_idx = s * SLICE_ROWS + r;
if (global_idx >= S.num_rows) continue;
int dst_row = S.row_perm[global_idx];
int row_len = S.row_lengths[global_idx];
float sum = 0.0f;
for (int k = 0; k < row_len; ++k) {
std::size_t src = static_cast<std::size_t>(offset)
+ static_cast<std::size_t>(k) * SLICE_ROWS + r;
sum += S.sell_values[src] * x[S.sell_col_indices[src]];
}
y[dst_row] = sum;
(void)width;
}
}
}
/* Compare device result to the CPU reference. SpMV is performed in
* single precision and the device kernel reduces in a different
* order than the CPU reference, so we accept differences within a
* relative tolerance OR a small absolute tolerance — whichever is
* larger. */
static bool verify(const std::vector<float>& reference,
const std::vector<float>& result,
float rel_tol = 1e-2f, float abs_tol = 1e-4f) {
float max_err = 0.0f;
int bad_idx = -1;
for (std::size_t i = 0; i < reference.size(); ++i) {
float diff = std::fabs(reference[i] - result[i]);
float allowed = std::max(abs_tol, rel_tol * std::fabs(reference[i]));
float over = diff - allowed;
if (over > max_err) {
max_err = over;
bad_idx = static_cast<int>(i);
}
}
if (max_err > 0.0f) {
printf("Verification FAILED at index %d (ref=%g, got=%g, diff=%g)\n",
bad_idx, reference[bad_idx], result[bad_idx],
std::fabs(reference[bad_idx] - result[bad_idx]));
return false;
}
return true;
}
//=============================================================================
// Main
//=============================================================================
int main() {
/* Random sparse matrix: ~16 nonzeros per row on average, sized to
* match the chosen tile shape (SLICE_ROWS = 64, TILE_COLS = 16). */
SellMatrix S = generateRandom(/*num_rows=*/100000, /*num_cols=*/100000,
/*avg_nnz_per_row=*/16, /*seed=*/0xA5A5);
printf("Random sparse matrix: rows=%d, cols=%d, nnz=%d, "
"avg nnz/row=%.1f\n",
S.num_rows, S.num_cols, S.nnz_total,
static_cast<double>(S.nnz_total) / S.num_rows);
printf("Tile configuration: ROWS=%d, COLS=%d (%d slices)\n",
SLICE_ROWS, TILE_COLS, S.num_slices);
/* host inputs */
std::vector<float> h_x(S.num_cols);
std::mt19937 rng(0xC0FFEE);
std::uniform_real_distribution<float> x_dist(-1.0f, 1.0f);
for (float& v : h_x) v = x_dist(rng);
/* CPU reference */
std::vector<float> ref_y;
cpuSpMV(S, h_x, ref_y);
/* device allocations */
S.uploadToDevice();
float* d_x = nullptr;
float* d_y = nullptr;
checkCudaErrors(cudaMalloc(&d_x, S.num_cols * sizeof(float)));
checkCudaErrors(cudaMalloc(&d_y, S.num_rows * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_x, h_x.data(), S.num_cols * sizeof(float),
cudaMemcpyHostToDevice));
/* Launch the SELL Tile kernel: one CTA per slice. */
spmvSell<SLICE_ROWS, TILE_COLS><<<S.num_slices>>>(
S.num_rows, S.d_sell_col_indices, S.d_sell_values,
S.d_slice_offsets, S.d_slice_widths, S.d_row_perm, d_x, d_y);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
/* copy result back and verify */
std::vector<float> h_y(S.num_rows);
checkCudaErrors(cudaMemcpy(h_y.data(), d_y, S.num_rows * sizeof(float),
cudaMemcpyDeviceToHost));
S.freeDevice();
checkCudaErrors(cudaFree(d_x));
checkCudaErrors(cudaFree(d_y));
if (!verify(ref_y, h_y)) {
return 1;
}
printf("Success! Tile SpMV matches the CPU reference.\n");
return 0;
}