Dheemanth b7c5481c55
Release v13.3 of the CUDA samples with CUDA 13.3 Toolkit (#435)
This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
2026-05-27 16:50:59 -05:00

332 lines
13 KiB
C++

/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* CUDA Tile C++ matrix multiplication autotuner.
*
* This sample is the host-side driver for the tiled matrix multiplication
* kernel in matmul.cu. It compiles that kernel repeatedly with different
* TILE_BLOCK_M, TILE_BLOCK_N, TILE_BLOCK_K, LOAD_LATENCY, and STORE_LATENCY
* values configured in a search-space file, derives the launch grid from the
* selected tile size, and reports the fastest configuration for the requested
* problem size.
*
* Backend flow:
* - NVRTC compiles matmul.cu to TileIR, then invokes tileiras to produce a
* cubin image.
* - NVCC compiles matmul.cu as a standalone source file and reuses the
* generated Tile cubin artifact.
* - Both paths load the resulting cubin with the CUDA Driver API and launch
* the same matmul_tile entry point.
*/
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <random>
#include <string>
#include <vector>
#include <cuda.h>
#include <cuda_fp16.h>
#include "backend_common.h"
#include "backend_nvcc.h"
#include "backend_nvrtc.h"
#include "matmul_benchmark.h"
#include <helper_cuda_drvapi.h>
// global SM value (compute capability)
static int smValue = 0;
static constexpr const char *kMatmulKernelName = "matmul_tile";
CompilerBackend parseCompilerBackendValue(const char *value) {
if (std::strcmp(value, "nvrtc") == 0) {
return CompilerBackend::NVRTC;
}
if (std::strcmp(value, "nvcc") == 0) {
return CompilerBackend::NVCC;
}
fprintf(stderr, "Error: unsupported backend '%s'\n", value);
fprintf(stderr, "Expected 'nvrtc' or 'nvcc'.\n");
exit(EXIT_FAILURE);
}
void printCompilerOptions() {
printf("Backend options:\n");
printf(" --backend=nvrtc|nvcc Select backend (default: NVRTC)\n");
printf("\n");
}
CompilerBackend parseCompilerBackendArgs(int argc, char** argv, std::vector<char*>& benchmark_argv) {
CompilerBackend compiler_backend = CompilerBackend::NVRTC;
benchmark_argv.clear();
benchmark_argv.push_back(argv[0]);
for (int i = 1; i < argc; i++) {
if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0) {
printCompilerOptions();
benchmark_argv.push_back(argv[i]);
} else if (std::strncmp(argv[i], "--backend=", 10) == 0) {
compiler_backend = parseCompilerBackendValue(argv[i] + 10);
} else if (std::strcmp(argv[i], "--backend") == 0) {
if (i + 1 >= argc) {
fprintf(stderr, "Error: %s requires an argument\n", argv[i]);
exit(EXIT_FAILURE);
}
compiler_backend = parseCompilerBackendValue(argv[++i]);
} else {
benchmark_argv.push_back(argv[i]);
}
}
return compiler_backend;
}
void setSMValue() {
CUdevice device;
int major = 0, minor = 0;
// initialize the CUDA Driver API
checkCudaErrors(cuInit(0));
// get the first device (device 0)
checkCudaErrors(cuDeviceGet(&device, 0));
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
printf("GPU Compute Capability: %d.%d\n", major, minor);
smValue = major * 10 + minor;
}
CompiledKernel compileFile(const char *filename,
int block_m, int block_n, int block_k,
CompilerBackend compiler_backend,
const std::vector<std::string>& extra_flags = {}) {
if (compiler_backend == CompilerBackend::NVCC) {
return compileFileWithNVCC(filename, smValue, block_m, block_n, block_k, extra_flags);
}
return compileFileWithNVRTC(filename, smValue, block_m, block_n, block_k, extra_flags);
}
void loadAndExecuteKernel(const CompiledKernel& compiled_kernel,
CUdeviceptr d_A, CUdeviceptr d_B, CUdeviceptr d_C,
int M, int N, int K,
unsigned int gridDimX, unsigned int gridDimY, unsigned int sMem) {
CUmodule module;
CUfunction kernel_addr;
void* args[] = {
(void*)&d_C,
(void*)&d_A,
(void*)&d_B,
(void*)&M,
(void*)&N,
(void*)&K
};
checkCudaErrors(cuModuleLoadData(&module, compiled_kernel.image.data()));
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, kMatmulKernelName));
checkCudaErrors(cuLaunchKernel(kernel_addr,
gridDimX, gridDimY, 1, // grid dim
1, 1, 1, // block dim
sMem, 0, // shared mem, stream
args, // arguments
NULL));
checkCudaErrors(cuCtxSynchronize());
// cleanup
checkCudaErrors(cuModuleUnload(module));
}
void autotuner(int M, int N, int K,
const char *kernel_file,
const SearchSpace& search_space,
CompilerBackend compiler_backend) {
printf("\n=== Matrix: C[%dx%d] = A[%dx%d] x B[%dx%d] (FP16->FP32) ===\n",
M, N, M, K, K, N);
printf(" FLOPs: %.2f GFLOP\n", 2.0 * M * N * K / 1e9);
// allocate and initialize (FP16 for A and B)
std::vector<__half> h_A(M * K), h_B(K * N);
std::vector<float> h_C;
srand(42);
for (int i = 0; i < M * K; i++) h_A[i] = __float2half((float)rand() / RAND_MAX - 0.5f);
for (int i = 0; i < K * N; i++) h_B[i] = __float2half((float)rand() / RAND_MAX - 0.5f);
// compute the CPU reference unless validation is disabled
if (use_validation()) {
h_C.resize(M * N);
matmul_cpu(h_C.data(), h_A.data(), h_B.data(), M, N, K);
}
// allocate device memory
CUdeviceptr d_A, d_B, d_C;
checkCudaErrors(cuMemAlloc(&d_A, M * K * sizeof(__half)));
checkCudaErrors(cuMemAlloc(&d_B, K * N * sizeof(__half)));
checkCudaErrors(cuMemAlloc(&d_C, M * N * sizeof(float)));
checkCudaErrors(cuMemcpyHtoD(d_A, h_A.data(), M * K * sizeof(__half)));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B.data(), K * N * sizeof(__half)));
// helper lambda to clear output
auto clear_output = [&]() {
std::vector<float> zeros(M * N, 0.0f);
checkCudaErrors(cuMemcpyHtoD(d_C, zeros.data(), M * N * sizeof(float)));
};
struct AutotuneResult {
int block_m;
int block_n;
int block_k;
int grid_x;
int grid_y;
int load_latency;
int store_latency;
BenchmarkResult result;
};
std::vector<AutotuneResult> autotune_results;
size_t config_count = 0;
size_t total_configs = search_space.tile_options.size() *
search_space.load_latency_options.size() *
search_space.store_latency_options.size();
for (const auto& tile : search_space.tile_options) {
int grid_x = ceilDiv(M, tile.block_m);
int grid_y = ceilDiv(N, tile.block_n);
for (int load_lat : search_space.load_latency_options) {
for (int store_lat : search_space.store_latency_options) {
config_count++;
printf(" [%zu/%zu] ", config_count, total_configs);
std::vector<std::string> compile_flags = {
"-DLOAD_LATENCY=" + std::to_string(load_lat),
"-DSTORE_LATENCY=" + std::to_string(store_lat)
};
CompiledKernel compiled_kernel = compileFile(kernel_file,
tile.block_m, tile.block_n, tile.block_k,
compiler_backend,
compile_flags);
clear_output();
std::string config_name = "bm=" + std::to_string(tile.block_m) +
",bn=" + std::to_string(tile.block_n) +
",bk=" + std::to_string(tile.block_k) +
",gx=" + std::to_string(grid_x) +
",gy=" + std::to_string(grid_y) +
",ld=" + std::to_string(load_lat) +
",st=" + std::to_string(store_lat);
auto result = run_benchmark(config_name.c_str(),
[&]() {
loadAndExecuteKernel(compiled_kernel, d_A, d_B, d_C, M, N, K,
grid_x, grid_y, 0);
},
[&]() {
std::vector<float> h_result(M * N);
checkCudaErrors(cuMemcpyDtoH(h_result.data(), d_C,
M * N * sizeof(float)));
return verify_matmul_result(config_name.c_str(),
h_result.data(), h_C.data(), M, N);
},
M, N, K);
print_result(result);
autotune_results.push_back({tile.block_m, tile.block_n, tile.block_k,
grid_x, grid_y, load_lat, store_lat, result});
}
}
}
// find the best configuration by GFLOPS
auto best = std::max_element(autotune_results.begin(), autotune_results.end(),
[](const AutotuneResult& a, const AutotuneResult& b) {
return a.result.gflops < b.result.gflops;
});
printf("\n *** BEST CONFIGURATION ***\n");
printf(" BLOCK_M=%d, BLOCK_N=%d, BLOCK_K=%d\n",
best->block_m, best->block_n, best->block_k);
printf(" LOAD_LATENCY=%d, STORE_LATENCY=%d, grid_x=%d, grid_y=%d\n",
best->load_latency, best->store_latency, best->grid_x, best->grid_y);
printf(" Performance: %.1f GFLOPS, %.3f ms, %.1f GB/s\n",
best->result.gflops, best->result.time_ms, best->result.bandwidth_gb_s);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
}
int main(int argc, char** argv) {
std::vector<char*> benchmark_argv;
CompilerBackend compiler_backend = parseCompilerBackendArgs(argc, argv, benchmark_argv);
parse_benchmark_args(static_cast<int>(benchmark_argv.size()), benchmark_argv.data());
print_device_info();
// initialize CUDA and get compute capability
setSMValue();
CUcontext context;
CUdevice cuDevice = 0;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGet(&cuDevice, 0));
checkCudaErrors(cuCtxCreate(&context, NULL, 0, cuDevice));
printf("\nMatrix Multiplication Autotuner (FP16 inputs, FP32 accumulate)\n");
printf("==============================================================\n");
printf("Backend: %s\n", compilerBackendName(compiler_backend));
char *kernel_file = findSampleFile("matmul.cu", argv[0]);
if (kernel_file == NULL) {
fprintf(stderr, "Error: unable to locate matmul.cu\n");
return 1;
}
char *search_space_file = findSampleFile(kSearchSpaceFileName, argv[0]);
if (search_space_file == NULL) {
fprintf(stderr, "Error: unable to locate %s\n", kSearchSpaceFileName);
free(kernel_file);
return 1;
}
SearchSpace search_space = loadSearchSpace(search_space_file);
printf("Search space: %s\n", search_space_file);
printf("Tuning for M=1024, N=4096, K=1024\n");
autotuner(1024, 4096, 1024, kernel_file, search_space, compiler_backend);
free(kernel_file);
free(search_space_file);
return 0;
}