/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file provides common benchmark utilities for CUDA Tile C++ * microbenchmarks. */ #ifndef CUDA_TILE_BENCHMARK_H #define CUDA_TILE_BENCHMARK_H #include #include #include #include // benchmark configuration // global settings for benchmark behavior, controllable via command line struct BenchmarkConfig { bool use_validation = false; // --validate enables CPU cross-validation int warmup_iters = 5; // --warmup=N (0 to disable) int bench_iters = 20; // --iters=N, -i N }; inline BenchmarkConfig& bench_config() { static BenchmarkConfig config; return config; } // convenience accessors inline bool use_validation() { return bench_config().use_validation; } inline int warmup_iters() { return bench_config().warmup_iters; } inline int bench_iters() { return bench_config().bench_iters; } // Parse command line options. Call from main() before benchmarks. inline void parse_benchmark_args(int argc, char** argv) { for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "--validate") == 0) { bench_config().use_validation = true; } else if (strcmp(argv[i], "--skip-warmup") == 0) { bench_config().warmup_iters = 0; } else if (strncmp(argv[i], "--warmup=", 9) == 0) { bench_config().warmup_iters = atoi(argv[i] + 9); } else if (strncmp(argv[i], "--iters=", 8) == 0) { bench_config().bench_iters = atoi(argv[i] + 8); } else if (strcmp(argv[i], "-i") == 0) { // Parse "-i N" where the next argument is the value. if (i + 1 < argc) { bench_config().bench_iters = atoi(argv[++i]); } else { fprintf(stderr, "Error: -i requires an argument\n"); exit(EXIT_FAILURE); } } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { printf("Benchmark options:\n"); printf(" --validate Enable CPU cross-validation\n"); printf(" --skip-warmup Disable warmup iterations\n"); printf(" --warmup=N Warmup iterations (default: 5)\n"); printf(" -i N, --iters=N Benchmark iterations (default: 20)\n"); exit(0); } else if (argv[i][0] == '-') { fprintf(stderr, "Error: unknown option '%s'\n", argv[i]); fprintf(stderr, "Try '--help' for usage information.\n"); exit(EXIT_FAILURE); } } // print active configuration if (!bench_config().use_validation) { printf("Note: CPU cross-validation disabled\n"); } if (bench_config().warmup_iters == 0) { printf("Note: warmup disabled, iters=%d\n", bench_config().bench_iters); } else if (bench_config().warmup_iters != 5 || bench_config().bench_iters != 20) { printf("Note: warmup=%d, iters=%d\n", bench_config().warmup_iters, bench_config().bench_iters); } } // CUDA error checking #define CHECK_CUDA(call) do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(err)); \ exit(EXIT_FAILURE); \ } \ } while(0) // device information inline void print_device_info() { cudaDeviceProp prop; CHECK_CUDA(cudaGetDeviceProperties(&prop, 0)); printf("Device: %s\n", prop.name); int memoryClockRateKHz; cudaError_t status = cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, 0); if (status == cudaSuccess) { printf("Memory Bandwidth: %.0f GB/s (theoretical peak)\n", 2.0 * memoryClockRateKHz * (prop.memoryBusWidth / 8) / 1e6); } } // timing utilities class CudaTimer { public: CudaTimer() { CHECK_CUDA(cudaEventCreate(&start_)); CHECK_CUDA(cudaEventCreate(&stop_)); } ~CudaTimer() { cudaEventDestroy(start_); cudaEventDestroy(stop_); } void start() { CHECK_CUDA(cudaEventRecord(start_)); } void stop() { CHECK_CUDA(cudaEventRecord(stop_)); CHECK_CUDA(cudaEventSynchronize(stop_)); } float elapsed_ms() const { float ms; CHECK_CUDA(cudaEventElapsedTime(&ms, start_, stop_)); return ms; } private: cudaEvent_t start_, stop_; }; // Time a kernel launch, returning average time per iteration in milliseconds. // Uses global bench_config() for warmup/iteration counts. template inline double time_kernel(KernelFunc kernel_launch) { // warmup if (warmup_iters() > 0) { for (int i = 0; i < warmup_iters(); i++) { kernel_launch(); } CHECK_CUDA(cudaDeviceSynchronize()); } // benchmark CudaTimer timer; timer.start(); for (int i = 0; i < bench_iters(); i++) { kernel_launch(); } timer.stop(); return timer.elapsed_ms() / bench_iters(); } // benchmark result structure struct BenchmarkResult { const char* name; double time_ms; double bandwidth_gb_s; double gflops; bool correct; BenchmarkResult() : name(nullptr), time_ms(0), bandwidth_gb_s(0), gflops(0), correct(false) {} }; // result printing inline void print_result(const BenchmarkResult& r) { const char* status = use_validation() ? (r.correct ? "[OK]" : "[FAIL]") : ""; if (r.gflops > 0) { printf(" %-42s: %7.3f ms, %7.1f GB/s, %6.1f GFLOPS %s\n", r.name, r.time_ms, r.bandwidth_gb_s, r.gflops, status); } else { printf(" %-42s: %7.3f ms, %7.1f GB/s %s\n", r.name, r.time_ms, r.bandwidth_gb_s, status); } } #endif // CUDA_TILE_BENCHMARK_H