2026-05-27 21:03:57 +00:00

207 lines
7.2 KiB
C++

/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This file provides common benchmark utilities for CUDA Tile C++
* microbenchmarks.
*/
#ifndef CUDA_TILE_BENCHMARK_H
#define CUDA_TILE_BENCHMARK_H
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// benchmark configuration
// global settings for benchmark behavior, controllable via command line
struct BenchmarkConfig {
bool use_validation = false; // --validate enables CPU cross-validation
int warmup_iters = 5; // --warmup=N (0 to disable)
int bench_iters = 20; // --iters=N, -i N
};
inline BenchmarkConfig& bench_config() {
static BenchmarkConfig config;
return config;
}
// convenience accessors
inline bool use_validation() { return bench_config().use_validation; }
inline int warmup_iters() { return bench_config().warmup_iters; }
inline int bench_iters() { return bench_config().bench_iters; }
// Parse command line options. Call from main() before benchmarks.
inline void parse_benchmark_args(int argc, char** argv) {
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "--validate") == 0) {
bench_config().use_validation = true;
} else if (strcmp(argv[i], "--skip-warmup") == 0) {
bench_config().warmup_iters = 0;
} else if (strncmp(argv[i], "--warmup=", 9) == 0) {
bench_config().warmup_iters = atoi(argv[i] + 9);
} else if (strncmp(argv[i], "--iters=", 8) == 0) {
bench_config().bench_iters = atoi(argv[i] + 8);
} else if (strcmp(argv[i], "-i") == 0) {
// Parse "-i N" where the next argument is the value.
if (i + 1 < argc) {
bench_config().bench_iters = atoi(argv[++i]);
} else {
fprintf(stderr, "Error: -i requires an argument\n");
exit(EXIT_FAILURE);
}
} else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
printf("Benchmark options:\n");
printf(" --validate Enable CPU cross-validation\n");
printf(" --skip-warmup Disable warmup iterations\n");
printf(" --warmup=N Warmup iterations (default: 5)\n");
printf(" -i N, --iters=N Benchmark iterations (default: 20)\n");
exit(0);
} else if (argv[i][0] == '-') {
fprintf(stderr, "Error: unknown option '%s'\n", argv[i]);
fprintf(stderr, "Try '--help' for usage information.\n");
exit(EXIT_FAILURE);
}
}
// print active configuration
if (!bench_config().use_validation) {
printf("Note: CPU cross-validation disabled\n");
}
if (bench_config().warmup_iters == 0) {
printf("Note: warmup disabled, iters=%d\n", bench_config().bench_iters);
} else if (bench_config().warmup_iters != 5 || bench_config().bench_iters != 20) {
printf("Note: warmup=%d, iters=%d\n",
bench_config().warmup_iters, bench_config().bench_iters);
}
}
// CUDA error checking
#define CHECK_CUDA(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// device information
inline void print_device_info() {
cudaDeviceProp prop;
CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));
printf("Device: %s\n", prop.name);
int memoryClockRateKHz;
cudaError_t status = cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, 0);
if (status == cudaSuccess) {
printf("Memory Bandwidth: %.0f GB/s (theoretical peak)\n",
2.0 * memoryClockRateKHz * (prop.memoryBusWidth / 8) / 1e6);
}
}
// timing utilities
class CudaTimer {
public:
CudaTimer() {
CHECK_CUDA(cudaEventCreate(&start_));
CHECK_CUDA(cudaEventCreate(&stop_));
}
~CudaTimer() {
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
}
void start() {
CHECK_CUDA(cudaEventRecord(start_));
}
void stop() {
CHECK_CUDA(cudaEventRecord(stop_));
CHECK_CUDA(cudaEventSynchronize(stop_));
}
float elapsed_ms() const {
float ms;
CHECK_CUDA(cudaEventElapsedTime(&ms, start_, stop_));
return ms;
}
private:
cudaEvent_t start_, stop_;
};
// Time a kernel launch, returning average time per iteration in milliseconds.
// Uses global bench_config() for warmup/iteration counts.
template<typename KernelFunc>
inline double time_kernel(KernelFunc kernel_launch) {
// warmup
if (warmup_iters() > 0) {
for (int i = 0; i < warmup_iters(); i++) {
kernel_launch();
}
CHECK_CUDA(cudaDeviceSynchronize());
}
// benchmark
CudaTimer timer;
timer.start();
for (int i = 0; i < bench_iters(); i++) {
kernel_launch();
}
timer.stop();
return timer.elapsed_ms() / bench_iters();
}
// benchmark result structure
struct BenchmarkResult {
const char* name;
double time_ms;
double bandwidth_gb_s;
double gflops;
bool correct;
BenchmarkResult() : name(nullptr), time_ms(0), bandwidth_gb_s(0), gflops(0), correct(false) {}
};
// result printing
inline void print_result(const BenchmarkResult& r) {
const char* status = use_validation() ? (r.correct ? "[OK]" : "[FAIL]") : "";
if (r.gflops > 0) {
printf(" %-42s: %7.3f ms, %7.1f GB/s, %6.1f GFLOPS %s\n",
r.name, r.time_ms, r.bandwidth_gb_s, r.gflops, status);
} else {
printf(" %-42s: %7.3f ms, %7.1f GB/s %s\n",
r.name, r.time_ms, r.bandwidth_gb_s, status);
}
}
#endif // CUDA_TILE_BENCHMARK_H