mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-06-04 00:06:52 +08:00
This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
207 lines
7.2 KiB
C++
207 lines
7.2 KiB
C++
/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This file provides common benchmark utilities for CUDA Tile C++
|
|
* microbenchmarks.
|
|
*/
|
|
|
|
#ifndef CUDA_TILE_BENCHMARK_H
|
|
#define CUDA_TILE_BENCHMARK_H
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
// benchmark configuration
|
|
// global settings for benchmark behavior, controllable via command line
|
|
struct BenchmarkConfig {
|
|
bool use_validation = false; // --validate enables CPU cross-validation
|
|
int warmup_iters = 5; // --warmup=N (0 to disable)
|
|
int bench_iters = 20; // --iters=N, -i N
|
|
};
|
|
|
|
inline BenchmarkConfig& bench_config() {
|
|
static BenchmarkConfig config;
|
|
return config;
|
|
}
|
|
|
|
// convenience accessors
|
|
inline bool use_validation() { return bench_config().use_validation; }
|
|
inline int warmup_iters() { return bench_config().warmup_iters; }
|
|
inline int bench_iters() { return bench_config().bench_iters; }
|
|
|
|
// Parse command line options. Call from main() before benchmarks.
|
|
inline void parse_benchmark_args(int argc, char** argv) {
|
|
for (int i = 1; i < argc; i++) {
|
|
if (strcmp(argv[i], "--validate") == 0) {
|
|
bench_config().use_validation = true;
|
|
} else if (strcmp(argv[i], "--skip-warmup") == 0) {
|
|
bench_config().warmup_iters = 0;
|
|
} else if (strncmp(argv[i], "--warmup=", 9) == 0) {
|
|
bench_config().warmup_iters = atoi(argv[i] + 9);
|
|
} else if (strncmp(argv[i], "--iters=", 8) == 0) {
|
|
bench_config().bench_iters = atoi(argv[i] + 8);
|
|
} else if (strcmp(argv[i], "-i") == 0) {
|
|
// Parse "-i N" where the next argument is the value.
|
|
if (i + 1 < argc) {
|
|
bench_config().bench_iters = atoi(argv[++i]);
|
|
} else {
|
|
fprintf(stderr, "Error: -i requires an argument\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
} else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
|
|
printf("Benchmark options:\n");
|
|
printf(" --validate Enable CPU cross-validation\n");
|
|
printf(" --skip-warmup Disable warmup iterations\n");
|
|
printf(" --warmup=N Warmup iterations (default: 5)\n");
|
|
printf(" -i N, --iters=N Benchmark iterations (default: 20)\n");
|
|
exit(0);
|
|
} else if (argv[i][0] == '-') {
|
|
fprintf(stderr, "Error: unknown option '%s'\n", argv[i]);
|
|
fprintf(stderr, "Try '--help' for usage information.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
// print active configuration
|
|
if (!bench_config().use_validation) {
|
|
printf("Note: CPU cross-validation disabled\n");
|
|
}
|
|
if (bench_config().warmup_iters == 0) {
|
|
printf("Note: warmup disabled, iters=%d\n", bench_config().bench_iters);
|
|
} else if (bench_config().warmup_iters != 5 || bench_config().bench_iters != 20) {
|
|
printf("Note: warmup=%d, iters=%d\n",
|
|
bench_config().warmup_iters, bench_config().bench_iters);
|
|
}
|
|
}
|
|
|
|
// CUDA error checking
|
|
#define CHECK_CUDA(call) do { \
|
|
cudaError_t err = call; \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
|
|
cudaGetErrorString(err)); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
} while(0)
|
|
|
|
// device information
|
|
inline void print_device_info() {
|
|
cudaDeviceProp prop;
|
|
CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));
|
|
printf("Device: %s\n", prop.name);
|
|
|
|
int memoryClockRateKHz;
|
|
cudaError_t status = cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, 0);
|
|
if (status == cudaSuccess) {
|
|
printf("Memory Bandwidth: %.0f GB/s (theoretical peak)\n",
|
|
2.0 * memoryClockRateKHz * (prop.memoryBusWidth / 8) / 1e6);
|
|
}
|
|
}
|
|
|
|
// timing utilities
|
|
class CudaTimer {
|
|
public:
|
|
CudaTimer() {
|
|
CHECK_CUDA(cudaEventCreate(&start_));
|
|
CHECK_CUDA(cudaEventCreate(&stop_));
|
|
}
|
|
|
|
~CudaTimer() {
|
|
cudaEventDestroy(start_);
|
|
cudaEventDestroy(stop_);
|
|
}
|
|
|
|
void start() {
|
|
CHECK_CUDA(cudaEventRecord(start_));
|
|
}
|
|
|
|
void stop() {
|
|
CHECK_CUDA(cudaEventRecord(stop_));
|
|
CHECK_CUDA(cudaEventSynchronize(stop_));
|
|
}
|
|
|
|
float elapsed_ms() const {
|
|
float ms;
|
|
CHECK_CUDA(cudaEventElapsedTime(&ms, start_, stop_));
|
|
return ms;
|
|
}
|
|
|
|
private:
|
|
cudaEvent_t start_, stop_;
|
|
};
|
|
|
|
// Time a kernel launch, returning average time per iteration in milliseconds.
|
|
// Uses global bench_config() for warmup/iteration counts.
|
|
template<typename KernelFunc>
|
|
inline double time_kernel(KernelFunc kernel_launch) {
|
|
// warmup
|
|
if (warmup_iters() > 0) {
|
|
for (int i = 0; i < warmup_iters(); i++) {
|
|
kernel_launch();
|
|
}
|
|
CHECK_CUDA(cudaDeviceSynchronize());
|
|
}
|
|
|
|
// benchmark
|
|
CudaTimer timer;
|
|
timer.start();
|
|
for (int i = 0; i < bench_iters(); i++) {
|
|
kernel_launch();
|
|
}
|
|
timer.stop();
|
|
|
|
return timer.elapsed_ms() / bench_iters();
|
|
}
|
|
|
|
// benchmark result structure
|
|
struct BenchmarkResult {
|
|
const char* name;
|
|
double time_ms;
|
|
double bandwidth_gb_s;
|
|
double gflops;
|
|
bool correct;
|
|
|
|
BenchmarkResult() : name(nullptr), time_ms(0), bandwidth_gb_s(0), gflops(0), correct(false) {}
|
|
};
|
|
|
|
// result printing
|
|
inline void print_result(const BenchmarkResult& r) {
|
|
const char* status = use_validation() ? (r.correct ? "[OK]" : "[FAIL]") : "";
|
|
if (r.gflops > 0) {
|
|
printf(" %-42s: %7.3f ms, %7.1f GB/s, %6.1f GFLOPS %s\n",
|
|
r.name, r.time_ms, r.bandwidth_gb_s, r.gflops, status);
|
|
} else {
|
|
printf(" %-42s: %7.3f ms, %7.1f GB/s %s\n",
|
|
r.name, r.time_ms, r.bandwidth_gb_s, status);
|
|
}
|
|
}
|
|
|
|
#endif // CUDA_TILE_BENCHMARK_H
|