cuda-samples/Samples/6_Performance/LargeKernelParameter/LargeKernelParameter.cu

174 lines
5.7 KiB
Plaintext
Raw Normal View History

/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is a simple test showing performance and usability
* improvements with large kernel parameters introduced in CUDA 12.1
*/
#include <chrono>
#include <iostream>
#include <cassert>
// Utility includes
#include <helper_cuda.h>
using namespace std;
using namespace std::chrono;
#define TEST_ITERATIONS (1000)
#define TOTAL_PARAMS (8000) // ints
#define KERNEL_PARAM_LIMIT (1024) // ints
#define CONST_COPIED_PARAMS (TOTAL_PARAMS - KERNEL_PARAM_LIMIT)
__constant__ int excess_params[CONST_COPIED_PARAMS];
typedef struct {
int param[KERNEL_PARAM_LIMIT];
} param_t;
typedef struct {
int param[TOTAL_PARAMS];
} param_large_t;
// Kernel with 4KB kernel parameter limit
__global__ void kernelDefault(__grid_constant__ const param_t p, int *result) {
int tmp = 0;
// accumulate kernel parameters
for (int i = 0; i < KERNEL_PARAM_LIMIT; ++i) {
tmp += p.param[i];
}
// accumulate excess values passed via const memory
for (int i = 0; i < CONST_COPIED_PARAMS; ++i) {
tmp += excess_params[i];
}
*result = tmp;
}
// Kernel with 32,764 byte kernel parameter limit
__global__ void kernelLargeParam(__grid_constant__ const param_large_t p, int *result) {
int tmp = 0;
// accumulate kernel parameters
for (int i = 0; i < TOTAL_PARAMS; ++i) {
tmp += p.param[i];
}
*result = tmp;
}
static void report_time(std::chrono::time_point<std::chrono::steady_clock> start,
std::chrono::time_point<std::chrono::steady_clock> end,
int iters) {
auto usecs = duration_cast<duration<float,
microseconds::period>>(end - start);
cout << usecs.count() / iters << endl;
}
int main() {
int rc;
cudaFree(0);
param_t p;
param_large_t p_large;
// pageable host memory that holds excess constants passed via constant memory
int *copied_params = (int *)malloc(CONST_COPIED_PARAMS * sizeof(int));
assert(copied_params);
// storage for computed result
int *d_result;
int h_result;
checkCudaErrors(cudaMalloc(&d_result, sizeof(int)));
int expected_result = 0;
// fill in data for validation
for (int i = 0; i < KERNEL_PARAM_LIMIT; ++i) {
p.param[i] = (i & 0xFF);
}
for (int i = KERNEL_PARAM_LIMIT; i < TOTAL_PARAMS; ++i) {
copied_params[i - KERNEL_PARAM_LIMIT] = (i & 0xFF);
}
for (int i = 0; i < TOTAL_PARAMS; ++i) {
p_large.param[i] = (i & 0xFF);
expected_result += (i & 0xFF);
}
// warmup, verify correctness
checkCudaErrors(cudaMemcpyToSymbol(excess_params, copied_params, CONST_COPIED_PARAMS * sizeof(int), 0, cudaMemcpyHostToDevice));
kernelDefault<<<1,1>>>(p, d_result);
checkCudaErrors(cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaDeviceSynchronize());
if(h_result != expected_result) {
std::cout << "Test failed" << std::endl;
rc=-1;
goto Exit;
}
kernelLargeParam<<<1,1>>>(p_large, d_result);
checkCudaErrors(cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaDeviceSynchronize());
if(h_result != expected_result) {
std::cout << "Test failed" << std::endl;
rc=-1;
goto Exit;
}
// benchmark default kernel parameter limit
{
auto start = steady_clock::now();
for (int i = 0; i < TEST_ITERATIONS; ++i) {
checkCudaErrors(cudaMemcpyToSymbol(excess_params, copied_params, CONST_COPIED_PARAMS * sizeof(int), 0, cudaMemcpyHostToDevice));
kernelDefault<<<1, 1>>>(p, d_result);
}
checkCudaErrors(cudaDeviceSynchronize());
auto end = steady_clock::now();
std::cout << "Kernel 4KB parameter limit - time (us):";
report_time(start, end, TEST_ITERATIONS);
// benchmark large kernel parameter limit
start = steady_clock::now();
for (int i = 0; i < TEST_ITERATIONS; ++i) {
kernelLargeParam<<<1, 1>>>(p_large, d_result);
}
checkCudaErrors(cudaDeviceSynchronize());
end = steady_clock::now();
std::cout << "Kernel 32,764 byte parameter limit - time (us):";
report_time(start, end, TEST_ITERATIONS);
}
std::cout << "Test passed!" << std::endl;
rc=0;
Exit:
// cleanup
cudaFree(d_result);
free(copied_params);
return rc;
}