Merge branch 'pr/58'

This commit is contained in:
Winslohw 2022-04-04 11:35:58 +02:00
commit f1701cf2fc

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
@ -74,8 +74,7 @@ void randomInit(float *, int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int, extern "C" void computeGold(float *, const float *, const float *, unsigned int,
unsigned int, unsigned int); unsigned int, unsigned int);
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul);
int *blk_size);
#ifndef FATBIN_FILE #ifndef FATBIN_FILE
#define FATBIN_FILE "matrixMul_kernel64.fatbin" #define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -112,9 +111,9 @@ int main(int argc, char **argv) {
void runTest(int argc, char **argv) { void runTest(int argc, char **argv) {
// initialize CUDA // initialize CUDA
CUfunction matrixMul = NULL; CUfunction matrixMul = NULL;
int block_size = 0; int block_size = 32;
initCUDA(argc, argv, &matrixMul, &block_size); initCUDA(argc, argv, &matrixMul);
// set seed for rand() // set seed for rand()
srand(2006); srand(2006);
@ -167,14 +166,14 @@ void runTest(int argc, char **argv) {
if (1) { if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simplier method) // Launching (simpler method)
size_t Matrix_Width_A = (size_t)WA; size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB; size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call // new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, args, NULL)); 2 * block_size * block_size * sizeof(float), NULL, args, NULL));
} else { } else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method) // Launching (advanced method)
@ -190,8 +189,8 @@ void runTest(int argc, char **argv) {
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B; *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B); offset += sizeof(d_B);
size_t Matrix_Width_A = (size_t)WA; size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB; size_t Matrix_Width_B = (size_t)WB;
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A; *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A); offset += sizeof(Matrix_Width_A);
@ -230,8 +229,9 @@ void runTest(int argc, char **argv) {
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " printf(
"Results may vary when GPU Boost is enabled.\n"); "\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// clean up memory // clean up memory
free(h_A); free(h_A);
@ -250,9 +250,9 @@ void randomInit(float *data, int size) {
} }
} }
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul) {
int *blk_size) {
CUfunction cuFunction = 0; CUfunction cuFunction = 0;
CUresult status;
int major = 0, minor = 0; int major = 0, minor = 0;
char deviceName[100]; char deviceName[100];
@ -276,45 +276,26 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
std::string module_path; std::string module_path;
std::ostringstream fatbin; std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
exit(EXIT_FAILURE); {
} else { exit(EXIT_FAILURE);
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); }
else
{
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
if (!fatbin.str().size()) { if (!fatbin.str().size())
printf("fatbin file empty. exiting..\n"); {
exit(EXIT_FAILURE); printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
} }
// Create module from binary file (FATBIN) // Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit"));
// select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
"matrixMul_bs8_64bit"};
int idx = 0;
int block_size = 32;
while (idx < 3) {
int threadsPerBlock = 0;
int blocksPerGrid = 0;
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size);
break;
} else {
block_size /= 2;
}
idx++;
}
*pMatrixMul = cuFunction; *pMatrixMul = cuFunction;
*blk_size = block_size;
return 0; return 0;
} }