mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-19 22:32:23 +08:00
Merge branch 'pr/58'
This commit is contained in:
commit
f1701cf2fc
|
@ -1,4 +1,4 @@
|
||||||
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions
|
* modification, are permitted provided that the following conditions
|
||||||
|
@ -74,8 +74,7 @@ void randomInit(float *, int);
|
||||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
||||||
unsigned int, unsigned int);
|
unsigned int, unsigned int);
|
||||||
|
|
||||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul);
|
||||||
int *blk_size);
|
|
||||||
|
|
||||||
#ifndef FATBIN_FILE
|
#ifndef FATBIN_FILE
|
||||||
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
||||||
|
@ -112,9 +111,9 @@ int main(int argc, char **argv) {
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv) {
|
||||||
// initialize CUDA
|
// initialize CUDA
|
||||||
CUfunction matrixMul = NULL;
|
CUfunction matrixMul = NULL;
|
||||||
int block_size = 0;
|
int block_size = 32;
|
||||||
|
|
||||||
initCUDA(argc, argv, &matrixMul, &block_size);
|
initCUDA(argc, argv, &matrixMul);
|
||||||
|
|
||||||
// set seed for rand()
|
// set seed for rand()
|
||||||
srand(2006);
|
srand(2006);
|
||||||
|
@ -167,7 +166,7 @@ void runTest(int argc, char **argv) {
|
||||||
|
|
||||||
if (1) {
|
if (1) {
|
||||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||||
// Launching (simplier method)
|
// Launching (simpler method)
|
||||||
size_t Matrix_Width_A = (size_t)WA;
|
size_t Matrix_Width_A = (size_t)WA;
|
||||||
size_t Matrix_Width_B = (size_t)WB;
|
size_t Matrix_Width_B = (size_t)WB;
|
||||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||||
|
@ -230,7 +229,8 @@ void runTest(int argc, char **argv) {
|
||||||
|
|
||||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||||
|
|
||||||
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
printf(
|
||||||
|
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||||
"Results may vary when GPU Boost is enabled.\n");
|
"Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
// clean up memory
|
// clean up memory
|
||||||
|
@ -250,9 +250,9 @@ void randomInit(float *data, int size) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul) {
|
||||||
int *blk_size) {
|
|
||||||
CUfunction cuFunction = 0;
|
CUfunction cuFunction = 0;
|
||||||
|
CUresult status;
|
||||||
int major = 0, minor = 0;
|
int major = 0, minor = 0;
|
||||||
char deviceName[100];
|
char deviceName[100];
|
||||||
|
|
||||||
|
@ -276,45 +276,26 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
||||||
std::string module_path;
|
std::string module_path;
|
||||||
std::ostringstream fatbin;
|
std::ostringstream fatbin;
|
||||||
|
|
||||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
|
||||||
|
{
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!fatbin.str().size()) {
|
if (!fatbin.str().size())
|
||||||
|
{
|
||||||
printf("fatbin file empty. exiting..\n");
|
printf("fatbin file empty. exiting..\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create module from binary file (FATBIN)
|
// Create module from binary file (FATBIN)
|
||||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||||
|
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit"));
|
||||||
// select the suitable kernel function
|
|
||||||
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
|
|
||||||
"matrixMul_bs8_64bit"};
|
|
||||||
|
|
||||||
int idx = 0;
|
|
||||||
int block_size = 32;
|
|
||||||
while (idx < 3) {
|
|
||||||
int threadsPerBlock = 0;
|
|
||||||
int blocksPerGrid = 0;
|
|
||||||
|
|
||||||
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
|
||||||
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
|
||||||
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
|
|
||||||
2 * block_size * block_size * sizeof(float), 0));
|
|
||||||
if (block_size * block_size <= threadsPerBlock) {
|
|
||||||
printf("> %d block size selected\n", block_size);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
block_size /= 2;
|
|
||||||
}
|
|
||||||
idx++;
|
|
||||||
}
|
|
||||||
|
|
||||||
*pMatrixMul = cuFunction;
|
*pMatrixMul = cuFunction;
|
||||||
*blk_size = block_size;
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user