mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-12 20:38:31 +08:00
Merge branch 'pr/58'
This commit is contained in:
commit
f1701cf2fc
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
|
@ -74,8 +74,7 @@ void randomInit(float *, int);
|
|||
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
||||
unsigned int, unsigned int);
|
||||
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
||||
int *blk_size);
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul);
|
||||
|
||||
#ifndef FATBIN_FILE
|
||||
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
||||
|
@ -112,9 +111,9 @@ int main(int argc, char **argv) {
|
|||
void runTest(int argc, char **argv) {
|
||||
// initialize CUDA
|
||||
CUfunction matrixMul = NULL;
|
||||
int block_size = 0;
|
||||
int block_size = 32;
|
||||
|
||||
initCUDA(argc, argv, &matrixMul, &block_size);
|
||||
initCUDA(argc, argv, &matrixMul);
|
||||
|
||||
// set seed for rand()
|
||||
srand(2006);
|
||||
|
@ -167,14 +166,14 @@ void runTest(int argc, char **argv) {
|
|||
|
||||
if (1) {
|
||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||
// Launching (simplier method)
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||
// new CUDA 4.0 Driver API Kernel launch call
|
||||
checkCudaErrors(cuLaunchKernel(
|
||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
||||
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
|
||||
// Launching (simpler method)
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||
// new CUDA 4.0 Driver API Kernel launch call
|
||||
checkCudaErrors(cuLaunchKernel(
|
||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
||||
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
|
||||
} else {
|
||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||
// Launching (advanced method)
|
||||
|
@ -190,8 +189,8 @@ void runTest(int argc, char **argv) {
|
|||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
|
||||
offset += sizeof(d_B);
|
||||
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
|
||||
offset += sizeof(Matrix_Width_A);
|
||||
|
@ -230,8 +229,9 @@ void runTest(int argc, char **argv) {
|
|||
|
||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||
|
||||
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||
"Results may vary when GPU Boost is enabled.\n");
|
||||
printf(
|
||||
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||
"Results may vary when GPU Boost is enabled.\n");
|
||||
|
||||
// clean up memory
|
||||
free(h_A);
|
||||
|
@ -250,9 +250,9 @@ void randomInit(float *data, int size) {
|
|||
}
|
||||
}
|
||||
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
||||
int *blk_size) {
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul) {
|
||||
CUfunction cuFunction = 0;
|
||||
CUresult status;
|
||||
int major = 0, minor = 0;
|
||||
char deviceName[100];
|
||||
|
||||
|
@ -276,45 +276,26 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
|||
std::string module_path;
|
||||
std::ostringstream fatbin;
|
||||
|
||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
|
||||
{
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||
}
|
||||
|
||||
if (!fatbin.str().size()) {
|
||||
printf("fatbin file empty. exiting..\n");
|
||||
exit(EXIT_FAILURE);
|
||||
if (!fatbin.str().size())
|
||||
{
|
||||
printf("fatbin file empty. exiting..\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Create module from binary file (FATBIN)
|
||||
// Create module from binary file (FATBIN)
|
||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||
|
||||
// select the suitable kernel function
|
||||
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
|
||||
"matrixMul_bs8_64bit"};
|
||||
|
||||
int idx = 0;
|
||||
int block_size = 32;
|
||||
while (idx < 3) {
|
||||
int threadsPerBlock = 0;
|
||||
int blocksPerGrid = 0;
|
||||
|
||||
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
||||
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
||||
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
|
||||
2 * block_size * block_size * sizeof(float), 0));
|
||||
if (block_size * block_size <= threadsPerBlock) {
|
||||
printf("> %d block size selected\n", block_size);
|
||||
break;
|
||||
} else {
|
||||
block_size /= 2;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit"));
|
||||
|
||||
*pMatrixMul = cuFunction;
|
||||
*blk_size = block_size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user