cuda-samples/Samples/matrixMulDrv/matrixMulDrv.cpp

437 lines
14 KiB
C++

/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Matrix multiplication: C = A * B.
* Host code.
*
* This sample implements matrix multiplication using the CUDA driver API.
* It has been written for clarity of exposition to illustrate various CUDA
* programming principles, not with the goal of providing the most
* performant generic kernel for matrix multiplication.
*
* CUBLAS provides high-performance matrix multiplication.
* See also:
* V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
* in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
* Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
*
* Volkov, V. 2010. Better performance at lower occupancy,
* GPU Technology Conference 2~010 (GTC 2010).
*
*/
// includes, system
#include <builtin_types.h>
#include <cuda.h>
#include <drvapi_error_string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_image.h>
#include <helper_string.h>
#include <helper_timer.h>
#include <cstring>
#include <iostream>
#include <string>
#include "matrixMul.h"
// includes, CUDA
const bool use_64bit_memory_address = false;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
void randomInit(float *, int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
unsigned int, unsigned int);
static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul);
// define input ptx file for different platforms
#if defined(_WIN64) || defined(__LP64__)
#define PTX_FILE "matrixMul_kernel64.ptx"
#define CUBIN_FILE "matrixMul_kernel64.cubin"
#else
#define PTX_FILE "matrixMul_kernel32.ptx"
#define CUBIN_FILE "matrixMul_kernel32.cubin"
#endif
////////////////////////////////////////////////////////////////////////////////
// Globals
////////////////////////////////////////////////////////////////////////////////
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
size_t totalGlobalMem;
const char *sSDKsample = "matrixMulDrv (Driver API)";
void constantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("[ %s ]\n", sSDKsample);
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 32;
CUresult error_id = initCUDA(argc, argv, &matrixMul);
if (error_id != CUDA_SUCCESS) {
printf("initCUDA() returned %d\n-> %s\n", error_id,
getCudaDrvErrorString(error_id));
exit(EXIT_FAILURE);
}
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
// initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// First reserve about 4GB of memory, so we ensure that all memory allocated
// afterwards is > 4GB
CUdeviceptr d_Mem[4];
if (use_64bit_memory_address) {
unsigned int mem_size = 1024 * 1024 * 1024;
checkCudaErrors(cuMemAlloc(&d_Mem[0], mem_size));
checkCudaErrors(cuMemAlloc(&d_Mem[1], mem_size));
checkCudaErrors(cuMemAlloc(&d_Mem[2], mem_size));
checkCudaErrors(cuMemAlloc(&d_Mem[3], mem_size));
}
// allocate device memory
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// allocate device memory for result
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// start the timer
sdkStartTimer(&timer);
// There are two ways to launch CUDA kernels via the Driver API.
// In this CUDA Sample, we illustrate both ways to pass parameters
// and specify parameters. By default we use the simpler method.
dim3 block(block_size, block_size, 1);
dim3 grid(WC / block_size, HC / block_size, 1);
if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simplier method)
if (use_64bit_memory_address &&
(totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L)) {
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
} else {
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
}
} else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
// pass in launch parameters (not actually de-referencing CUdeviceptr).
// CUdeviceptr is storing the value of the parameters
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
offset += sizeof(d_C);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
offset += sizeof(d_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B);
if (use_64bit_memory_address &&
(totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L)) {
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
} else {
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
*(reinterpret_cast<int *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*(reinterpret_cast<int *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
}
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, NULL,
reinterpret_cast<void **>(&kernel_launch_config)));
}
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
h_C[i], WA * valB);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// clean up memory
if (use_64bit_memory_address) {
cuMemFree(d_Mem[0]);
cuMemFree(d_Mem[1]);
cuMemFree(d_Mem[2]);
cuMemFree(d_Mem[3]);
}
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(cuContext));
}
// Allocates a matrix with random float entries.
void randomInit(float *data, int size) {
for (int i = 0; i < size; ++i) {
data[i] = rand() / static_cast<float>(RAND_MAX);
}
}
bool inline findModulePath(const char *module_file, std::string &module_path,
char **argv, std::string &ptx_source) {
char *actual_path = sdkFindFilePath(module_file, argv[0]);
if (actual_path) {
module_path = actual_path;
} else {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
}
if (module_path.empty()) {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
} else {
printf("> findModulePath <%s>\n", module_path.c_str());
if (module_path.rfind(".ptx") != std::string::npos) {
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + 1];
fseek(fp, 0, SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf;
}
return true;
}
}
static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul) {
CUfunction cuFunction = 0;
CUresult status;
int major = 0, minor = 0;
char deviceName[100];
std::string module_path, ptx_source;
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n",
(long long unsigned int)totalGlobalMem);
printf(" 64-bit Memory Address: %s\n",
(totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L) ? "YES" : "NO");
status = cuCtxCreate(&cuContext, 0, cuDevice);
if (CUDA_SUCCESS != status) {
goto Error;
}
// first search for the module path before we load the results
if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
if (!findModulePath(CUBIN_FILE, module_path, argv, ptx_source)) {
printf(
"> findModulePath could not find <matrixMul_kernel> ptx or cubin\n");
status = CUDA_ERROR_NOT_FOUND;
goto Error;
}
} else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (module_path.rfind("ptx") != std::string::npos) {
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize);
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = reinterpret_cast<void *>(jitRegCount);
status =
cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions,
jitOptions, reinterpret_cast<void **>(jitOptVals));
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
} else {
status = cuModuleLoad(&cuModule, module_path.c_str());
}
if (CUDA_SUCCESS != status) {
goto Error;
}
#if USE_64BIT_MEMORY_ADDRESS
if (totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L) {
status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit");
} else
#endif
{
status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_32bit");
}
if (CUDA_SUCCESS != status) {
goto Error;
}
*pMatrixMul = cuFunction;
return CUDA_SUCCESS;
Error:
cuCtxDestroy(cuContext);
return status;
}