cuda-samples/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu

899 lines
32 KiB
Plaintext
Raw Normal View History

/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cudla.h"
#include "cuda_runtime.h"
#include "cudlaExternalEtbl.hpp"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sys/stat.h>
#include <fstream>
#include <sstream>
#define MAX_FILENAME_LEN 200
#define RESERVED_SUFFIX_LEN 10
#define DPRINTF(...) printf(__VA_ARGS__)
static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) {
DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name);
DPRINTF("\tsize: %lu\n", tensorDesc->size);
DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c,
tensorDesc->h, tensorDesc->w);
DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat);
DPRINTF("\tdata type: %d\n", tensorDesc->dataType);
DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory);
DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat);
DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping);
DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]);
DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]);
DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]);
DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]);
}
typedef struct {
cudlaDevHandle devHandle;
cudlaModule moduleHandle;
unsigned char* loadableData;
cudaStream_t stream;
uint32_t numInputTensors;
uint32_t numOutputTensors;
uint32_t numOutputTaskStatistics;
unsigned char** inputBuffer;
unsigned char** outputBuffer;
unsigned char** statisticsOutputBuffer;
void** inputBufferGPU;
void** outputBufferGPU;
void** outputTaskStatisticsGPU;
void **csv;
cudlaModuleTensorDescriptor* inputTensorDesc;
cudlaModuleTensorDescriptor* outputTensorDesc;
cudlaModuleTensorDescriptor* outputTaskStatisticsDesc;
uint64_t** inputBufferRegisteredPtr;
uint64_t** outputBufferRegisteredPtr;
uint64_t** outputTaskStatisticsRegisteredPtr;
uint64_t** outputStatisticsBufferRegisteredPtr;
} ResourceList;
void cleanUp(ResourceList* resourceList);
void cleanUp(ResourceList* resourceList) {
uint32_t ii = 0;
if (resourceList->inputTensorDesc != NULL) {
free(resourceList->inputTensorDesc);
resourceList->inputTensorDesc = NULL;
}
if (resourceList->outputTensorDesc != NULL) {
free(resourceList->outputTensorDesc);
resourceList->outputTensorDesc = NULL;
}
if (resourceList->outputTaskStatisticsDesc != NULL) {
free(resourceList->outputTaskStatisticsDesc);
resourceList->outputTaskStatisticsDesc = NULL;
}
if (resourceList->loadableData != NULL) {
free(resourceList->loadableData);
resourceList->loadableData = NULL;
}
if (resourceList->moduleHandle != NULL) {
cudlaModuleUnload(resourceList->moduleHandle, 0);
resourceList->moduleHandle = NULL;
}
if (resourceList->devHandle != NULL) {
cudlaDestroyDevice(resourceList->devHandle);
resourceList->devHandle = NULL;
}
if (resourceList->inputBufferGPU != NULL) {
for (ii = 0; ii < resourceList->numInputTensors; ii++) {
if ((resourceList->inputBufferGPU)[ii] != NULL) {
cudaFree((resourceList->inputBufferGPU)[ii]);
(resourceList->inputBufferGPU)[ii] = NULL;
}
}
free(resourceList->inputBufferGPU);
resourceList->inputBufferGPU = NULL;
}
if (resourceList->outputBufferGPU != NULL) {
for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
if ((resourceList->outputBufferGPU)[ii] != NULL) {
cudaFree((resourceList->outputBufferGPU)[ii]);
(resourceList->outputBufferGPU)[ii] = NULL;
}
}
free(resourceList->outputBufferGPU);
resourceList->outputBufferGPU = NULL;
}
if (resourceList->outputTaskStatisticsGPU != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if ((resourceList->outputTaskStatisticsGPU)[ii] != NULL) {
cudaFree((resourceList->outputTaskStatisticsGPU)[ii]);
(resourceList->outputTaskStatisticsGPU)[ii] = NULL;
}
}
free(resourceList->outputTaskStatisticsGPU);
resourceList->outputTaskStatisticsGPU = NULL;
}
if (resourceList->csv != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if ((resourceList->csv)[ii] != NULL)
{
free((resourceList->csv)[ii]);
(resourceList->csv)[ii] = NULL;
}
}
free(resourceList->csv);
resourceList->csv = NULL;
}
if (resourceList->inputBuffer != NULL) {
for (ii = 0; ii < resourceList->numInputTensors; ii++) {
if ((resourceList->inputBuffer)[ii] != NULL) {
free((resourceList->inputBuffer)[ii]);
(resourceList->inputBuffer)[ii] = NULL;
}
}
free(resourceList->inputBuffer);
resourceList->inputBuffer = NULL;
}
if (resourceList->outputBuffer != NULL) {
for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
if ((resourceList->outputBuffer)[ii] != NULL)
{
free((resourceList->outputBuffer)[ii]);
(resourceList->outputBuffer)[ii] = NULL;
}
}
free(resourceList->outputBuffer);
resourceList->outputBuffer = NULL;
}
if (resourceList->statisticsOutputBuffer != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if ((resourceList->statisticsOutputBuffer)[ii] != NULL) {
free((resourceList->statisticsOutputBuffer)[ii]);
(resourceList->statisticsOutputBuffer)[ii] = NULL;
}
}
free(resourceList->statisticsOutputBuffer);
resourceList->statisticsOutputBuffer = NULL;
}
if (resourceList->stream != NULL) {
cudaStreamDestroy(resourceList->stream);
resourceList->stream = NULL;
}
if (resourceList->inputBufferRegisteredPtr != NULL) {
free(resourceList->inputBufferRegisteredPtr);
resourceList->inputBufferRegisteredPtr = NULL;
}
if (resourceList->outputBufferRegisteredPtr != NULL) {
free(resourceList->outputBufferRegisteredPtr);
resourceList->outputBufferRegisteredPtr = NULL;
}
if (resourceList->outputTaskStatisticsRegisteredPtr != NULL) {
free(resourceList->outputTaskStatisticsRegisteredPtr);
resourceList->outputTaskStatisticsRegisteredPtr = NULL;
}
if (resourceList->outputStatisticsBufferRegisteredPtr != NULL) {
free(resourceList->outputStatisticsBufferRegisteredPtr);
resourceList->outputStatisticsBufferRegisteredPtr = NULL;
}
resourceList->numInputTensors = 0;
resourceList->numOutputTensors = 0;
resourceList->numOutputTaskStatistics = 0;
}
int main(int argc, char** argv) {
cudlaDevHandle devHandle;
cudlaModule moduleHandle;
cudlaStatus err;
uint32_t statSupport = 0;
uint32_t dlaFreqInMHz = 0;
FILE* fp = NULL;
struct stat st;
size_t file_size;
size_t actually_read = 0;
unsigned char *loadableData = NULL;
char filename[MAX_FILENAME_LEN];
const char* suffix = ".csv";
cudaStream_t stream;
cudaError_t result;
const char* errPtr = NULL;
ResourceList resourceList;
memset(&resourceList, 0x00, sizeof(ResourceList));
if ((argc != 4) && (argc != 5)) {
DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_hybrid_test1 <loadable> <freqMHZ> <statSupport> <filename prefix>\n");
return 1;
}
if (argc == 5) {
if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN))
{
DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN));
return 1;
}
}
// Read loadable into buffer.
fp = fopen(argv[1], "rb");
if (fp == NULL) {
DPRINTF("Cannot open file %s\n", argv[1]);
return 1;
}
if (stat(argv[1], &st) != 0) {
DPRINTF("Cannot stat file\n");
return 1;
}
file_size = st.st_size;
DPRINTF("The file size = %ld\n", file_size);
dlaFreqInMHz = atoi(argv[2]);
statSupport = atoi(argv[3]);
loadableData = (unsigned char *)malloc(file_size);
if (loadableData == NULL) {
DPRINTF("Cannot Allocate memory for loadable\n");
return 1;
}
actually_read = fread(loadableData, 1, file_size, fp);
if ( actually_read != file_size ) {
free(loadableData);
DPRINTF("Read wrong size\n");
return 1;
}
fclose(fp);
resourceList.loadableData = loadableData;
// Initialize CUDA.
result = cudaFree(0);
if (result != cudaSuccess) {
errPtr = cudaGetErrorName(result);
DPRINTF("Error in creating cudaFree = %s\n", errPtr);
cleanUp(&resourceList);
return 1;
}
result = cudaSetDevice(0);
if (result != cudaSuccess) {
errPtr = cudaGetErrorName(result);
DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr);
cleanUp(&resourceList);
return 1;
}
err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA);
if (err != cudlaSuccess) {
DPRINTF("Error in cuDLA create device = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Device created successfully\n");
resourceList.devHandle = devHandle;
err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0);
if (err != cudlaSuccess) {
DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err);
cleanUp(&resourceList);
return 1;
} else {
DPRINTF("Successfully loaded module\n");
}
resourceList.moduleHandle = moduleHandle;
// Create CUDA stream.
result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
if (result != cudaSuccess) {
errPtr = cudaGetErrorName(result);
DPRINTF("Error in creating cuda stream = %s\n", errPtr);
cleanUp(&resourceList);
return 1;
}
resourceList.stream = stream;
// Get tensor attributes.
uint32_t numInputTensors = 0;
uint32_t numOutputTensors = 0;
uint32_t numOutputTaskStatistics = 0;
cudlaModuleAttribute attribute;
err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting numInputTensors = %d\n", err);
cleanUp(&resourceList);
return 1;
}
numInputTensors = attribute.numInputTensors;
DPRINTF("numInputTensors = %d\n", numInputTensors);
err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting numOutputTensors = %d\n", err);
cleanUp(&resourceList);
return 1;
}
numOutputTensors = attribute.numOutputTensors;
DPRINTF("numOutputTensors = %d\n", numOutputTensors);
// using the same attributes to get num_output_task_statistics_tensors
attribute.numOutputTensors = 0;
err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting numOutputTensors = %d\n", err);
cleanUp(&resourceList);
return 1;
}
numOutputTaskStatistics = attribute.numOutputTensors;
DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics);
if(numOutputTaskStatistics == 0) {
DPRINTF("Layerwise stats is not supported for this Loadable \n");
cleanUp(&resourceList);
return 1;
}
resourceList.numInputTensors = numInputTensors;
resourceList.numOutputTensors = numOutputTensors;
resourceList.numOutputTaskStatistics = numOutputTaskStatistics;
cudlaModuleTensorDescriptor* inputTensorDesc =
(cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors);
cudlaModuleTensorDescriptor* outputTensorDesc =
(cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors);
if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) {
if (inputTensorDesc != NULL) {
free(inputTensorDesc);
inputTensorDesc = NULL;
}
if (outputTensorDesc != NULL) {
free(outputTensorDesc);
outputTensorDesc = NULL;
}
cleanUp(&resourceList);
return 1;
}
resourceList.inputTensorDesc = inputTensorDesc;
resourceList.outputTensorDesc = outputTensorDesc;
cudlaModuleTensorDescriptor* outputTaskStatisticsDesc =
(cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics);
if (outputTaskStatisticsDesc == NULL) {
free(outputTaskStatisticsDesc);
outputTaskStatisticsDesc = NULL;
cleanUp(&resourceList);
return 1;
}
resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc;
attribute.inputTensorDesc = inputTensorDesc;
err = cudlaModuleGetAttributes(moduleHandle,
CUDLA_INPUT_TENSOR_DESCRIPTORS,
&attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting input tensor descriptor = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Printing input tensor descriptor\n");
printTensorDesc(inputTensorDesc);
attribute.outputTensorDesc = outputTensorDesc;
err = cudlaModuleGetAttributes(moduleHandle,
CUDLA_OUTPUT_TENSOR_DESCRIPTORS,
&attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting output tensor descriptor = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Printing output tensor descriptor\n");
printTensorDesc(outputTensorDesc);
attribute.outputTensorDesc = outputTaskStatisticsDesc;
err = cudlaModuleGetAttributes(moduleHandle,
CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS,
&attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting task statistics descriptor = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Printing output task statistics descriptor size\n");
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size);
}
// Setup the input and output buffers which will be used as an input to CUDA.
unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors);
if (inputBuffer == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors);
resourceList.inputBuffer = inputBuffer;
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size);
if (inputBuffer[ii] == NULL) {
DPRINTF("Error in allocating input memory\n");
cleanUp(&resourceList);
return 1;
}
memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size);
}
unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors);
if (outputBuffer == NULL) {
DPRINTF("Error in allocating memory for output buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors);
resourceList.outputBuffer = outputBuffer;
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size);
if (outputBuffer[ii] == NULL) {
DPRINTF("Error in allocating output memory\n");
cleanUp(&resourceList);
return 1;
}
memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size);
}
unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics);
if (statisticsOutputBuffer == NULL) {
DPRINTF("Error in allocating memory for output buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics);
resourceList.statisticsOutputBuffer = statisticsOutputBuffer;
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size);
if (outputBuffer[ii] == NULL) {
DPRINTF("Error in allocating output memory\n");
cleanUp(&resourceList);
return 1;
}
memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size);
}
// Allocate memory on GPU.
void** inputBufferGPU = (void **)malloc(sizeof(void *)*numInputTensors);
if (inputBufferGPU == NULL) {
DPRINTF("Error in allocating memory for input buffer GPU array\n");
cleanUp(&resourceList);
return 1;
}
memset(inputBufferGPU, 0x00, sizeof(void *)*numInputTensors);
resourceList.inputBufferGPU = inputBufferGPU;
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
result = cudaMalloc(&(inputBufferGPU[ii]), inputTensorDesc[ii].size);
if (result != cudaSuccess)
{
DPRINTF("Error in allocating input memory on GPU\n");
cleanUp(&resourceList);
return 1;
}
}
void** outputBufferGPU = (void **)malloc(sizeof(void *)*numOutputTensors);
if (outputBufferGPU == NULL) {
DPRINTF("Error in allocating memory for output buffer GPU array\n");
cleanUp(&resourceList);
return 1;
}
memset(outputBufferGPU, 0x00, sizeof(void *)*numOutputTensors);
resourceList.outputBufferGPU = outputBufferGPU;
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
result = cudaMalloc(&(outputBufferGPU[ii]), outputTensorDesc[ii].size);
if (result != cudaSuccess) {
DPRINTF("Error in allocating output memory on GPU\n");
cleanUp(&resourceList);
return 1;
}
}
void** outputTaskStatisticsGPU = (void **)malloc(sizeof(void *)*numOutputTaskStatistics);
if (outputTaskStatisticsGPU == NULL) {
DPRINTF("Error in allocating memory for output task statistics GPU array\n");
cleanUp(&resourceList);
return 1;
}
memset(outputTaskStatisticsGPU, 0x00, sizeof(void *)*numOutputTaskStatistics);
resourceList.outputTaskStatisticsGPU = outputTaskStatisticsGPU;
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
result = cudaMalloc(&(outputTaskStatisticsGPU[ii]), outputTaskStatisticsDesc[ii].size);
if (result != cudaSuccess) {
DPRINTF("Error in allocating task statistics memory on GPU\n");
cleanUp(&resourceList);
return 1;
}
}
uint64_t** inputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors);
uint64_t** outputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors);
uint64_t** outputTaskStatisticsRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics);
if ((inputBufferRegisteredPtr == NULL) || (outputBufferRegisteredPtr == NULL) || (outputTaskStatisticsRegisteredPtr == NULL)) {
if (inputBufferRegisteredPtr != NULL) {
free(inputBufferRegisteredPtr);
inputBufferRegisteredPtr = NULL;
}
if (outputBufferRegisteredPtr != NULL) {
free(outputBufferRegisteredPtr);
outputBufferRegisteredPtr = NULL;
}
if (outputTaskStatisticsRegisteredPtr != NULL) {
free(outputTaskStatisticsRegisteredPtr);
outputTaskStatisticsRegisteredPtr = NULL;
}
cleanUp(&resourceList);
return 1;
}
resourceList.inputBufferRegisteredPtr = inputBufferRegisteredPtr;
resourceList.outputBufferRegisteredPtr = outputBufferRegisteredPtr;
resourceList.outputTaskStatisticsRegisteredPtr = outputTaskStatisticsRegisteredPtr;
// Register the CUDA-allocated buffers.
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
err = cudlaMemRegister(devHandle,
(uint64_t* )(inputBufferGPU[ii]),
inputTensorDesc[ii].size,
&(inputBufferRegisteredPtr[ii]),
0);
if (err != cudlaSuccess) {
DPRINTF("Error in registering input memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
err = cudlaMemRegister(devHandle,
(uint64_t* )(outputBufferGPU[ii]),
outputTensorDesc[ii].size,
&(outputBufferRegisteredPtr[ii]),
0);
if (err != cudlaSuccess) {
DPRINTF("Error in registering output memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
err = cudlaMemRegister(devHandle,
(uint64_t* )(outputTaskStatisticsGPU[ii]),
outputTaskStatisticsDesc[ii].size,
&(outputTaskStatisticsRegisteredPtr[ii]),
CUDLA_TASK_STATISTICS);
if (err != cudlaSuccess) {
DPRINTF("Error in registering statistics output memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n");
// Copy data from CPU buffers to GPU buffers.
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
result = cudaMemcpyAsync(inputBufferGPU[ii], inputBuffer[ii], inputTensorDesc[ii].size, cudaMemcpyHostToDevice, stream);
if (result != cudaSuccess) {
DPRINTF("Error in enqueueing memcpy for input\n");
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
result = cudaMemsetAsync(outputBufferGPU[ii], 0, outputTensorDesc[ii].size, stream);
if (result != cudaSuccess) {
DPRINTF("Error in enqueueing memset for output\n");
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
result = cudaMemsetAsync(outputTaskStatisticsGPU[ii], 0, outputTaskStatisticsDesc[ii].size, stream);
if (result != cudaSuccess) {
DPRINTF("Error in enqueueing memset for statistics output\n");
cleanUp(&resourceList);
return 1;
}
}
uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0};
uint32_t index = 0;
for (; index < numOutputTensors ; index++) {
outputStatisticsBufferRegisteredPtr[index] = ((outputBufferRegisteredPtr[index]));
}
for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) {
outputStatisticsBufferRegisteredPtr[index++] = ((outputTaskStatisticsRegisteredPtr[jj]));
}
// Enqueue a cuDLA task.
cudlaTask task;
task.moduleHandle = moduleHandle;
task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr;
if(statSupport == 1) {
task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics);
DPRINTF("Layerwise profiling is requested \n");
} else {
task.numOutputTensors = numOutputTensors;
DPRINTF("Layerwise profiling is not requested \n");
}
task.numInputTensors = numInputTensors;
task.inputTensor = inputBufferRegisteredPtr;
task.waitEvents = NULL;
task.signalEvents = NULL;
err = cudlaSubmitTask(devHandle, &task, 1, stream, 0);
if (err != cudlaSuccess) {
DPRINTF("no of output tensor %u \n",(task.numOutputTensors));
DPRINTF("Error in submitting task\n");
cleanUp(&resourceList);
return 1;
}
DPRINTF("SUBMIT IS DONE !!!\n");
result = cudaStreamSynchronize(stream);
if (result != cudaSuccess) {
DPRINTF("Error in synchronizing stream = %s\n", cudaGetErrorName(result));
cleanUp(&resourceList);
return 1;
}
// Wait for stream operations to finish and bring output buffer to CPU.
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
result = cudaMemcpyAsync(outputBuffer[ii], outputBufferGPU[ii],
outputTensorDesc[ii].size, cudaMemcpyDeviceToHost, stream);
if (result != cudaSuccess) {
DPRINTF("Error in bringing result back to CPU\n");
cleanUp(&resourceList);
return 1;
}
}
result = cudaStreamSynchronize(stream);
if (result != cudaSuccess) {
DPRINTF("Error in synchronizing stream\n");
cleanUp(&resourceList);
return 1;
}
if(statSupport == 1) {
// copy statistics data to cpu
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
result = cudaMemcpyAsync(statisticsOutputBuffer[ii], outputTaskStatisticsGPU[ii],
outputTaskStatisticsDesc[ii].size, cudaMemcpyDeviceToHost, stream);
if (result != cudaSuccess) {
DPRINTF("Error in bringing result back to CPU\n");
cleanUp(&resourceList);
return 1;
}
}
result = cudaStreamSynchronize(stream);
if (result != cudaSuccess) {
DPRINTF("Error in synchronizing stream\n");
cleanUp(&resourceList);
return 1;
}
// To get the last index of the filename prefix in which statistics will be dumped
uint32_t index = 0;
if (argc == 5) {
while(argv[4][index]!='\0') {
index++;
}
}
const cudlaExternalEtbl* etbl = NULL;
if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) {
DPRINTF("Error in getting export table\n");
cleanUp(&resourceList);
return 1;
}
void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics);
if (csv == NULL) {
DPRINTF("Error in allocating memory for csv stream\n");
cleanUp(&resourceList);
return 1;
}
memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics);
resourceList.csv = csv;
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
cudlaTranslateCsvAttribute csvAttribute;
uint64_t csvStreamLength = 0;
err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute);
csv[ii] = (void* )malloc(csvAttribute.csvStreamLength);
csvStreamLength = csvAttribute.csvStreamLength;
DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength);
if (csv[ii] == NULL) {
DPRINTF("Error in allocating memory for csv stream\n");
cleanUp(&resourceList);
return 1;
}
memset(csv[ii], 0x00, csvAttribute.csvStreamLength);
csvAttribute.csvStreamStats = csv[ii];
err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute);
if (err != cudlaSuccess) {
DPRINTF("Error in translating stats\n");
cleanUp(&resourceList);
return 1;
}
if (argc == 5) {
sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix);
fp = fopen(filename, "w+");
if (fp == NULL) {
DPRINTF("Cannot open file %s\n", filename);
cleanUp(&resourceList);
return 1;
}
uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp);
if(ret_val != csvStreamLength) {
DPRINTF("number of elements written to file is %u \n", ret_val);
cleanUp(&resourceList);
return 1;
}
fclose(fp);
} else {
DPRINTF("%s \n",(char *)csv[ii]);
}
}
}
// unregister the CUDA-allocated buffers.
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
err = cudlaMemUnregister(devHandle,
(inputBufferRegisteredPtr[ii]));
if (err != cudlaSuccess) {
DPRINTF("Error in registering input memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
err = cudlaMemUnregister(devHandle,
(outputBufferRegisteredPtr[ii]));
if (err != cudlaSuccess) {
DPRINTF("Error in registering output memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
err = cudlaMemUnregister(devHandle,
(outputTaskStatisticsRegisteredPtr[ii]));
if (err != cudlaSuccess) {
DPRINTF("Error in registering output memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n");
result = cudaStreamDestroy(stream);
if (result != cudaSuccess) {
errPtr = cudaGetErrorName(result);
DPRINTF("Error in destroying cuda stream = %s\n", errPtr);
cleanUp(&resourceList);
return 1;
}
resourceList.stream = NULL;
err = cudlaModuleUnload(moduleHandle, 0);
if (err != cudlaSuccess) {
DPRINTF("Error in cudlaModuleUnload = %d\n", err);
cleanUp(&resourceList);
return 1;
} else {
DPRINTF("Successfully unloaded module\n");
}
resourceList.moduleHandle = NULL;
err = cudlaDestroyDevice(devHandle);
if (err != cudlaSuccess) {
DPRINTF("Error in cuDLA destroy device = %d\n", err);
return 1;
}
DPRINTF("Device destroyed successfully\n");
resourceList.devHandle = NULL;
cleanUp(&resourceList);
DPRINTF("cuDLALayerwiseStatsHybrid DONE !!!\n");
return 0;
}