cuda-samples/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp

1349 lines
50 KiB
C++
Raw Normal View History

/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cudla.h"
#include "nvscierror.h"
#include "nvscibuf.h"
#include "nvscisync.h"
#include "cudlaExternalEtbl.hpp"
#include <unistd.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sys/stat.h>
#include <fstream>
#include <sstream>
#define MAX_FILENAME_LEN 200
#define RESERVED_SUFFIX_LEN 10
#define DPRINTF(...) printf(__VA_ARGS__)
static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) {
DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name);
DPRINTF("\tsize: %lu\n", tensorDesc->size);
DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n",
tensorDesc->n,
tensorDesc->c,
tensorDesc->h,
tensorDesc->w);
DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat);
DPRINTF("\tdata type: %d\n", tensorDesc->dataType);
DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory);
DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat);
DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping);
DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]);
DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]);
DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]);
DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]);
}
typedef struct {
cudlaDevHandle devHandle;
cudlaModule moduleHandle;
unsigned char* loadableData;
uint32_t numInputTensors;
uint32_t numOutputTensors;
uint32_t numOutputTaskStatistics;
unsigned char** inputBuffer;
unsigned char** outputBuffer;
unsigned char** statisticsOutputBuffer;
cudlaModuleTensorDescriptor* inputTensorDesc;
cudlaModuleTensorDescriptor* outputTensorDesc;
cudlaModuleTensorDescriptor* outputTaskStatisticsDesc;
NvSciBufObj* inputBufObj;
NvSciBufObj* outputBufObj;
NvSciBufObj* statisticsBufObj;
NvSciBufModule bufModule;
NvSciBufAttrList* inputAttrList;
NvSciBufAttrList* reconciledInputAttrList;
NvSciBufAttrList* inputConflictList;
NvSciBufAttrList* outputAttrList;
NvSciBufAttrList* reconciledOutputAttrList;
NvSciBufAttrList* outputConflictList;
NvSciSyncObj syncObj;
NvSciSyncModule syncModule;
NvSciSyncCpuWaitContext nvSciCtx;
NvSciSyncAttrList waiterAttrListObj;
NvSciSyncAttrList signalerAttrListObj;
NvSciSyncAttrList nvSciSyncConflictListObj;
NvSciSyncAttrList nvSciSyncReconciledListObj;
NvSciBufAttrList* statisticsOutputAttrList;
NvSciBufAttrList* reconciledStatisticsOutputAttrList;
NvSciBufAttrList* statisticsOutputConflictList;
uint64_t** inputBufObjRegPtr;
uint64_t** outputBufObjRegPtr;
uint64_t** statisticsBufObjRegPtr;
uint64_t** devPtrs;
cudlaSignalEvents* signalEvents;
NvSciSyncFence eofFence;
void **csv;
} ResourceList;
void cleanUp(ResourceList* resourceList);
void cleanUp(ResourceList* resourceList) {
uint32_t ii = 0;
if (resourceList->inputTensorDesc != NULL) {
free(resourceList->inputTensorDesc);
resourceList->inputTensorDesc = NULL;
}
if (resourceList->outputTensorDesc != NULL) {
free(resourceList->outputTensorDesc);
resourceList->outputTensorDesc = NULL;
}
if (resourceList->outputTaskStatisticsDesc != NULL) {
free(resourceList->outputTaskStatisticsDesc);
resourceList->outputTaskStatisticsDesc = NULL;
}
if (resourceList->loadableData != NULL) {
free(resourceList->loadableData);
resourceList->loadableData = NULL;
}
if (resourceList->moduleHandle != NULL) {
cudlaModuleUnload(resourceList->moduleHandle, 0);
resourceList->moduleHandle = NULL;
}
if (resourceList->devHandle != NULL) {
cudlaDestroyDevice(resourceList->devHandle);
resourceList->devHandle = NULL;
}
if (resourceList->inputBufObj != NULL) {
for (ii = 0; ii < resourceList->numInputTensors; ii++) {
if((resourceList->inputBufObj)[ii] != NULL) {
NvSciBufObjFree((resourceList->inputBufObj)[ii]);
(resourceList->inputBufObj)[ii] = NULL;
}
}
}
if (resourceList->outputBufObj != NULL) {
for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
if((resourceList->outputBufObj)[ii] != NULL) {
NvSciBufObjFree((resourceList->outputBufObj)[ii]);
(resourceList->outputBufObj)[ii] = NULL;
}
}
}
if (resourceList->statisticsBufObj != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if((resourceList->statisticsBufObj)[ii] != NULL) {
NvSciBufObjFree((resourceList->statisticsBufObj)[ii]);
(resourceList->statisticsBufObj)[ii] = NULL;
}
}
}
if (resourceList->inputBuffer != NULL) {
for (ii = 0; ii < resourceList->numInputTensors; ii++) {
if ((resourceList->inputBuffer)[ii] != NULL) {
free((resourceList->inputBuffer)[ii]);
(resourceList->inputBuffer)[ii] = NULL;
}
}
free(resourceList->inputBuffer);
resourceList->inputBuffer = NULL;
}
if (resourceList->outputBuffer != NULL) {
for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
if ((resourceList->outputBuffer)[ii] != NULL) {
free((resourceList->outputBuffer)[ii]);
(resourceList->outputBuffer)[ii] = NULL;
}
}
free(resourceList->outputBuffer);
resourceList->outputBuffer = NULL;
}
if (resourceList->statisticsOutputBuffer != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if ((resourceList->statisticsOutputBuffer)[ii] != NULL) {
free((resourceList->statisticsOutputBuffer)[ii]);
(resourceList->statisticsOutputBuffer)[ii] = NULL;
}
}
free(resourceList->statisticsOutputBuffer);
resourceList->statisticsOutputBuffer = NULL;
}
if (resourceList->csv != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if ((resourceList->csv)[ii] != NULL) {
free((resourceList->csv)[ii]);
(resourceList->csv)[ii] = NULL;
}
}
free(resourceList->csv);
resourceList->csv = NULL;
}
if (resourceList->reconciledInputAttrList != NULL) {
for (ii = 0; ii < resourceList->numInputTensors; ii++) {
if((resourceList->reconciledInputAttrList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->reconciledInputAttrList)[ii]);
(resourceList->reconciledInputAttrList)[ii] = NULL;
}
}
free(resourceList->reconciledInputAttrList);
resourceList->reconciledInputAttrList = NULL;
}
if (resourceList->inputConflictList != NULL) {
for (ii = 0; ii < resourceList->numInputTensors; ii++) {
if((resourceList->inputConflictList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->inputConflictList)[ii]);
(resourceList->inputConflictList)[ii] = NULL;
}
}
free(resourceList->inputConflictList);
resourceList->inputConflictList = NULL;
}
if (resourceList->inputAttrList != NULL) {
for (ii = 0; ii < resourceList->numInputTensors; ii++) {
if((resourceList->inputAttrList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->inputAttrList)[ii]);
(resourceList->inputAttrList)[ii] = NULL;
}
}
free(resourceList->inputAttrList);
resourceList->inputAttrList = NULL;
}
if (resourceList->reconciledOutputAttrList != NULL) {
for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
if((resourceList->reconciledOutputAttrList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->reconciledOutputAttrList)[ii]);
(resourceList->reconciledOutputAttrList)[ii] = NULL;
}
}
free(resourceList->reconciledOutputAttrList);
resourceList->reconciledOutputAttrList = NULL;
}
if (resourceList->outputConflictList != NULL) {
for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
if((resourceList->outputConflictList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->outputConflictList)[ii]);
(resourceList->outputConflictList)[ii] = NULL;
}
}
free(resourceList->outputConflictList);
resourceList->outputConflictList = NULL;
}
if (resourceList->outputAttrList != NULL) {
for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
if((resourceList->outputAttrList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->outputAttrList)[ii]);
(resourceList->outputAttrList)[ii] = NULL;
}
}
free(resourceList->outputAttrList);
resourceList->outputAttrList = NULL;
}
if (resourceList->reconciledStatisticsOutputAttrList != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if((resourceList->reconciledStatisticsOutputAttrList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->reconciledStatisticsOutputAttrList)[ii]);
(resourceList->reconciledStatisticsOutputAttrList)[ii] = NULL;
}
}
free(resourceList->reconciledStatisticsOutputAttrList);
resourceList->reconciledStatisticsOutputAttrList = NULL;
}
if (resourceList->statisticsOutputConflictList != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if((resourceList->statisticsOutputConflictList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->statisticsOutputConflictList)[ii]);
(resourceList->statisticsOutputConflictList)[ii] = NULL;
}
}
free(resourceList->statisticsOutputConflictList);
resourceList->statisticsOutputConflictList = NULL;
}
if (resourceList->statisticsOutputAttrList != NULL) {
for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
if((resourceList->statisticsOutputAttrList)[ii] != NULL) {
NvSciBufAttrListFree((resourceList->statisticsOutputAttrList)[ii]);
(resourceList->statisticsOutputAttrList)[ii] = NULL;
}
}
free(resourceList->statisticsOutputAttrList);
resourceList->statisticsOutputAttrList = NULL;
}
if (resourceList->outputBufObjRegPtr != NULL) {
free(resourceList->outputBufObjRegPtr);
resourceList->outputBufObjRegPtr = NULL;
}
if (resourceList->statisticsBufObjRegPtr != NULL) {
free(resourceList->statisticsBufObjRegPtr);
resourceList->statisticsBufObjRegPtr = NULL;
}
if (resourceList->inputBufObjRegPtr != NULL) {
free(resourceList->inputBufObjRegPtr);
resourceList->inputBufObjRegPtr = NULL;
}
if (resourceList->bufModule != NULL) {
NvSciBufModuleClose(resourceList->bufModule);
resourceList->bufModule = NULL;
}
NvSciSyncFenceClear(&(resourceList->eofFence));
if (resourceList->syncObj != NULL) {
NvSciSyncObjFree(resourceList->syncObj);
resourceList->syncObj = NULL;
}
if (resourceList->nvSciSyncConflictListObj != NULL) {
NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj);
resourceList->nvSciSyncConflictListObj = NULL;
}
if (resourceList->nvSciSyncReconciledListObj != NULL) {
NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj);
resourceList->nvSciSyncReconciledListObj = NULL;
}
if (resourceList->signalerAttrListObj != NULL) {
NvSciSyncAttrListFree(resourceList->signalerAttrListObj);
resourceList->signalerAttrListObj = NULL;
}
if (resourceList->waiterAttrListObj != NULL) {
NvSciSyncAttrListFree(resourceList->waiterAttrListObj);
resourceList->waiterAttrListObj = NULL;
}
if (resourceList->nvSciCtx != NULL) {
NvSciSyncCpuWaitContextFree(resourceList->nvSciCtx);
resourceList->nvSciCtx = NULL;
}
if (resourceList->syncModule != NULL) {
NvSciSyncModuleClose(resourceList->syncModule);
resourceList->syncModule = NULL;
}
if (resourceList->signalEvents != NULL) {
if (resourceList->signalEvents->eofFences != NULL) {
free(resourceList->signalEvents->eofFences);
resourceList->signalEvents->eofFences = NULL;
}
free(resourceList->signalEvents);
resourceList->signalEvents = NULL;
}
if (resourceList->devPtrs != NULL) {
free(resourceList->devPtrs);
resourceList->devPtrs = NULL;
}
resourceList->numInputTensors = 0;
resourceList->numOutputTensors = 0;
resourceList->numOutputTaskStatistics = 0;
}
cudlaStatus createAndSetAttrList(NvSciBufModule module,
uint64_t bufSize,
NvSciBufAttrList *attrList);
cudlaStatus createAndSetAttrList(NvSciBufModule module,
uint64_t bufSize,
NvSciBufAttrList *attrList) {
cudlaStatus status = cudlaSuccess;
NvSciError sciStatus = NvSciError_Success;
sciStatus = NvSciBufAttrListCreate(module, attrList);
if (sciStatus != NvSciError_Success) {
status = cudlaErrorNvSci;
DPRINTF("Error in creating NvSciBuf attribute list\n");
return status;
}
// TODO: Refactor into multiple dimensions
bool needCpuAccess = true;
NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
uint32_t dimcount = 1;
uint64_t sizes[] = {bufSize};
uint32_t alignment[] = {1};
uint32_t dataType = NvSciDataType_Int8;
NvSciBufType type = NvSciBufType_Tensor;
uint64_t baseAddrAlign = 512;
NvSciBufAttrKeyValuePair setAttrs[] = {
{.key = NvSciBufGeneralAttrKey_Types,
.value = &type,
.len = sizeof(type)},
{.key = NvSciBufTensorAttrKey_DataType,
.value = &dataType,
.len = sizeof(dataType)},
{.key = NvSciBufTensorAttrKey_NumDims,
.value = &dimcount,
.len = sizeof(dimcount)},
{.key = NvSciBufTensorAttrKey_SizePerDim,
.value = &sizes,
.len = sizeof(sizes)},
{.key = NvSciBufTensorAttrKey_AlignmentPerDim,
.value = &alignment,
.len = sizeof(alignment)},
{.key = NvSciBufTensorAttrKey_BaseAddrAlign,
.value = &baseAddrAlign,
.len = sizeof(baseAddrAlign)},
{.key = NvSciBufGeneralAttrKey_RequiredPerm,
.value = &perm,
.len = sizeof(perm)},
{.key = NvSciBufGeneralAttrKey_NeedCpuAccess,
.value = &needCpuAccess,
.len = sizeof(needCpuAccess)}};
size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair);
sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length);
if (sciStatus != NvSciError_Success)
{
status = cudlaErrorNvSci;
DPRINTF("Error in setting NvSciBuf attribute list\n");
return status;
}
return status;
}
NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list);
NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) {
bool cpuWaiter = true;
NvSciSyncAttrKeyValuePair keyValue[2];
memset(keyValue, 0, sizeof(keyValue));
keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess;
keyValue[0].value = (void*) &cpuWaiter;
keyValue[0].len = sizeof(cpuWaiter);
NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly;
keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm;
keyValue[1].value = (void*) &cpuPerm;
keyValue[1].len = sizeof(cpuPerm);
return NvSciSyncAttrListSetAttrs(list, keyValue, 2);
}
int main(int argc, char** argv) {
cudlaDevHandle devHandle;
cudlaModule moduleHandle;
cudlaStatus err;
uint32_t statSupport = 0;
uint32_t dlaFreqInMHz = 0;
FILE* fp = NULL;
struct stat st;
size_t file_size;
size_t actually_read = 0;
unsigned char *loadableData = NULL;
char filename[MAX_FILENAME_LEN];
const char* suffix = ".csv";
ResourceList resourceList;
memset(&resourceList, 0x00, sizeof(ResourceList));
if ((argc != 4) && (argc != 5)) {
DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_standalone_test1 <loadable> <freqMHZ> <statSupport> <filenamePrefix>\n");
return 1;
}
if (argc == 5) {
if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) {
DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN));
return 1;
}
}
// Read loadable into buffer.
fp = fopen(argv[1], "rb");
if (fp == NULL) {
DPRINTF("Cannot open file %s\n", argv[1]);
return 1;
}
if (stat(argv[1], &st) != 0) {
DPRINTF("Cannot stat file\n");
return 1;
}
file_size = st.st_size;
DPRINTF("The file size = %ld\n", file_size);
dlaFreqInMHz = atoi(argv[2]);
statSupport = atoi(argv[3]);
loadableData = (unsigned char *)malloc(file_size);
if (loadableData == NULL) {
DPRINTF("Cannot Allocate memory for loadable\n");
return 1;
}
actually_read = fread(loadableData, 1, file_size, fp);
if ( actually_read != file_size ) {
free(loadableData);
DPRINTF("Read wrong size\n");
return 1;
}
fclose(fp);
resourceList.loadableData = loadableData;
err = cudlaCreateDevice(0, &devHandle, CUDLA_STANDALONE);
if (err != cudlaSuccess) {
DPRINTF("Error in cuDLA create device = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Device created successfully\n");
resourceList.devHandle = devHandle;
err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0);
if (err != cudlaSuccess) {
DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err);
cleanUp(&resourceList);
return 1;
} else {
DPRINTF("Successfully loaded module\n");
}
resourceList.moduleHandle = moduleHandle;
// Get tensor attributes.
uint32_t numInputTensors = 0;
uint32_t numOutputTensors = 0;
uint32_t numOutputTaskStatistics = 0;
cudlaModuleAttribute attribute;
err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting numInputTensors = %d\n", err);
cleanUp(&resourceList);
return 1;
}
numInputTensors = attribute.numInputTensors;
DPRINTF("numInputTensors = %d\n", numInputTensors);
err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting numOutputTensors = %d\n", err);
cleanUp(&resourceList);
return 1;
}
numOutputTensors = attribute.numOutputTensors;
DPRINTF("numOutputTensors = %d\n", numOutputTensors);
// using the same attributes to get num_output_task_statistics_tensors
attribute.numOutputTensors = 0;
err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting numOutputTensors = %d\n", err);
cleanUp(&resourceList);
return 1;
}
numOutputTaskStatistics = attribute.numOutputTensors;
DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics);
if(numOutputTaskStatistics == 0) {
DPRINTF("Layerwise stats is not supported for this Loadable \n");
cleanUp(&resourceList);
return 1;
}
resourceList.numInputTensors = numInputTensors;
resourceList.numOutputTensors = numOutputTensors;
resourceList.numOutputTaskStatistics = numOutputTaskStatistics;
cudlaModuleTensorDescriptor* inputTensorDesc =
(cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors);
cudlaModuleTensorDescriptor* outputTensorDesc =
(cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors);
if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) {
if (inputTensorDesc != NULL)
{
free(inputTensorDesc);
inputTensorDesc = NULL;
}
if (outputTensorDesc != NULL)
{
free(outputTensorDesc);
outputTensorDesc = NULL;
}
cleanUp(&resourceList);
return 1;
}
resourceList.inputTensorDesc = inputTensorDesc;
resourceList.outputTensorDesc = outputTensorDesc;
cudlaModuleTensorDescriptor* outputTaskStatisticsDesc =
(cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics);
if (outputTaskStatisticsDesc == NULL) {
free(outputTaskStatisticsDesc);
outputTaskStatisticsDesc = NULL;
cleanUp(&resourceList);
return 1;
}
resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc;
attribute.inputTensorDesc = inputTensorDesc;
err = cudlaModuleGetAttributes(moduleHandle,
CUDLA_INPUT_TENSOR_DESCRIPTORS,
&attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting input tensor descriptor = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Printing input tensor descriptor\n");
printTensorDesc(inputTensorDesc);
attribute.outputTensorDesc = outputTensorDesc;
err = cudlaModuleGetAttributes(moduleHandle,
CUDLA_OUTPUT_TENSOR_DESCRIPTORS,
&attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting output tensor descriptor = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Printing output tensor descriptor\n");
printTensorDesc(outputTensorDesc);
attribute.outputTensorDesc = outputTaskStatisticsDesc;
err = cudlaModuleGetAttributes(moduleHandle,
CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS,
&attribute);
if (err != cudlaSuccess) {
DPRINTF("Error in getting task statistics descriptor = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("Printing output task statistics descriptor size\n");
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size);
}
// Setup the input and output buffers.
unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors);
if (inputBuffer == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors);
resourceList.inputBuffer = inputBuffer;
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size);
if (inputBuffer[ii] == NULL) {
DPRINTF("Error in allocating input memory\n");
cleanUp(&resourceList);
return 1;
}
memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size);
}
unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors);
if (outputBuffer == NULL) {
DPRINTF("Error in allocating memory for output buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors);
resourceList.outputBuffer = outputBuffer;
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size);
if (outputBuffer[ii] == NULL) {
DPRINTF("Error in allocating output memory\n");
cleanUp(&resourceList);
return 1;
}
memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size);
}
unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics);
if (statisticsOutputBuffer == NULL) {
DPRINTF("Error in allocating memory for output buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics);
resourceList.statisticsOutputBuffer = statisticsOutputBuffer;
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size);
if (outputBuffer[ii] == NULL) {
DPRINTF("Error in allocating output memory\n");
cleanUp(&resourceList);
return 1;
}
memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size);
}
NvSciBufModule bufModule = NULL;
NvSciBufAttrList *inputAttrList = {NULL};
NvSciBufAttrList *outputAttrList = {NULL};
NvSciBufAttrList *statisticsOutputAttrList = {NULL};
NvSciBufAttrList *reconciledInputAttrList = {NULL};
NvSciBufAttrList *reconciledOutputAttrList = {NULL};
NvSciBufAttrList *reconciledStatisticsOutputAttrList = {NULL};
NvSciBufAttrList *inputConflictList = {NULL};
NvSciBufAttrList *outputConflictList = {NULL};
NvSciBufAttrList *statisticsOutputConflictList = {NULL};
NvSciError sciError = NvSciError_Success;
sciError = NvSciBufModuleOpen(&bufModule);
if (sciError != NvSciError_Success) {
DPRINTF("Error in initializing NvSciBufModule\n");
cleanUp(&resourceList);
return 1;
}
resourceList.bufModule = bufModule;
// creating and setting input attribute list
inputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors);
if (inputAttrList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(inputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors);
resourceList.inputAttrList = inputAttrList;
reconciledInputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors);
if (reconciledInputAttrList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(reconciledInputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors);
resourceList.reconciledInputAttrList = reconciledInputAttrList;
inputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors);
if (inputConflictList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(inputConflictList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors);
resourceList.inputConflictList = inputConflictList;
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
err = createAndSetAttrList(bufModule,
inputTensorDesc[ii].size,
&inputAttrList[ii]);
if (err != cudlaSuccess) {
DPRINTF("Error in creating NvSciBuf attribute list for input attribute\n");
cleanUp(&resourceList);
return 1;
}
sciError = NvSciBufAttrListReconcile(&inputAttrList[ii],
1,
&reconciledInputAttrList[ii],
&inputConflictList[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in reconciling NvSciBuf attribute list for input attribute\n");
cleanUp(&resourceList);
return 1;
}
}
outputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors);
if (outputAttrList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(outputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors);
resourceList.outputAttrList = outputAttrList;
reconciledOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors);
if (reconciledOutputAttrList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(reconciledOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors);
resourceList.reconciledOutputAttrList = reconciledOutputAttrList;
outputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors);
if (outputConflictList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(outputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors);
resourceList.outputConflictList = outputConflictList;
// creating and setting output attribute list
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
err = createAndSetAttrList(bufModule,
outputTensorDesc[ii].size,
&outputAttrList[ii]);
if (err != cudlaSuccess) {
DPRINTF("Error in creating NvSciBuf attribute list for output attibute\n");
cleanUp(&resourceList);
return 1;
}
sciError = NvSciBufAttrListReconcile(&outputAttrList[ii],
1,
&reconciledOutputAttrList[ii],
&outputConflictList[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in reconciling NvSciBuf attribute list for output attribute\n");
cleanUp(&resourceList);
return 1;
}
}
statisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
if (statisticsOutputAttrList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(statisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
resourceList.statisticsOutputAttrList = statisticsOutputAttrList;
reconciledStatisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
if (reconciledStatisticsOutputAttrList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(reconciledStatisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
resourceList.reconciledStatisticsOutputAttrList = reconciledStatisticsOutputAttrList;
statisticsOutputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
if (statisticsOutputConflictList == NULL) {
DPRINTF("Error in allocating memory for input buffer array\n");
cleanUp(&resourceList);
return 1;
}
memset(statisticsOutputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
resourceList.statisticsOutputConflictList = statisticsOutputConflictList;
// creating and setting statistics output attribute list
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
err = createAndSetAttrList(bufModule,
outputTaskStatisticsDesc[ii].size,
&statisticsOutputAttrList[ii]);
if (err != cudlaSuccess) {
DPRINTF("Error in creating NvSciBuf attribute list\n");
cleanUp(&resourceList);
return 1;
}
sciError = NvSciBufAttrListReconcile(&statisticsOutputAttrList[ii],
1,
&reconciledStatisticsOutputAttrList[ii],
&statisticsOutputConflictList[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in reconciling NvSciBuf attribute list\n");
cleanUp(&resourceList);
return 1;
}
}
NvSciBufObj *inputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numInputTensors);
NvSciBufObj *outputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTensors);
NvSciBufObj *statisticsBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTaskStatistics);
resourceList.inputBufObj = inputBufObj;
resourceList.outputBufObj = outputBufObj;
resourceList.statisticsBufObj = statisticsBufObj;
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
sciError = NvSciBufObjAlloc(reconciledInputAttrList[ii], &inputBufObj[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in allocating NvSciBuf object\n");
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
sciError = NvSciBufObjAlloc(reconciledOutputAttrList[ii], &outputBufObj[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in allocating NvSciBuf object\n");
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
sciError = NvSciBufObjAlloc(reconciledStatisticsOutputAttrList[ii], &statisticsBufObj[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in allocating NvSciBuf object\n");
cleanUp(&resourceList);
return 1;
}
}
uint64_t** inputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors);
uint64_t** outputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors);
uint64_t** statisticsBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics);
if ((inputBufObjRegPtr == NULL) || (outputBufObjRegPtr == NULL) || (statisticsBufObjRegPtr == NULL)) {
if (inputBufObjRegPtr != NULL) {
free(inputBufObjRegPtr);
inputBufObjRegPtr = NULL;
}
if (outputBufObjRegPtr != NULL) {
free(outputBufObjRegPtr);
outputBufObjRegPtr = NULL;
}
if (statisticsBufObjRegPtr != NULL) {
free(statisticsBufObjRegPtr);
statisticsBufObjRegPtr = NULL;
}
cleanUp(&resourceList);
return 1;
}
resourceList.inputBufObjRegPtr = inputBufObjRegPtr;
resourceList.outputBufObjRegPtr = outputBufObjRegPtr;
resourceList.statisticsBufObjRegPtr = statisticsBufObjRegPtr;
void **inputBufObjBuffer = (void **)malloc(sizeof(void*)*numInputTensors);
void **outputBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTensors);
void **statisticsBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTaskStatistics);
cudlaExternalMemoryHandleDesc memDesc = { 0 };
// importing external memory
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
memset(&memDesc, 0, sizeof(memDesc));
memDesc.extBufObject = (void *)inputBufObj[ii];
memDesc.size = inputTensorDesc[ii].size;
err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr[ii], 0);
if (err != cudlaSuccess) {
DPRINTF("Error in importing external memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
sciError = NvSciBufObjGetCpuPtr(inputBufObj[ii], &inputBufObjBuffer[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in getting NvSciBuf CPU pointer\n");
cleanUp(&resourceList);
return 1;
}
memcpy(inputBufObjBuffer[ii], inputBuffer[ii], inputTensorDesc[ii].size);
}
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
memset(&memDesc, 0, sizeof(memDesc));
memDesc.extBufObject = (void *)outputBufObj[ii];
memDesc.size = outputTensorDesc[ii].size;
err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr[ii], 0);
if (err != cudlaSuccess) {
DPRINTF("Error in importing external memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
sciError = NvSciBufObjGetCpuPtr(outputBufObj[ii], &outputBufObjBuffer[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in getting NvSciBuf CPU pointer\n");
cleanUp(&resourceList);
return 1;
}
memset(outputBufObjBuffer[ii], 0, outputTensorDesc[ii].size);
}
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
memset(&memDesc, 0, sizeof(memDesc));
memDesc.extBufObject = (void *)statisticsBufObj[ii];
memDesc.size = outputTaskStatisticsDesc[ii].size;
err = cudlaImportExternalMemory(devHandle, &memDesc, &statisticsBufObjRegPtr[ii], CUDLA_TASK_STATISTICS);
if (err != cudlaSuccess) {
DPRINTF("Error in importing external memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
sciError = NvSciBufObjGetCpuPtr(statisticsBufObj[ii], &statisticsBufObjBuffer[ii]);
if (sciError != NvSciError_Success) {
DPRINTF("Error in getting NvSciBuf CPU pointer\n");
cleanUp(&resourceList);
return 1;
}
memset(statisticsBufObjBuffer[ii], 0, outputTaskStatisticsDesc[ii].size);
}
uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0} ;
uint32_t index = 0;
for (; index < numOutputTensors ; index++) {
outputStatisticsBufferRegisteredPtr[index] = ((outputBufObjRegPtr[index]));
}
for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) {
outputStatisticsBufferRegisteredPtr[index++] = ((statisticsBufObjRegPtr[jj]));
}
NvSciSyncObj syncObj;
NvSciSyncModule syncModule;
NvSciSyncAttrList syncAttrListObj[2];
NvSciSyncCpuWaitContext nvSciCtx;
NvSciSyncAttrList waiterAttrListObj = NULL;
NvSciSyncAttrList signalerAttrListObj = NULL;
NvSciSyncAttrList nvSciSyncConflictListObj;
NvSciSyncAttrList nvSciSyncReconciledListObj;
sciError = NvSciSyncModuleOpen(&syncModule);
if (sciError != NvSciError_Success) {
DPRINTF("Error in initializing NvSciSyncModuleOpen\n");
cleanUp(&resourceList);
return 1;
}
resourceList.syncModule = syncModule;
sciError = NvSciSyncCpuWaitContextAlloc(syncModule, &nvSciCtx);
if (sciError != NvSciError_Success) {
DPRINTF("Error in allocating cpu wait context NvSciSyncCpuWaitContextAlloc\n");
cleanUp(&resourceList);
return 1;
}
resourceList.nvSciCtx = nvSciCtx;
sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj);
if (sciError != NvSciError_Success) {
DPRINTF("Error in creating NvSciSync attribute list\n");
cleanUp(&resourceList);
return 1;
}
resourceList.signalerAttrListObj = signalerAttrListObj;
sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj);
if (sciError != NvSciError_Success) {
DPRINTF("Error in creating NvSciSync attribute list\n");
cleanUp(&resourceList);
return 1;
}
resourceList.waiterAttrListObj = waiterAttrListObj;
err = cudlaGetNvSciSyncAttributes(reinterpret_cast<uint64_t*>(signalerAttrListObj),
CUDLA_NVSCISYNC_ATTR_SIGNAL);
if (err != cudlaSuccess) {
DPRINTF("Error in getting cuDLA's NvSciSync attributes\n");
cleanUp(&resourceList);
return 1;
}
sciError = fillCpuWaiterAttrList(waiterAttrListObj);
if (sciError != NvSciError_Success) {
DPRINTF("Error in setting NvSciSync attribute list\n");
cleanUp(&resourceList);
return 1;
}
syncAttrListObj[0] = signalerAttrListObj;
syncAttrListObj[1] = waiterAttrListObj;
sciError = NvSciSyncAttrListReconcile(syncAttrListObj,
2,
&nvSciSyncReconciledListObj,
&nvSciSyncConflictListObj);
if (sciError != NvSciError_Success) {
DPRINTF("Error in reconciling NvSciSync's attribute lists\n");
cleanUp(&resourceList);
return 1;
}
resourceList.nvSciSyncConflictListObj = nvSciSyncConflictListObj;
resourceList.nvSciSyncReconciledListObj = nvSciSyncReconciledListObj;
sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj, &syncObj);
if (sciError != NvSciError_Success) {
DPRINTF("Error in allocating NvSciSync object\n");
cleanUp(&resourceList);
return 1;
}
resourceList.syncObj = syncObj;
// importing external semaphore
uint64_t* nvSciSyncObjRegPtr = NULL;
cudlaExternalSemaphoreHandleDesc semaMemDesc = { 0 };
memset(&semaMemDesc, 0, sizeof(semaMemDesc));
semaMemDesc.extSyncObject = syncObj;
err = cudlaImportExternalSemaphore(devHandle,
&semaMemDesc,
&nvSciSyncObjRegPtr,
0);
if (err != cudlaSuccess) {
DPRINTF("Error in importing external semaphore = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n");
// Signal Events
cudlaSignalEvents* signalEvents;
signalEvents = (cudlaSignalEvents *)malloc(sizeof(cudlaSignalEvents));
if (signalEvents == NULL) {
DPRINTF("Error in allocating signal events\n");
cleanUp(&resourceList);
return 1;
}
signalEvents->numEvents = 1;
uint64_t** devPtrs = (uint64_t **)malloc(signalEvents->numEvents *
sizeof(uint64_t *));
if (devPtrs == NULL) {
DPRINTF("Error in allocating output pointer's array of registered objects\n");
cleanUp(&resourceList);
return 1;
}
devPtrs[0] = nvSciSyncObjRegPtr;
signalEvents->devPtrs = devPtrs;
resourceList.devPtrs = devPtrs;
signalEvents->eofFences = (CudlaFence *)malloc(signalEvents->numEvents *
sizeof(CudlaFence));
if (signalEvents->eofFences == NULL) {
DPRINTF("Error in allocating eofFence array\n");
cleanUp(&resourceList);
return 1;
}
NvSciSyncFence eofFence = NvSciSyncFenceInitializer;
signalEvents->eofFences[0].fence = &eofFence;
signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE;
resourceList.signalEvents = signalEvents;
resourceList.eofFence = eofFence;
// Enqueue a cuDLA task.
cudlaTask task;
task.moduleHandle = moduleHandle;
task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr;
if(statSupport == 1) {
task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics);
DPRINTF("Layerwise profiling is requested \n");
} else {
task.numOutputTensors = numOutputTensors;
DPRINTF("Layerwise profiling is not requested \n");
}
task.numInputTensors = numInputTensors;
task.inputTensor = inputBufObjRegPtr;
task.waitEvents = NULL;
task.signalEvents = signalEvents;
err = cudlaSubmitTask(devHandle, &task, 1, NULL, 0);
if (err != cudlaSuccess) {
DPRINTF("Error in submitting task\n");
cleanUp(&resourceList);
return 1;
}
DPRINTF("SUBMIT IS DONE !!!\n");
// Wait for operations to finish and bring output buffer to CPU.
sciError = NvSciSyncFenceWait(reinterpret_cast<NvSciSyncFence*>(signalEvents->eofFences[0].fence),
nvSciCtx, -1);
if (sciError != NvSciError_Success) {
DPRINTF("Error in waiting on NvSciSyncFence\n");
cleanUp(&resourceList);
return 1;
}
// copy statistics data to cpu
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
memcpy(outputBuffer[ii], outputBufObjBuffer[ii], outputTensorDesc[ii].size);
}
if(statSupport == 1) {
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
memcpy(statisticsOutputBuffer[ii], statisticsBufObjBuffer[ii], outputTaskStatisticsDesc[ii].size);
}
const cudlaExternalEtbl* etbl = NULL;
if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) {
DPRINTF("Error in getting export table\n");
cleanUp(&resourceList);
return 1;
}
void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics);
if (csv == NULL) {
DPRINTF("Error in allocating memory for csv stream\n");
cleanUp(&resourceList);
return 1;
}
memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics);
resourceList.csv = csv;
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
cudlaTranslateCsvAttribute csvAttribute;
uint64_t csvStreamLength = 0;
err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute);
csv[ii] = (void* )malloc(csvAttribute.csvStreamLength);
csvStreamLength = csvAttribute.csvStreamLength;
DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength);
if (csv[ii] == NULL) {
DPRINTF("Error in allocating memory for csv stream\n");
cleanUp(&resourceList);
return 1;
}
memset(csv[ii], 0x00, csvAttribute.csvStreamLength);
csvAttribute.csvStreamStats = csv[ii];
err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute);
if (err != cudlaSuccess) {
DPRINTF("Error in translating stats\n");
cleanUp(&resourceList);
return 1;
}
if (argc == 5) {
sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix);
fp = fopen(filename, "w+");
if (fp == NULL) {
DPRINTF("Cannot open file %s\n", filename);
cleanUp(&resourceList);
return 1;
}
uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp);
if(ret_val != csvStreamLength) {
DPRINTF("number of elements written to file is %u \n", ret_val);
cleanUp(&resourceList);
return 1;
}
fclose(fp);
} else {
DPRINTF("%s \n",(char *)csv[ii]);
}
}
}
// unregister the CUDA-allocated buffers.
for (uint32_t ii = 0; ii < numInputTensors; ii++) {
err = cudlaMemUnregister(devHandle,
(inputBufObjRegPtr[ii]));
if (err != cudlaSuccess) {
DPRINTF("Error in registering input memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
err = cudlaMemUnregister(devHandle,
(outputBufObjRegPtr[ii]));
if (err != cudlaSuccess) {
DPRINTF("Error in registering output memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
err = cudlaMemUnregister(devHandle,
(statisticsBufObjRegPtr[ii]));
if (err != cudlaSuccess) {
DPRINTF("Error in registering output memory = %d\n", err);
cleanUp(&resourceList);
return 1;
}
}
err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr);
if (err != cudlaSuccess) {
DPRINTF("Error in unregistering external semaphore = %d\n", err);
cleanUp(&resourceList);
return 1;
}
DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n");
err = cudlaModuleUnload(moduleHandle, 0);
if (err != cudlaSuccess) {
DPRINTF("Error in cudlaModuleUnload = %d\n", err);
cleanUp(&resourceList);
return 1;
} else {
DPRINTF("Successfully unloaded module\n");
}
resourceList.moduleHandle = NULL;
err = cudlaDestroyDevice(devHandle);
if (err != cudlaSuccess) {
DPRINTF("Error in cuDLA destroy device = %d\n", err);
return 1;
}
DPRINTF("Device destroyed successfully\n");
resourceList.devHandle = NULL;
cleanUp(&resourceList);
DPRINTF("cuDLALayerwiseStatsStandalone DONE !!!\n");
return 0;
}