mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 13:49:16 +08:00
602 lines
22 KiB
C++
602 lines
22 KiB
C++
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "cudaNvSci.h"
|
|
#include <cuda.h>
|
|
#include <condition_variable>
|
|
#include <iostream>
|
|
#include <thread>
|
|
|
|
std::mutex m_mutex;
|
|
std::condition_variable m_condVar;
|
|
bool workSubmitted = false;
|
|
|
|
class cudaNvSciSignal {
|
|
private:
|
|
NvSciSyncModule m_syncModule;
|
|
NvSciBufModule m_bufModule;
|
|
|
|
NvSciSyncAttrList m_syncAttrList;
|
|
NvSciSyncFence *m_fence;
|
|
|
|
NvSciBufAttrList m_rawBufAttrList;
|
|
NvSciBufAttrList m_imageBufAttrList;
|
|
NvSciBufAttrList m_buffAttrListOut[2];
|
|
NvSciBufAttrKeyValuePair pairArrayOut[10];
|
|
|
|
cudaExternalMemory_t extMemRawBuf, extMemImageBuf;
|
|
cudaMipmappedArray_t d_mipmapArray;
|
|
cudaArray_t d_mipLevelArray;
|
|
cudaTextureObject_t texObject;
|
|
cudaExternalSemaphore_t signalSem;
|
|
|
|
cudaStream_t streamToRun;
|
|
int m_cudaDeviceId;
|
|
CUuuid m_devUUID;
|
|
uint64_t m_imageWidth;
|
|
uint64_t m_imageHeight;
|
|
void *d_outputBuf;
|
|
size_t m_bufSize;
|
|
|
|
public:
|
|
cudaNvSciSignal(NvSciBufModule bufModule, NvSciSyncModule syncModule,
|
|
int cudaDeviceId, int bufSize, uint64_t imageWidth,
|
|
uint64_t imageHeight, NvSciSyncFence *fence)
|
|
: m_syncModule(syncModule),
|
|
m_bufModule(bufModule),
|
|
m_cudaDeviceId(cudaDeviceId),
|
|
d_outputBuf(NULL),
|
|
m_bufSize(bufSize),
|
|
m_imageWidth(imageWidth),
|
|
m_imageHeight(imageHeight),
|
|
m_fence(fence) {
|
|
initCuda();
|
|
|
|
checkNvSciErrors(NvSciSyncAttrListCreate(m_syncModule, &m_syncAttrList));
|
|
checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_rawBufAttrList));
|
|
checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_imageBufAttrList));
|
|
|
|
setRawBufAttrList(m_bufSize);
|
|
setImageBufAttrList(m_imageWidth, m_imageHeight);
|
|
|
|
checkCudaErrors(cudaDeviceGetNvSciSyncAttributes(
|
|
m_syncAttrList, m_cudaDeviceId, cudaNvSciSyncAttrSignal));
|
|
}
|
|
|
|
~cudaNvSciSignal() {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
checkCudaErrors(cudaFreeMipmappedArray(d_mipmapArray));
|
|
checkCudaErrors(cudaFree(d_outputBuf));
|
|
checkCudaErrors(cudaDestroyExternalSemaphore(signalSem));
|
|
checkCudaErrors(cudaDestroyExternalMemory(extMemRawBuf));
|
|
checkCudaErrors(cudaDestroyExternalMemory(extMemImageBuf));
|
|
checkCudaErrors(cudaDestroyTextureObject(texObject));
|
|
checkCudaErrors(cudaStreamDestroy(streamToRun));
|
|
}
|
|
|
|
void initCuda() {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
checkCudaErrors(
|
|
cudaStreamCreateWithFlags(&streamToRun, cudaStreamNonBlocking));
|
|
|
|
int major = 0, minor = 0;
|
|
checkCudaErrors(cudaDeviceGetAttribute(
|
|
&major, cudaDevAttrComputeCapabilityMajor, m_cudaDeviceId));
|
|
checkCudaErrors(cudaDeviceGetAttribute(
|
|
&minor, cudaDevAttrComputeCapabilityMinor, m_cudaDeviceId));
|
|
printf(
|
|
"[cudaNvSciSignal] GPU Device %d: \"%s\" with compute capability "
|
|
"%d.%d\n\n",
|
|
m_cudaDeviceId, _ConvertSMVer2ArchName(major, minor), major, minor);
|
|
|
|
CUresult res = cuDeviceGetUuid(&m_devUUID, m_cudaDeviceId);
|
|
if (res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Driver API error = %04d \n", res);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
void setRawBufAttrList(uint64_t size) {
|
|
NvSciBufType bufType = NvSciBufType_RawBuffer;
|
|
bool cpuAccess = false;
|
|
NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
|
|
NvSciBufAttrKeyValuePair rawBufAttrs[] = {
|
|
{NvSciBufRawBufferAttrKey_Size, &size, sizeof(size)},
|
|
{NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)},
|
|
{NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccess, sizeof(cpuAccess)},
|
|
{NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)},
|
|
{NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)},
|
|
};
|
|
|
|
checkNvSciErrors(NvSciBufAttrListSetAttrs(
|
|
m_rawBufAttrList, rawBufAttrs,
|
|
sizeof(rawBufAttrs) / sizeof(NvSciBufAttrKeyValuePair)));
|
|
}
|
|
|
|
void setImageBufAttrList(uint32_t width, uint32_t height) {
|
|
NvSciBufType bufType = NvSciBufType_Image;
|
|
NvSciBufAttrValImageLayoutType layout = NvSciBufImage_BlockLinearType;
|
|
NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
|
|
|
|
uint32_t planeCount = 1;
|
|
uint32_t planeWidths[] = {width};
|
|
uint32_t planeHeights[] = {height};
|
|
uint64_t lrpad = 0, tbpad = 100;
|
|
|
|
bool cpuAccessFlag = false;
|
|
|
|
NvSciBufAttrValColorFmt planecolorfmts[] = {NvSciColor_B8G8R8A8};
|
|
NvSciBufAttrValColorStd planecolorstds[] = {NvSciColorStd_SRGB};
|
|
NvSciBufAttrValImageScanType planescantype[] = {NvSciBufScan_InterlaceType};
|
|
|
|
NvSciBufAttrKeyValuePair imgBufAttrs[] = {
|
|
{NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)},
|
|
{NvSciBufImageAttrKey_PlaneCount, &planeCount, sizeof(planeCount)},
|
|
{NvSciBufImageAttrKey_Layout, &layout, sizeof(layout)},
|
|
{NvSciBufImageAttrKey_TopPadding, &tbpad, sizeof(tbpad)},
|
|
{NvSciBufImageAttrKey_BottomPadding, &tbpad, sizeof(tbpad)},
|
|
{NvSciBufImageAttrKey_LeftPadding, &lrpad, sizeof(lrpad)},
|
|
{NvSciBufImageAttrKey_RightPadding, &lrpad, sizeof(lrpad)},
|
|
{NvSciBufImageAttrKey_PlaneColorFormat, planecolorfmts,
|
|
sizeof(planecolorfmts)},
|
|
{NvSciBufImageAttrKey_PlaneColorStd, planecolorstds,
|
|
sizeof(planecolorstds)},
|
|
{NvSciBufImageAttrKey_PlaneWidth, planeWidths, sizeof(planeWidths)},
|
|
{NvSciBufImageAttrKey_PlaneHeight, planeHeights, sizeof(planeHeights)},
|
|
{NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccessFlag,
|
|
sizeof(cpuAccessFlag)},
|
|
{NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)},
|
|
{NvSciBufImageAttrKey_PlaneScanType, planescantype,
|
|
sizeof(planescantype)},
|
|
{NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)},
|
|
};
|
|
|
|
checkNvSciErrors(NvSciBufAttrListSetAttrs(
|
|
m_imageBufAttrList, imgBufAttrs,
|
|
sizeof(imgBufAttrs) / sizeof(NvSciBufAttrKeyValuePair)));
|
|
}
|
|
|
|
NvSciSyncAttrList getNvSciSyncAttrList() { return m_syncAttrList; }
|
|
|
|
NvSciBufAttrList getNvSciRawBufAttrList() { return m_rawBufAttrList; }
|
|
|
|
NvSciBufAttrList getNvSciImageBufAttrList() { return m_imageBufAttrList; }
|
|
|
|
void runRotateImageAndSignal(unsigned char *imageData) {
|
|
int numOfGPUs = 0;
|
|
checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); // For cuda init purpose
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
|
|
copyDataToImageArray(imageData);
|
|
createTexture();
|
|
|
|
float angle = 0.5f; // angle to rotate image by (in radians)
|
|
rotateKernel(texObject, angle, (unsigned int *)d_outputBuf, m_imageWidth,
|
|
m_imageHeight, streamToRun);
|
|
|
|
signalExternalSemaphore();
|
|
}
|
|
|
|
void cudaImportNvSciSemaphore(NvSciSyncObj syncObj) {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
|
|
cudaExternalSemaphoreHandleDesc extSemDesc;
|
|
memset(&extSemDesc, 0, sizeof(extSemDesc));
|
|
extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync;
|
|
extSemDesc.handle.nvSciSyncObj = (void *)syncObj;
|
|
|
|
checkCudaErrors(cudaImportExternalSemaphore(&signalSem, &extSemDesc));
|
|
}
|
|
|
|
void signalExternalSemaphore() {
|
|
cudaExternalSemaphoreSignalParams signalParams;
|
|
memset(&signalParams, 0, sizeof(signalParams));
|
|
// For cross-process signaler-waiter applications need to use NvSciIpc
|
|
// and NvSciSync[Export|Import] utilities to share the NvSciSyncFence
|
|
// across process. This step is optional in single-process.
|
|
signalParams.params.nvSciSync.fence = (void *)m_fence;
|
|
signalParams.flags = 0;
|
|
|
|
checkCudaErrors(cudaSignalExternalSemaphoresAsync(&signalSem, &signalParams,
|
|
1, streamToRun));
|
|
}
|
|
|
|
void cudaImportNvSciRawBuf(NvSciBufObj inputBufObj) {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
checkNvSciErrors(
|
|
NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[0]));
|
|
|
|
memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10);
|
|
pairArrayOut[0].key = NvSciBufRawBufferAttrKey_Size;
|
|
|
|
checkNvSciErrors(
|
|
NvSciBufAttrListGetAttrs(m_buffAttrListOut[0], pairArrayOut, 1));
|
|
|
|
uint64_t size = *(uint64_t *)pairArrayOut[0].value;
|
|
|
|
cudaExternalMemoryHandleDesc memHandleDesc;
|
|
memset(&memHandleDesc, 0, sizeof(memHandleDesc));
|
|
memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
|
|
memHandleDesc.handle.nvSciBufObject = inputBufObj;
|
|
memHandleDesc.size = size;
|
|
checkCudaErrors(cudaImportExternalMemory(&extMemRawBuf, &memHandleDesc));
|
|
|
|
cudaExternalMemoryBufferDesc bufferDesc;
|
|
memset(&bufferDesc, 0, sizeof(bufferDesc));
|
|
bufferDesc.offset = 0;
|
|
bufferDesc.size = size;
|
|
m_bufSize = size;
|
|
checkCudaErrors(cudaExternalMemoryGetMappedBuffer(
|
|
&d_outputBuf, extMemRawBuf, &bufferDesc));
|
|
}
|
|
|
|
void cudaImportNvSciImage(NvSciBufObj inputBufObj) {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
checkNvSciErrors(
|
|
NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[1]));
|
|
|
|
memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10);
|
|
pairArrayOut[0].key = NvSciBufImageAttrKey_Size;
|
|
pairArrayOut[1].key = NvSciBufImageAttrKey_Alignment;
|
|
pairArrayOut[2].key = NvSciBufImageAttrKey_PlaneCount;
|
|
pairArrayOut[3].key = NvSciBufImageAttrKey_PlaneWidth;
|
|
pairArrayOut[4].key = NvSciBufImageAttrKey_PlaneHeight;
|
|
|
|
checkNvSciErrors(
|
|
NvSciBufAttrListGetAttrs(m_buffAttrListOut[1], pairArrayOut, 5));
|
|
|
|
uint64_t size = *(uint64_t *)pairArrayOut[0].value;
|
|
uint64_t alignment = *(uint64_t *)pairArrayOut[1].value;
|
|
uint64_t planeCount = *(uint64_t *)pairArrayOut[2].value;
|
|
uint64_t imageWidth = *(uint64_t *)pairArrayOut[3].value;
|
|
uint64_t imageHeight = *(uint64_t *)pairArrayOut[4].value;
|
|
|
|
cudaExternalMemoryHandleDesc memHandleDesc;
|
|
memset(&memHandleDesc, 0, sizeof(memHandleDesc));
|
|
memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
|
|
memHandleDesc.handle.nvSciBufObject = inputBufObj;
|
|
memHandleDesc.size = size;
|
|
checkCudaErrors(cudaImportExternalMemory(&extMemImageBuf, &memHandleDesc));
|
|
|
|
cudaExtent extent = {};
|
|
memset(&extent, 0, sizeof(extent));
|
|
extent.width = imageWidth;
|
|
extent.height = imageHeight;
|
|
extent.depth = 0;
|
|
|
|
cudaChannelFormatDesc desc;
|
|
desc.x = 8;
|
|
desc.y = 8;
|
|
desc.z = 8;
|
|
desc.w = 8;
|
|
desc.f = cudaChannelFormatKindUnsigned;
|
|
|
|
cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0};
|
|
mipmapDesc.offset = 0;
|
|
mipmapDesc.formatDesc = desc;
|
|
mipmapDesc.extent = extent;
|
|
mipmapDesc.flags = 0;
|
|
|
|
mipmapDesc.numLevels = 1;
|
|
checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(
|
|
&d_mipmapArray, extMemImageBuf, &mipmapDesc));
|
|
}
|
|
|
|
void copyDataToImageArray(unsigned char *imageData) {
|
|
uint32_t mipLevelId = 0;
|
|
checkCudaErrors(cudaGetMipmappedArrayLevel(&d_mipLevelArray, d_mipmapArray,
|
|
mipLevelId));
|
|
|
|
checkCudaErrors(cudaMemcpy2DToArrayAsync(
|
|
d_mipLevelArray, 0, 0, imageData, m_imageWidth * sizeof(unsigned int),
|
|
m_imageWidth * sizeof(unsigned int), m_imageHeight,
|
|
cudaMemcpyHostToDevice, streamToRun));
|
|
}
|
|
|
|
void createTexture() {
|
|
cudaResourceDesc texRes;
|
|
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
|
|
|
texRes.resType = cudaResourceTypeArray;
|
|
texRes.res.array.array = d_mipLevelArray;
|
|
|
|
cudaTextureDesc texDescr;
|
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
|
|
|
texDescr.normalizedCoords = true;
|
|
texDescr.filterMode = cudaFilterModeLinear;
|
|
texDescr.addressMode[0] = cudaAddressModeWrap;
|
|
texDescr.addressMode[1] = cudaAddressModeWrap;
|
|
texDescr.readMode = cudaReadModeNormalizedFloat;
|
|
|
|
checkCudaErrors(
|
|
cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL));
|
|
}
|
|
};
|
|
|
|
class cudaNvSciWait {
|
|
private:
|
|
NvSciSyncModule m_syncModule;
|
|
NvSciBufModule m_bufModule;
|
|
|
|
NvSciSyncAttrList m_syncAttrList;
|
|
NvSciBufAttrList m_rawBufAttrList;
|
|
NvSciBufAttrList m_buffAttrListOut;
|
|
NvSciBufAttrKeyValuePair pairArrayOut[10];
|
|
NvSciSyncFence *m_fence;
|
|
|
|
cudaExternalMemory_t extMemRawBuf;
|
|
cudaExternalSemaphore_t waitSem;
|
|
cudaStream_t streamToRun;
|
|
int m_cudaDeviceId;
|
|
CUuuid m_devUUID;
|
|
void *d_outputBuf;
|
|
size_t m_bufSize;
|
|
size_t imageWidth;
|
|
size_t imageHeight;
|
|
|
|
public:
|
|
cudaNvSciWait(NvSciBufModule bufModule, NvSciSyncModule syncModule,
|
|
int cudaDeviceId, int bufSize, NvSciSyncFence *fence)
|
|
: m_bufModule(bufModule),
|
|
m_syncModule(syncModule),
|
|
m_cudaDeviceId(cudaDeviceId),
|
|
m_bufSize(bufSize),
|
|
m_fence(fence) {
|
|
initCuda();
|
|
checkNvSciErrors(NvSciSyncAttrListCreate(m_syncModule, &m_syncAttrList));
|
|
checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_rawBufAttrList));
|
|
|
|
setRawBufAttrList(m_bufSize);
|
|
checkCudaErrors(cudaDeviceGetNvSciSyncAttributes(
|
|
m_syncAttrList, m_cudaDeviceId, cudaNvSciSyncAttrWait));
|
|
}
|
|
|
|
~cudaNvSciWait() {
|
|
checkCudaErrors(cudaStreamDestroy(streamToRun));
|
|
checkCudaErrors(cudaDestroyExternalSemaphore(waitSem));
|
|
checkCudaErrors(cudaDestroyExternalMemory(extMemRawBuf));
|
|
checkCudaErrors(cudaFree(d_outputBuf));
|
|
}
|
|
|
|
void initCuda() {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
checkCudaErrors(
|
|
cudaStreamCreateWithFlags(&streamToRun, cudaStreamNonBlocking));
|
|
CUresult res = cuDeviceGetUuid(&m_devUUID, m_cudaDeviceId);
|
|
if (res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Driver API error = %04d \n", res);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
int major = 0, minor = 0;
|
|
checkCudaErrors(cudaDeviceGetAttribute(
|
|
&major, cudaDevAttrComputeCapabilityMajor, m_cudaDeviceId));
|
|
checkCudaErrors(cudaDeviceGetAttribute(
|
|
&minor, cudaDevAttrComputeCapabilityMinor, m_cudaDeviceId));
|
|
printf(
|
|
"[cudaNvSciWait] GPU Device %d: \"%s\" with compute capability "
|
|
"%d.%d\n\n",
|
|
m_cudaDeviceId, _ConvertSMVer2ArchName(major, minor), major, minor);
|
|
}
|
|
|
|
void setRawBufAttrList(uint64_t size) {
|
|
NvSciBufType bufType = NvSciBufType_RawBuffer;
|
|
bool cpuAccess = false;
|
|
NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
|
|
NvSciBufAttrKeyValuePair rawBufAttrs[] = {
|
|
{NvSciBufRawBufferAttrKey_Size, &size, sizeof(size)},
|
|
{NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)},
|
|
{NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccess, sizeof(cpuAccess)},
|
|
{NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)},
|
|
{NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)},
|
|
};
|
|
|
|
checkNvSciErrors(NvSciBufAttrListSetAttrs(
|
|
m_rawBufAttrList, rawBufAttrs,
|
|
sizeof(rawBufAttrs) / sizeof(NvSciBufAttrKeyValuePair)));
|
|
}
|
|
|
|
NvSciSyncAttrList getNvSciSyncAttrList() { return m_syncAttrList; }
|
|
|
|
NvSciBufAttrList getNvSciRawBufAttrList() { return m_rawBufAttrList; }
|
|
|
|
void runImageGrayscale(std::string image_filename, size_t imageWidth,
|
|
size_t imageHeight) {
|
|
int numOfGPUs = 0;
|
|
checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); // For cuda init purpose
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
|
|
waitExternalSemaphore();
|
|
launchGrayScaleKernel((unsigned int *)d_outputBuf, image_filename,
|
|
imageWidth, imageHeight, streamToRun);
|
|
}
|
|
|
|
void cudaImportNvSciSemaphore(NvSciSyncObj syncObj) {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
|
|
cudaExternalSemaphoreHandleDesc extSemDesc;
|
|
memset(&extSemDesc, 0, sizeof(extSemDesc));
|
|
extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync;
|
|
extSemDesc.handle.nvSciSyncObj = (void *)syncObj;
|
|
|
|
checkCudaErrors(cudaImportExternalSemaphore(&waitSem, &extSemDesc));
|
|
}
|
|
|
|
void waitExternalSemaphore() {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
|
|
cudaExternalSemaphoreWaitParams waitParams;
|
|
memset(&waitParams, 0, sizeof(waitParams));
|
|
// For cross-process signaler-waiter applications need to use NvSciIpc
|
|
// and NvSciSync[Export|Import] utilities to share the NvSciSyncFence
|
|
// across process. This step is optional in single-process.
|
|
waitParams.params.nvSciSync.fence = (void *)m_fence;
|
|
waitParams.flags = 0;
|
|
|
|
checkCudaErrors(
|
|
cudaWaitExternalSemaphoresAsync(&waitSem, &waitParams, 1, streamToRun));
|
|
}
|
|
|
|
void cudaImportNvSciRawBuf(NvSciBufObj inputBufObj) {
|
|
checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
|
|
|
|
checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut));
|
|
|
|
memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10);
|
|
pairArrayOut[0].key = NvSciBufRawBufferAttrKey_Size;
|
|
|
|
checkNvSciErrors(
|
|
NvSciBufAttrListGetAttrs(m_buffAttrListOut, pairArrayOut, 1));
|
|
|
|
uint64_t size = *(uint64_t *)pairArrayOut[0].value;
|
|
|
|
cudaExternalMemoryHandleDesc memHandleDesc;
|
|
memset(&memHandleDesc, 0, sizeof(memHandleDesc));
|
|
memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
|
|
memHandleDesc.handle.nvSciBufObject = inputBufObj;
|
|
memHandleDesc.size = size;
|
|
checkCudaErrors(cudaImportExternalMemory(&extMemRawBuf, &memHandleDesc));
|
|
|
|
cudaExternalMemoryBufferDesc bufferDesc;
|
|
memset(&bufferDesc, 0, sizeof(bufferDesc));
|
|
bufferDesc.offset = 0;
|
|
bufferDesc.size = size;
|
|
m_bufSize = size;
|
|
|
|
checkCudaErrors(cudaExternalMemoryGetMappedBuffer(
|
|
&d_outputBuf, extMemRawBuf, &bufferDesc));
|
|
}
|
|
};
|
|
|
|
void thread_rotateAndSignal(cudaNvSciSignal *cudaNvSciSignalObj,
|
|
unsigned char *imageData) {
|
|
std::lock_guard<std::mutex> guard(m_mutex);
|
|
cudaNvSciSignalObj->runRotateImageAndSignal(imageData);
|
|
workSubmitted = true;
|
|
m_condVar.notify_one();
|
|
}
|
|
|
|
void thread_waitAndGrayscale(cudaNvSciWait *cudaNvSciWaitObj,
|
|
std::string image_filename, size_t imageWidth,
|
|
size_t imageHeight) {
|
|
// Acquire the lock
|
|
std::unique_lock<std::mutex> mlock(m_mutex);
|
|
m_condVar.wait(mlock, [] { return workSubmitted; });
|
|
cudaNvSciWaitObj->runImageGrayscale(image_filename, imageWidth, imageHeight);
|
|
}
|
|
|
|
cudaNvSci::cudaNvSci(int isMultiGPU, std::vector<int> &deviceIds,
|
|
unsigned char *imageData, size_t width, size_t height)
|
|
: m_isMultiGPU(isMultiGPU),
|
|
image_data(imageData),
|
|
imageWidth(width),
|
|
imageHeight(height) {
|
|
if (isMultiGPU) {
|
|
m_cudaNvSciSignalDeviceId = deviceIds[0];
|
|
m_cudaNvSciWaitDeviceId = deviceIds[1];
|
|
} else {
|
|
m_cudaNvSciSignalDeviceId = m_cudaNvSciWaitDeviceId = deviceIds[0];
|
|
}
|
|
|
|
m_bufSize = imageWidth * imageHeight * sizeof(unsigned int);
|
|
}
|
|
|
|
void cudaNvSci::initNvSci() {
|
|
checkNvSciErrors(NvSciSyncModuleOpen(&syncModule));
|
|
checkNvSciErrors(NvSciBufModuleOpen(&buffModule));
|
|
fence = (NvSciSyncFence *)calloc(1, sizeof(NvSciSyncFence));
|
|
}
|
|
|
|
void cudaNvSci::runCudaNvSci(std::string &image_filename) {
|
|
initNvSci();
|
|
|
|
cudaNvSciSignal rotateAndSignal(buffModule, syncModule,
|
|
m_cudaNvSciSignalDeviceId, m_bufSize,
|
|
imageWidth, imageHeight, fence);
|
|
cudaNvSciWait waitAndGrayscale(buffModule, syncModule,
|
|
m_cudaNvSciWaitDeviceId, m_bufSize, fence);
|
|
|
|
rawBufUnreconciledList[0] = rotateAndSignal.getNvSciRawBufAttrList();
|
|
rawBufUnreconciledList[1] = waitAndGrayscale.getNvSciRawBufAttrList();
|
|
|
|
createNvSciRawBufObj();
|
|
|
|
imageBufUnreconciledList[0] = rotateAndSignal.getNvSciImageBufAttrList();
|
|
|
|
createNvSciBufImageObj();
|
|
|
|
rotateAndSignal.cudaImportNvSciRawBuf(rawBufObj);
|
|
rotateAndSignal.cudaImportNvSciImage(imageBufObj);
|
|
|
|
waitAndGrayscale.cudaImportNvSciRawBuf(rawBufObj);
|
|
|
|
syncUnreconciledList[0] = rotateAndSignal.getNvSciSyncAttrList();
|
|
syncUnreconciledList[1] = waitAndGrayscale.getNvSciSyncAttrList();
|
|
|
|
createNvSciSyncObj();
|
|
|
|
rotateAndSignal.cudaImportNvSciSemaphore(syncObj);
|
|
waitAndGrayscale.cudaImportNvSciSemaphore(syncObj);
|
|
|
|
std::thread rotateThread(&thread_rotateAndSignal, &rotateAndSignal,
|
|
image_data);
|
|
|
|
std::thread grayscaleThread(&thread_waitAndGrayscale, &waitAndGrayscale,
|
|
image_filename, imageWidth, imageHeight);
|
|
|
|
rotateThread.join();
|
|
grayscaleThread.join();
|
|
}
|
|
|
|
void cudaNvSci::createNvSciRawBufObj() {
|
|
int numAttrList = 2;
|
|
checkNvSciErrors(NvSciBufAttrListReconcile(rawBufUnreconciledList,
|
|
numAttrList, &rawBufReconciledList,
|
|
&buffConflictList));
|
|
checkNvSciErrors(NvSciBufObjAlloc(rawBufReconciledList, &rawBufObj));
|
|
printf("created NvSciBufObj\n");
|
|
}
|
|
|
|
void cudaNvSci::createNvSciBufImageObj() {
|
|
int numAttrList = 1;
|
|
checkNvSciErrors(NvSciBufAttrListReconcile(
|
|
imageBufUnreconciledList, numAttrList, &imageBufReconciledList,
|
|
&imageBufConflictList));
|
|
checkNvSciErrors(NvSciBufObjAlloc(imageBufReconciledList, &imageBufObj));
|
|
printf("created NvSciBufImageObj\n");
|
|
}
|
|
|
|
void cudaNvSci::createNvSciSyncObj() {
|
|
int numAttrList = 2;
|
|
checkNvSciErrors(NvSciSyncAttrListReconcile(syncUnreconciledList, numAttrList,
|
|
&syncReconciledList,
|
|
&syncConflictList));
|
|
checkNvSciErrors(NvSciSyncObjAlloc(syncReconciledList, &syncObj));
|
|
printf("created NvSciSyncObj\n");
|
|
}
|