cuda-samples/Samples/cudaNvSciNvMedia/cuda_consumer.cu
Mahesh Doijade 92b0568792 Add cudaNvSciNvMedia sample with/without nvsci* APIs, it takes RGBA image as input and produces YUV via nvmedia
this YUV is consumed by cuda which converts it to grayscale image which is written to file as output
2020-11-24 16:28:04 +05:30

373 lines
18 KiB
Plaintext

/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <iostream>
#include <cuda_runtime.h>
#include "cuda_consumer.h"
#include <helper_image.h>
#include "nvmedia_image_nvscibuf.h"
#include "nvmedia_utils/cmdline.h"
// Enable this to 1 if require cuda processed output to ppm file.
#define WRITE_OUTPUT_IMAGE 0
#define checkNvSciErrors(call) \
do { \
NvSciError _status = call; \
if (NvSciError_Success != _status) { \
printf( \
"NVSCI call in file '%s' in line %i returned" \
" %d, expected %d\n", \
__FILE__, __LINE__, _status, NvSciError_Success); \
fflush(stdout); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ static void yuvToGrayscale(cudaSurfaceObject_t surfaceObject, unsigned int *dstImage, int32_t imageWidth, int32_t imageHeight)
{
size_t x = blockIdx.x*blockDim.x + threadIdx.x;
size_t y = blockIdx.y*blockDim.y + threadIdx.y;
uchar4* dstImageUchar4 = (uchar4*)dstImage;
for ( ; x < imageWidth && y < imageHeight; x += gridDim.x*blockDim.x, y += gridDim.y*blockDim.y)
{
int colInBytes = x * sizeof(unsigned char);
unsigned char luma = surf2Dread<unsigned char>(surfaceObject, colInBytes, y);
uchar4 grayscalePix = make_uchar4(luma, luma, luma, 0);
dstImageUchar4[y*imageWidth + x] = grayscalePix;
}
}
static void cudaImportNvSciSync(cudaExternalSemaphore_t &extSem, NvSciSyncObj &syncObj)
{
cudaExternalSemaphoreHandleDesc extSemDesc;
memset(&extSemDesc, 0, sizeof(extSemDesc));
extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync;
extSemDesc.handle.nvSciSyncObj = (void *)syncObj;
checkCudaErrors(cudaImportExternalSemaphore(&extSem, &extSemDesc));
}
static void waitExternalSemaphore(cudaExternalSemaphore_t &waitSem, NvSciSyncFence *fence,
cudaStream_t stream) {
cudaExternalSemaphoreWaitParams waitParams;
memset(&waitParams, 0, sizeof(waitParams));
// For cross-process signaler-waiter applications need to use NvSciIpc
// and NvSciSync[Export|Import] utilities to share the NvSciSyncFence
// across process. This step is optional in single-process.
waitParams.params.nvSciSync.fence = (void *)fence;
waitParams.flags = 0;
checkCudaErrors(cudaWaitExternalSemaphoresAsync(&waitSem, &waitParams, 1, stream));
}
static void signalExternalSemaphore(cudaExternalSemaphore_t &signalSem, NvSciSyncFence *fence,
cudaStream_t stream) {
cudaExternalSemaphoreSignalParams signalParams;
memset(&signalParams, 0, sizeof(signalParams));
// For cross-process signaler-waiter applications need to use NvSciIpc
// and NvSciSync[Export|Import] utilities to share the NvSciSyncFence
// across process. This step is optional in single-process.
signalParams.params.nvSciSync.fence = (void *)fence;
signalParams.flags = 0;
checkCudaErrors(cudaSignalExternalSemaphoresAsync(&signalSem, &signalParams,
1, stream));
}
static void yuvToGrayscaleCudaKernel(cudaExternalResInterop& cudaExtResObj, int32_t imageWidth, int32_t imageHeight)
{
#if WRITE_OUTPUT_IMAGE
unsigned int *h_dstImage;
checkCudaErrors(cudaMallocHost(&h_dstImage, sizeof(unsigned int)*imageHeight*imageWidth));
#endif
dim3 block(16, 16, 1);
dim3 grid((imageWidth/block.x)+1, (imageHeight/block.y)+1, 1);
yuvToGrayscale<<<grid, block, 0, cudaExtResObj.stream>>>(cudaExtResObj.cudaSurfaceNvmediaBuf[0], cudaExtResObj.d_outputImage, imageWidth, imageHeight);
#if WRITE_OUTPUT_IMAGE
checkCudaErrors(cudaMemcpyAsync(h_dstImage, cudaExtResObj.d_outputImage, sizeof(unsigned int)*imageHeight*imageWidth, cudaMemcpyDeviceToHost, cudaExtResObj.stream));
checkCudaErrors(cudaStreamSynchronize(cudaExtResObj.stream));
char outputFilename[1024];
std::string image_filename = "Grayscale";
strcpy(outputFilename, image_filename.c_str());
strcpy(outputFilename + image_filename.length(), "_nvsci_out.ppm");
sdkSavePPM4ub(outputFilename, (unsigned char *)h_dstImage, imageWidth, imageHeight);
printf("Wrote '%s'\n", outputFilename);
checkCudaErrors(cudaFreeHost(h_dstImage));
#endif
}
static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj, NvSciBufObj& inputBufObj)
{
NvSciBufModule module = NULL;
NvSciBufAttrList attrlist = NULL;
NvSciBufAttrKeyValuePair pairArrayOut[10];
checkNvSciErrors(NvSciBufModuleOpen(&module));
checkNvSciErrors(NvSciBufAttrListCreate(module, &attrlist));
checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &attrlist));
memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10);
int numAttrs = 0;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Size;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneChannelCount;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneCount;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneWidth;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneHeight;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Layout;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneBitsPerPixel;
checkNvSciErrors(NvSciBufAttrListGetAttrs(attrlist, pairArrayOut, numAttrs));
uint64_t size = *(uint64_t *)pairArrayOut[0].value;
uint8_t channelCount = *(uint8_t *)pairArrayOut[1].value;
cudaExtResObj.planeCount = *(int32_t *)pairArrayOut[2].value;
cudaExtResObj.imageWidth = (int32_t*) malloc(sizeof(int32_t)*cudaExtResObj.planeCount);
cudaExtResObj.imageHeight = (int32_t*) malloc(sizeof(int32_t)*cudaExtResObj.planeCount);
memcpy(cudaExtResObj.imageWidth, (int32_t *)pairArrayOut[3].value, cudaExtResObj.planeCount * sizeof(int32_t));
memcpy(cudaExtResObj.imageHeight, (int32_t *)pairArrayOut[4].value, cudaExtResObj.planeCount * sizeof(int32_t));
NvSciBufAttrValImageLayoutType layout = *(NvSciBufAttrValImageLayoutType *)pairArrayOut[5].value;
uint32_t bitsPerPixel = *(uint32_t*)pairArrayOut[6].value;
if (layout != NvSciBufImage_BlockLinearType) {
printf("Image layout is not block linear.. waiving execution\n");
exit(EXIT_WAIVED);
}
cudaExternalMemoryHandleDesc memHandleDesc;
memset(&memHandleDesc, 0, sizeof(memHandleDesc));
memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
memHandleDesc.handle.nvSciBufObject = inputBufObj;
memHandleDesc.size = size;
checkCudaErrors(cudaImportExternalMemory(&cudaExtResObj.extMemImageBuf, &memHandleDesc));
cudaExtResObj.d_mipmapArray = (cudaMipmappedArray_t*) malloc(sizeof(cudaMipmappedArray_t) * cudaExtResObj.planeCount);
for (int i = 0; i < cudaExtResObj.planeCount; i++) {
cudaExtent extent = {};
memset(&extent, 0, sizeof(extent));
extent.width = cudaExtResObj.imageWidth[i];
extent.height = cudaExtResObj.imageHeight[i];
extent.depth = 0;
cudaChannelFormatDesc desc;
switch (channelCount) {
case 1:
default:
desc = cudaCreateChannelDesc(bitsPerPixel, 0, 0, 0, cudaChannelFormatKindUnsigned);
break;
case 2:
desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, 0, 0, cudaChannelFormatKindUnsigned);
break;
case 3:
desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, bitsPerPixel, 0, cudaChannelFormatKindUnsigned);
break;
case 4:
desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, bitsPerPixel, bitsPerPixel, cudaChannelFormatKindUnsigned);
break;
}
cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0};
mipmapDesc.offset = 0;
mipmapDesc.formatDesc = desc;
mipmapDesc.extent = extent;
mipmapDesc.flags = 0;
mipmapDesc.numLevels = 1;
checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cudaExtResObj.d_mipmapArray[i], cudaExtResObj.extMemImageBuf, &mipmapDesc));
}
}
static cudaSurfaceObject_t createCudaSurface(cudaArray_t &d_mipLevelArray)
{
cudaResourceDesc resourceDesc;
memset(&resourceDesc, 0, sizeof(resourceDesc));
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = d_mipLevelArray;
cudaSurfaceObject_t surfaceObject;
checkCudaErrors(cudaCreateSurfaceObject(&surfaceObject, &resourceDesc));
return surfaceObject;
}
static cudaStream_t createCudaStream(int deviceId)
{
checkCudaErrors(cudaSetDevice(deviceId));
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
return stream;
}
// CUDA setup buffers/synchronization objects for interop via NvSci API.
void setupCuda(cudaExternalResInterop& cudaExtResObj, NvSciBufObj& inputBufObj,
NvSciSyncObj &syncObj, NvSciSyncObj &cudaSignalerSyncObj, int deviceId)
{
checkCudaErrors(cudaSetDevice(deviceId));
cudaImportNvSciSync(cudaExtResObj.waitSem, syncObj);
cudaImportNvSciSync(cudaExtResObj.signalSem, cudaSignalerSyncObj);
cudaImportNvSciImage(cudaExtResObj, inputBufObj);
cudaExtResObj.d_mipLevelArray = (cudaArray_t *) malloc(sizeof(cudaArray_t) * cudaExtResObj.planeCount);
cudaExtResObj.cudaSurfaceNvmediaBuf = (cudaSurfaceObject_t *) malloc(sizeof(cudaSurfaceObject_t) * cudaExtResObj.planeCount);
for (int i = 0; i < cudaExtResObj.planeCount; ++i) {
uint32_t mipLevelId = 0;
checkCudaErrors(cudaGetMipmappedArrayLevel(&cudaExtResObj.d_mipLevelArray[i], cudaExtResObj.d_mipmapArray[i], mipLevelId));
cudaExtResObj.cudaSurfaceNvmediaBuf[i] = createCudaSurface(cudaExtResObj.d_mipLevelArray[i]);
}
cudaExtResObj.stream = createCudaStream(deviceId);
checkCudaErrors(cudaMalloc(&cudaExtResObj.d_outputImage, sizeof(unsigned int) * cudaExtResObj.imageWidth[0] * cudaExtResObj.imageHeight[0]));
}
// CUDA clean up buffers used **with** NvSci API.
void cleanupCuda(cudaExternalResInterop& cudaExtResObj)
{
for (int i=0; i < cudaExtResObj.planeCount; i++) {
checkCudaErrors(cudaDestroySurfaceObject(cudaExtResObj.cudaSurfaceNvmediaBuf[i]));
checkCudaErrors(cudaFreeMipmappedArray(cudaExtResObj.d_mipmapArray[i]));
}
free(cudaExtResObj.d_mipmapArray);
free(cudaExtResObj.d_mipLevelArray);
free(cudaExtResObj.cudaSurfaceNvmediaBuf);
free(cudaExtResObj.imageWidth);
free(cudaExtResObj.imageHeight);
checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtResObj.waitSem));
checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtResObj.signalSem));
checkCudaErrors(cudaDestroyExternalMemory(cudaExtResObj.extMemImageBuf));
checkCudaErrors(cudaStreamDestroy(cudaExtResObj.stream));
checkCudaErrors(cudaFree(cudaExtResObj.d_outputImage));
}
void runCudaOperation(cudaExternalResInterop& cudaExtResObj, NvSciSyncFence *cudaWaitFence,
NvSciSyncFence *cudaSignalFence, int deviceId, int iterations)
{
checkCudaErrors(cudaSetDevice(deviceId));
static int64_t launch = 0;
waitExternalSemaphore(cudaExtResObj.waitSem, cudaWaitFence, cudaExtResObj.stream);
// run cuda kernel over surface object of the LUMA surface part to extract grayscale.
yuvToGrayscaleCudaKernel(cudaExtResObj, cudaExtResObj.imageWidth[0], cudaExtResObj.imageHeight[0]);
// signal fence till the second last iterations for NvMedia2DBlit to wait for cuda signal
// and for final iteration as there is no corresponding NvMedia operation pending
// therefore we end with cudaStreamSynchronize()
if (launch < iterations-1) {
signalExternalSemaphore(cudaExtResObj.signalSem, cudaSignalFence, cudaExtResObj.stream);
}
else {
checkCudaErrors(cudaStreamSynchronize(cudaExtResObj.stream));
}
launch++;
}
// CUDA imports and operates on NvSci buffer/synchronization objects
void setupCuda(Blit2DTest *ctx, cudaResources &cudaResObj, int deviceId)
{
checkCudaErrors(cudaSetDevice(deviceId));
cudaResObj.d_yuvArray = (cudaArray_t *) malloc(sizeof(cudaArray_t) * ctx->numSurfaces);
cudaResObj.cudaSurfaceNvmediaBuf = (cudaSurfaceObject_t*) malloc(sizeof(cudaSurfaceObject_t) * ctx->numSurfaces);
cudaChannelFormatDesc channelDesc;
switch (ctx->bytesPerPixel) {
case 1:
default:
channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
break;
}
for(int k = 0; k < ctx->numSurfaces; k++) {
checkCudaErrors(cudaMallocArray(&cudaResObj.d_yuvArray[k], &channelDesc, ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel,
ctx->heightSurface * ctx->yScalePtr[k]));
cudaResObj.cudaSurfaceNvmediaBuf[k] = createCudaSurface(cudaResObj.d_yuvArray[k]);
}
checkCudaErrors(cudaMalloc(&cudaResObj.d_outputImage, sizeof(unsigned int) * ctx->widthSurface * ctx->heightSurface));
cudaResObj.stream = createCudaStream(deviceId);
}
// CUDA clean up buffers used **without** NvSci API.
void cleanupCuda(Blit2DTest *ctx, cudaResources &cudaResObj)
{
for(int k = 0; k < ctx->numSurfaces; k++) {
checkCudaErrors(cudaDestroySurfaceObject(cudaResObj.cudaSurfaceNvmediaBuf[k]));
checkCudaErrors(cudaFreeArray(cudaResObj.d_yuvArray[k]));
}
free(cudaResObj.cudaSurfaceNvmediaBuf);
checkCudaErrors(cudaStreamDestroy(cudaResObj.stream));
checkCudaErrors(cudaFree(cudaResObj.d_outputImage));
}
static void yuvToGrayscaleCudaKernelNonNvSci(cudaResources &cudaResObj, int deviceId, int32_t imageWidth, int32_t imageHeight)
{
#if WRITE_OUTPUT_IMAGE
unsigned int *h_dstImage;
checkCudaErrors(cudaMallocHost(&h_dstImage, sizeof(unsigned int)*imageHeight*imageWidth));
#endif
dim3 block(16, 16, 1);
dim3 grid((imageWidth/block.x)+1, (imageHeight/block.y)+1, 1);
yuvToGrayscale<<<grid, block, 0, cudaResObj.stream>>>(cudaResObj.cudaSurfaceNvmediaBuf[0], cudaResObj.d_outputImage, imageWidth, imageHeight);
#if WRITE_OUTPUT_IMAGE
checkCudaErrors(cudaMemcpyAsync(h_dstImage, cudaResObj.d_outputImage, sizeof(unsigned int)*imageHeight*imageWidth, cudaMemcpyDeviceToHost, cudaResObj.stream));
checkCudaErrors(cudaStreamSynchronize(cudaResObj.stream));
char outputFilename[1024];
std::string image_filename = "Grayscale";
strcpy(outputFilename, image_filename.c_str());
strcpy(outputFilename + image_filename.length(), "_non-nvsci_out.ppm");
sdkSavePPM4ub(outputFilename, (unsigned char *)h_dstImage, imageWidth, imageHeight);
printf("Wrote '%s'\n", outputFilename);
checkCudaErrors(cudaFreeHost(h_dstImage));
#else
checkCudaErrors(cudaStreamSynchronize(cudaResObj.stream));
#endif
}
// CUDA operates **without** NvSci APIs buffer/synchronization objects.
void runCudaOperation(Blit2DTest *ctx, cudaResources &cudaResObj, int deviceId)
{
for(int k = 0; k < ctx->numSurfaces; k++) {
checkCudaErrors(cudaMemcpy2DToArray(cudaResObj.d_yuvArray[k], 0, 0, ctx->dstBuff[k],
ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel,
ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel,
ctx->heightSurface * ctx->yScalePtr[k], cudaMemcpyHostToDevice));
}
// run cuda kernel over surface object of the LUMA surface part to extract grayscale.
yuvToGrayscaleCudaKernelNonNvSci(cudaResObj, deviceId, ctx->widthSurface, ctx->heightSurface);
}