/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Implements BGR 3 progressive planars frames batch resize #include #include #include "resize_convert.h" __global__ void resizeBGRplanarBatchKernel(cudaTextureObject_t texSrc, float *pDst, int nDstPitch, int nDstHeight, int nSrcHeight, int batch, float scaleX, float scaleY, int cropX, int cropY, int cropW, int cropH) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; if (x >= (int)(cropW/scaleX) || y >= (int)(cropH/scaleY)) return; int frameSize = nDstPitch*nDstHeight; float *p = NULL; for (int i = blockIdx.z; i < batch; i += gridDim.z) { #pragma unroll for (int channel=0; channel < 3; channel++){ p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize; *p = tex2D(texSrc, x * scaleX + cropX, ((3 * i + channel) * nSrcHeight + y * scaleY + cropY)); } } } static void resizeBGRplanarBatchCore( float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, int nBatchSize, cudaStream_t stream, bool whSameResizeRatio, int cropX, int cropY, int cropW, int cropH) { cudaTextureObject_t texSrc[2]; int nTiles = 1, h, iTile; h = nSrcHeight * 3 * nBatchSize; while ((h + nTiles - 1) / nTiles > 65536) nTiles++; if (nTiles > 2) return; int batchTile = nBatchSize / nTiles; int batchTileLast = nBatchSize - batchTile * (nTiles-1); for (iTile = 0; iTile < nTiles; ++iTile) { int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile; float *dpSrcNew = dpSrc + iTile * (batchTile * 3 * nSrcHeight * nSrcPitch); cudaResourceDesc resDesc = {}; resDesc.resType = cudaResourceTypePitch2D; resDesc.res.pitch2D.devPtr = dpSrcNew; resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); resDesc.res.pitch2D.width = nSrcWidth; resDesc.res.pitch2D.height = bs * 3 * nSrcHeight; resDesc.res.pitch2D.pitchInBytes = nSrcPitch * sizeof(float); cudaTextureDesc texDesc = {}; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; checkCudaErrors(cudaCreateTextureObject(&texSrc[iTile], &resDesc, &texDesc, NULL)); float *dpDstNew = dpDst + iTile * (batchTile * 3 * nDstHeight * nDstPitch); if(cropW == 0 || cropH == 0) { cropX = 0; cropY = 0; cropW = nSrcWidth; cropH = nSrcHeight; } float scaleX = (cropW*1.0f / nDstWidth); float scaleY = (cropH*1.0f / nDstHeight); if(whSameResizeRatio == true) scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY; dim3 block(32, 32, 1); size_t blockDimZ = bs; // Restricting blocks in Z-dim till 32 to not launch too many blocks blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ; dim3 grid((cropW*1.0f/scaleX + block.x - 1) / block.x, (cropH*1.0f/scaleY + block.y - 1) / block.y, blockDimZ); resizeBGRplanarBatchKernel<<>> (texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight, bs, scaleX, scaleY, cropX, cropY, cropW, cropH); } for (iTile = 0; iTile < nTiles; ++iTile) checkCudaErrors(cudaDestroyTextureObject(texSrc[iTile])); } void resizeBGRplanarBatch( float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, int nBatchSize, cudaStream_t stream, int cropX, int cropY, int cropW, int cropH, bool whSameResizeRatio) { resizeBGRplanarBatchCore(dpSrc, nSrcPitch, nSrcWidth, nSrcHeight, dpDst, nDstPitch, nDstWidth, nDstHeight, nBatchSize, stream, whSameResizeRatio, cropX, cropY, cropW, cropH); }