cuda-samples/Samples/NV12toBGRandResize/bgr_resize.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


// Implements BGR 3 progressive planars frames batch resize

#include <cuda.h>
#include <cuda_runtime.h>
#include "resize_convert.h"

__global__ void resizeBGRplanarBatchKernel(cudaTextureObject_t texSrc,
    float *pDst, int nDstPitch, int nDstHeight, int nSrcHeight,
    int batch, float scaleX, float scaleY,
    int cropX, int cropY, int cropW, int cropH) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x >= (int)(cropW/scaleX) || y >= (int)(cropH/scaleY))
        return;

    int frameSize = nDstPitch*nDstHeight;
    float *p = NULL;
    for (int i = blockIdx.z; i < batch; i += gridDim.z) {
        #pragma unroll
        for (int channel=0; channel < 3; channel++){
            p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize;
            *p = tex2D<float>(texSrc, x * scaleX + cropX,
                                ((3 * i + channel) * nSrcHeight + y * scaleY + cropY));
        }
    }
}


static void resizeBGRplanarBatchCore(
        float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
        float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
        int nBatchSize, cudaStream_t stream, bool whSameResizeRatio,
        int cropX, int cropY, int cropW, int cropH) {
    cudaTextureObject_t texSrc[2];
    int nTiles = 1, h, iTile;

    h = nSrcHeight * 3 * nBatchSize;
    while ((h + nTiles - 1) / nTiles > 65536)
        nTiles++;

    if (nTiles > 2)
        return;

    int batchTile = nBatchSize / nTiles;
    int batchTileLast = nBatchSize - batchTile * (nTiles-1);

    for (iTile = 0; iTile < nTiles; ++iTile) {
        int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile;
        float *dpSrcNew = dpSrc +
            iTile * (batchTile * 3 * nSrcHeight * nSrcPitch);

        cudaResourceDesc resDesc = {};
        resDesc.resType = cudaResourceTypePitch2D;
        resDesc.res.pitch2D.devPtr = dpSrcNew;
        resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
        resDesc.res.pitch2D.width = nSrcWidth;
        resDesc.res.pitch2D.height = bs * 3 * nSrcHeight;
        resDesc.res.pitch2D.pitchInBytes = nSrcPitch * sizeof(float);
        cudaTextureDesc texDesc = {};
        texDesc.filterMode = cudaFilterModeLinear;
        texDesc.readMode = cudaReadModeElementType;

        checkCudaErrors(cudaCreateTextureObject(&texSrc[iTile], &resDesc, &texDesc, NULL));
        float *dpDstNew = dpDst +
            iTile * (batchTile * 3 * nDstHeight * nDstPitch);

        if(cropW == 0 || cropH == 0) {
            cropX = 0;
            cropY = 0;
            cropW = nSrcWidth;
            cropH = nSrcHeight;
        }

        float scaleX = (cropW*1.0f / nDstWidth);
        float scaleY = (cropH*1.0f / nDstHeight);

        if(whSameResizeRatio == true)
            scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
        dim3 block(32, 32, 1);

        size_t blockDimZ = bs;
        // Restricting blocks in Z-dim till 32 to not launch too many blocks
        blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
        dim3 grid((cropW*1.0f/scaleX + block.x - 1) / block.x,
                  (cropH*1.0f/scaleY + block.y - 1) / block.y, blockDimZ);

        resizeBGRplanarBatchKernel<<<grid, block, 0, stream>>>
                (texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight,
                bs, scaleX, scaleY, cropX, cropY, cropW, cropH);

    }

    for (iTile = 0; iTile < nTiles; ++iTile)
        checkCudaErrors(cudaDestroyTextureObject(texSrc[iTile]));
}

void resizeBGRplanarBatch(
        float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
        float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
        int nBatchSize, cudaStream_t stream,
        int cropX, int cropY, int cropW, int cropH, bool whSameResizeRatio) {
    resizeBGRplanarBatchCore(dpSrc, nSrcPitch, nSrcWidth, nSrcHeight,
        dpDst, nDstPitch, nDstWidth, nDstHeight, nBatchSize, stream,
        whSameResizeRatio, cropX, cropY, cropW, cropH);
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
Add and update samples with CUDA 10.1 Update 1 support 2019-04-10 22:42:09 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`


			`// Implements BGR 3 progressive planars frames batch resize`

			`#include <cuda.h>`
			`#include <cuda_runtime.h>`
			`#include "resize_convert.h"`

			`__global__ void resizeBGRplanarBatchKernel(cudaTextureObject_t texSrc,`
			`float *pDst, int nDstPitch, int nDstHeight, int nSrcHeight,`
			`int batch, float scaleX, float scaleY,`
			`int cropX, int cropY, int cropW, int cropH) {`
			`int x = threadIdx.x + blockIdx.x * blockDim.x;`
			`int y = threadIdx.y + blockIdx.y * blockDim.y;`

			`if (x >= (int)(cropW/scaleX) \|\| y >= (int)(cropH/scaleY))`
			`return;`

			`int frameSize = nDstPitch*nDstHeight;`
			`float *p = NULL;`
			`for (int i = blockIdx.z; i < batch; i += gridDim.z) {`
			`#pragma unroll`
			`for (int channel=0; channel < 3; channel++){`
			`p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize;`
			`p = tex2D<float>(texSrc, x scaleX + cropX,`
			`((3 * i + channel) * nSrcHeight + y * scaleY + cropY));`
			`}`
			`}`
			`}`


			`static void resizeBGRplanarBatchCore(`
			`float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,`
			`float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,`
			`int nBatchSize, cudaStream_t stream, bool whSameResizeRatio,`
			`int cropX, int cropY, int cropW, int cropH) {`
			`cudaTextureObject_t texSrc[2];`
			`int nTiles = 1, h, iTile;`

			`h = nSrcHeight * 3 * nBatchSize;`
			`while ((h + nTiles - 1) / nTiles > 65536)`
			`nTiles++;`

			`if (nTiles > 2)`
			`return;`

			`int batchTile = nBatchSize / nTiles;`
			`int batchTileLast = nBatchSize - batchTile * (nTiles-1);`

			`for (iTile = 0; iTile < nTiles; ++iTile) {`
			`int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile;`
			`float *dpSrcNew = dpSrc +`
			`iTile * (batchTile * 3 * nSrcHeight * nSrcPitch);`

			`cudaResourceDesc resDesc = {};`
			`resDesc.resType = cudaResourceTypePitch2D;`
			`resDesc.res.pitch2D.devPtr = dpSrcNew;`
			`resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();`
			`resDesc.res.pitch2D.width = nSrcWidth;`
			`resDesc.res.pitch2D.height = bs * 3 * nSrcHeight;`
			`resDesc.res.pitch2D.pitchInBytes = nSrcPitch * sizeof(float);`
			`cudaTextureDesc texDesc = {};`
			`texDesc.filterMode = cudaFilterModeLinear;`
			`texDesc.readMode = cudaReadModeElementType;`

			`checkCudaErrors(cudaCreateTextureObject(&texSrc[iTile], &resDesc, &texDesc, NULL));`
			`float *dpDstNew = dpDst +`
			`iTile * (batchTile * 3 * nDstHeight * nDstPitch);`

			`if(cropW == 0 \|\| cropH == 0) {`
			`cropX = 0;`
			`cropY = 0;`
			`cropW = nSrcWidth;`
			`cropH = nSrcHeight;`
			`}`

			`float scaleX = (cropW*1.0f / nDstWidth);`
			`float scaleY = (cropH*1.0f / nDstHeight);`

			`if(whSameResizeRatio == true)`
			`scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;`
			`dim3 block(32, 32, 1);`

			`size_t blockDimZ = bs;`
			`// Restricting blocks in Z-dim till 32 to not launch too many blocks`
			`blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;`
			`dim3 grid((cropW*1.0f/scaleX + block.x - 1) / block.x,`
			`(cropH*1.0f/scaleY + block.y - 1) / block.y, blockDimZ);`

			`resizeBGRplanarBatchKernel<<<grid, block, 0, stream>>>`
			`(texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight,`
			`bs, scaleX, scaleY, cropX, cropY, cropW, cropH);`

			`}`

			`for (iTile = 0; iTile < nTiles; ++iTile)`
			`checkCudaErrors(cudaDestroyTextureObject(texSrc[iTile]));`
			`}`

			`void resizeBGRplanarBatch(`
			`float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,`
			`float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,`
			`int nBatchSize, cudaStream_t stream,`
			`int cropX, int cropY, int cropW, int cropH, bool whSameResizeRatio) {`
			`resizeBGRplanarBatchCore(dpSrc, nSrcPitch, nSrcWidth, nSrcHeight,`
			`dpDst, nDstPitch, nDstWidth, nDstHeight, nBatchSize, stream,`
			`whSameResizeRatio, cropX, cropY, cropW, cropH);`
			`}`