mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-07-02 05:10:31 +08:00
175 lines
7.6 KiB
Plaintext
175 lines
7.6 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
|
|
// Implements BGR 3 progressive planars frames batch resize
|
|
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#include "resize_convert.h"
|
|
|
|
__global__ void resizeBGRplanarBatchKernel(cudaTextureObject_t texSrc,
|
|
float *pDst,
|
|
int nDstPitch,
|
|
int nDstHeight,
|
|
int nSrcHeight,
|
|
int batch,
|
|
float scaleX,
|
|
float scaleY,
|
|
int cropX,
|
|
int cropY,
|
|
int cropW,
|
|
int cropH)
|
|
{
|
|
int x = threadIdx.x + blockIdx.x * blockDim.x;
|
|
int y = threadIdx.y + blockIdx.y * blockDim.y;
|
|
|
|
if (x >= (int)(cropW / scaleX) || y >= (int)(cropH / scaleY))
|
|
return;
|
|
|
|
int frameSize = nDstPitch * nDstHeight;
|
|
float *p = NULL;
|
|
for (int i = blockIdx.z; i < batch; i += gridDim.z) {
|
|
#pragma unroll
|
|
for (int channel = 0; channel < 3; channel++) {
|
|
p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize;
|
|
*p = tex2D<float>(texSrc, x * scaleX + cropX, ((3 * i + channel) * nSrcHeight + y * scaleY + cropY));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static void resizeBGRplanarBatchCore(float *dpSrc,
|
|
int nSrcPitch,
|
|
int nSrcWidth,
|
|
int nSrcHeight,
|
|
float *dpDst,
|
|
int nDstPitch,
|
|
int nDstWidth,
|
|
int nDstHeight,
|
|
int nBatchSize,
|
|
cudaStream_t stream,
|
|
bool whSameResizeRatio,
|
|
int cropX,
|
|
int cropY,
|
|
int cropW,
|
|
int cropH)
|
|
{
|
|
cudaTextureObject_t texSrc[2];
|
|
int nTiles = 1, h, iTile;
|
|
|
|
h = nSrcHeight * 3 * nBatchSize;
|
|
while ((h + nTiles - 1) / nTiles > 65536)
|
|
nTiles++;
|
|
|
|
if (nTiles > 2)
|
|
return;
|
|
|
|
int batchTile = nBatchSize / nTiles;
|
|
int batchTileLast = nBatchSize - batchTile * (nTiles - 1);
|
|
|
|
for (iTile = 0; iTile < nTiles; ++iTile) {
|
|
int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile;
|
|
float *dpSrcNew = dpSrc + iTile * (batchTile * 3 * nSrcHeight * nSrcPitch);
|
|
|
|
cudaResourceDesc resDesc = {};
|
|
resDesc.resType = cudaResourceTypePitch2D;
|
|
resDesc.res.pitch2D.devPtr = dpSrcNew;
|
|
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
|
|
resDesc.res.pitch2D.width = nSrcWidth;
|
|
resDesc.res.pitch2D.height = bs * 3 * nSrcHeight;
|
|
resDesc.res.pitch2D.pitchInBytes = nSrcPitch * sizeof(float);
|
|
cudaTextureDesc texDesc = {};
|
|
texDesc.filterMode = cudaFilterModeLinear;
|
|
texDesc.readMode = cudaReadModeElementType;
|
|
|
|
checkCudaErrors(cudaCreateTextureObject(&texSrc[iTile], &resDesc, &texDesc, NULL));
|
|
float *dpDstNew = dpDst + iTile * (batchTile * 3 * nDstHeight * nDstPitch);
|
|
|
|
if (cropW == 0 || cropH == 0) {
|
|
cropX = 0;
|
|
cropY = 0;
|
|
cropW = nSrcWidth;
|
|
cropH = nSrcHeight;
|
|
}
|
|
|
|
float scaleX = (cropW * 1.0f / nDstWidth);
|
|
float scaleY = (cropH * 1.0f / nDstHeight);
|
|
|
|
if (whSameResizeRatio == true)
|
|
scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
|
|
dim3 block(32, 32, 1);
|
|
|
|
size_t blockDimZ = bs;
|
|
// Restricting blocks in Z-dim till 32 to not launch too many blocks
|
|
blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
|
|
dim3 grid((cropW * 1.0f / scaleX + block.x - 1) / block.x,
|
|
(cropH * 1.0f / scaleY + block.y - 1) / block.y,
|
|
blockDimZ);
|
|
|
|
resizeBGRplanarBatchKernel<<<grid, block, 0, stream>>>(
|
|
texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight, bs, scaleX, scaleY, cropX, cropY, cropW, cropH);
|
|
}
|
|
|
|
for (iTile = 0; iTile < nTiles; ++iTile)
|
|
checkCudaErrors(cudaDestroyTextureObject(texSrc[iTile]));
|
|
}
|
|
|
|
void resizeBGRplanarBatch(float *dpSrc,
|
|
int nSrcPitch,
|
|
int nSrcWidth,
|
|
int nSrcHeight,
|
|
float *dpDst,
|
|
int nDstPitch,
|
|
int nDstWidth,
|
|
int nDstHeight,
|
|
int nBatchSize,
|
|
cudaStream_t stream,
|
|
int cropX,
|
|
int cropY,
|
|
int cropW,
|
|
int cropH,
|
|
bool whSameResizeRatio)
|
|
{
|
|
resizeBGRplanarBatchCore(dpSrc,
|
|
nSrcPitch,
|
|
nSrcWidth,
|
|
nSrcHeight,
|
|
dpDst,
|
|
nDstPitch,
|
|
nDstWidth,
|
|
nDstHeight,
|
|
nBatchSize,
|
|
stream,
|
|
whSameResizeRatio,
|
|
cropX,
|
|
cropY,
|
|
cropW,
|
|
cropH);
|
|
}
|