cuda-samples/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.cu
2022-01-13 11:35:24 +05:30

217 lines
7.8 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "common.h"
// include kernels
#include "downscaleKernel.cuh"
#include "upscaleKernel.cuh"
#include "warpingKernel.cuh"
#include "derivativesKernel.cuh"
#include "solverKernel.cuh"
#include "addKernel.cuh"
///////////////////////////////////////////////////////////////////////////////
/// \brief method logic
///
/// handles memory allocations, control flow
/// \param[in] I0 source image
/// \param[in] I1 tracked image
/// \param[in] width images width
/// \param[in] height images height
/// \param[in] stride images stride
/// \param[in] alpha degree of displacement field smoothness
/// \param[in] nLevels number of levels in a pyramid
/// \param[in] nWarpIters number of warping iterations per pyramid level
/// \param[in] nSolverIters number of solver iterations (Jacobi iterations)
/// \param[out] u horizontal displacement
/// \param[out] v vertical displacement
///////////////////////////////////////////////////////////////////////////////
void ComputeFlowCUDA(const float *I0, const float *I1, int width, int height,
int stride, float alpha, int nLevels, int nWarpIters,
int nSolverIters, float *u, float *v) {
printf("Computing optical flow on GPU...\n");
// pI0 and pI1 will hold device pointers
const float **pI0 = new const float *[nLevels];
const float **pI1 = new const float *[nLevels];
int *pW = new int[nLevels];
int *pH = new int[nLevels];
int *pS = new int[nLevels];
// device memory pointers
float *d_tmp;
float *d_du0;
float *d_dv0;
float *d_du1;
float *d_dv1;
float *d_Ix;
float *d_Iy;
float *d_Iz;
float *d_u;
float *d_v;
float *d_nu;
float *d_nv;
const int dataSize = stride * height * sizeof(float);
checkCudaErrors(cudaMalloc(&d_tmp, dataSize));
checkCudaErrors(cudaMalloc(&d_du0, dataSize));
checkCudaErrors(cudaMalloc(&d_dv0, dataSize));
checkCudaErrors(cudaMalloc(&d_du1, dataSize));
checkCudaErrors(cudaMalloc(&d_dv1, dataSize));
checkCudaErrors(cudaMalloc(&d_Ix, dataSize));
checkCudaErrors(cudaMalloc(&d_Iy, dataSize));
checkCudaErrors(cudaMalloc(&d_Iz, dataSize));
checkCudaErrors(cudaMalloc(&d_u, dataSize));
checkCudaErrors(cudaMalloc(&d_v, dataSize));
checkCudaErrors(cudaMalloc(&d_nu, dataSize));
checkCudaErrors(cudaMalloc(&d_nv, dataSize));
// prepare pyramid
int currentLevel = nLevels - 1;
// allocate GPU memory for input images
checkCudaErrors(cudaMalloc(pI0 + currentLevel, dataSize));
checkCudaErrors(cudaMalloc(pI1 + currentLevel, dataSize));
checkCudaErrors(cudaMemcpy((void *)pI0[currentLevel], I0, dataSize,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy((void *)pI1[currentLevel], I1, dataSize,
cudaMemcpyHostToDevice));
pW[currentLevel] = width;
pH[currentLevel] = height;
pS[currentLevel] = stride;
for (; currentLevel > 0; --currentLevel) {
int nw = pW[currentLevel] / 2;
int nh = pH[currentLevel] / 2;
int ns = iAlignUp(nw);
checkCudaErrors(
cudaMalloc(pI0 + currentLevel - 1, ns * nh * sizeof(float)));
checkCudaErrors(
cudaMalloc(pI1 + currentLevel - 1, ns * nh * sizeof(float)));
Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel],
pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]);
Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]);
pW[currentLevel - 1] = nw;
pH[currentLevel - 1] = nh;
pS[currentLevel - 1] = ns;
}
checkCudaErrors(cudaMemset(d_u, 0, stride * height * sizeof(float)));
checkCudaErrors(cudaMemset(d_v, 0, stride * height * sizeof(float)));
// compute flow
for (; currentLevel < nLevels; ++currentLevel) {
for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) {
checkCudaErrors(cudaMemset(d_du0, 0, dataSize));
checkCudaErrors(cudaMemset(d_dv0, 0, dataSize));
checkCudaErrors(cudaMemset(d_du1, 0, dataSize));
checkCudaErrors(cudaMemset(d_dv1, 0, dataSize));
// on current level we compute optical flow
// between frame 0 and warped frame 1
WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
pS[currentLevel], d_u, d_v, d_tmp);
ComputeDerivatives(pI0[currentLevel], d_tmp, pW[currentLevel],
pH[currentLevel], pS[currentLevel], d_Ix, d_Iy, d_Iz);
for (int iter = 0; iter < nSolverIters; ++iter) {
SolveForUpdate(d_du0, d_dv0, d_Ix, d_Iy, d_Iz, pW[currentLevel],
pH[currentLevel], pS[currentLevel], alpha, d_du1, d_dv1);
Swap(d_du0, d_du1);
Swap(d_dv0, d_dv1);
}
// update u, v
Add(d_u, d_du0, pH[currentLevel] * pS[currentLevel], d_u);
Add(d_v, d_dv0, pH[currentLevel] * pS[currentLevel], d_v);
}
if (currentLevel != nLevels - 1) {
// prolongate solution
float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel];
Upscale(d_u, pW[currentLevel], pH[currentLevel], pS[currentLevel],
pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
scaleX, d_nu);
float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel];
Upscale(d_v, pW[currentLevel], pH[currentLevel], pS[currentLevel],
pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
scaleY, d_nv);
Swap(d_u, d_nu);
Swap(d_v, d_nv);
}
}
checkCudaErrors(cudaMemcpy(u, d_u, dataSize, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(v, d_v, dataSize, cudaMemcpyDeviceToHost));
// cleanup
for (int i = 0; i < nLevels; ++i) {
checkCudaErrors(cudaFree((void *)pI0[i]));
checkCudaErrors(cudaFree((void *)pI1[i]));
}
delete[] pI0;
delete[] pI1;
delete[] pW;
delete[] pH;
delete[] pS;
checkCudaErrors(cudaFree(d_tmp));
checkCudaErrors(cudaFree(d_du0));
checkCudaErrors(cudaFree(d_dv0));
checkCudaErrors(cudaFree(d_du1));
checkCudaErrors(cudaFree(d_dv1));
checkCudaErrors(cudaFree(d_Ix));
checkCudaErrors(cudaFree(d_Iy));
checkCudaErrors(cudaFree(d_Iz));
checkCudaErrors(cudaFree(d_nu));
checkCudaErrors(cudaFree(d_nv));
checkCudaErrors(cudaFree(d_u));
checkCudaErrors(cudaFree(d_v));
}