/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common.h" // include kernels #include "downscaleKernel.cuh" #include "upscaleKernel.cuh" #include "warpingKernel.cuh" #include "derivativesKernel.cuh" #include "solverKernel.cuh" #include "addKernel.cuh" /////////////////////////////////////////////////////////////////////////////// /// \brief method logic /// /// handles memory allocations, control flow /// \param[in] I0 source image /// \param[in] I1 tracked image /// \param[in] width images width /// \param[in] height images height /// \param[in] stride images stride /// \param[in] alpha degree of displacement field smoothness /// \param[in] nLevels number of levels in a pyramid /// \param[in] nWarpIters number of warping iterations per pyramid level /// \param[in] nSolverIters number of solver iterations (Jacobi iterations) /// \param[out] u horizontal displacement /// \param[out] v vertical displacement /////////////////////////////////////////////////////////////////////////////// void ComputeFlowCUDA(const float *I0, const float *I1, int width, int height, int stride, float alpha, int nLevels, int nWarpIters, int nSolverIters, float *u, float *v) { printf("Computing optical flow on GPU...\n"); // pI0 and pI1 will hold device pointers const float **pI0 = new const float *[nLevels]; const float **pI1 = new const float *[nLevels]; int *pW = new int[nLevels]; int *pH = new int[nLevels]; int *pS = new int[nLevels]; // device memory pointers float *d_tmp; float *d_du0; float *d_dv0; float *d_du1; float *d_dv1; float *d_Ix; float *d_Iy; float *d_Iz; float *d_u; float *d_v; float *d_nu; float *d_nv; const int dataSize = stride * height * sizeof(float); checkCudaErrors(cudaMalloc(&d_tmp, dataSize)); checkCudaErrors(cudaMalloc(&d_du0, dataSize)); checkCudaErrors(cudaMalloc(&d_dv0, dataSize)); checkCudaErrors(cudaMalloc(&d_du1, dataSize)); checkCudaErrors(cudaMalloc(&d_dv1, dataSize)); checkCudaErrors(cudaMalloc(&d_Ix, dataSize)); checkCudaErrors(cudaMalloc(&d_Iy, dataSize)); checkCudaErrors(cudaMalloc(&d_Iz, dataSize)); checkCudaErrors(cudaMalloc(&d_u, dataSize)); checkCudaErrors(cudaMalloc(&d_v, dataSize)); checkCudaErrors(cudaMalloc(&d_nu, dataSize)); checkCudaErrors(cudaMalloc(&d_nv, dataSize)); // prepare pyramid int currentLevel = nLevels - 1; // allocate GPU memory for input images checkCudaErrors(cudaMalloc(pI0 + currentLevel, dataSize)); checkCudaErrors(cudaMalloc(pI1 + currentLevel, dataSize)); checkCudaErrors(cudaMemcpy((void *)pI0[currentLevel], I0, dataSize, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy((void *)pI1[currentLevel], I1, dataSize, cudaMemcpyHostToDevice)); pW[currentLevel] = width; pH[currentLevel] = height; pS[currentLevel] = stride; for (; currentLevel > 0; --currentLevel) { int nw = pW[currentLevel] / 2; int nh = pH[currentLevel] / 2; int ns = iAlignUp(nw); checkCudaErrors( cudaMalloc(pI0 + currentLevel - 1, ns * nh * sizeof(float))); checkCudaErrors( cudaMalloc(pI1 + currentLevel - 1, ns * nh * sizeof(float))); Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel], pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]); Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel], pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]); pW[currentLevel - 1] = nw; pH[currentLevel - 1] = nh; pS[currentLevel - 1] = ns; } checkCudaErrors(cudaMemset(d_u, 0, stride * height * sizeof(float))); checkCudaErrors(cudaMemset(d_v, 0, stride * height * sizeof(float))); // compute flow for (; currentLevel < nLevels; ++currentLevel) { for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) { checkCudaErrors(cudaMemset(d_du0, 0, dataSize)); checkCudaErrors(cudaMemset(d_dv0, 0, dataSize)); checkCudaErrors(cudaMemset(d_du1, 0, dataSize)); checkCudaErrors(cudaMemset(d_dv1, 0, dataSize)); // on current level we compute optical flow // between frame 0 and warped frame 1 WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel], pS[currentLevel], d_u, d_v, d_tmp); ComputeDerivatives(pI0[currentLevel], d_tmp, pW[currentLevel], pH[currentLevel], pS[currentLevel], d_Ix, d_Iy, d_Iz); for (int iter = 0; iter < nSolverIters; ++iter) { SolveForUpdate(d_du0, d_dv0, d_Ix, d_Iy, d_Iz, pW[currentLevel], pH[currentLevel], pS[currentLevel], alpha, d_du1, d_dv1); Swap(d_du0, d_du1); Swap(d_dv0, d_dv1); } // update u, v Add(d_u, d_du0, pH[currentLevel] * pS[currentLevel], d_u); Add(d_v, d_dv0, pH[currentLevel] * pS[currentLevel], d_v); } if (currentLevel != nLevels - 1) { // prolongate solution float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel]; Upscale(d_u, pW[currentLevel], pH[currentLevel], pS[currentLevel], pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1], scaleX, d_nu); float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel]; Upscale(d_v, pW[currentLevel], pH[currentLevel], pS[currentLevel], pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1], scaleY, d_nv); Swap(d_u, d_nu); Swap(d_v, d_nv); } } checkCudaErrors(cudaMemcpy(u, d_u, dataSize, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(v, d_v, dataSize, cudaMemcpyDeviceToHost)); // cleanup for (int i = 0; i < nLevels; ++i) { checkCudaErrors(cudaFree((void *)pI0[i])); checkCudaErrors(cudaFree((void *)pI1[i])); } delete[] pI0; delete[] pI1; delete[] pW; delete[] pH; delete[] pS; checkCudaErrors(cudaFree(d_tmp)); checkCudaErrors(cudaFree(d_du0)); checkCudaErrors(cudaFree(d_dv0)); checkCudaErrors(cudaFree(d_du1)); checkCudaErrors(cudaFree(d_dv1)); checkCudaErrors(cudaFree(d_Ix)); checkCudaErrors(cudaFree(d_Iy)); checkCudaErrors(cudaFree(d_Iz)); checkCudaErrors(cudaFree(d_nu)); checkCudaErrors(cudaFree(d_nv)); checkCudaErrors(cudaFree(d_u)); checkCudaErrors(cudaFree(d_v)); }