mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-01-19 22:05:49 +08:00
217 lines
7.8 KiB
Plaintext
217 lines
7.8 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "common.h"
|
|
|
|
// include kernels
|
|
#include "downscaleKernel.cuh"
|
|
#include "upscaleKernel.cuh"
|
|
#include "warpingKernel.cuh"
|
|
#include "derivativesKernel.cuh"
|
|
#include "solverKernel.cuh"
|
|
#include "addKernel.cuh"
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
/// \brief method logic
|
|
///
|
|
/// handles memory allocations, control flow
|
|
/// \param[in] I0 source image
|
|
/// \param[in] I1 tracked image
|
|
/// \param[in] width images width
|
|
/// \param[in] height images height
|
|
/// \param[in] stride images stride
|
|
/// \param[in] alpha degree of displacement field smoothness
|
|
/// \param[in] nLevels number of levels in a pyramid
|
|
/// \param[in] nWarpIters number of warping iterations per pyramid level
|
|
/// \param[in] nSolverIters number of solver iterations (Jacobi iterations)
|
|
/// \param[out] u horizontal displacement
|
|
/// \param[out] v vertical displacement
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
void ComputeFlowCUDA(const float *I0, const float *I1, int width, int height,
|
|
int stride, float alpha, int nLevels, int nWarpIters,
|
|
int nSolverIters, float *u, float *v) {
|
|
printf("Computing optical flow on GPU...\n");
|
|
|
|
// pI0 and pI1 will hold device pointers
|
|
const float **pI0 = new const float *[nLevels];
|
|
const float **pI1 = new const float *[nLevels];
|
|
|
|
int *pW = new int[nLevels];
|
|
int *pH = new int[nLevels];
|
|
int *pS = new int[nLevels];
|
|
|
|
// device memory pointers
|
|
float *d_tmp;
|
|
float *d_du0;
|
|
float *d_dv0;
|
|
float *d_du1;
|
|
float *d_dv1;
|
|
|
|
float *d_Ix;
|
|
float *d_Iy;
|
|
float *d_Iz;
|
|
|
|
float *d_u;
|
|
float *d_v;
|
|
float *d_nu;
|
|
float *d_nv;
|
|
|
|
const int dataSize = stride * height * sizeof(float);
|
|
|
|
checkCudaErrors(cudaMalloc(&d_tmp, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_du0, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_dv0, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_du1, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_dv1, dataSize));
|
|
|
|
checkCudaErrors(cudaMalloc(&d_Ix, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_Iy, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_Iz, dataSize));
|
|
|
|
checkCudaErrors(cudaMalloc(&d_u, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_v, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_nu, dataSize));
|
|
checkCudaErrors(cudaMalloc(&d_nv, dataSize));
|
|
|
|
// prepare pyramid
|
|
|
|
int currentLevel = nLevels - 1;
|
|
// allocate GPU memory for input images
|
|
checkCudaErrors(cudaMalloc(pI0 + currentLevel, dataSize));
|
|
checkCudaErrors(cudaMalloc(pI1 + currentLevel, dataSize));
|
|
|
|
checkCudaErrors(cudaMemcpy((void *)pI0[currentLevel], I0, dataSize,
|
|
cudaMemcpyHostToDevice));
|
|
checkCudaErrors(cudaMemcpy((void *)pI1[currentLevel], I1, dataSize,
|
|
cudaMemcpyHostToDevice));
|
|
|
|
pW[currentLevel] = width;
|
|
pH[currentLevel] = height;
|
|
pS[currentLevel] = stride;
|
|
|
|
for (; currentLevel > 0; --currentLevel) {
|
|
int nw = pW[currentLevel] / 2;
|
|
int nh = pH[currentLevel] / 2;
|
|
int ns = iAlignUp(nw);
|
|
|
|
checkCudaErrors(
|
|
cudaMalloc(pI0 + currentLevel - 1, ns * nh * sizeof(float)));
|
|
checkCudaErrors(
|
|
cudaMalloc(pI1 + currentLevel - 1, ns * nh * sizeof(float)));
|
|
|
|
Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel],
|
|
pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]);
|
|
|
|
Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
|
|
pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]);
|
|
|
|
pW[currentLevel - 1] = nw;
|
|
pH[currentLevel - 1] = nh;
|
|
pS[currentLevel - 1] = ns;
|
|
}
|
|
|
|
checkCudaErrors(cudaMemset(d_u, 0, stride * height * sizeof(float)));
|
|
checkCudaErrors(cudaMemset(d_v, 0, stride * height * sizeof(float)));
|
|
|
|
// compute flow
|
|
for (; currentLevel < nLevels; ++currentLevel) {
|
|
for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) {
|
|
checkCudaErrors(cudaMemset(d_du0, 0, dataSize));
|
|
checkCudaErrors(cudaMemset(d_dv0, 0, dataSize));
|
|
|
|
checkCudaErrors(cudaMemset(d_du1, 0, dataSize));
|
|
checkCudaErrors(cudaMemset(d_dv1, 0, dataSize));
|
|
|
|
// on current level we compute optical flow
|
|
// between frame 0 and warped frame 1
|
|
WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
|
|
pS[currentLevel], d_u, d_v, d_tmp);
|
|
|
|
ComputeDerivatives(pI0[currentLevel], d_tmp, pW[currentLevel],
|
|
pH[currentLevel], pS[currentLevel], d_Ix, d_Iy, d_Iz);
|
|
|
|
for (int iter = 0; iter < nSolverIters; ++iter) {
|
|
SolveForUpdate(d_du0, d_dv0, d_Ix, d_Iy, d_Iz, pW[currentLevel],
|
|
pH[currentLevel], pS[currentLevel], alpha, d_du1, d_dv1);
|
|
|
|
Swap(d_du0, d_du1);
|
|
Swap(d_dv0, d_dv1);
|
|
}
|
|
|
|
// update u, v
|
|
Add(d_u, d_du0, pH[currentLevel] * pS[currentLevel], d_u);
|
|
Add(d_v, d_dv0, pH[currentLevel] * pS[currentLevel], d_v);
|
|
}
|
|
|
|
if (currentLevel != nLevels - 1) {
|
|
// prolongate solution
|
|
float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel];
|
|
|
|
Upscale(d_u, pW[currentLevel], pH[currentLevel], pS[currentLevel],
|
|
pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
|
|
scaleX, d_nu);
|
|
|
|
float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel];
|
|
|
|
Upscale(d_v, pW[currentLevel], pH[currentLevel], pS[currentLevel],
|
|
pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
|
|
scaleY, d_nv);
|
|
|
|
Swap(d_u, d_nu);
|
|
Swap(d_v, d_nv);
|
|
}
|
|
}
|
|
|
|
checkCudaErrors(cudaMemcpy(u, d_u, dataSize, cudaMemcpyDeviceToHost));
|
|
checkCudaErrors(cudaMemcpy(v, d_v, dataSize, cudaMemcpyDeviceToHost));
|
|
|
|
// cleanup
|
|
for (int i = 0; i < nLevels; ++i) {
|
|
checkCudaErrors(cudaFree((void *)pI0[i]));
|
|
checkCudaErrors(cudaFree((void *)pI1[i]));
|
|
}
|
|
|
|
delete[] pI0;
|
|
delete[] pI1;
|
|
delete[] pW;
|
|
delete[] pH;
|
|
delete[] pS;
|
|
|
|
checkCudaErrors(cudaFree(d_tmp));
|
|
checkCudaErrors(cudaFree(d_du0));
|
|
checkCudaErrors(cudaFree(d_dv0));
|
|
checkCudaErrors(cudaFree(d_du1));
|
|
checkCudaErrors(cudaFree(d_dv1));
|
|
checkCudaErrors(cudaFree(d_Ix));
|
|
checkCudaErrors(cudaFree(d_Iy));
|
|
checkCudaErrors(cudaFree(d_Iz));
|
|
checkCudaErrors(cudaFree(d_nu));
|
|
checkCudaErrors(cudaFree(d_nv));
|
|
checkCudaErrors(cudaFree(d_u));
|
|
checkCudaErrors(cudaFree(d_v));
|
|
}
|