cuda-samples/Samples/HSOpticalFlow/flowCUDA.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "common.h"

// include kernels
#include "downscaleKernel.cuh"
#include "upscaleKernel.cuh"
#include "warpingKernel.cuh"
#include "derivativesKernel.cuh"
#include "solverKernel.cuh"
#include "addKernel.cuh"

///////////////////////////////////////////////////////////////////////////////
/// \brief method logic
///
/// handles memory allocations, control flow
/// \param[in]  I0           source image
/// \param[in]  I1           tracked image
/// \param[in]  width        images width
/// \param[in]  height       images height
/// \param[in]  stride       images stride
/// \param[in]  alpha        degree of displacement field smoothness
/// \param[in]  nLevels      number of levels in a pyramid
/// \param[in]  nWarpIters   number of warping iterations per pyramid level
/// \param[in]  nSolverIters number of solver iterations (Jacobi iterations)
/// \param[out] u            horizontal displacement
/// \param[out] v            vertical displacement
///////////////////////////////////////////////////////////////////////////////
void ComputeFlowCUDA(const float *I0, const float *I1, int width, int height,
                     int stride, float alpha, int nLevels, int nWarpIters,
                     int nSolverIters, float *u, float *v) {
  printf("Computing optical flow on GPU...\n");

  // pI0 and pI1 will hold device pointers
  const float **pI0 = new const float *[nLevels];
  const float **pI1 = new const float *[nLevels];

  int *pW = new int[nLevels];
  int *pH = new int[nLevels];
  int *pS = new int[nLevels];

  // device memory pointers
  float *d_tmp;
  float *d_du0;
  float *d_dv0;
  float *d_du1;
  float *d_dv1;

  float *d_Ix;
  float *d_Iy;
  float *d_Iz;

  float *d_u;
  float *d_v;
  float *d_nu;
  float *d_nv;

  const int dataSize = stride * height * sizeof(float);

  checkCudaErrors(cudaMalloc(&d_tmp, dataSize));
  checkCudaErrors(cudaMalloc(&d_du0, dataSize));
  checkCudaErrors(cudaMalloc(&d_dv0, dataSize));
  checkCudaErrors(cudaMalloc(&d_du1, dataSize));
  checkCudaErrors(cudaMalloc(&d_dv1, dataSize));

  checkCudaErrors(cudaMalloc(&d_Ix, dataSize));
  checkCudaErrors(cudaMalloc(&d_Iy, dataSize));
  checkCudaErrors(cudaMalloc(&d_Iz, dataSize));

  checkCudaErrors(cudaMalloc(&d_u, dataSize));
  checkCudaErrors(cudaMalloc(&d_v, dataSize));
  checkCudaErrors(cudaMalloc(&d_nu, dataSize));
  checkCudaErrors(cudaMalloc(&d_nv, dataSize));

  // prepare pyramid

  int currentLevel = nLevels - 1;
  // allocate GPU memory for input images
  checkCudaErrors(cudaMalloc(pI0 + currentLevel, dataSize));
  checkCudaErrors(cudaMalloc(pI1 + currentLevel, dataSize));

  checkCudaErrors(cudaMemcpy((void *)pI0[currentLevel], I0, dataSize,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy((void *)pI1[currentLevel], I1, dataSize,
                             cudaMemcpyHostToDevice));

  pW[currentLevel] = width;
  pH[currentLevel] = height;
  pS[currentLevel] = stride;

  for (; currentLevel > 0; --currentLevel) {
    int nw = pW[currentLevel] / 2;
    int nh = pH[currentLevel] / 2;
    int ns = iAlignUp(nw);

    checkCudaErrors(
        cudaMalloc(pI0 + currentLevel - 1, ns * nh * sizeof(float)));
    checkCudaErrors(
        cudaMalloc(pI1 + currentLevel - 1, ns * nh * sizeof(float)));

    Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel],
              pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]);

    Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
              pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]);

    pW[currentLevel - 1] = nw;
    pH[currentLevel - 1] = nh;
    pS[currentLevel - 1] = ns;
  }

  checkCudaErrors(cudaMemset(d_u, 0, stride * height * sizeof(float)));
  checkCudaErrors(cudaMemset(d_v, 0, stride * height * sizeof(float)));

  // compute flow
  for (; currentLevel < nLevels; ++currentLevel) {
    for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) {
      checkCudaErrors(cudaMemset(d_du0, 0, dataSize));
      checkCudaErrors(cudaMemset(d_dv0, 0, dataSize));

      checkCudaErrors(cudaMemset(d_du1, 0, dataSize));
      checkCudaErrors(cudaMemset(d_dv1, 0, dataSize));

      // on current level we compute optical flow
      // between frame 0 and warped frame 1
      WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
                pS[currentLevel], d_u, d_v, d_tmp);

      ComputeDerivatives(pI0[currentLevel], d_tmp, pW[currentLevel],
                         pH[currentLevel], pS[currentLevel], d_Ix, d_Iy, d_Iz);

      for (int iter = 0; iter < nSolverIters; ++iter) {
        SolveForUpdate(d_du0, d_dv0, d_Ix, d_Iy, d_Iz, pW[currentLevel],
                       pH[currentLevel], pS[currentLevel], alpha, d_du1, d_dv1);

        Swap(d_du0, d_du1);
        Swap(d_dv0, d_dv1);
      }

      // update u, v
      Add(d_u, d_du0, pH[currentLevel] * pS[currentLevel], d_u);
      Add(d_v, d_dv0, pH[currentLevel] * pS[currentLevel], d_v);
    }

    if (currentLevel != nLevels - 1) {
      // prolongate solution
      float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel];

      Upscale(d_u, pW[currentLevel], pH[currentLevel], pS[currentLevel],
              pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
              scaleX, d_nu);

      float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel];

      Upscale(d_v, pW[currentLevel], pH[currentLevel], pS[currentLevel],
              pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
              scaleY, d_nv);

      Swap(d_u, d_nu);
      Swap(d_v, d_nv);
    }
  }

  checkCudaErrors(cudaMemcpy(u, d_u, dataSize, cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy(v, d_v, dataSize, cudaMemcpyDeviceToHost));

  // cleanup
  for (int i = 0; i < nLevels; ++i) {
    checkCudaErrors(cudaFree((void *)pI0[i]));
    checkCudaErrors(cudaFree((void *)pI1[i]));
  }

  delete[] pI0;
  delete[] pI1;
  delete[] pW;
  delete[] pH;
  delete[] pS;

  checkCudaErrors(cudaFree(d_tmp));
  checkCudaErrors(cudaFree(d_du0));
  checkCudaErrors(cudaFree(d_dv0));
  checkCudaErrors(cudaFree(d_du1));
  checkCudaErrors(cudaFree(d_dv1));
  checkCudaErrors(cudaFree(d_Ix));
  checkCudaErrors(cudaFree(d_Iy));
  checkCudaErrors(cudaFree(d_Iz));
  checkCudaErrors(cudaFree(d_nu));
  checkCudaErrors(cudaFree(d_nv));
  checkCudaErrors(cudaFree(d_u));
  checkCudaErrors(cudaFree(d_v));
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`#include "common.h"`

			`// include kernels`
			`#include "downscaleKernel.cuh"`
			`#include "upscaleKernel.cuh"`
			`#include "warpingKernel.cuh"`
			`#include "derivativesKernel.cuh"`
			`#include "solverKernel.cuh"`
			`#include "addKernel.cuh"`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief method logic`
			`///`
			`/// handles memory allocations, control flow`
			`/// \param[in] I0 source image`
			`/// \param[in] I1 tracked image`
			`/// \param[in] width images width`
			`/// \param[in] height images height`
			`/// \param[in] stride images stride`
			`/// \param[in] alpha degree of displacement field smoothness`
			`/// \param[in] nLevels number of levels in a pyramid`
			`/// \param[in] nWarpIters number of warping iterations per pyramid level`
			`/// \param[in] nSolverIters number of solver iterations (Jacobi iterations)`
			`/// \param[out] u horizontal displacement`
			`/// \param[out] v vertical displacement`
			`///////////////////////////////////////////////////////////////////////////////`
			`void ComputeFlowCUDA(const float I0, const float I1, int width, int height,`
			`int stride, float alpha, int nLevels, int nWarpIters,`
			`int nSolverIters, float u, float v) {`
			`printf("Computing optical flow on GPU...\n");`

			`// pI0 and pI1 will hold device pointers`
			`const float *pI0 = new const float [nLevels];`
			`const float *pI1 = new const float [nLevels];`

			`int *pW = new int[nLevels];`
			`int *pH = new int[nLevels];`
			`int *pS = new int[nLevels];`

			`// device memory pointers`
			`float *d_tmp;`
			`float *d_du0;`
			`float *d_dv0;`
			`float *d_du1;`
			`float *d_dv1;`

			`float *d_Ix;`
			`float *d_Iy;`
			`float *d_Iz;`

			`float *d_u;`
			`float *d_v;`
			`float *d_nu;`
			`float *d_nv;`

			`const int dataSize = stride * height * sizeof(float);`

			`checkCudaErrors(cudaMalloc(&d_tmp, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_du0, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_dv0, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_du1, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_dv1, dataSize));`

			`checkCudaErrors(cudaMalloc(&d_Ix, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_Iy, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_Iz, dataSize));`

			`checkCudaErrors(cudaMalloc(&d_u, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_v, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_nu, dataSize));`
			`checkCudaErrors(cudaMalloc(&d_nv, dataSize));`

			`// prepare pyramid`

			`int currentLevel = nLevels - 1;`
			`// allocate GPU memory for input images`
			`checkCudaErrors(cudaMalloc(pI0 + currentLevel, dataSize));`
			`checkCudaErrors(cudaMalloc(pI1 + currentLevel, dataSize));`

			`checkCudaErrors(cudaMemcpy((void *)pI0[currentLevel], I0, dataSize,`
			`cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy((void *)pI1[currentLevel], I1, dataSize,`
			`cudaMemcpyHostToDevice));`

			`pW[currentLevel] = width;`
			`pH[currentLevel] = height;`
			`pS[currentLevel] = stride;`

			`for (; currentLevel > 0; --currentLevel) {`
			`int nw = pW[currentLevel] / 2;`
			`int nh = pH[currentLevel] / 2;`
			`int ns = iAlignUp(nw);`

			`checkCudaErrors(`
			`cudaMalloc(pI0 + currentLevel - 1, ns * nh * sizeof(float)));`
			`checkCudaErrors(`
			`cudaMalloc(pI1 + currentLevel - 1, ns * nh * sizeof(float)));`

			`Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel],`
			`pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]);`

			`Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel],`
			`pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]);`

			`pW[currentLevel - 1] = nw;`
			`pH[currentLevel - 1] = nh;`
			`pS[currentLevel - 1] = ns;`
			`}`

			`checkCudaErrors(cudaMemset(d_u, 0, stride * height * sizeof(float)));`
			`checkCudaErrors(cudaMemset(d_v, 0, stride * height * sizeof(float)));`

			`// compute flow`
			`for (; currentLevel < nLevels; ++currentLevel) {`
			`for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) {`
			`checkCudaErrors(cudaMemset(d_du0, 0, dataSize));`
			`checkCudaErrors(cudaMemset(d_dv0, 0, dataSize));`

			`checkCudaErrors(cudaMemset(d_du1, 0, dataSize));`
			`checkCudaErrors(cudaMemset(d_dv1, 0, dataSize));`

			`// on current level we compute optical flow`
			`// between frame 0 and warped frame 1`
			`WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel],`
			`pS[currentLevel], d_u, d_v, d_tmp);`

			`ComputeDerivatives(pI0[currentLevel], d_tmp, pW[currentLevel],`
			`pH[currentLevel], pS[currentLevel], d_Ix, d_Iy, d_Iz);`

			`for (int iter = 0; iter < nSolverIters; ++iter) {`
			`SolveForUpdate(d_du0, d_dv0, d_Ix, d_Iy, d_Iz, pW[currentLevel],`
			`pH[currentLevel], pS[currentLevel], alpha, d_du1, d_dv1);`

			`Swap(d_du0, d_du1);`
			`Swap(d_dv0, d_dv1);`
			`}`

			`// update u, v`
			`Add(d_u, d_du0, pH[currentLevel] * pS[currentLevel], d_u);`
			`Add(d_v, d_dv0, pH[currentLevel] * pS[currentLevel], d_v);`
			`}`

			`if (currentLevel != nLevels - 1) {`
			`// prolongate solution`
			`float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel];`

			`Upscale(d_u, pW[currentLevel], pH[currentLevel], pS[currentLevel],`
			`pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],`
			`scaleX, d_nu);`

			`float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel];`

			`Upscale(d_v, pW[currentLevel], pH[currentLevel], pS[currentLevel],`
			`pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],`
			`scaleY, d_nv);`

			`Swap(d_u, d_nu);`
			`Swap(d_v, d_nv);`
			`}`
			`}`

			`checkCudaErrors(cudaMemcpy(u, d_u, dataSize, cudaMemcpyDeviceToHost));`
			`checkCudaErrors(cudaMemcpy(v, d_v, dataSize, cudaMemcpyDeviceToHost));`

			`// cleanup`
			`for (int i = 0; i < nLevels; ++i) {`
			`checkCudaErrors(cudaFree((void *)pI0[i]));`
			`checkCudaErrors(cudaFree((void *)pI1[i]));`
			`}`

			`delete[] pI0;`
			`delete[] pI1;`
			`delete[] pW;`
			`delete[] pH;`
			`delete[] pS;`

			`checkCudaErrors(cudaFree(d_tmp));`
			`checkCudaErrors(cudaFree(d_du0));`
			`checkCudaErrors(cudaFree(d_dv0));`
			`checkCudaErrors(cudaFree(d_du1));`
			`checkCudaErrors(cudaFree(d_dv1));`
			`checkCudaErrors(cudaFree(d_Ix));`
			`checkCudaErrors(cudaFree(d_Iy));`
			`checkCudaErrors(cudaFree(d_Iz));`
			`checkCudaErrors(cudaFree(d_nu));`
			`checkCudaErrors(cudaFree(d_nv));`
			`checkCudaErrors(cudaFree(d_u));`
			`checkCudaErrors(cudaFree(d_v));`
			`}`