cuda-samples/Samples/HSOpticalFlow/flowGold.cpp

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "common.h"
#include "flowGold.h"

///////////////////////////////////////////////////////////////////////////////
/// \brief host texture fetch
///
/// read from arbitrary position within image using bilinear interpolation
/// out of range coords are mirrored
/// \param[in]  t   texture raw data
/// \param[in]  w   texture width
/// \param[in]  h   texture height
/// \param[in]  s   texture stride
/// \param[in]  x   x coord of the point to fetch value at
/// \param[in]  y   y coord of the point to fetch value at
/// \return fetched value
///////////////////////////////////////////////////////////////////////////////
inline float Tex2D(const float *t, int w, int h, int s, float x, float y) {
  // integer parts in floating point format
  float intPartX, intPartY;

  // get fractional parts of coordinates
  float dx = fabsf(modff(x, &intPartX));
  float dy = fabsf(modff(y, &intPartY));

  // assume pixels are squares
  // one of the corners
  int ix0 = (int)intPartX;
  int iy0 = (int)intPartY;

  // mirror out-of-range position
  if (ix0 < 0) ix0 = abs(ix0 + 1);

  if (iy0 < 0) iy0 = abs(iy0 + 1);

  if (ix0 >= w) ix0 = w * 2 - ix0 - 1;

  if (iy0 >= h) iy0 = h * 2 - iy0 - 1;

  // corner which is opposite to (ix0, iy0)
  int ix1 = ix0 + 1;
  int iy1 = iy0 + 1;

  if (ix1 >= w) ix1 = w * 2 - ix1 - 1;

  if (iy1 >= h) iy1 = h * 2 - iy1 - 1;

  float res = t[ix0 + iy0 * s] * (1.0f - dx) * (1.0f - dy);
  res += t[ix1 + iy0 * s] * dx * (1.0f - dy);
  res += t[ix0 + iy1 * s] * (1.0f - dx) * dy;
  res += t[ix1 + iy1 * s] * dx * dy;

  return res;
}

///////////////////////////////////////////////////////////////////////////////
/// \brief host texture fetch
///
/// read specific texel value
/// out of range coords are mirrored
/// \param[in]  t   texture raw data
/// \param[in]  w   texture width
/// \param[in]  h   texture height
/// \param[in]  s   texture stride
/// \param[in]  x   x coord of the point to fetch value at
/// \param[in]  y   y coord of the point to fetch value at
/// \return fetched value
///////////////////////////////////////////////////////////////////////////////
inline float Tex2Di(const float *src, int w, int h, int s, int x, int y) {
  if (x < 0) x = abs(x + 1);

  if (y < 0) y = abs(y + 1);

  if (x >= w) x = w * 2 - x - 1;

  if (y >= h) y = h * 2 - y - 1;

  return src[x + y * s];
}

///////////////////////////////////////////////////////////////////////////////
/// \brief resize image
/// \param[in]  src         image to downscale
/// \param[in]  width       image width
/// \param[in]  height      image height
/// \param[in]  stride      image stride
/// \param[in]  newWidth    image new width
/// \param[in]  newHeight   image new height
/// \param[in]  newStride   image new stride
/// \param[out] out         downscaled image data
///////////////////////////////////////////////////////////////////////////////
static void Downscale(const float *src, int width, int height, int stride,
                      int newWidth, int newHeight, int newStride, float *out) {
  for (int i = 0; i < newHeight; ++i) {
    for (int j = 0; j < newWidth; ++j) {
      const int srcX = j * 2;
      const int srcY = i * 2;
      // average 4 neighbouring pixels
      float sum;
      sum = Tex2Di(src, width, height, stride, srcX + 0, srcY + 0);
      sum += Tex2Di(src, width, height, stride, srcX + 0, srcY + 1);
      sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 0);
      sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 1);
      // normalize
      sum *= 0.25f;
      out[j + i * newStride] = sum;
    }
  }
}

///////////////////////////////////////////////////////////////////////////////
/// \brief upscale one component of a displacement field
/// \param[in]  src         field component to upscale
/// \param[in]  width       field current width
/// \param[in]  height      field current height
/// \param[in]  stride      field current stride
/// \param[in]  newWidth    field new width
/// \param[in]  newHeight   field new height
/// \param[in]  newStride   field new stride
/// \param[in]  scale       value scale factor (multiplier)
/// \param[out] out         upscaled field component
///////////////////////////////////////////////////////////////////////////////
static void Upscale(const float *src, int width, int height, int stride,
                    int newWidth, int newHeight, int newStride, float scale,
                    float *out) {
  for (int i = 0; i < newHeight; ++i) {
    for (int j = 0; j < newWidth; ++j) {
      // position within smaller image
      float x = ((float)j - 0.5f) * 0.5f;
      float y = ((float)i - 0.5f) * 0.5f;

      out[j + i * newStride] = Tex2D(src, width, height, stride, x, y) * scale;
    }
  }
}

///////////////////////////////////////////////////////////////////////////////
/// \brief warp image with provided vector field
///
/// For each output pixel there is a vector which tells which pixel
/// from a source image should be mapped to this particular output
/// pixel.
/// It is assumed that images and the vector field have the same stride and
/// resolution.
/// \param[in]  src source image
/// \param[in]  w   width
/// \param[in]  h   height
/// \param[in]  s   stride
/// \param[in]  u   horizontal displacement
/// \param[in]  v   vertical displacement
/// \param[out] out warped image
///////////////////////////////////////////////////////////////////////////////
static void WarpImage(const float *src, int w, int h, int s, const float *u,
                      const float *v, float *out) {
  for (int i = 0; i < h; ++i) {
    for (int j = 0; j < w; ++j) {
      const int pos = j + i * s;
      // warped coords
      float x = (float)j + u[pos];
      float y = (float)i + v[pos];

      out[pos] = Tex2D(src, w, h, s, x, y);
    }
  }
}

///////////////////////////////////////////////////////////////////////////////
/// \brief computes image derivatives for a pair of images
/// \param[in]  I0  source image
/// \param[in]  I1  tracked image
/// \param[in]  w   images width
/// \param[in]  h   images height
/// \param[in]  s   images stride
/// \param[out] Ix  x derivative
/// \param[out] Iy  y derivative
/// \param[out] Iz  temporal derivative
///////////////////////////////////////////////////////////////////////////////
static void ComputeDerivatives(const float *I0, const float *I1, int w, int h,
                               int s, float *Ix, float *Iy, float *Iz) {
  for (int i = 0; i < h; ++i) {
    for (int j = 0; j < w; ++j) {
      const int pos = j + i * s;
      float t0, t1;
      // derivative filter is (1, -8, 0, 8, -1)/12
      // x derivative
      t0 = Tex2Di(I0, w, h, s, j - 2, i);
      t0 -= Tex2Di(I0, w, h, s, j - 1, i) * 8.0f;
      t0 += Tex2Di(I0, w, h, s, j + 1, i) * 8.0f;
      t0 -= Tex2Di(I0, w, h, s, j + 2, i);
      t0 /= 12.0f;

      t1 = Tex2Di(I1, w, h, s, j - 2, i);
      t1 -= Tex2Di(I1, w, h, s, j - 1, i) * 8.0f;
      t1 += Tex2Di(I1, w, h, s, j + 1, i) * 8.0f;
      t1 -= Tex2Di(I1, w, h, s, j + 2, i);
      t1 /= 12.0f;

      // spatial derivatives are averaged
      Ix[pos] = (t0 + t1) * 0.5f;

      // t derivative
      Iz[pos] = I1[pos] - I0[pos];

      // y derivative
      t0 = Tex2Di(I0, w, h, s, j, i - 2);
      t0 -= Tex2Di(I0, w, h, s, j, i - 1) * 8.0f;
      t0 += Tex2Di(I0, w, h, s, j, i + 1) * 8.0f;
      t0 -= Tex2Di(I0, w, h, s, j, i + 2);
      t0 /= 12.0f;

      t1 = Tex2Di(I1, w, h, s, j, i - 2);
      t1 -= Tex2Di(I1, w, h, s, j, i - 1) * 8.0f;
      t1 += Tex2Di(I1, w, h, s, j, i + 1) * 8.0f;
      t1 -= Tex2Di(I1, w, h, s, j, i + 2);
      t1 /= 12.0f;

      Iy[pos] = (t0 + t1) * 0.5f;
    }
  }
}

///////////////////////////////////////////////////////////////////////////////
/// \brief one iteration of classical Horn-Schunck method
///
/// It is one iteration of Jacobi method for a corresponding linear system
/// \param[in]  du0     current horizontal displacement approximation
/// \param[in]  dv0     current vertical displacement approximation
/// \param[in]  Ix      image x derivative
/// \param[in]  Iy      image y derivative
/// \param[in]  Iz      temporal derivative
/// \param[in]  w       width
/// \param[in]  h       height
/// \param[in]  s       stride
/// \param[in]  alpha   degree of smoothness
/// \param[out] du1     new horizontal displacement approximation
/// \param[out] dv1     new vertical displacement approximation
///////////////////////////////////////////////////////////////////////////////
static void SolveForUpdate(const float *du0, const float *dv0, const float *Ix,
                           const float *Iy, const float *Iz, int w, int h,
                           int s, float alpha, float *du1, float *dv1) {
  for (int i = 0; i < h; ++i) {
    for (int j = 0; j < w; ++j) {
      const int pos = j + i * s;
      int left, right, up, down;

      // handle borders
      if (j != 0)
        left = pos - 1;
      else
        left = pos;

      if (j != w - 1)
        right = pos + 1;
      else
        right = pos;

      if (i != 0)
        down = pos - s;
      else
        down = pos;

      if (i != h - 1)
        up = pos + s;
      else
        up = pos;

      float sumU = (du0[left] + du0[right] + du0[up] + du0[down]) * 0.25f;
      float sumV = (dv0[left] + dv0[right] + dv0[up] + dv0[down]) * 0.25f;

      float frac = (Ix[pos] * sumU + Iy[pos] * sumV + Iz[pos]) /
                   (Ix[pos] * Ix[pos] + Iy[pos] * Iy[pos] + alpha);

      du1[pos] = sumU - Ix[pos] * frac;
      dv1[pos] = sumV - Iy[pos] * frac;
    }
  }
}

///////////////////////////////////////////////////////////////////////////////
/// \brief method logic
///
/// handles memory allocation and control flow
/// \param[in]  I0           source image
/// \param[in]  I1           tracked image
/// \param[in]  width        images width
/// \param[in]  height       images height
/// \param[in]  stride       images stride
/// \param[in]  alpha        degree of displacement field smoothness
/// \param[in]  nLevels      number of levels in a pyramid
/// \param[in]  nWarpIters   number of warping iterations per pyramid level
/// \param[in]  nSolverIters number of solver iterations (Jacobi iterations)
/// \param[out] u            horizontal displacement
/// \param[out] v            vertical displacement
///////////////////////////////////////////////////////////////////////////////
void ComputeFlowGold(const float *I0, const float *I1, int width, int height,
                     int stride, float alpha, int nLevels, int nWarpIters,
                     int nSolverIters, float *u, float *v) {
  printf("Computing optical flow on CPU...\n");

  float *u0 = u;
  float *v0 = v;

  const float **pI0 = new const float *[nLevels];
  const float **pI1 = new const float *[nLevels];

  int *pW = new int[nLevels];
  int *pH = new int[nLevels];
  int *pS = new int[nLevels];

  const int pixelCountAligned = height * stride;

  float *tmp = new float[pixelCountAligned];
  float *du0 = new float[pixelCountAligned];
  float *dv0 = new float[pixelCountAligned];
  float *du1 = new float[pixelCountAligned];
  float *dv1 = new float[pixelCountAligned];
  float *Ix = new float[pixelCountAligned];
  float *Iy = new float[pixelCountAligned];
  float *Iz = new float[pixelCountAligned];
  float *nu = new float[pixelCountAligned];
  float *nv = new float[pixelCountAligned];

  // prepare pyramid
  int currentLevel = nLevels - 1;
  pI0[currentLevel] = I0;
  pI1[currentLevel] = I1;

  pW[currentLevel] = width;
  pH[currentLevel] = height;
  pS[currentLevel] = stride;

  for (; currentLevel > 0; --currentLevel) {
    int nw = pW[currentLevel] / 2;
    int nh = pH[currentLevel] / 2;
    int ns = iAlignUp(nw);
    pI0[currentLevel - 1] = new float[ns * nh];
    pI1[currentLevel - 1] = new float[ns * nh];

    Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel],
              pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]);

    Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
              pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]);

    pW[currentLevel - 1] = nw;
    pH[currentLevel - 1] = nh;
    pS[currentLevel - 1] = ns;
  }

  // initial approximation
  memset(u, 0, stride * height * sizeof(float));
  memset(v, 0, stride * height * sizeof(float));

  // compute flow
  for (; currentLevel < nLevels; ++currentLevel) {
    for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) {
      memset(du0, 0, pixelCountAligned * sizeof(float));
      memset(dv0, 0, pixelCountAligned * sizeof(float));

      memset(du1, 0, pixelCountAligned * sizeof(float));
      memset(dv1, 0, pixelCountAligned * sizeof(float));

      WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel],
                pS[currentLevel], u, v, tmp);

      // on current level we compute optical flow
      // between frame 0 and warped frame 1
      ComputeDerivatives(pI0[currentLevel], tmp, pW[currentLevel],
                         pH[currentLevel], pS[currentLevel], Ix, Iy, Iz);

      for (int iter = 0; iter < nSolverIters; ++iter) {
        SolveForUpdate(du0, dv0, Ix, Iy, Iz, pW[currentLevel], pH[currentLevel],
                       pS[currentLevel], alpha, du1, dv1);
        Swap(du0, du1);
        Swap(dv0, dv1);
      }

      // update u, v
      for (int i = 0; i < pH[currentLevel] * pS[currentLevel]; ++i) {
        u[i] += du0[i];
        v[i] += dv0[i];
      }
    }  // end for (int warpIter = 0; warpIter < nWarpIters; ++warpIter)

    if (currentLevel != nLevels - 1) {
      // prolongate solution
      float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel];

      Upscale(u, pW[currentLevel], pH[currentLevel], pS[currentLevel],
              pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
              scaleX, nu);

      float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel];

      Upscale(v, pW[currentLevel], pH[currentLevel], pS[currentLevel],
              pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],
              scaleY, nv);

      Swap(u, nu);
      Swap(v, nv);
    }
  }  // end for (; currentLevel < nLevels; ++currentLevel)

  if (u != u0) {
    // solution is not in the specified array
    // copy
    memcpy(u0, u, pixelCountAligned * sizeof(float));
    memcpy(v0, v, pixelCountAligned * sizeof(float));
    Swap(u, nu);
    Swap(v, nv);
  }

  // cleanup
  // last level is not being freed here
  // because it refers to input images
  for (int i = 0; i < nLevels - 1; ++i) {
    delete[] pI0[i];
    delete[] pI1[i];
  }

  delete[] pI0;
  delete[] pI1;
  delete[] pW;
  delete[] pH;
  delete[] pS;
  delete[] tmp;
  delete[] du0;
  delete[] dv0;
  delete[] du1;
  delete[] dv1;
  delete[] Ix;
  delete[] Iy;
  delete[] Iz;
  delete[] nu;
  delete[] nv;
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`#include "common.h"`
			`#include "flowGold.h"`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief host texture fetch`
			`///`
			`/// read from arbitrary position within image using bilinear interpolation`
			`/// out of range coords are mirrored`
			`/// \param[in] t texture raw data`
			`/// \param[in] w texture width`
			`/// \param[in] h texture height`
			`/// \param[in] s texture stride`
			`/// \param[in] x x coord of the point to fetch value at`
			`/// \param[in] y y coord of the point to fetch value at`
			`/// \return fetched value`
			`///////////////////////////////////////////////////////////////////////////////`
			`inline float Tex2D(const float *t, int w, int h, int s, float x, float y) {`
			`// integer parts in floating point format`
			`float intPartX, intPartY;`

			`// get fractional parts of coordinates`
			`float dx = fabsf(modff(x, &intPartX));`
			`float dy = fabsf(modff(y, &intPartY));`

			`// assume pixels are squares`
			`// one of the corners`
			`int ix0 = (int)intPartX;`
			`int iy0 = (int)intPartY;`

			`// mirror out-of-range position`
			`if (ix0 < 0) ix0 = abs(ix0 + 1);`

			`if (iy0 < 0) iy0 = abs(iy0 + 1);`

			`if (ix0 >= w) ix0 = w * 2 - ix0 - 1;`

			`if (iy0 >= h) iy0 = h * 2 - iy0 - 1;`

			`// corner which is opposite to (ix0, iy0)`
			`int ix1 = ix0 + 1;`
			`int iy1 = iy0 + 1;`

			`if (ix1 >= w) ix1 = w * 2 - ix1 - 1;`

			`if (iy1 >= h) iy1 = h * 2 - iy1 - 1;`

			`float res = t[ix0 + iy0 * s] * (1.0f - dx) * (1.0f - dy);`
			`res += t[ix1 + iy0 * s] * dx * (1.0f - dy);`
			`res += t[ix0 + iy1 * s] * (1.0f - dx) * dy;`
			`res += t[ix1 + iy1 * s] * dx * dy;`

			`return res;`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief host texture fetch`
			`///`
			`/// read specific texel value`
			`/// out of range coords are mirrored`
			`/// \param[in] t texture raw data`
			`/// \param[in] w texture width`
			`/// \param[in] h texture height`
			`/// \param[in] s texture stride`
			`/// \param[in] x x coord of the point to fetch value at`
			`/// \param[in] y y coord of the point to fetch value at`
			`/// \return fetched value`
			`///////////////////////////////////////////////////////////////////////////////`
			`inline float Tex2Di(const float *src, int w, int h, int s, int x, int y) {`
			`if (x < 0) x = abs(x + 1);`

			`if (y < 0) y = abs(y + 1);`

			`if (x >= w) x = w * 2 - x - 1;`

			`if (y >= h) y = h * 2 - y - 1;`

			`return src[x + y * s];`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief resize image`
			`/// \param[in] src image to downscale`
			`/// \param[in] width image width`
			`/// \param[in] height image height`
			`/// \param[in] stride image stride`
			`/// \param[in] newWidth image new width`
			`/// \param[in] newHeight image new height`
			`/// \param[in] newStride image new stride`
			`/// \param[out] out downscaled image data`
			`///////////////////////////////////////////////////////////////////////////////`
			`static void Downscale(const float *src, int width, int height, int stride,`
			`int newWidth, int newHeight, int newStride, float *out) {`
			`for (int i = 0; i < newHeight; ++i) {`
			`for (int j = 0; j < newWidth; ++j) {`
			`const int srcX = j * 2;`
			`const int srcY = i * 2;`
			`// average 4 neighbouring pixels`
			`float sum;`
			`sum = Tex2Di(src, width, height, stride, srcX + 0, srcY + 0);`
			`sum += Tex2Di(src, width, height, stride, srcX + 0, srcY + 1);`
			`sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 0);`
			`sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 1);`
			`// normalize`
			`sum *= 0.25f;`
			`out[j + i * newStride] = sum;`
			`}`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief upscale one component of a displacement field`
			`/// \param[in] src field component to upscale`
			`/// \param[in] width field current width`
			`/// \param[in] height field current height`
			`/// \param[in] stride field current stride`
			`/// \param[in] newWidth field new width`
			`/// \param[in] newHeight field new height`
			`/// \param[in] newStride field new stride`
			`/// \param[in] scale value scale factor (multiplier)`
			`/// \param[out] out upscaled field component`
			`///////////////////////////////////////////////////////////////////////////////`
			`static void Upscale(const float *src, int width, int height, int stride,`
			`int newWidth, int newHeight, int newStride, float scale,`
			`float *out) {`
			`for (int i = 0; i < newHeight; ++i) {`
			`for (int j = 0; j < newWidth; ++j) {`
			`// position within smaller image`
			`float x = ((float)j - 0.5f) * 0.5f;`
			`float y = ((float)i - 0.5f) * 0.5f;`

			`out[j + i * newStride] = Tex2D(src, width, height, stride, x, y) * scale;`
			`}`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief warp image with provided vector field`
			`///`
			`/// For each output pixel there is a vector which tells which pixel`
			`/// from a source image should be mapped to this particular output`
			`/// pixel.`
			`/// It is assumed that images and the vector field have the same stride and`
			`/// resolution.`
			`/// \param[in] src source image`
			`/// \param[in] w width`
			`/// \param[in] h height`
			`/// \param[in] s stride`
			`/// \param[in] u horizontal displacement`
			`/// \param[in] v vertical displacement`
			`/// \param[out] out warped image`
			`///////////////////////////////////////////////////////////////////////////////`
			`static void WarpImage(const float src, int w, int h, int s, const float u,`
			`const float v, float out) {`
			`for (int i = 0; i < h; ++i) {`
			`for (int j = 0; j < w; ++j) {`
			`const int pos = j + i * s;`
			`// warped coords`
			`float x = (float)j + u[pos];`
			`float y = (float)i + v[pos];`

			`out[pos] = Tex2D(src, w, h, s, x, y);`
			`}`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief computes image derivatives for a pair of images`
			`/// \param[in] I0 source image`
			`/// \param[in] I1 tracked image`
			`/// \param[in] w images width`
			`/// \param[in] h images height`
			`/// \param[in] s images stride`
			`/// \param[out] Ix x derivative`
			`/// \param[out] Iy y derivative`
			`/// \param[out] Iz temporal derivative`
			`///////////////////////////////////////////////////////////////////////////////`
			`static void ComputeDerivatives(const float I0, const float I1, int w, int h,`
			`int s, float Ix, float Iy, float *Iz) {`
			`for (int i = 0; i < h; ++i) {`
			`for (int j = 0; j < w; ++j) {`
			`const int pos = j + i * s;`
			`float t0, t1;`
			`// derivative filter is (1, -8, 0, 8, -1)/12`
			`// x derivative`
			`t0 = Tex2Di(I0, w, h, s, j - 2, i);`
			`t0 -= Tex2Di(I0, w, h, s, j - 1, i) * 8.0f;`
			`t0 += Tex2Di(I0, w, h, s, j + 1, i) * 8.0f;`
			`t0 -= Tex2Di(I0, w, h, s, j + 2, i);`
			`t0 /= 12.0f;`

			`t1 = Tex2Di(I1, w, h, s, j - 2, i);`
			`t1 -= Tex2Di(I1, w, h, s, j - 1, i) * 8.0f;`
			`t1 += Tex2Di(I1, w, h, s, j + 1, i) * 8.0f;`
			`t1 -= Tex2Di(I1, w, h, s, j + 2, i);`
			`t1 /= 12.0f;`

			`// spatial derivatives are averaged`
			`Ix[pos] = (t0 + t1) * 0.5f;`

			`// t derivative`
			`Iz[pos] = I1[pos] - I0[pos];`

			`// y derivative`
			`t0 = Tex2Di(I0, w, h, s, j, i - 2);`
			`t0 -= Tex2Di(I0, w, h, s, j, i - 1) * 8.0f;`
			`t0 += Tex2Di(I0, w, h, s, j, i + 1) * 8.0f;`
			`t0 -= Tex2Di(I0, w, h, s, j, i + 2);`
			`t0 /= 12.0f;`

			`t1 = Tex2Di(I1, w, h, s, j, i - 2);`
			`t1 -= Tex2Di(I1, w, h, s, j, i - 1) * 8.0f;`
			`t1 += Tex2Di(I1, w, h, s, j, i + 1) * 8.0f;`
			`t1 -= Tex2Di(I1, w, h, s, j, i + 2);`
			`t1 /= 12.0f;`

			`Iy[pos] = (t0 + t1) * 0.5f;`
			`}`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief one iteration of classical Horn-Schunck method`
			`///`
			`/// It is one iteration of Jacobi method for a corresponding linear system`
			`/// \param[in] du0 current horizontal displacement approximation`
			`/// \param[in] dv0 current vertical displacement approximation`
			`/// \param[in] Ix image x derivative`
			`/// \param[in] Iy image y derivative`
			`/// \param[in] Iz temporal derivative`
			`/// \param[in] w width`
			`/// \param[in] h height`
			`/// \param[in] s stride`
			`/// \param[in] alpha degree of smoothness`
			`/// \param[out] du1 new horizontal displacement approximation`
			`/// \param[out] dv1 new vertical displacement approximation`
			`///////////////////////////////////////////////////////////////////////////////`
			`static void SolveForUpdate(const float du0, const float dv0, const float *Ix,`
			`const float Iy, const float Iz, int w, int h,`
			`int s, float alpha, float du1, float dv1) {`
			`for (int i = 0; i < h; ++i) {`
			`for (int j = 0; j < w; ++j) {`
			`const int pos = j + i * s;`
			`int left, right, up, down;`

			`// handle borders`
			`if (j != 0)`
			`left = pos - 1;`
			`else`
			`left = pos;`

			`if (j != w - 1)`
			`right = pos + 1;`
			`else`
			`right = pos;`

			`if (i != 0)`
			`down = pos - s;`
			`else`
			`down = pos;`

			`if (i != h - 1)`
			`up = pos + s;`
			`else`
			`up = pos;`

			`float sumU = (du0[left] + du0[right] + du0[up] + du0[down]) * 0.25f;`
			`float sumV = (dv0[left] + dv0[right] + dv0[up] + dv0[down]) * 0.25f;`

			`float frac = (Ix[pos] * sumU + Iy[pos] * sumV + Iz[pos]) /`
			`(Ix[pos] * Ix[pos] + Iy[pos] * Iy[pos] + alpha);`

			`du1[pos] = sumU - Ix[pos] * frac;`
			`dv1[pos] = sumV - Iy[pos] * frac;`
			`}`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`/// \brief method logic`
			`///`
			`/// handles memory allocation and control flow`
			`/// \param[in] I0 source image`
			`/// \param[in] I1 tracked image`
			`/// \param[in] width images width`
			`/// \param[in] height images height`
			`/// \param[in] stride images stride`
			`/// \param[in] alpha degree of displacement field smoothness`
			`/// \param[in] nLevels number of levels in a pyramid`
			`/// \param[in] nWarpIters number of warping iterations per pyramid level`
			`/// \param[in] nSolverIters number of solver iterations (Jacobi iterations)`
			`/// \param[out] u horizontal displacement`
			`/// \param[out] v vertical displacement`
			`///////////////////////////////////////////////////////////////////////////////`
			`void ComputeFlowGold(const float I0, const float I1, int width, int height,`
			`int stride, float alpha, int nLevels, int nWarpIters,`
			`int nSolverIters, float u, float v) {`
			`printf("Computing optical flow on CPU...\n");`

			`float *u0 = u;`
			`float *v0 = v;`

			`const float *pI0 = new const float [nLevels];`
			`const float *pI1 = new const float [nLevels];`

			`int *pW = new int[nLevels];`
			`int *pH = new int[nLevels];`
			`int *pS = new int[nLevels];`

			`const int pixelCountAligned = height * stride;`

			`float *tmp = new float[pixelCountAligned];`
			`float *du0 = new float[pixelCountAligned];`
			`float *dv0 = new float[pixelCountAligned];`
			`float *du1 = new float[pixelCountAligned];`
			`float *dv1 = new float[pixelCountAligned];`
			`float *Ix = new float[pixelCountAligned];`
			`float *Iy = new float[pixelCountAligned];`
			`float *Iz = new float[pixelCountAligned];`
			`float *nu = new float[pixelCountAligned];`
			`float *nv = new float[pixelCountAligned];`

			`// prepare pyramid`
			`int currentLevel = nLevels - 1;`
			`pI0[currentLevel] = I0;`
			`pI1[currentLevel] = I1;`

			`pW[currentLevel] = width;`
			`pH[currentLevel] = height;`
			`pS[currentLevel] = stride;`

			`for (; currentLevel > 0; --currentLevel) {`
			`int nw = pW[currentLevel] / 2;`
			`int nh = pH[currentLevel] / 2;`
			`int ns = iAlignUp(nw);`
			`pI0[currentLevel - 1] = new float[ns * nh];`
			`pI1[currentLevel - 1] = new float[ns * nh];`

			`Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel],`
			`pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]);`

			`Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel],`
			`pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]);`

			`pW[currentLevel - 1] = nw;`
			`pH[currentLevel - 1] = nh;`
			`pS[currentLevel - 1] = ns;`
			`}`

			`// initial approximation`
			`memset(u, 0, stride * height * sizeof(float));`
			`memset(v, 0, stride * height * sizeof(float));`

			`// compute flow`
			`for (; currentLevel < nLevels; ++currentLevel) {`
			`for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) {`
			`memset(du0, 0, pixelCountAligned * sizeof(float));`
			`memset(dv0, 0, pixelCountAligned * sizeof(float));`

			`memset(du1, 0, pixelCountAligned * sizeof(float));`
			`memset(dv1, 0, pixelCountAligned * sizeof(float));`

			`WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel],`
			`pS[currentLevel], u, v, tmp);`

			`// on current level we compute optical flow`
			`// between frame 0 and warped frame 1`
			`ComputeDerivatives(pI0[currentLevel], tmp, pW[currentLevel],`
			`pH[currentLevel], pS[currentLevel], Ix, Iy, Iz);`

			`for (int iter = 0; iter < nSolverIters; ++iter) {`
			`SolveForUpdate(du0, dv0, Ix, Iy, Iz, pW[currentLevel], pH[currentLevel],`
			`pS[currentLevel], alpha, du1, dv1);`
			`Swap(du0, du1);`
			`Swap(dv0, dv1);`
			`}`

			`// update u, v`
			`for (int i = 0; i < pH[currentLevel] * pS[currentLevel]; ++i) {`
			`u[i] += du0[i];`
			`v[i] += dv0[i];`
			`}`
			`} // end for (int warpIter = 0; warpIter < nWarpIters; ++warpIter)`

			`if (currentLevel != nLevels - 1) {`
			`// prolongate solution`
			`float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel];`

			`Upscale(u, pW[currentLevel], pH[currentLevel], pS[currentLevel],`
			`pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],`
			`scaleX, nu);`

			`float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel];`

			`Upscale(v, pW[currentLevel], pH[currentLevel], pS[currentLevel],`
			`pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1],`
			`scaleY, nv);`

			`Swap(u, nu);`
			`Swap(v, nv);`
			`}`
			`} // end for (; currentLevel < nLevels; ++currentLevel)`

			`if (u != u0) {`
			`// solution is not in the specified array`
			`// copy`
			`memcpy(u0, u, pixelCountAligned * sizeof(float));`
			`memcpy(v0, v, pixelCountAligned * sizeof(float));`
			`Swap(u, nu);`
			`Swap(v, nv);`
			`}`

			`// cleanup`
			`// last level is not being freed here`
			`// because it refers to input images`
			`for (int i = 0; i < nLevels - 1; ++i) {`
			`delete[] pI0[i];`
			`delete[] pI1[i];`
			`}`

			`delete[] pI0;`
			`delete[] pI1;`
			`delete[] pW;`
			`delete[] pH;`
			`delete[] pS;`
			`delete[] tmp;`
			`delete[] du0;`
			`delete[] dv0;`
			`delete[] du1;`
			`delete[] dv1;`
			`delete[] Ix;`
			`delete[] Iy;`
			`delete[] Iz;`
			`delete[] nu;`
			`delete[] nv;`
			`}`