/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

const static char *const sSDKsample = "HSOpticalFlow";

// CPU-GPU discrepancy threshold for self-test
const float THRESHOLD = 0.05f;

#include <cuda_runtime.h>

#include "common.h"
#include "flowGold.h"
#include "flowCUDA.h"

#include <helper_functions.h>

///////////////////////////////////////////////////////////////////////////////
/// \brief save optical flow in format described on vision.middlebury.edu/flow
/// \param[in] name output file name
/// \param[in] w    optical flow field width
/// \param[in] h    optical flow field height
/// \param[in] s    optical flow field row stride
/// \param[in] u    horizontal displacement
/// \param[in] v    vertical displacement
///////////////////////////////////////////////////////////////////////////////
void WriteFloFile(const char *name, int w, int h, int s, const float *u,
                  const float *v) {
  FILE *stream;
  stream = fopen(name, "wb");

  if (stream == 0) {
    printf("Could not save flow to \"%s\"\n", name);
    return;
  }

  float data = 202021.25f;
  fwrite(&data, sizeof(float), 1, stream);
  fwrite(&w, sizeof(w), 1, stream);
  fwrite(&h, sizeof(h), 1, stream);

  for (int i = 0; i < h; ++i) {
    for (int j = 0; j < w; ++j) {
      const int pos = j + i * s;
      fwrite(u + pos, sizeof(float), 1, stream);
      fwrite(v + pos, sizeof(float), 1, stream);
    }
  }

  fclose(stream);
}

///////////////////////////////////////////////////////////////////////////////
/// \brief
/// load 4-channel unsigned byte image
/// and convert it to single channel FP32 image
/// \param[out] img_data pointer to raw image data
/// \param[out] img_w    image width
/// \param[out] img_h    image height
/// \param[out] img_s    image row stride
/// \param[in]  name     image file name
/// \param[in]  exePath  executable file path
/// \return true if image is successfully loaded or false otherwise
///////////////////////////////////////////////////////////////////////////////
bool LoadImageAsFP32(float *&img_data, int &img_w, int &img_h, int &img_s,
                     const char *name, const char *exePath) {
  printf("Loading \"%s\" ...\n", name);
  char *name_ = sdkFindFilePath(name, exePath);

  if (!name_) {
    printf("File not found\n");
    return false;
  }

  unsigned char *data = 0;
  unsigned int w = 0, h = 0;
  bool result = sdkLoadPPM4ub(name_, &data, &w, &h);

  if (result == false) {
    printf("Invalid file format\n");
    return false;
  }

  img_w = w;
  img_h = h;
  img_s = iAlignUp(img_w);

  img_data = new float[img_s * h];

  // source is 4 channel image
  const int widthStep = 4 * img_w;

  for (int i = 0; i < img_h; ++i) {
    for (int j = 0; j < img_w; ++j) {
      img_data[j + i * img_s] = ((float)data[j * 4 + i * widthStep]) / 255.0f;
    }
  }

  return true;
}

///////////////////////////////////////////////////////////////////////////////
/// \brief compare given flow field with gold (L1 norm)
/// \param[in] width    optical flow field width
/// \param[in] height   optical flow field height
/// \param[in] stride   optical flow field row stride
/// \param[in] h_uGold  horizontal displacement, gold
/// \param[in] h_vGold  vertical displacement, gold
/// \param[in] h_u      horizontal displacement
/// \param[in] h_v      vertical displacement
/// \return true if discrepancy is lower than a given threshold
///////////////////////////////////////////////////////////////////////////////
bool CompareWithGold(int width, int height, int stride, const float *h_uGold,
                     const float *h_vGold, const float *h_u, const float *h_v) {
  float error = 0.0f;

  for (int i = 0; i < height; ++i) {
    for (int j = 0; j < width; ++j) {
      const int pos = j + i * stride;
      error += fabsf(h_u[pos] - h_uGold[pos]) + fabsf(h_v[pos] - h_vGold[pos]);
    }
  }

  error /= (float)(width * height);

  printf("L1 error : %.6f\n", error);

  return (error < THRESHOLD);
}

///////////////////////////////////////////////////////////////////////////////
/// application entry point
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  // welcome message
  printf("%s Starting...\n\n", sSDKsample);

  // pick GPU
  findCudaDevice(argc, (const char **)argv);

  // find images
  const char *const sourceFrameName = "frame10.ppm";
  const char *const targetFrameName = "frame11.ppm";

  // image dimensions
  int width;
  int height;
  // row access stride
  int stride;

  // flow is computed from source image to target image
  float *h_source;  // source image, host memory
  float *h_target;  // target image, host memory

  // load image from file
  if (!LoadImageAsFP32(h_source, width, height, stride, sourceFrameName,
                       argv[0])) {
    exit(EXIT_FAILURE);
  }

  if (!LoadImageAsFP32(h_target, width, height, stride, targetFrameName,
                       argv[0])) {
    exit(EXIT_FAILURE);
  }

  // allocate host memory for CPU results
  float *h_uGold = new float[stride * height];
  float *h_vGold = new float[stride * height];

  // allocate host memory for GPU results
  float *h_u = new float[stride * height];
  float *h_v = new float[stride * height];

  // smoothness
  // if image brightness is not within [0,1]
  // this paramter should be scaled appropriately
  const float alpha = 0.2f;

  // number of pyramid levels
  const int nLevels = 5;

  // number of solver iterations on each level
  const int nSolverIters = 500;

  // number of warping iterations
  const int nWarpIters = 3;

  ComputeFlowGold(h_source, h_target, width, height, stride, alpha, nLevels,
                  nWarpIters, nSolverIters, h_uGold, h_vGold);

  ComputeFlowCUDA(h_source, h_target, width, height, stride, alpha, nLevels,
                  nWarpIters, nSolverIters, h_u, h_v);

  // compare results (L1 norm)
  bool status =
      CompareWithGold(width, height, stride, h_uGold, h_vGold, h_u, h_v);

  WriteFloFile("FlowGPU.flo", width, height, stride, h_u, h_v);

  WriteFloFile("FlowCPU.flo", width, height, stride, h_uGold, h_vGold);

  // free resources
  delete[] h_uGold;
  delete[] h_vGold;

  delete[] h_u;
  delete[] h_v;

  delete[] h_source;
  delete[] h_target;

  // report self-test status
  exit(status ? EXIT_SUCCESS : EXIT_FAILURE);
}