/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
  Recursive Gaussian filter
*/

#ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_
#define _RECURSIVEGAUSSIAN_KERNEL_CU_

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>
#include <helper_math.h>

#define BLOCK_DIM 16
#define CLAMP_TO_EDGE 1

// Transpose kernel (see transpose CUDA Sample for details)
__global__ void d_transpose(uint *odata, uint *idata, int width, int height) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();

  __shared__ uint block[BLOCK_DIM][BLOCK_DIM + 1];

  // read the matrix tile into shared memory
  unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
  unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;

  if ((xIndex < width) && (yIndex < height)) {
    unsigned int index_in = yIndex * width + xIndex;
    block[threadIdx.y][threadIdx.x] = idata[index_in];
  }

  cg::sync(cta);

  // write the transposed matrix tile to global memory
  xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
  yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;

  if ((xIndex < height) && (yIndex < width)) {
    unsigned int index_out = yIndex * height + xIndex;
    odata[index_out] = block[threadIdx.x][threadIdx.y];
  }
}

// RGBA version
// reads from 32-bit uint array holding 8-bit RGBA

// convert floating point rgba color to 32-bit integer
__device__ uint rgbaFloatToInt(float4 rgba) {
  rgba.x = __saturatef(rgba.x);  // clamp to [0.0, 1.0]
  rgba.y = __saturatef(rgba.y);
  rgba.z = __saturatef(rgba.z);
  rgba.w = __saturatef(rgba.w);
  return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) |
         (uint(rgba.y * 255) << 8) | uint(rgba.x * 255);
}

// convert from 32-bit int to float4
__device__ float4 rgbaIntToFloat(uint c) {
  float4 rgba;
  rgba.x = (c & 0xff) / 255.0f;
  rgba.y = ((c >> 8) & 0xff) / 255.0f;
  rgba.z = ((c >> 16) & 0xff) / 255.0f;
  rgba.w = ((c >> 24) & 0xff) / 255.0f;
  return rgba;
}

/*
  simple 1st order recursive filter
  - processes one image column per thread

  parameters:
  id - pointer to input data (RGBA image packed into 32-bit integers)
  od - pointer to output data
  w  - image width
  h  - image height
  a  - blur parameter
*/

__global__ void d_simpleRecursive_rgba(uint *id, uint *od, int w, int h,
                                       float a) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;

  if (x >= w) return;

  id += x;  // advance pointers to correct column
  od += x;

  // forward pass
  float4 yp = rgbaIntToFloat(*id);  // previous output

  for (int y = 0; y < h; y++) {
    float4 xc = rgbaIntToFloat(*id);
    float4 yc =
        xc + a * (yp - xc);  // simple lerp between current and previous value
    *od = rgbaFloatToInt(yc);
    id += w;
    od += w;  // move to next row
    yp = yc;
  }

  // reset pointers to point to last element in column
  id -= w;
  od -= w;

  // reverse pass
  // ensures response is symmetrical
  yp = rgbaIntToFloat(*id);

  for (int y = h - 1; y >= 0; y--) {
    float4 xc = rgbaIntToFloat(*id);
    float4 yc = xc + a * (yp - xc);
    *od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc) * 0.5f);
    id -= w;
    od -= w;  // move to previous row
    yp = yc;
  }
}

/*
  recursive Gaussian filter

  parameters:
  id - pointer to input data (RGBA image packed into 32-bit integers)
  od - pointer to output data
  w  - image width
  h  - image height
  a0-a3, b1, b2, coefp, coefn - filter parameters
*/

__global__ void d_recursiveGaussian_rgba(uint *id, uint *od, int w, int h,
                                         float a0, float a1, float a2, float a3,
                                         float b1, float b2, float coefp,
                                         float coefn) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;

  if (x >= w) return;

  id += x;  // advance pointers to correct column
  od += x;

  // forward pass
  float4 xp = make_float4(0.0f);  // previous input
  float4 yp = make_float4(0.0f);  // previous output
  float4 yb = make_float4(0.0f);  // previous output by 2
#if CLAMP_TO_EDGE
  xp = rgbaIntToFloat(*id);
  yb = coefp * xp;
  yp = yb;
#endif

  for (int y = 0; y < h; y++) {
    float4 xc = rgbaIntToFloat(*id);
    float4 yc = a0 * xc + a1 * xp - b1 * yp - b2 * yb;
    *od = rgbaFloatToInt(yc);
    id += w;
    od += w;  // move to next row
    xp = xc;
    yb = yp;
    yp = yc;
  }

  // reset pointers to point to last element in column
  id -= w;
  od -= w;

  // reverse pass
  // ensures response is symmetrical
  float4 xn = make_float4(0.0f);
  float4 xa = make_float4(0.0f);
  float4 yn = make_float4(0.0f);
  float4 ya = make_float4(0.0f);
#if CLAMP_TO_EDGE
  xn = xa = rgbaIntToFloat(*id);
  yn = coefn * xn;
  ya = yn;
#endif

  for (int y = h - 1; y >= 0; y--) {
    float4 xc = rgbaIntToFloat(*id);
    float4 yc = a2 * xn + a3 * xa - b1 * yn - b2 * ya;
    xa = xn;
    xn = xc;
    ya = yn;
    yn = yc;
    *od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc);
    id -= w;
    od -= w;  // move to previous row
  }
}

#endif  // #ifndef _GAUSSIAN_KERNEL_H_