cuda-samples/Samples/dct8x8/dct8x8_kernel_short.cuh

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/**
**************************************************************************
* \file dct8x8_kernel_short.cu
* \brief Contains kernel implementations of DCT and IDCT routines for 16-bit
*        integers, used in JPEG internal data processing. Optimized device code.
*
* This code implements traditional approach to forward and inverse Discrete
* Cosine Transform to blocks of image pixels (of 8x8 size), as in JPEG standard.
* The data processing is performed using short data type.
* The routine that performs quantization of coefficients can be found in
* dct8x8_kernel_quantization.cu file.
*/

#pragma once

#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include "Common.h"

/**
*  Width of data block (short kernel)
*/
#define KERS_BLOCK_WIDTH 32

/**
*  Height of data block (short kernel)
*/
#define KERS_BLOCK_HEIGHT 32

/**
*  LOG2 of width of data block (short kernel)
*/
#define KERS_BW_LOG2 5

/**
*  LOG2 of height of data block (short kernel)
*/
#define KERS_BH_LOG2 5

/**
*  Stride of shared memory buffer (short kernel)
*/
#define KERS_SMEMBLOCK_STRIDE (KERS_BLOCK_WIDTH + 2)

/**
*  Half of data block width (short kernel)
*/
#define KERS_BLOCK_WIDTH_HALF (KERS_BLOCK_WIDTH / 2)

#define SIN_1_4 0x5A82
#define COS_1_4 0x5A82
#define SIN_1_8 0x30FC
#define COS_1_8 0x7642

#define OSIN_1_16 0x063E
#define OSIN_3_16 0x11C7
#define OSIN_5_16 0x1A9B
#define OSIN_7_16 0x1F63

#define OCOS_1_16 0x1F63
#define OCOS_3_16 0x1A9B
#define OCOS_5_16 0x11C7
#define OCOS_7_16 0x063E

/**
*  Package of 2 shorts into 1 int - designed to perform i/o by integers to avoid
* bank conflicts
*/
union PackedShorts {
  struct __align__(8) {
    short hShort1;
    short hShort2;
  };
  unsigned int hInt;
};

/**
*  Converts fixed point value to short value
*/
__device__ inline short unfixh(int x) { return (short)((x + 0x8000) >> 16); }

/**
*  Converts fixed point value to short value
*/
__device__ inline int unfixo(int x) { return (x + 0x1000) >> 13; }

/**
**************************************************************************
*  Performs in-place DCT of vector of 8 elements (used to access columns in
*shared memory).
*
* \param SrcDst         [IN/OUT] - Pointer to the first element of vector
* \param Stride         [IN] - Value to add to ptr to access other elements
*
* \return None
*/
__device__ void CUDAshortInplaceDCT(short *SrcDst, int Stride) {
  int in0, in1, in2, in3, in4, in5, in6, in7;
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int tmp14, tmp15, tmp16, tmp17;
  int tmp25, tmp26;

  int DoubleStride = Stride << 1;

  short *DstPtr = SrcDst;
  in0 = *DstPtr;
  DstPtr += Stride;
  in1 = *DstPtr;
  DstPtr += Stride;
  in2 = *DstPtr;
  DstPtr += Stride;
  in3 = *DstPtr;
  DstPtr += Stride;
  in4 = *DstPtr;
  DstPtr += Stride;
  in5 = *DstPtr;
  DstPtr += Stride;
  in6 = *DstPtr;
  DstPtr += Stride;
  in7 = *DstPtr;

  tmp0 = in7 + in0;
  tmp1 = in6 + in1;
  tmp2 = in5 + in2;
  tmp3 = in4 + in3;
  tmp4 = in3 - in4;
  tmp5 = in2 - in5;
  tmp6 = in1 - in6;
  tmp7 = in0 - in7;

  tmp10 = tmp3 + tmp0;
  tmp11 = tmp2 + tmp1;
  tmp12 = tmp1 - tmp2;
  tmp13 = tmp0 - tmp3;

  tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4));
  tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4));

  tmp4 <<= 2;
  tmp7 <<= 2;

  tmp14 = tmp4 + tmp15;
  tmp25 = tmp4 - tmp15;
  tmp26 = tmp7 - tmp16;
  tmp17 = tmp7 + tmp16;

  DstPtr = SrcDst;
  *DstPtr = unfixh(FMUL(tmp10 + tmp11, SIN_1_4));
  DstPtr += DoubleStride;
  *DstPtr = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8));
  DstPtr += DoubleStride;
  *DstPtr = unfixh(FMUL(tmp10 - tmp11, COS_1_4));
  DstPtr += DoubleStride;
  *DstPtr = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8));

  DstPtr = SrcDst + Stride;
  *DstPtr = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16));
  DstPtr += DoubleStride;
  *DstPtr = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16));
  DstPtr += DoubleStride;
  *DstPtr = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16));
  DstPtr += DoubleStride;
  *DstPtr = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16));
}

/**
**************************************************************************
*  Performs in-place DCT of vector of 8 elements (used to access rows in shared
*memory).
*
* \param V8         [IN/OUT] - Pointer to the first two elements of vector
*
* \return None
*/
__device__ void CUDAshortInplaceDCT(unsigned int *V8) {
  int in0, in1, in2, in3, in4, in5, in6, in7;
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int tmp14, tmp15, tmp16, tmp17;
  int tmp25, tmp26;
  PackedShorts sh0, sh1, sh2, sh3;

  sh0.hInt = V8[0];
  sh1.hInt = V8[1];
  sh2.hInt = V8[2];
  sh3.hInt = V8[3];
  in0 = sh0.hShort1;
  in1 = sh0.hShort2;
  in2 = sh1.hShort1;
  in3 = sh1.hShort2;
  in4 = sh2.hShort1;
  in5 = sh2.hShort2;
  in6 = sh3.hShort1;
  in7 = sh3.hShort2;

  tmp0 = in7 + in0;
  tmp1 = in6 + in1;
  tmp2 = in5 + in2;
  tmp3 = in4 + in3;
  tmp4 = in3 - in4;
  tmp5 = in2 - in5;
  tmp6 = in1 - in6;
  tmp7 = in0 - in7;

  tmp10 = tmp3 + tmp0;
  tmp11 = tmp2 + tmp1;
  tmp12 = tmp1 - tmp2;
  tmp13 = tmp0 - tmp3;

  sh0.hShort1 = unfixh(FMUL(tmp10 + tmp11, SIN_1_4));
  sh2.hShort1 = unfixh(FMUL(tmp10 - tmp11, COS_1_4));

  sh1.hShort1 = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8));
  sh3.hShort1 = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8));

  tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4));
  tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4));

  tmp4 <<= 2;
  tmp7 <<= 2;

  tmp14 = tmp4 + tmp15;
  tmp25 = tmp4 - tmp15;
  tmp26 = tmp7 - tmp16;
  tmp17 = tmp7 + tmp16;

  sh0.hShort2 = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16));
  sh3.hShort2 = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16));
  sh2.hShort2 = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16));
  sh1.hShort2 = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16));

  V8[0] = sh0.hInt;
  V8[1] = sh1.hInt;
  V8[2] = sh2.hInt;
  V8[3] = sh3.hInt;
}

/**
**************************************************************************
*  Performs in-place IDCT of vector of 8 elements (used to access columns in
*shared memory).
*
* \param SrcDst         [IN/OUT] - Pointer to the first element of vector
* \param Stride         [IN] - Value to add to ptr to access other elements
*
* \return None
*/
__device__ void CUDAshortInplaceIDCT(short *SrcDst, int Stride) {
  int in0, in1, in2, in3, in4, in5, in6, in7;
  int tmp10, tmp11, tmp12, tmp13;
  int tmp20, tmp21, tmp22, tmp23;
  int tmp30, tmp31;
  int tmp40, tmp41, tmp42, tmp43;
  int tmp50, tmp51, tmp52, tmp53;

  short *DstPtr = SrcDst;
  in0 = *DstPtr;
  DstPtr += Stride;
  in1 = *DstPtr;
  DstPtr += Stride;
  in2 = *DstPtr;
  DstPtr += Stride;
  in3 = *DstPtr;
  DstPtr += Stride;
  in4 = *DstPtr;
  DstPtr += Stride;
  in5 = *DstPtr;
  DstPtr += Stride;
  in6 = *DstPtr;
  DstPtr += Stride;
  in7 = *DstPtr;

  tmp10 = FMUL(in0 + in4, COS_1_4);
  tmp11 = FMUL(in0 - in4, COS_1_4);
  tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8);
  tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8);

  tmp20 = tmp10 + tmp13;
  tmp21 = tmp11 + tmp12;
  tmp22 = tmp11 - tmp12;
  tmp23 = tmp10 - tmp13;

  tmp30 = unfixo(FMUL(in3 + in5, COS_1_4));
  tmp31 = unfixo(FMUL(in3 - in5, COS_1_4));

  in1 <<= 2;
  in7 <<= 2;

  tmp40 = in1 + tmp30;
  tmp41 = in7 + tmp31;
  tmp42 = in1 - tmp30;
  tmp43 = in7 - tmp31;

  tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16);
  tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16);
  tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16);
  tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16);

  DstPtr = SrcDst;
  *DstPtr = unfixh(tmp20 + tmp50);
  DstPtr += Stride;
  *DstPtr = unfixh(tmp21 + tmp53);
  DstPtr += Stride;
  *DstPtr = unfixh(tmp22 + tmp52);
  DstPtr += Stride;
  *DstPtr = unfixh(tmp23 + tmp51);
  DstPtr += Stride;
  *DstPtr = unfixh(tmp23 - tmp51);
  DstPtr += Stride;
  *DstPtr = unfixh(tmp22 - tmp52);
  DstPtr += Stride;
  *DstPtr = unfixh(tmp21 - tmp53);
  DstPtr += Stride;
  *DstPtr = unfixh(tmp20 - tmp50);
}

/**
**************************************************************************
*  Performs in-place IDCT of vector of 8 elements (used to access rows in shared
*memory).
*
* \param V8         [IN/OUT] - Pointer to the first two elements of vector
*
* \return None
*/
__device__ void CUDAshortInplaceIDCT(unsigned int *V8) {
  int in0, in1, in2, in3, in4, in5, in6, in7;
  int tmp10, tmp11, tmp12, tmp13;
  int tmp20, tmp21, tmp22, tmp23;
  int tmp30, tmp31;
  int tmp40, tmp41, tmp42, tmp43;
  int tmp50, tmp51, tmp52, tmp53;
  PackedShorts sh0, sh1, sh2, sh3;

  sh0.hInt = V8[0];
  sh1.hInt = V8[1];
  sh2.hInt = V8[2];
  sh3.hInt = V8[3];
  in0 = sh0.hShort1;
  in1 = sh0.hShort2;
  in2 = sh1.hShort1;
  in3 = sh1.hShort2;
  in4 = sh2.hShort1;
  in5 = sh2.hShort2;
  in6 = sh3.hShort1;
  in7 = sh3.hShort2;

  tmp10 = FMUL(in0 + in4, COS_1_4);
  tmp11 = FMUL(in0 - in4, COS_1_4);
  tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8);
  tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8);

  tmp20 = tmp10 + tmp13;
  tmp21 = tmp11 + tmp12;
  tmp22 = tmp11 - tmp12;
  tmp23 = tmp10 - tmp13;

  tmp30 = unfixo(FMUL(in3 + in5, COS_1_4));
  tmp31 = unfixo(FMUL(in3 - in5, COS_1_4));

  in1 <<= 2;
  in7 <<= 2;

  tmp40 = in1 + tmp30;
  tmp41 = in7 + tmp31;
  tmp42 = in1 - tmp30;
  tmp43 = in7 - tmp31;

  tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16);
  tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16);
  tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16);
  tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16);

  sh0.hShort1 = unfixh(tmp20 + tmp50);
  sh0.hShort2 = unfixh(tmp21 + tmp53);
  sh1.hShort1 = unfixh(tmp22 + tmp52);
  sh1.hShort2 = unfixh(tmp23 + tmp51);
  sh2.hShort1 = unfixh(tmp23 - tmp51);
  sh2.hShort2 = unfixh(tmp22 - tmp52);
  sh3.hShort1 = unfixh(tmp21 - tmp53);
  sh3.hShort2 = unfixh(tmp20 - tmp50);

  V8[0] = sh0.hInt;
  V8[1] = sh1.hInt;
  V8[2] = sh2.hInt;
  V8[3] = sh3.hInt;
}

/**
**************************************************************************
*  Performs 8x8 block-wise Forward Discrete Cosine Transform of the given
*  image plane and outputs result to the array of coefficients. Short
*implementation.
*  This kernel is designed to process image by blocks of blocks8x8 that
*  utilize maximum warps capacity, assuming that it is enough of 8 threads
*  per block8x8.
*
* \param SrcDst                     [OUT] - Coefficients plane
* \param ImgStride                  [IN] - Stride of SrcDst
*
* \return None
*/

#define IMAD(a, b, c) (((a) * (b)) + (c))
#define IMUL(a, b) ((a) * (b))

__global__ void CUDAkernelShortDCT(short *SrcDst, int ImgStride) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  __shared__ short block[KERS_BLOCK_HEIGHT * KERS_SMEMBLOCK_STRIDE];
  int OffsThreadInRow = FMUL(threadIdx.y, BLOCK_SIZE) + threadIdx.x;
  int OffsThreadInCol = FMUL(threadIdx.z, BLOCK_SIZE);
  int OffsThrRowPermuted =
      (OffsThreadInRow & 0xFFFFFFE0) |
      ((OffsThreadInRow << 1) | (OffsThreadInRow >> 4) & 0x1) & 0x1F;

  SrcDst +=
      IMAD(IMAD(blockIdx.y, KERS_BLOCK_HEIGHT, OffsThreadInCol), ImgStride,
           IMAD(blockIdx.x, KERS_BLOCK_WIDTH, OffsThreadInRow * 2));
  short *bl_ptr =
      block + IMAD(OffsThreadInCol, KERS_SMEMBLOCK_STRIDE, OffsThreadInRow * 2);

  // load data to shared memory (only first half of threads in each row performs
  // data moving (each thread moves 2 shorts)
  if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) {
#pragma unroll

    for (int i = 0; i < BLOCK_SIZE; i++)
      ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)] =
          ((int *)SrcDst)[i * (ImgStride / 2)];
  }

  cg::sync(cta);
  CUDAshortInplaceDCT(
      block + OffsThreadInCol * KERS_SMEMBLOCK_STRIDE + OffsThrRowPermuted,
      KERS_SMEMBLOCK_STRIDE);
  cg::sync(cta);
  CUDAshortInplaceDCT((unsigned int *)(block +
                                       OffsThreadInRow * KERS_SMEMBLOCK_STRIDE +
                                       OffsThreadInCol));
  cg::sync(cta);

  // store data to global memory (only first half of threads in each row
  // performs data moving (each thread moves 2 shorts)
  if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) {
#pragma unroll

    for (int i = 0; i < BLOCK_SIZE; i++)
      ((int *)SrcDst)[i * (ImgStride / 2)] =
          ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)];
  }
}

/**
**************************************************************************
*  Performs 8x8 block-wise Inverse Discrete Cosine Transform of the given
*  image plane and outputs result to the array of coefficients. Short
*implementation.
*  This kernel is designed to process image by blocks of blocks8x8 that
*  utilize maximum warps capacity, assuming that it is enough of 8 threads
*  per block8x8.
*
* \param SrcDst                     [OUT] - Coefficients plane
* \param ImgStride                  [IN] - Stride of SrcDst
*
* \return None
*/

__global__ void CUDAkernelShortIDCT(short *SrcDst, int ImgStride) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  __shared__ short block[KERS_BLOCK_HEIGHT * KERS_SMEMBLOCK_STRIDE];

  int OffsThreadInRow = IMAD(threadIdx.y, BLOCK_SIZE, threadIdx.x);
  int OffsThreadInCol = IMUL(threadIdx.z, BLOCK_SIZE);
  int OffsThrRowPermuted =
      (OffsThreadInRow & 0xFFFFFFE0) |
      ((OffsThreadInRow << 1) | (OffsThreadInRow >> 4) & 0x1) & 0x1F;

  SrcDst +=
      IMAD(IMAD(blockIdx.y, KERS_BLOCK_HEIGHT, OffsThreadInCol), ImgStride,
           IMAD(blockIdx.x, KERS_BLOCK_WIDTH, OffsThreadInRow * 2));
  short *bl_ptr =
      block + IMAD(OffsThreadInCol, KERS_SMEMBLOCK_STRIDE, OffsThreadInRow * 2);

  // load data to shared memory (only first half of threads in each row performs
  // data moving (each thread moves 2 shorts)
  if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) {
#pragma unroll

    for (int i = 0; i < BLOCK_SIZE; i++)
      ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)] =
          ((int *)SrcDst)[i * (ImgStride / 2)];
  }

  cg::sync(cta);
  CUDAshortInplaceIDCT(
      block + OffsThreadInCol * KERS_SMEMBLOCK_STRIDE + OffsThrRowPermuted,
      KERS_SMEMBLOCK_STRIDE);
  cg::sync(cta);
  CUDAshortInplaceIDCT(
      (unsigned int *)(block + OffsThreadInRow * KERS_SMEMBLOCK_STRIDE +
                       OffsThreadInCol));
  cg::sync(cta);

  // store data to global memory (only first half of threads in each row
  // performs data moving (each thread moves 2 shorts)
  if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) {
#pragma unroll

    for (int i = 0; i < BLOCK_SIZE; i++)
      ((int *)SrcDst)[i * (ImgStride / 2)] =
          ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)];
  }
}