cuda-samples/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_kernel.cuh

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
    Bicubic filtering
    See GPU Gems 2: "Fast Third-Order Texture Filtering", Sigg & Hadwiger
    https://developer.nvidia.com/gpugems/gpugems2/part-iii-high-quality-rendering/chapter-20-fast-third-order-texture-filtering

    Reformulation thanks to Keenan Crane
*/

#ifndef _BICUBICTEXTURE_KERNEL_CUH_
#define _BICUBICTEXTURE_KERNEL_CUH_

enum Mode {
  MODE_NEAREST,
  MODE_BILINEAR,
  MODE_BICUBIC,
  MODE_FAST_BICUBIC,
  MODE_CATROM
};

cudaTextureObject_t texObjPoint, texObjLinear;

// w0, w1, w2, and w3 are the four cubic B-spline basis functions
__host__ __device__ float w0(float a) {
  //    return (1.0f/6.0f)*(-a*a*a + 3.0f*a*a - 3.0f*a + 1.0f);
  return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f);  // optimized
}

__host__ __device__ float w1(float a) {
  //    return (1.0f/6.0f)*(3.0f*a*a*a - 6.0f*a*a + 4.0f);
  return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f);
}

__host__ __device__ float w2(float a) {
  //    return (1.0f/6.0f)*(-3.0f*a*a*a + 3.0f*a*a + 3.0f*a + 1.0f);
  return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f);
}

__host__ __device__ float w3(float a) { return (1.0f / 6.0f) * (a * a * a); }

// g0 and g1 are the two amplitude functions
__device__ float g0(float a) { return w0(a) + w1(a); }

__device__ float g1(float a) { return w2(a) + w3(a); }

// h0 and h1 are the two offset functions
__device__ float h0(float a) {
  // note +0.5 offset to compensate for CUDA linear filtering convention
  return -1.0f + w1(a) / (w0(a) + w1(a)) + 0.5f;
}

__device__ float h1(float a) { return 1.0f + w3(a) / (w2(a) + w3(a)) + 0.5f; }

// filter 4 values using cubic splines
template <class T>
__device__ T cubicFilter(float x, T c0, T c1, T c2, T c3) {
  T r;
  r = c0 * w0(x);
  r += c1 * w1(x);
  r += c2 * w2(x);
  r += c3 * w3(x);
  return r;
}

// slow but precise bicubic lookup using 16 texture lookups
template <class T, class R>  // texture data type, return type
__device__ R tex2DBicubic(const cudaTextureObject_t tex, float x, float y) {
  x -= 0.5f;
  y -= 0.5f;
  float px = floorf(x);
  float py = floorf(y);
  float fx = x - px;
  float fy = y - py;

  return cubicFilter<R>(
      fy, cubicFilter<R>(
              fx, tex2D<R>(tex, px - 1, py - 1), tex2D<R>(tex, px, py - 1),
              tex2D<R>(tex, px + 1, py - 1), tex2D<R>(tex, px + 2, py - 1)),
      cubicFilter<R>(fx, tex2D<R>(tex, px - 1, py), tex2D<R>(tex, px, py),
                     tex2D<R>(tex, px + 1, py), tex2D<R>(tex, px + 2, py)),
      cubicFilter<R>(fx, tex2D<R>(tex, px - 1, py + 1),
                     tex2D<R>(tex, px, py + 1), tex2D<R>(tex, px + 1, py + 1),
                     tex2D<R>(tex, px + 2, py + 1)),
      cubicFilter<R>(fx, tex2D<R>(tex, px - 1, py + 2),
                     tex2D<R>(tex, px, py + 2), tex2D<R>(tex, px + 1, py + 2),
                     tex2D<R>(tex, px + 2, py + 2)));
}

// fast bicubic texture lookup using 4 bilinear lookups
// assumes texture is set to non-normalized coordinates, point sampling
template <class T, class R>  // texture data type, return type
__device__ R tex2DFastBicubic(const cudaTextureObject_t tex, float x, float y) {
  x -= 0.5f;
  y -= 0.5f;
  float px = floorf(x);
  float py = floorf(y);
  float fx = x - px;
  float fy = y - py;

  // note: we could store these functions in a lookup table texture, but maths
  // is cheap
  float g0x = g0(fx);
  float g1x = g1(fx);
  float h0x = h0(fx);
  float h1x = h1(fx);
  float h0y = h0(fy);
  float h1y = h1(fy);

  R r = g0(fy) * (g0x * tex2D<R>(tex, px + h0x, py + h0y) +
                  g1x * tex2D<R>(tex, px + h1x, py + h0y)) +
        g1(fy) * (g0x * tex2D<R>(tex, px + h0x, py + h1y) +
                  g1x * tex2D<R>(tex, px + h1x, py + h1y));
  return r;
}

// higher-precision 2D bilinear lookup
template <class T, class R>  // texture data type, return type
__device__ R tex2DBilinear(const cudaTextureObject_t tex, float x, float y) {
  x -= 0.5f;
  y -= 0.5f;
  float px = floorf(x);  // integer position
  float py = floorf(y);
  float fx = x - px;  // fractional position
  float fy = y - py;
  px += 0.5f;
  py += 0.5f;

  return lerp(lerp(tex2D<R>(tex, px, py), tex2D<R>(tex, px + 1.0f, py), fx),
              lerp(tex2D<R>(tex, px, py + 1.0f),
                   tex2D<R>(tex, px + 1.0f, py + 1.0f), fx),
              fy);
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200

/*
  bilinear 2D texture lookup using tex2Dgather function
  - tex2Dgather() returns the four neighbouring samples in a single texture
   lookup
  - it is only supported on the Fermi architecture
  - you can select which component to fetch using the "comp" parameter
  - it can be used to efficiently implement custom texture filters

  The samples are returned in a 4-vector in the following order:
  x: (0, 1)
  y: (1, 1)
  z: (1, 0)
  w: (0, 0)
*/

template <class T, class R>  // texture data type, return type
__device__ float tex2DBilinearGather(const cudaTextureObject_t tex, float x,
                                     float y, int comp = 0) {
  x -= 0.5f;
  y -= 0.5f;
  float px = floorf(x);  // integer position
  float py = floorf(y);
  float fx = x - px;  // fractional position
  float fy = y - py;

  R samples = tex2Dgather<R>(tex, px + 0.5f, py + 0.5f, comp);

  return lerp(lerp((float)samples.w, (float)samples.z, fx),
              lerp((float)samples.x, (float)samples.y, fx), fy);
}

#endif

// Catmull-Rom interpolation

__host__ __device__ float catrom_w0(float a) {
  // return -0.5f*a + a*a - 0.5f*a*a*a;
  return a * (-0.5f + a * (1.0f - 0.5f * a));
}

__host__ __device__ float catrom_w1(float a) {
  // return 1.0f - 2.5f*a*a + 1.5f*a*a*a;
  return 1.0f + a * a * (-2.5f + 1.5f * a);
}

__host__ __device__ float catrom_w2(float a) {
  // return 0.5f*a + 2.0f*a*a - 1.5f*a*a*a;
  return a * (0.5f + a * (2.0f - 1.5f * a));
}

__host__ __device__ float catrom_w3(float a) {
  // return -0.5f*a*a + 0.5f*a*a*a;
  return a * a * (-0.5f + 0.5f * a);
}

template <class T>
__device__ T catRomFilter(float x, T c0, T c1, T c2, T c3) {
  T r;
  r = c0 * catrom_w0(x);
  r += c1 * catrom_w1(x);
  r += c2 * catrom_w2(x);
  r += c3 * catrom_w3(x);
  return r;
}

// Note - can't use bilinear trick here because of negative lobes
template <class T, class R>  // texture data type, return type
__device__ R tex2DCatRom(const cudaTextureObject_t tex, float x, float y) {
  x -= 0.5f;
  y -= 0.5f;
  float px = floorf(x);
  float py = floorf(y);
  float fx = x - px;
  float fy = y - py;

  return catRomFilter<R>(
      fy, catRomFilter<R>(
              fx, tex2D<R>(tex, px - 1, py - 1), tex2D<R>(tex, px, py - 1),
              tex2D<R>(tex, px + 1, py - 1), tex2D<R>(tex, px + 2, py - 1)),
      catRomFilter<R>(fx, tex2D<R>(tex, px - 1, py), tex2D<R>(tex, px, py),
                      tex2D<R>(tex, px + 1, py), tex2D<R>(tex, px + 2, py)),
      catRomFilter<R>(fx, tex2D<R>(tex, px - 1, py + 1),
                      tex2D<R>(tex, px, py + 1), tex2D<R>(tex, px + 1, py + 1),
                      tex2D<R>(tex, px + 2, py + 1)),
      catRomFilter<R>(fx, tex2D<R>(tex, px - 1, py + 2),
                      tex2D<R>(tex, px, py + 2), tex2D<R>(tex, px + 1, py + 2),
                      tex2D<R>(tex, px + 2, py + 2)));
}

// test functions

// render image using normal bilinear texture lookup
__global__ void d_render(uchar4 *d_output, uint width, uint height, float tx,
                         float ty, float scale, float cx, float cy,
                         cudaTextureObject_t texObj) {
  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
  uint i = __umul24(y, width) + x;

  float u = (x - cx) * scale + cx + tx;
  float v = (y - cy) * scale + cy + ty;

  if ((x < width) && (y < height)) {
    // write output color
    float c = tex2D<float>(texObj, u, v);
    // float c = tex2DBilinear<uchar, float>(tex, u, v);
    // float c = tex2DBilinearGather<uchar, uchar4>(tex2, u, v, 0) / 255.0f;
    d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);
  }
}

// render image using bicubic texture lookup
__global__ void d_renderBicubic(uchar4 *d_output, uint width, uint height,
                                float tx, float ty, float scale, float cx,
                                float cy, cudaTextureObject_t texObj) {
  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
  uint i = __umul24(y, width) + x;

  float u = (x - cx) * scale + cx + tx;
  float v = (y - cy) * scale + cy + ty;

  if ((x < width) && (y < height)) {
    // write output color
    float c = tex2DBicubic<uchar, float>(texObj, u, v);
    d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);
  }
}

// render image using fast bicubic texture lookup
__global__ void d_renderFastBicubic(uchar4 *d_output, uint width, uint height,
                                    float tx, float ty, float scale, float cx,
                                    float cy, cudaTextureObject_t texObj) {
  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
  uint i = __umul24(y, width) + x;

  float u = (x - cx) * scale + cx + tx;
  float v = (y - cy) * scale + cy + ty;

  if ((x < width) && (y < height)) {
    // write output color
    float c = tex2DFastBicubic<uchar, float>(texObj, u, v);
    d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);
  }
}

// render image using Catmull-Rom texture lookup
__global__ void d_renderCatRom(uchar4 *d_output, uint width, uint height,
                               float tx, float ty, float scale, float cx,
                               float cy, cudaTextureObject_t texObj) {
  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
  uint i = __umul24(y, width) + x;

  float u = (x - cx) * scale + cx + tx;
  float v = (y - cy) * scale + cy + ty;

  if ((x < width) && (y < height)) {
    // write output color
    float c = tex2DCatRom<uchar, float>(texObj, u, v);
    d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);
  }
}

#endif  // _BICUBICTEXTURE_KERNEL_CUH_
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`Bicubic filtering`
			`See GPU Gems 2: "Fast Third-Order Texture Filtering", Sigg & Hadwiger`
			`https://developer.nvidia.com/gpugems/gpugems2/part-iii-high-quality-rendering/chapter-20-fast-third-order-texture-filtering`

			`Reformulation thanks to Keenan Crane`
			`*/`

			`#ifndef _BICUBICTEXTURE_KERNEL_CUH_`
			`#define _BICUBICTEXTURE_KERNEL_CUH_`

			`enum Mode {`
			`MODE_NEAREST,`
			`MODE_BILINEAR,`
			`MODE_BICUBIC,`
			`MODE_FAST_BICUBIC,`
			`MODE_CATROM`
			`};`

			`cudaTextureObject_t texObjPoint, texObjLinear;`

			`// w0, w1, w2, and w3 are the four cubic B-spline basis functions`
			`__host__ __device__ float w0(float a) {`
			`// return (1.0f/6.0f)(-aaa + 3.0faa - 3.0fa + 1.0f);`
			`return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f); // optimized`
			`}`

			`__host__ __device__ float w1(float a) {`
			`// return (1.0f/6.0f)(3.0faaa - 6.0faa + 4.0f);`
			`return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f);`
			`}`

			`__host__ __device__ float w2(float a) {`
			`// return (1.0f/6.0f)(-3.0faaa + 3.0faa + 3.0f*a + 1.0f);`
			`return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f);`
			`}`

			`__host__ __device__ float w3(float a) { return (1.0f / 6.0f) * (a * a * a); }`

			`// g0 and g1 are the two amplitude functions`
			`__device__ float g0(float a) { return w0(a) + w1(a); }`

			`__device__ float g1(float a) { return w2(a) + w3(a); }`

			`// h0 and h1 are the two offset functions`
			`__device__ float h0(float a) {`
			`// note +0.5 offset to compensate for CUDA linear filtering convention`
			`return -1.0f + w1(a) / (w0(a) + w1(a)) + 0.5f;`
			`}`

			`__device__ float h1(float a) { return 1.0f + w3(a) / (w2(a) + w3(a)) + 0.5f; }`

			`// filter 4 values using cubic splines`
			`template <class T>`
			`__device__ T cubicFilter(float x, T c0, T c1, T c2, T c3) {`
			`T r;`
			`r = c0 * w0(x);`
			`r += c1 * w1(x);`
			`r += c2 * w2(x);`
			`r += c3 * w3(x);`
			`return r;`
			`}`

			`// slow but precise bicubic lookup using 16 texture lookups`
			`template <class T, class R> // texture data type, return type`
			`__device__ R tex2DBicubic(const cudaTextureObject_t tex, float x, float y) {`
			`x -= 0.5f;`
			`y -= 0.5f;`
			`float px = floorf(x);`
			`float py = floorf(y);`
			`float fx = x - px;`
			`float fy = y - py;`

			`return cubicFilter<R>(`
			`fy, cubicFilter<R>(`
			`fx, tex2D<R>(tex, px - 1, py - 1), tex2D<R>(tex, px, py - 1),`
			`tex2D<R>(tex, px + 1, py - 1), tex2D<R>(tex, px + 2, py - 1)),`
			`cubicFilter<R>(fx, tex2D<R>(tex, px - 1, py), tex2D<R>(tex, px, py),`
			`tex2D<R>(tex, px + 1, py), tex2D<R>(tex, px + 2, py)),`
			`cubicFilter<R>(fx, tex2D<R>(tex, px - 1, py + 1),`
			`tex2D<R>(tex, px, py + 1), tex2D<R>(tex, px + 1, py + 1),`
			`tex2D<R>(tex, px + 2, py + 1)),`
			`cubicFilter<R>(fx, tex2D<R>(tex, px - 1, py + 2),`
			`tex2D<R>(tex, px, py + 2), tex2D<R>(tex, px + 1, py + 2),`
			`tex2D<R>(tex, px + 2, py + 2)));`
			`}`

			`// fast bicubic texture lookup using 4 bilinear lookups`
			`// assumes texture is set to non-normalized coordinates, point sampling`
			`template <class T, class R> // texture data type, return type`
			`__device__ R tex2DFastBicubic(const cudaTextureObject_t tex, float x, float y) {`
			`x -= 0.5f;`
			`y -= 0.5f;`
			`float px = floorf(x);`
			`float py = floorf(y);`
			`float fx = x - px;`
			`float fy = y - py;`

			`// note: we could store these functions in a lookup table texture, but maths`
			`// is cheap`
			`float g0x = g0(fx);`
			`float g1x = g1(fx);`
			`float h0x = h0(fx);`
			`float h1x = h1(fx);`
			`float h0y = h0(fy);`
			`float h1y = h1(fy);`

			`R r = g0(fy) * (g0x * tex2D<R>(tex, px + h0x, py + h0y) +`
			`g1x * tex2D<R>(tex, px + h1x, py + h0y)) +`
			`g1(fy) * (g0x * tex2D<R>(tex, px + h0x, py + h1y) +`
			`g1x * tex2D<R>(tex, px + h1x, py + h1y));`
			`return r;`
			`}`

			`// higher-precision 2D bilinear lookup`
			`template <class T, class R> // texture data type, return type`
			`__device__ R tex2DBilinear(const cudaTextureObject_t tex, float x, float y) {`
			`x -= 0.5f;`
			`y -= 0.5f;`
			`float px = floorf(x); // integer position`
			`float py = floorf(y);`
			`float fx = x - px; // fractional position`
			`float fy = y - py;`
			`px += 0.5f;`
			`py += 0.5f;`

			`return lerp(lerp(tex2D<R>(tex, px, py), tex2D<R>(tex, px + 1.0f, py), fx),`
			`lerp(tex2D<R>(tex, px, py + 1.0f),`
			`tex2D<R>(tex, px + 1.0f, py + 1.0f), fx),`
			`fy);`
			`}`

			`#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 200`

			`/*`
			`bilinear 2D texture lookup using tex2Dgather function`
			`- tex2Dgather() returns the four neighbouring samples in a single texture`
			`lookup`
			`- it is only supported on the Fermi architecture`
			`- you can select which component to fetch using the "comp" parameter`
			`- it can be used to efficiently implement custom texture filters`

			`The samples are returned in a 4-vector in the following order:`
			`x: (0, 1)`
			`y: (1, 1)`
			`z: (1, 0)`
			`w: (0, 0)`
			`*/`

			`template <class T, class R> // texture data type, return type`
			`__device__ float tex2DBilinearGather(const cudaTextureObject_t tex, float x,`
			`float y, int comp = 0) {`
			`x -= 0.5f;`
			`y -= 0.5f;`
			`float px = floorf(x); // integer position`
			`float py = floorf(y);`
			`float fx = x - px; // fractional position`
			`float fy = y - py;`

			`R samples = tex2Dgather<R>(tex, px + 0.5f, py + 0.5f, comp);`

			`return lerp(lerp((float)samples.w, (float)samples.z, fx),`
			`lerp((float)samples.x, (float)samples.y, fx), fy);`
			`}`

			`#endif`

			`// Catmull-Rom interpolation`

			`__host__ __device__ float catrom_w0(float a) {`
			`// return -0.5fa + aa - 0.5faa*a;`
			`return a * (-0.5f + a * (1.0f - 0.5f * a));`
			`}`

			`__host__ __device__ float catrom_w1(float a) {`
			`// return 1.0f - 2.5faa + 1.5faa*a;`
			`return 1.0f + a * a * (-2.5f + 1.5f * a);`
			`}`

			`__host__ __device__ float catrom_w2(float a) {`
			`// return 0.5fa + 2.0faa - 1.5faaa;`
			`return a * (0.5f + a * (2.0f - 1.5f * a));`
			`}`

			`__host__ __device__ float catrom_w3(float a) {`
			`// return -0.5faa + 0.5faa*a;`
			`return a * a * (-0.5f + 0.5f * a);`
			`}`

			`template <class T>`
			`__device__ T catRomFilter(float x, T c0, T c1, T c2, T c3) {`
			`T r;`
			`r = c0 * catrom_w0(x);`
			`r += c1 * catrom_w1(x);`
			`r += c2 * catrom_w2(x);`
			`r += c3 * catrom_w3(x);`
			`return r;`
			`}`

			`// Note - can't use bilinear trick here because of negative lobes`
			`template <class T, class R> // texture data type, return type`
			`__device__ R tex2DCatRom(const cudaTextureObject_t tex, float x, float y) {`
			`x -= 0.5f;`
			`y -= 0.5f;`
			`float px = floorf(x);`
			`float py = floorf(y);`
			`float fx = x - px;`
			`float fy = y - py;`

			`return catRomFilter<R>(`
			`fy, catRomFilter<R>(`
			`fx, tex2D<R>(tex, px - 1, py - 1), tex2D<R>(tex, px, py - 1),`
			`tex2D<R>(tex, px + 1, py - 1), tex2D<R>(tex, px + 2, py - 1)),`
			`catRomFilter<R>(fx, tex2D<R>(tex, px - 1, py), tex2D<R>(tex, px, py),`
			`tex2D<R>(tex, px + 1, py), tex2D<R>(tex, px + 2, py)),`
			`catRomFilter<R>(fx, tex2D<R>(tex, px - 1, py + 1),`
			`tex2D<R>(tex, px, py + 1), tex2D<R>(tex, px + 1, py + 1),`
			`tex2D<R>(tex, px + 2, py + 1)),`
			`catRomFilter<R>(fx, tex2D<R>(tex, px - 1, py + 2),`
			`tex2D<R>(tex, px, py + 2), tex2D<R>(tex, px + 1, py + 2),`
			`tex2D<R>(tex, px + 2, py + 2)));`
			`}`

			`// test functions`

			`// render image using normal bilinear texture lookup`
			`__global__ void d_render(uchar4 *d_output, uint width, uint height, float tx,`
			`float ty, float scale, float cx, float cy,`
			`cudaTextureObject_t texObj) {`
			`uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;`
			`uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;`
			`uint i = __umul24(y, width) + x;`

			`float u = (x - cx) * scale + cx + tx;`
			`float v = (y - cy) * scale + cy + ty;`

			`if ((x < width) && (y < height)) {`
			`// write output color`
			`float c = tex2D<float>(texObj, u, v);`
			`// float c = tex2DBilinear<uchar, float>(tex, u, v);`
			`// float c = tex2DBilinearGather<uchar, uchar4>(tex2, u, v, 0) / 255.0f;`
			`d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);`
			`}`
			`}`

			`// render image using bicubic texture lookup`
			`__global__ void d_renderBicubic(uchar4 *d_output, uint width, uint height,`
			`float tx, float ty, float scale, float cx,`
			`float cy, cudaTextureObject_t texObj) {`
			`uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;`
			`uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;`
			`uint i = __umul24(y, width) + x;`

			`float u = (x - cx) * scale + cx + tx;`
			`float v = (y - cy) * scale + cy + ty;`

			`if ((x < width) && (y < height)) {`
			`// write output color`
			`float c = tex2DBicubic<uchar, float>(texObj, u, v);`
			`d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);`
			`}`
			`}`

			`// render image using fast bicubic texture lookup`
			`__global__ void d_renderFastBicubic(uchar4 *d_output, uint width, uint height,`
			`float tx, float ty, float scale, float cx,`
			`float cy, cudaTextureObject_t texObj) {`
			`uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;`
			`uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;`
			`uint i = __umul24(y, width) + x;`

			`float u = (x - cx) * scale + cx + tx;`
			`float v = (y - cy) * scale + cy + ty;`

			`if ((x < width) && (y < height)) {`
			`// write output color`
			`float c = tex2DFastBicubic<uchar, float>(texObj, u, v);`
			`d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);`
			`}`
			`}`

			`// render image using Catmull-Rom texture lookup`
			`__global__ void d_renderCatRom(uchar4 *d_output, uint width, uint height,`
			`float tx, float ty, float scale, float cx,`
			`float cy, cudaTextureObject_t texObj) {`
			`uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;`
			`uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;`
			`uint i = __umul24(y, width) + x;`

			`float u = (x - cx) * scale + cx + tx;`
			`float v = (y - cy) * scale + cy + ty;`

			`if ((x < width) && (y < height)) {`
			`// write output color`
			`float c = tex2DCatRom<uchar, float>(texObj, u, v);`
			`d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0);`
			`}`
			`}`

			`#endif // _BICUBICTEXTURE_KERNEL_CUH_`