cuda-samples/Samples/dxtc/dxtc.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// Utilities and system includes
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_functions.h>
#include <helper_cuda.h>

#include <helper_math.h>
#include <float.h>  // for FLT_MAX

#include "CudaMath.h"
#include "dds.h"
#include "permutations.h"

// Definitions
#define INPUT_IMAGE "lena_std.ppm"
#define REFERENCE_IMAGE "lena_ref.dds"

#define ERROR_THRESHOLD 0.02f

#define NUM_THREADS 64  // Number of threads per block.

#define __debugsync()

template <class T>
__device__ inline void swap(T &a, T &b) {
  T tmp = a;
  a = b;
  b = tmp;
}

//__constant__ float3 kColorMetric = { 0.2126f, 0.7152f, 0.0722f };
__constant__ float3 kColorMetric = {1.0f, 1.0f, 1.0f};

////////////////////////////////////////////////////////////////////////////////
// Sort colors
////////////////////////////////////////////////////////////////////////////////
__device__ void sortColors(const float *values, int *ranks,
                           cg::thread_group tile) {
  const int tid = threadIdx.x;

  int rank = 0;

#pragma unroll

  for (int i = 0; i < 16; i++) {
    rank += (values[i] < values[tid]);
  }

  ranks[tid] = rank;

  cg::sync(tile);

  // Resolve elements with the same index.
  for (int i = 0; i < 15; i++) {
    if (tid > i && ranks[tid] == ranks[i]) {
      ++ranks[tid];
    }
    cg::sync(tile);
  }
}

////////////////////////////////////////////////////////////////////////////////
// Load color block to shared mem
////////////////////////////////////////////////////////////////////////////////
__device__ void loadColorBlock(const uint *image, float3 colors[16],
                               float3 sums[16], int xrefs[16], int blockOffset,
                               cg::thread_block cta) {
  const int bid = blockIdx.x + blockOffset;
  const int idx = threadIdx.x;

  __shared__ float dps[16];

  float3 tmp;

  cg::thread_group tile = cg::tiled_partition(cta, 16);

  if (idx < 16) {
    // Read color and copy to shared mem.
    uint c = image[(bid)*16 + idx];

    colors[idx].x = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
    colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
    colors[idx].z = ((c >> 16) & 0xFF) * (1.0f / 255.0f);

    cg::sync(tile);
    // Sort colors along the best fit line.
    colorSums(colors, sums, tile);

    cg::sync(tile);

    float3 axis = bestFitLine(colors, sums[0], tile);

    cg::sync(tile);

    dps[idx] = dot(colors[idx], axis);

    cg::sync(tile);

    sortColors(dps, xrefs, tile);

    cg::sync(tile);

    tmp = colors[idx];

    cg::sync(tile);

    colors[xrefs[idx]] = tmp;
  }
}

////////////////////////////////////////////////////////////////////////////////
// Round color to RGB565 and expand
////////////////////////////////////////////////////////////////////////////////
inline __device__ float3 roundAndExpand(float3 v, ushort *w) {
  v.x = rintf(__saturatef(v.x) * 31.0f);
  v.y = rintf(__saturatef(v.y) * 63.0f);
  v.z = rintf(__saturatef(v.z) * 31.0f);

  *w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z;
  v.x *= 0.03227752766457f;  // approximate integer bit expansion.
  v.y *= 0.01583151765563f;
  v.z *= 0.03227752766457f;
  return v;
}

__constant__ float alphaTable4[4] = {9.0f, 0.0f, 6.0f, 3.0f};
__constant__ float alphaTable3[4] = {4.0f, 0.0f, 2.0f, 2.0f};
__constant__ const int prods4[4] = {0x090000, 0x000900, 0x040102, 0x010402};
__constant__ const int prods3[4] = {0x040000, 0x000400, 0x040101, 0x010401};

#define USE_TABLES 1

////////////////////////////////////////////////////////////////////////////////
// Evaluate permutations
////////////////////////////////////////////////////////////////////////////////
static __device__ float evalPermutation4(const float3 *colors, uint permutation,
                                         ushort *start, ushort *end,
                                         float3 color_sum) {
// Compute endpoints using least squares.
#if USE_TABLES
  float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);

  int akku = 0;

  // Compute alpha & beta for this permutation.
  for (int i = 0; i < 16; i++) {
    const uint bits = permutation >> (2 * i);

    alphax_sum += alphaTable4[bits & 3] * colors[i];
    akku += prods4[bits & 3];
  }

  float alpha2_sum = float(akku >> 16);
  float beta2_sum = float((akku >> 8) & 0xff);
  float alphabeta_sum = float((akku >> 0) & 0xff);
  float3 betax_sum = (9.0f * color_sum) - alphax_sum;
#else
  float alpha2_sum = 0.0f;
  float beta2_sum = 0.0f;
  float alphabeta_sum = 0.0f;
  float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);

  // Compute alpha & beta for this permutation.
  for (int i = 0; i < 16; i++) {
    const uint bits = permutation >> (2 * i);

    float beta = (bits & 1);

    if (bits & 2) {
      beta = (1 + beta) * (1.0f / 3.0f);
    }

    float alpha = 1.0f - beta;

    alpha2_sum += alpha * alpha;
    beta2_sum += beta * beta;
    alphabeta_sum += alpha * beta;
    alphax_sum += alpha * colors[i];
  }

  float3 betax_sum = color_sum - alphax_sum;
#endif

  // alpha2, beta2, alphabeta and factor could be precomputed for each
  // permutation, but it's faster to recompute them.
  const float factor =
      1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

  float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
  float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

  // Round a, b to the closest 5-6-5 color and expand...
  a = roundAndExpand(a, start);
  b = roundAndExpand(b, end);

  // compute the error
  float3 e = a * a * alpha2_sum + b * b * beta2_sum +
             2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);

  return (0.111111111111f) * dot(e, kColorMetric);
}

static __device__ float evalPermutation3(const float3 *colors, uint permutation,
                                         ushort *start, ushort *end,
                                         float3 color_sum) {
// Compute endpoints using least squares.
#if USE_TABLES
  float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);

  int akku = 0;

  // Compute alpha & beta for this permutation.
  for (int i = 0; i < 16; i++) {
    const uint bits = permutation >> (2 * i);

    alphax_sum += alphaTable3[bits & 3] * colors[i];
    akku += prods3[bits & 3];
  }

  float alpha2_sum = float(akku >> 16);
  float beta2_sum = float((akku >> 8) & 0xff);
  float alphabeta_sum = float((akku >> 0) & 0xff);
  float3 betax_sum = (4.0f * color_sum) - alphax_sum;
#else
  float alpha2_sum = 0.0f;
  float beta2_sum = 0.0f;
  float alphabeta_sum = 0.0f;
  float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);

  // Compute alpha & beta for this permutation.
  for (int i = 0; i < 16; i++) {
    const uint bits = permutation >> (2 * i);

    float beta = (bits & 1);

    if (bits & 2) {
      beta = 0.5f;
    }

    float alpha = 1.0f - beta;

    alpha2_sum += alpha * alpha;
    beta2_sum += beta * beta;
    alphabeta_sum += alpha * beta;
    alphax_sum += alpha * colors[i];
  }

  float3 betax_sum = color_sum - alphax_sum;
#endif

  const float factor =
      1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

  float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
  float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

  // Round a, b to the closest 5-6-5 color and expand...
  a = roundAndExpand(a, start);
  b = roundAndExpand(b, end);

  // compute the error
  float3 e = a * a * alpha2_sum + b * b * beta2_sum +
             2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);

  return (0.25f) * dot(e, kColorMetric);
}

__device__ void evalAllPermutations(const float3 *colors,
                                    const uint *permutations, ushort &bestStart,
                                    ushort &bestEnd, uint &bestPermutation,
                                    float *errors, float3 color_sum,
                                    cg::thread_block cta) {
  const int idx = threadIdx.x;

  float bestError = FLT_MAX;

  __shared__ uint s_permutations[160];

  for (int i = 0; i < 16; i++) {
    int pidx = idx + NUM_THREADS * i;

    if (pidx >= 992) {
      break;
    }

    ushort start, end;
    uint permutation = permutations[pidx];

    if (pidx < 160) {
      s_permutations[pidx] = permutation;
    }

    float error =
        evalPermutation4(colors, permutation, &start, &end, color_sum);

    if (error < bestError) {
      bestError = error;
      bestPermutation = permutation;
      bestStart = start;
      bestEnd = end;
    }
  }

  if (bestStart < bestEnd) {
    swap(bestEnd, bestStart);
    bestPermutation ^= 0x55555555;  // Flip indices.
  }

  cg::sync(cta);  // Sync here to ensure s_permutations is valid going forward

  for (int i = 0; i < 3; i++) {
    int pidx = idx + NUM_THREADS * i;

    if (pidx >= 160) {
      break;
    }

    ushort start, end;
    uint permutation = s_permutations[pidx];
    float error =
        evalPermutation3(colors, permutation, &start, &end, color_sum);

    if (error < bestError) {
      bestError = error;
      bestPermutation = permutation;
      bestStart = start;
      bestEnd = end;

      if (bestStart > bestEnd) {
        swap(bestEnd, bestStart);
        bestPermutation ^=
            (~bestPermutation >> 1) & 0x55555555;  // Flip indices.
      }
    }
  }

  errors[idx] = bestError;
}

////////////////////////////////////////////////////////////////////////////////
// Find index with minimum error
////////////////////////////////////////////////////////////////////////////////
__device__ int findMinError(float *errors, cg::thread_block cta) {
  const int idx = threadIdx.x;
  __shared__ int indices[NUM_THREADS];
  indices[idx] = idx;

  cg::sync(cta);

  for (int d = NUM_THREADS / 2; d > 0; d >>= 1) {
    float err0 = errors[idx];
    float err1 = (idx + d) < NUM_THREADS ? errors[idx + d] : FLT_MAX;
    int index1 = (idx + d) < NUM_THREADS ? indices[idx + d] : 0;

    cg::sync(cta);

    if (err1 < err0) {
      errors[idx] = err1;
      indices[idx] = index1;
    }

    cg::sync(cta);
  }

  return indices[0];
}

////////////////////////////////////////////////////////////////////////////////
// Save DXT block
////////////////////////////////////////////////////////////////////////////////
__device__ void saveBlockDXT1(ushort start, ushort end, uint permutation,
                              int xrefs[16], uint2 *result, int blockOffset) {
  const int bid = blockIdx.x + blockOffset;

  if (start == end) {
    permutation = 0;
  }

  // Reorder permutation.
  uint indices = 0;

  for (int i = 0; i < 16; i++) {
    int ref = xrefs[i];
    indices |= ((permutation >> (2 * ref)) & 3) << (2 * i);
  }

  // Write endpoints.
  result[bid].x = (end << 16) | start;

  // Write palette indices.
  result[bid].y = indices;
}

////////////////////////////////////////////////////////////////////////////////
// Compress color block
////////////////////////////////////////////////////////////////////////////////
__global__ void compress(const uint *permutations, const uint *image,
                         uint2 *result, int blockOffset) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();

  const int idx = threadIdx.x;

  __shared__ float3 colors[16];
  __shared__ float3 sums[16];
  __shared__ int xrefs[16];

  loadColorBlock(image, colors, sums, xrefs, blockOffset, cta);

  cg::sync(cta);

  ushort bestStart, bestEnd;
  uint bestPermutation;

  __shared__ float errors[NUM_THREADS];

  evalAllPermutations(colors, permutations, bestStart, bestEnd, bestPermutation,
                      errors, sums[0], cta);

  // Use a parallel reduction to find minimum error.
  const int minIdx = findMinError(errors, cta);

  cg::sync(cta);

  // Only write the result of the winner thread.
  if (idx == minIdx) {
    saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result,
                  blockOffset);
  }
}

// Helper structs and functions to validate the output of the compressor.
// We cannot simply do a bitwise compare, because different compilers produce
// different
// results for different targets due to floating point arithmetic.

union Color32 {
  struct {
    unsigned char b, g, r, a;
  };
  unsigned int u;
};

union Color16 {
  struct {
    unsigned short b : 5;
    unsigned short g : 6;
    unsigned short r : 5;
  };
  unsigned short u;
};

struct BlockDXT1 {
  Color16 col0;
  Color16 col1;
  union {
    unsigned char row[4];
    unsigned int indices;
  };

  void decompress(Color32 colors[16]) const;
};

void BlockDXT1::decompress(Color32 *colors) const {
  Color32 palette[4];

  // Does bit expansion before interpolation.
  palette[0].b = (col0.b << 3) | (col0.b >> 2);
  palette[0].g = (col0.g << 2) | (col0.g >> 4);
  palette[0].r = (col0.r << 3) | (col0.r >> 2);
  palette[0].a = 0xFF;

  palette[1].r = (col1.r << 3) | (col1.r >> 2);
  palette[1].g = (col1.g << 2) | (col1.g >> 4);
  palette[1].b = (col1.b << 3) | (col1.b >> 2);
  palette[1].a = 0xFF;

  if (col0.u > col1.u) {
    // Four-color block: derive the other two colors.
    palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
    palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
    palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
    palette[2].a = 0xFF;

    palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
    palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
    palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
    palette[3].a = 0xFF;
  } else {
    // Three-color block: derive the other color.
    palette[2].r = (palette[0].r + palette[1].r) / 2;
    palette[2].g = (palette[0].g + palette[1].g) / 2;
    palette[2].b = (palette[0].b + palette[1].b) / 2;
    palette[2].a = 0xFF;

    palette[3].r = 0x00;
    palette[3].g = 0x00;
    palette[3].b = 0x00;
    palette[3].a = 0x00;
  }

  for (int i = 0; i < 16; i++) {
    colors[i] = palette[(indices >> (2 * i)) & 0x3];
  }
}

static int compareColors(const Color32 *b0, const Color32 *b1) {
  int sum = 0;

  for (int i = 0; i < 16; i++) {
    int r = (b0[i].r - b1[i].r);
    int g = (b0[i].g - b1[i].g);
    int b = (b0[i].b - b1[i].b);
    sum += r * r + g * g + b * b;
  }

  return sum;
}

static int compareBlock(const BlockDXT1 *b0, const BlockDXT1 *b1) {
  Color32 colors0[16];
  Color32 colors1[16];

  if (memcmp(b0, b1, sizeof(BlockDXT1)) == 0) {
    return 0;
  } else {
    b0->decompress(colors0);
    b1->decompress(colors1);

    return compareColors(colors0, colors1);
  }
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  printf("%s Starting...\n\n", argv[0]);

  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  findCudaDevice(argc, (const char **)argv);

  // Load input image.
  unsigned char *data = NULL;
  uint W, H;

  char *image_path = sdkFindFilePath(INPUT_IMAGE, argv[0]);

  if (image_path == 0) {
    printf("Error, unable to find source image  <%s>\n", image_path);
    exit(EXIT_FAILURE);
  }

  if (!sdkLoadPPM4ub(image_path, &data, &W, &H)) {
    printf("Error, unable to open source image file <%s>\n", image_path);

    exit(EXIT_FAILURE);
  }

  uint w = W, h = H;

  printf("Image Loaded '%s', %d x %d pixels\n\n", image_path, w, h);

  // Allocate input image.
  const uint memSize = w * h * 4;
  assert(0 != memSize);
  uint *block_image = (uint *)malloc(memSize);

  // Convert linear image to block linear.
  for (uint by = 0; by < h / 4; by++) {
    for (uint bx = 0; bx < w / 4; bx++) {
      for (int i = 0; i < 16; i++) {
        const int x = i & 3;
        const int y = i / 4;
        block_image[(by * w / 4 + bx) * 16 + i] =
            ((uint *)data)[(by * 4 + y) * 4 * (W / 4) + bx * 4 + x];
      }
    }
  }

  // copy into global mem
  uint *d_data = NULL;
  checkCudaErrors(cudaMalloc((void **)&d_data, memSize));

  // Result
  uint *d_result = NULL;
  const uint compressedSize = (w / 4) * (h / 4) * 8;
  checkCudaErrors(cudaMalloc((void **)&d_result, compressedSize));
  uint *h_result = (uint *)malloc(compressedSize);

  // Compute permutations.
  uint permutations[1024];
  computePermutations(permutations);

  // Copy permutations host to devie.
  uint *d_permutations = NULL;
  checkCudaErrors(cudaMalloc((void **)&d_permutations, 1024 * sizeof(uint)));
  checkCudaErrors(cudaMemcpy(d_permutations, permutations, 1024 * sizeof(uint),
                             cudaMemcpyHostToDevice));

  // create a timer
  StopWatchInterface *timer = NULL;
  sdkCreateTimer(&timer);

  // Copy image from host to device
  checkCudaErrors(
      cudaMemcpy(d_data, block_image, memSize, cudaMemcpyHostToDevice));

  // Determine launch configuration and run timed computation numIterations
  // times
  uint blocks = ((w + 3) / 4) *
                ((h + 3) / 4);  // rounds up by 1 block in each dim if %4 != 0

  int devID;
  cudaDeviceProp deviceProp;

  // get number of SMs on this GPU
  checkCudaErrors(cudaGetDevice(&devID));
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

  // Restrict the numbers of blocks to launch on low end GPUs to avoid kernel
  // timeout
  int blocksPerLaunch = min(blocks, 768 * deviceProp.multiProcessorCount);

  printf("Running DXT Compression on %u x %u image...\n", w, h);
  printf("\n%u Blocks, %u Threads per Block, %u Threads in Grid...\n\n", blocks,
         NUM_THREADS, blocks * NUM_THREADS);
  int numIterations = 1;

  for (int i = -1; i < numIterations; ++i) {
    if (i == 0) {
      checkCudaErrors(cudaDeviceSynchronize());
      sdkStartTimer(&timer);
    }

    for (int j = 0; j < (int)blocks; j += blocksPerLaunch) {
      compress<<<min(blocksPerLaunch, blocks - j), NUM_THREADS>>>(
          d_permutations, d_data, (uint2 *)d_result, j);
    }
  }

  getLastCudaError("compress");

  // sync to host, stop timer, record perf
  checkCudaErrors(cudaDeviceSynchronize());
  sdkStopTimer(&timer);
  double dAvgTime = 1.0e-3 * sdkGetTimerValue(&timer) / (double)numIterations;
  printf(
      "dxtc, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, "
      "NumDevsUsed = %i, Workgroup = %d\n",
      (1.0e-6 * (double)(W * H) / dAvgTime), dAvgTime, (W * H), 1, NUM_THREADS);

  // copy result data from device to host
  checkCudaErrors(
      cudaMemcpy(h_result, d_result, compressedSize, cudaMemcpyDeviceToHost));

  // Write out result data to DDS file
  char output_filename[1024];
  strcpy(output_filename, image_path);
  strcpy(output_filename + strlen(image_path) - 3, "dds");
  FILE *fp = fopen(output_filename, "wb");

  if (fp == 0) {
    printf("Error, unable to open output image <%s>\n", output_filename);
    exit(EXIT_FAILURE);
  }

  DDSHeader header;
  header.fourcc = FOURCC_DDS;
  header.size = 124;
  header.flags = (DDSD_WIDTH | DDSD_HEIGHT | DDSD_CAPS | DDSD_PIXELFORMAT |
                  DDSD_LINEARSIZE);
  header.height = h;
  header.width = w;
  header.pitch = compressedSize;
  header.depth = 0;
  header.mipmapcount = 0;
  memset(header.reserved, 0, sizeof(header.reserved));
  header.pf.size = 32;
  header.pf.flags = DDPF_FOURCC;
  header.pf.fourcc = FOURCC_DXT1;
  header.pf.bitcount = 0;
  header.pf.rmask = 0;
  header.pf.gmask = 0;
  header.pf.bmask = 0;
  header.pf.amask = 0;
  header.caps.caps1 = DDSCAPS_TEXTURE;
  header.caps.caps2 = 0;
  header.caps.caps3 = 0;
  header.caps.caps4 = 0;
  header.notused = 0;
  fwrite(&header, sizeof(DDSHeader), 1, fp);
  fwrite(h_result, compressedSize, 1, fp);
  fclose(fp);

  // Make sure the generated image is correct.
  const char *reference_image_path = sdkFindFilePath(REFERENCE_IMAGE, argv[0]);

  if (reference_image_path == 0) {
    printf("Error, unable to find reference image\n");

    exit(EXIT_FAILURE);
  }

  fp = fopen(reference_image_path, "rb");

  if (fp == 0) {
    printf("Error, unable to open reference image\n");

    exit(EXIT_FAILURE);
  }

  fseek(fp, sizeof(DDSHeader), SEEK_SET);
  uint referenceSize = (W / 4) * (H / 4) * 8;
  uint *reference = (uint *)malloc(referenceSize);
  fread(reference, referenceSize, 1, fp);
  fclose(fp);

  printf("\nChecking accuracy...\n");
  float rms = 0;

  for (uint y = 0; y < h; y += 4) {
    for (uint x = 0; x < w; x += 4) {
      uint referenceBlockIdx = ((y / 4) * (W / 4) + (x / 4));
      uint resultBlockIdx = ((y / 4) * (w / 4) + (x / 4));

      int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx,
                             ((BlockDXT1 *)reference) + referenceBlockIdx);

      if (cmp != 0.0f) {
        printf("Deviation at (%4d,%4d):\t%f rms\n", x / 4, y / 4,
               float(cmp) / 16 / 3);
      }

      rms += cmp;
    }
  }

  rms /= w * h * 3;

  // Free allocated resources and exit
  checkCudaErrors(cudaFree(d_permutations));
  checkCudaErrors(cudaFree(d_data));
  checkCudaErrors(cudaFree(d_result));
  free(image_path);
  free(data);
  free(block_image);
  free(h_result);
  free(reference);
  sdkDeleteTimer(&timer);

  printf("RMS(reference, result) = %f\n\n", rms);
  printf(rms <= ERROR_THRESHOLD ? "Test passed\n" : "Test failed!\n");
  /* Return zero if test passed, one otherwise */
  return rms > ERROR_THRESHOLD;
}