mirror of
				https://github.com/NVIDIA/cuda-samples.git
				synced 2025-10-31 19:47:48 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			663 lines
		
	
	
		
			23 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			663 lines
		
	
	
		
			23 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions
 | |
|  * are met:
 | |
|  *  * Redistributions of source code must retain the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer.
 | |
|  *  * Redistributions in binary form must reproduce the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer in the
 | |
|  *    documentation and/or other materials provided with the distribution.
 | |
|  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 | |
|  *    contributors may be used to endorse or promote products derived
 | |
|  *    from this software without specific prior written permission.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 | |
|  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 | |
|  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 | |
|  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 | |
|  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 | |
|  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 | |
|  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 | |
|  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | |
|  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
|  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  */
 | |
| 
 | |
| /**
 | |
| **************************************************************************
 | |
| * \file dct8x8.cu
 | |
| * \brief Contains entry point, wrappers to host and device code and benchmark.
 | |
| *
 | |
| * This sample implements forward and inverse Discrete Cosine Transform to blocks
 | |
| * of image pixels (of 8x8 size), as in JPEG standard. The typical work flow is
 | |
| *as
 | |
| * follows:
 | |
| * 1. Run CPU version (Host code) and measure execution time;
 | |
| * 2. Run CUDA version (Device code) and measure execution time;
 | |
| * 3. Output execution timings and calculate CUDA speedup.
 | |
| */
 | |
| 
 | |
| #include "Common.h"
 | |
| #include "DCT8x8_Gold.h"
 | |
| #include "BmpUtil.h"
 | |
| 
 | |
| /**
 | |
| *  The number of DCT kernel calls
 | |
| */
 | |
| #define BENCHMARK_SIZE 10
 | |
| 
 | |
| /**
 | |
| *  The PSNR values over this threshold indicate images equality
 | |
| */
 | |
| #define PSNR_THRESHOLD_EQUAL 40
 | |
| 
 | |
| // includes kernels
 | |
| #include "dct8x8_kernel1.cuh"
 | |
| #include "dct8x8_kernel2.cuh"
 | |
| #include "dct8x8_kernel_short.cuh"
 | |
| #include "dct8x8_kernel_quantization.cuh"
 | |
| 
 | |
| /**
 | |
| **************************************************************************
 | |
| *  Wrapper function for 1st gold version of DCT, quantization and IDCT
 | |
| *implementations
 | |
| *
 | |
| * \param ImgSrc         [IN] - Source byte image plane
 | |
| * \param ImgDst         [IN] - Quantized result byte image plane
 | |
| * \param Stride         [IN] - Stride for both source and result planes
 | |
| * \param Size           [IN] - Size of both planes
 | |
| *
 | |
| * \return Execution time in milliseconds
 | |
| */
 | |
| float WrapperGold1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) {
 | |
|   // allocate float buffers for DCT and other data
 | |
|   int StrideF;
 | |
|   float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
 | |
|   float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
 | |
| 
 | |
|   // convert source image to float representation
 | |
|   CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size);
 | |
|   AddFloatPlane(-128.0f, ImgF1, StrideF, Size);
 | |
| 
 | |
|   // create and start CUDA timer
 | |
|   StopWatchInterface *timerGold = 0;
 | |
|   sdkCreateTimer(&timerGold);
 | |
|   sdkResetTimer(&timerGold);
 | |
| 
 | |
|   // perform block-wise DCT processing and benchmarking
 | |
|   for (int i = 0; i < BENCHMARK_SIZE; i++) {
 | |
|     sdkStartTimer(&timerGold);
 | |
|     computeDCT8x8Gold1(ImgF1, ImgF2, StrideF, Size);
 | |
|     sdkStopTimer(&timerGold);
 | |
|   }
 | |
| 
 | |
|   // stop and destroy CUDA timer
 | |
|   float TimerGoldSpan = sdkGetAverageTimerValue(&timerGold);
 | |
|   sdkDeleteTimer(&timerGold);
 | |
| 
 | |
|   // perform quantization
 | |
|   quantizeGoldFloat(ImgF2, StrideF, Size);
 | |
| 
 | |
|   // perform block-wise IDCT processing
 | |
|   computeIDCT8x8Gold1(ImgF2, ImgF1, StrideF, Size);
 | |
| 
 | |
|   // convert image back to byte representation
 | |
|   AddFloatPlane(128.0f, ImgF1, StrideF, Size);
 | |
|   CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size);
 | |
| 
 | |
|   // free float buffers
 | |
|   FreePlane(ImgF1);
 | |
|   FreePlane(ImgF2);
 | |
| 
 | |
|   // return time taken by the operation
 | |
|   return TimerGoldSpan;
 | |
| }
 | |
| 
 | |
| /**
 | |
| **************************************************************************
 | |
| *  Wrapper function for 2nd gold version of DCT, quantization and IDCT
 | |
| *implementations
 | |
| *
 | |
| * \param ImgSrc         [IN] - Source byte image plane
 | |
| * \param ImgDst         [IN] - Quantized result byte image plane
 | |
| * \param Stride         [IN] - Stride for both source and result planes
 | |
| * \param Size           [IN] - Size of both planes
 | |
| *
 | |
| * \return Execution time in milliseconds
 | |
| */
 | |
| float WrapperGold2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) {
 | |
|   // allocate float buffers for DCT and other data
 | |
|   int StrideF;
 | |
|   float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
 | |
|   float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
 | |
| 
 | |
|   // convert source image to float representation
 | |
|   CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size);
 | |
|   AddFloatPlane(-128.0f, ImgF1, StrideF, Size);
 | |
| 
 | |
|   // create and start CUDA timer
 | |
|   StopWatchInterface *timerGold = 0;
 | |
|   sdkCreateTimer(&timerGold);
 | |
|   sdkResetTimer(&timerGold);
 | |
| 
 | |
|   // perform block-wise DCT processing and benchmarking
 | |
|   for (int i = 0; i < BENCHMARK_SIZE; i++) {
 | |
|     sdkStartTimer(&timerGold);
 | |
|     computeDCT8x8Gold2(ImgF1, ImgF2, StrideF, Size);
 | |
|     sdkStopTimer(&timerGold);
 | |
|   }
 | |
| 
 | |
|   // stop and destroy CUDA timer
 | |
|   float TimerGoldSpan = sdkGetAverageTimerValue(&timerGold);
 | |
|   sdkDeleteTimer(&timerGold);
 | |
| 
 | |
|   // perform quantization
 | |
|   quantizeGoldFloat(ImgF2, StrideF, Size);
 | |
| 
 | |
|   // perform block-wise IDCT processing
 | |
|   computeIDCT8x8Gold2(ImgF2, ImgF1, StrideF, Size);
 | |
| 
 | |
|   // convert image back to byte representation
 | |
|   AddFloatPlane(128.0f, ImgF1, StrideF, Size);
 | |
|   CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size);
 | |
| 
 | |
|   // free float buffers
 | |
|   FreePlane(ImgF1);
 | |
|   FreePlane(ImgF2);
 | |
| 
 | |
|   // return time taken by the operation
 | |
|   return TimerGoldSpan;
 | |
| }
 | |
| 
 | |
| /**
 | |
| **************************************************************************
 | |
| *  Wrapper function for 1st CUDA version of DCT, quantization and IDCT
 | |
| *implementations
 | |
| *
 | |
| * \param ImgSrc         [IN] - Source byte image plane
 | |
| * \param ImgDst         [IN] - Quantized result byte image plane
 | |
| * \param Stride         [IN] - Stride for both source and result planes
 | |
| * \param Size           [IN] - Size of both planes
 | |
| *
 | |
| * \return Execution time in milliseconds
 | |
| */
 | |
| float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) {
 | |
|   // prepare channel format descriptor for passing texture into kernels
 | |
|   cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();
 | |
| 
 | |
|   // allocate device memory
 | |
|   cudaArray *Src;
 | |
|   float *Dst;
 | |
|   size_t DstStride;
 | |
|   checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height));
 | |
|   checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride,
 | |
|                                   Size.width * sizeof(float), Size.height));
 | |
|   DstStride /= sizeof(float);
 | |
| 
 | |
|   // convert source image to float representation
 | |
|   int ImgSrcFStride;
 | |
|   float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride);
 | |
|   CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size);
 | |
|   AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);
 | |
| 
 | |
|   // copy from host memory to device
 | |
|   checkCudaErrors(cudaMemcpy2DToArray(
 | |
|       Src, 0, 0, ImgSrcF, ImgSrcFStride * sizeof(float),
 | |
|       Size.width * sizeof(float), Size.height, cudaMemcpyHostToDevice));
 | |
| 
 | |
|   // setup execution parameters
 | |
|   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
 | |
|   dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);
 | |
| 
 | |
|   // create and start CUDA timer
 | |
|   StopWatchInterface *timerCUDA = 0;
 | |
|   sdkCreateTimer(&timerCUDA);
 | |
|   sdkResetTimer(&timerCUDA);
 | |
| 
 | |
|   // execute DCT kernel and benchmark
 | |
|   cudaTextureObject_t TexSrc;
 | |
|   cudaResourceDesc texRes;
 | |
|   memset(&texRes, 0, sizeof(cudaResourceDesc));
 | |
| 
 | |
|   texRes.resType = cudaResourceTypeArray;
 | |
|   texRes.res.array.array = Src;
 | |
| 
 | |
|   cudaTextureDesc texDescr;
 | |
|   memset(&texDescr, 0, sizeof(cudaTextureDesc));
 | |
| 
 | |
|   texDescr.normalizedCoords = false;
 | |
|   texDescr.filterMode = cudaFilterModeLinear;
 | |
|   texDescr.addressMode[0] = cudaAddressModeWrap;
 | |
|   texDescr.addressMode[1] = cudaAddressModeWrap;
 | |
|   texDescr.readMode = cudaReadModeElementType;
 | |
| 
 | |
|   checkCudaErrors(cudaCreateTextureObject(&TexSrc, &texRes, &texDescr, NULL));
 | |
| 
 | |
|   for (int i = 0; i < BENCHMARK_SIZE; i++) {
 | |
|     sdkStartTimer(&timerCUDA);
 | |
|     CUDAkernel1DCT<<<grid, threads>>>(Dst, (int)DstStride, 0, 0, TexSrc);
 | |
|     checkCudaErrors(cudaDeviceSynchronize());
 | |
|     sdkStopTimer(&timerCUDA);
 | |
|   }
 | |
| 
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // finalize CUDA timer
 | |
|   float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA);
 | |
|   sdkDeleteTimer(&timerCUDA);
 | |
| 
 | |
|   // execute Quantization kernel
 | |
|   CUDAkernelQuantizationFloat<<<grid, threads>>>(Dst, (int)DstStride);
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // copy quantized coefficients from host memory to device array
 | |
|   checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0, Dst, DstStride * sizeof(float),
 | |
|                                       Size.width * sizeof(float), Size.height,
 | |
|                                       cudaMemcpyDeviceToDevice));
 | |
| 
 | |
|   // execute IDCT kernel
 | |
|   CUDAkernel1IDCT<<<grid, threads>>>(Dst, (int)DstStride, 0, 0, TexSrc);
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // copy quantized image block to host
 | |
|   checkCudaErrors(cudaMemcpy2D(
 | |
|       ImgSrcF, ImgSrcFStride * sizeof(float), Dst, DstStride * sizeof(float),
 | |
|       Size.width * sizeof(float), Size.height, cudaMemcpyDeviceToHost));
 | |
| 
 | |
|   // convert image back to byte representation
 | |
|   AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);
 | |
|   CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);
 | |
| 
 | |
|   // clean up memory
 | |
|   checkCudaErrors(cudaDestroyTextureObject(TexSrc));
 | |
|   checkCudaErrors(cudaFreeArray(Src));
 | |
|   checkCudaErrors(cudaFree(Dst));
 | |
|   FreePlane(ImgSrcF);
 | |
| 
 | |
|   // return time taken by the operation
 | |
|   return TimerCUDASpan;
 | |
| }
 | |
| 
 | |
| /**
 | |
| **************************************************************************
 | |
| *  Wrapper function for 2nd CUDA version of DCT, quantization and IDCT
 | |
| *implementations
 | |
| *
 | |
| * \param ImgSrc         [IN] - Source byte image plane
 | |
| * \param ImgDst         [IN] - Quantized result byte image plane
 | |
| * \param Stride         [IN] - Stride for both source and result planes
 | |
| * \param Size           [IN] - Size of both planes
 | |
| *
 | |
| * \return Execution time in milliseconds
 | |
| */
 | |
| 
 | |
| float WrapperCUDA2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) {
 | |
|   // allocate host buffers for DCT and other data
 | |
|   int StrideF;
 | |
|   float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
 | |
| 
 | |
|   // convert source image to float representation
 | |
|   CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size);
 | |
|   AddFloatPlane(-128.0f, ImgF1, StrideF, Size);
 | |
| 
 | |
|   // allocate device memory
 | |
|   float *src, *dst;
 | |
|   size_t DeviceStride;
 | |
|   checkCudaErrors(cudaMallocPitch((void **)&src, &DeviceStride,
 | |
|                                   Size.width * sizeof(float), Size.height));
 | |
|   checkCudaErrors(cudaMallocPitch((void **)&dst, &DeviceStride,
 | |
|                                   Size.width * sizeof(float), Size.height));
 | |
|   DeviceStride /= sizeof(float);
 | |
| 
 | |
|   // copy from host memory to device
 | |
|   checkCudaErrors(cudaMemcpy2D(
 | |
|       src, DeviceStride * sizeof(float), ImgF1, StrideF * sizeof(float),
 | |
|       Size.width * sizeof(float), Size.height, cudaMemcpyHostToDevice));
 | |
| 
 | |
|   // create and start CUDA timer
 | |
|   StopWatchInterface *timerCUDA = 0;
 | |
|   sdkCreateTimer(&timerCUDA);
 | |
| 
 | |
|   // setup execution parameters
 | |
|   dim3 GridFullWarps(Size.width / KER2_BLOCK_WIDTH,
 | |
|                      Size.height / KER2_BLOCK_HEIGHT, 1);
 | |
|   dim3 ThreadsFullWarps(8, KER2_BLOCK_WIDTH / 8, KER2_BLOCK_HEIGHT / 8);
 | |
| 
 | |
|   // perform block-wise DCT processing and benchmarking
 | |
|   const int numIterations = 100;
 | |
| 
 | |
|   for (int i = -1; i < numIterations; i++) {
 | |
|     if (i == 0) {
 | |
|       checkCudaErrors(cudaDeviceSynchronize());
 | |
|       sdkResetTimer(&timerCUDA);
 | |
|       sdkStartTimer(&timerCUDA);
 | |
|     }
 | |
| 
 | |
|     CUDAkernel2DCT<<<GridFullWarps, ThreadsFullWarps>>>(dst, src,
 | |
|                                                         (int)DeviceStride);
 | |
|     getLastCudaError("Kernel execution failed");
 | |
|   }
 | |
| 
 | |
|   checkCudaErrors(cudaDeviceSynchronize());
 | |
|   sdkStopTimer(&timerCUDA);
 | |
| 
 | |
|   // finalize timing of CUDA Kernels
 | |
|   float avgTime = (float)sdkGetTimerValue(&timerCUDA) / (float)numIterations;
 | |
|   sdkDeleteTimer(&timerCUDA);
 | |
|   printf("%f MPix/s //%f ms\n",
 | |
|          (1E-6 * (float)Size.width * (float)Size.height) / (1E-3 * avgTime),
 | |
|          avgTime);
 | |
| 
 | |
|   // setup execution parameters for quantization
 | |
|   dim3 ThreadsSmallBlocks(BLOCK_SIZE, BLOCK_SIZE);
 | |
|   dim3 GridSmallBlocks(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);
 | |
| 
 | |
|   // execute Quantization kernel
 | |
|   CUDAkernelQuantizationFloat<<<GridSmallBlocks, ThreadsSmallBlocks>>>(
 | |
|       dst, (int)DeviceStride);
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // perform block-wise IDCT processing
 | |
|   CUDAkernel2IDCT<<<GridFullWarps, ThreadsFullWarps>>>(src, dst,
 | |
|                                                        (int)DeviceStride);
 | |
|   checkCudaErrors(cudaDeviceSynchronize());
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // copy quantized image block to host
 | |
|   checkCudaErrors(cudaMemcpy2D(
 | |
|       ImgF1, StrideF * sizeof(float), src, DeviceStride * sizeof(float),
 | |
|       Size.width * sizeof(float), Size.height, cudaMemcpyDeviceToHost));
 | |
| 
 | |
|   // convert image back to byte representation
 | |
|   AddFloatPlane(128.0f, ImgF1, StrideF, Size);
 | |
|   CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size);
 | |
| 
 | |
|   // clean up memory
 | |
|   checkCudaErrors(cudaFree(dst));
 | |
|   checkCudaErrors(cudaFree(src));
 | |
|   FreePlane(ImgF1);
 | |
| 
 | |
|   // return time taken by the operation
 | |
|   return avgTime;
 | |
| }
 | |
| 
 | |
| /**
 | |
| **************************************************************************
 | |
| *  Wrapper function for short CUDA version of DCT, quantization and IDCT
 | |
| *implementations
 | |
| *
 | |
| * \param ImgSrc         [IN] - Source byte image plane
 | |
| * \param ImgDst         [IN] - Quantized result byte image plane
 | |
| * \param Stride         [IN] - Stride for both source and result planes
 | |
| * \param Size           [IN] - Size of both planes
 | |
| *
 | |
| * \return Execution time in milliseconds
 | |
| */
 | |
| float WrapperCUDAshort(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) {
 | |
|   // allocate host buffers for DCT and other data
 | |
|   int StrideS;
 | |
|   short *ImgS1 = MallocPlaneShort(Size.width, Size.height, &StrideS);
 | |
| 
 | |
|   // convert source image to short representation centered at 128
 | |
|   for (int i = 0; i < Size.height; i++) {
 | |
|     for (int j = 0; j < Size.width; j++) {
 | |
|       ImgS1[i * StrideS + j] = (short)ImgSrc[i * Stride + j] - 128;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // allocate device memory
 | |
|   short *SrcDst;
 | |
|   size_t DeviceStride;
 | |
|   checkCudaErrors(cudaMallocPitch((void **)(&SrcDst), &DeviceStride,
 | |
|                                   Size.width * sizeof(short), Size.height));
 | |
|   DeviceStride /= sizeof(short);
 | |
| 
 | |
|   // copy from host memory to device
 | |
|   checkCudaErrors(cudaMemcpy2D(
 | |
|       SrcDst, DeviceStride * sizeof(short), ImgS1, StrideS * sizeof(short),
 | |
|       Size.width * sizeof(short), Size.height, cudaMemcpyHostToDevice));
 | |
| 
 | |
|   // create and start CUDA timer
 | |
|   StopWatchInterface *timerLibJpeg = 0;
 | |
|   sdkCreateTimer(&timerLibJpeg);
 | |
|   sdkResetTimer(&timerLibJpeg);
 | |
| 
 | |
|   // setup execution parameters
 | |
|   dim3 GridShort(Size.width / KERS_BLOCK_WIDTH, Size.height / KERS_BLOCK_HEIGHT,
 | |
|                  1);
 | |
|   dim3 ThreadsShort(8, KERS_BLOCK_WIDTH / 8, KERS_BLOCK_HEIGHT / 8);
 | |
| 
 | |
|   // perform block-wise DCT processing and benchmarking
 | |
|   sdkStartTimer(&timerLibJpeg);
 | |
|   CUDAkernelShortDCT<<<GridShort, ThreadsShort>>>(SrcDst, (int)DeviceStride);
 | |
|   checkCudaErrors(cudaDeviceSynchronize());
 | |
|   sdkStopTimer(&timerLibJpeg);
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // stop and destroy CUDA timer
 | |
|   float TimerLibJpegSpan16b = sdkGetAverageTimerValue(&timerLibJpeg);
 | |
|   sdkDeleteTimer(&timerLibJpeg);
 | |
| 
 | |
|   // setup execution parameters for quantization
 | |
|   dim3 ThreadsSmallBlocks(BLOCK_SIZE, BLOCK_SIZE);
 | |
|   dim3 GridSmallBlocks(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);
 | |
| 
 | |
|   // execute Quantization kernel
 | |
|   CUDAkernelQuantizationShort<<<GridSmallBlocks, ThreadsSmallBlocks>>>(
 | |
|       SrcDst, (int)DeviceStride);
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // perform block-wise IDCT processing
 | |
|   CUDAkernelShortIDCT<<<GridShort, ThreadsShort>>>(SrcDst, (int)DeviceStride);
 | |
|   checkCudaErrors(cudaDeviceSynchronize());
 | |
|   getLastCudaError("Kernel execution failed");
 | |
| 
 | |
|   // copy quantized image block to host
 | |
|   checkCudaErrors(cudaMemcpy2D(
 | |
|       ImgS1, StrideS * sizeof(short), SrcDst, DeviceStride * sizeof(short),
 | |
|       Size.width * sizeof(short), Size.height, cudaMemcpyDeviceToHost));
 | |
| 
 | |
|   // convert image back to byte representation
 | |
|   for (int i = 0; i < Size.height; i++) {
 | |
|     for (int j = 0; j < Size.width; j++) {
 | |
|       ImgDst[i * Stride + j] = clamp_0_255(ImgS1[i * StrideS + j] + 128);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // free float buffers
 | |
|   checkCudaErrors(cudaFree(SrcDst));
 | |
|   FreePlane(ImgS1);
 | |
| 
 | |
|   // return time taken by the operation
 | |
|   return TimerLibJpegSpan16b;
 | |
| }
 | |
| 
 | |
| /**
 | |
| **************************************************************************
 | |
| *  Program entry point
 | |
| *
 | |
| * \param argc       [IN] - Number of command-line arguments
 | |
| * \param argv       [IN] - Array of command-line arguments
 | |
| *
 | |
| * \return Status code
 | |
| */
 | |
| 
 | |
| int main(int argc, char **argv) {
 | |
|   //
 | |
|   // Sample initialization
 | |
|   //
 | |
|   printf("%s Starting...\n\n", argv[0]);
 | |
| 
 | |
|   // initialize CUDA
 | |
|   findCudaDevice(argc, (const char **)argv);
 | |
| 
 | |
|   // source and results image filenames
 | |
|   char SampleImageFname[] = "teapot512.bmp";
 | |
|   char SampleImageFnameResGold1[] = "teapot512_gold1.bmp";
 | |
|   char SampleImageFnameResGold2[] = "teapot512_gold2.bmp";
 | |
|   char SampleImageFnameResCUDA1[] = "teapot512_cuda1.bmp";
 | |
|   char SampleImageFnameResCUDA2[] = "teapot512_cuda2.bmp";
 | |
|   char SampleImageFnameResCUDAshort[] = "teapot512_cuda_short.bmp";
 | |
| 
 | |
|   char *pSampleImageFpath = sdkFindFilePath(SampleImageFname, argv[0]);
 | |
| 
 | |
|   if (pSampleImageFpath == NULL) {
 | |
|     printf("dct8x8 could not locate Sample Image <%s>\nExiting...\n",
 | |
|            pSampleImageFpath);
 | |
|     exit(EXIT_FAILURE);
 | |
|   }
 | |
| 
 | |
|   // preload image (acquire dimensions)
 | |
|   int ImgWidth, ImgHeight;
 | |
|   ROI ImgSize;
 | |
|   int res = PreLoadBmp(pSampleImageFpath, &ImgWidth, &ImgHeight);
 | |
|   ImgSize.width = ImgWidth;
 | |
|   ImgSize.height = ImgHeight;
 | |
| 
 | |
|   // CONSOLE INFORMATION: saying hello to user
 | |
|   printf("CUDA sample DCT/IDCT implementation\n");
 | |
|   printf("===================================\n");
 | |
|   printf("Loading test image: %s... ", SampleImageFname);
 | |
| 
 | |
|   if (res) {
 | |
|     printf("\nError: Image file not found or invalid!\n");
 | |
|     exit(EXIT_FAILURE);
 | |
|     return 1;
 | |
|   }
 | |
| 
 | |
|   // check image dimensions are multiples of BLOCK_SIZE
 | |
|   if (ImgWidth % BLOCK_SIZE != 0 || ImgHeight % BLOCK_SIZE != 0) {
 | |
|     printf("\nError: Input image dimensions must be multiples of 8!\n");
 | |
|     exit(EXIT_FAILURE);
 | |
|     return 1;
 | |
|   }
 | |
| 
 | |
|   printf("[%d x %d]... ", ImgWidth, ImgHeight);
 | |
| 
 | |
|   // allocate image buffers
 | |
|   int ImgStride;
 | |
|   byte *ImgSrc = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
 | |
|   byte *ImgDstGold1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
 | |
|   byte *ImgDstGold2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
 | |
|   byte *ImgDstCUDA1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
 | |
|   byte *ImgDstCUDA2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
 | |
|   byte *ImgDstCUDAshort = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
 | |
| 
 | |
|   // load sample image
 | |
|   LoadBmpAsGray(pSampleImageFpath, ImgStride, ImgSize, ImgSrc);
 | |
| 
 | |
|   //
 | |
|   // RUNNING WRAPPERS
 | |
|   //
 | |
| 
 | |
|   // compute Gold 1 version of DCT/quantization/IDCT
 | |
|   printf("Success\nRunning Gold 1 (CPU) version... ");
 | |
|   float TimeGold1 = WrapperGold1(ImgSrc, ImgDstGold1, ImgStride, ImgSize);
 | |
| 
 | |
|   // compute Gold 2 version of DCT/quantization/IDCT
 | |
|   printf("Success\nRunning Gold 2 (CPU) version... ");
 | |
|   float TimeGold2 = WrapperGold2(ImgSrc, ImgDstGold2, ImgStride, ImgSize);
 | |
| 
 | |
|   // compute CUDA 1 version of DCT/quantization/IDCT
 | |
|   printf("Success\nRunning CUDA 1 (GPU) version... ");
 | |
|   float TimeCUDA1 = WrapperCUDA1(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize);
 | |
| 
 | |
|   // compute CUDA 2 version of DCT/quantization/IDCT
 | |
|   printf("Success\nRunning CUDA 2 (GPU) version... ");
 | |
|   float TimeCUDA2 = WrapperCUDA2(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize);
 | |
| 
 | |
|   // compute CUDA short version of DCT/quantization/IDCT
 | |
|   printf("Success\nRunning CUDA short (GPU) version... ");
 | |
|   float TimeCUDAshort =
 | |
|       WrapperCUDAshort(ImgSrc, ImgDstCUDAshort, ImgStride, ImgSize);
 | |
|   //
 | |
|   // Execution statistics, result saving and validation
 | |
|   //
 | |
| 
 | |
|   // dump result of Gold 1 processing
 | |
|   printf("Success\nDumping result to %s... ", SampleImageFnameResGold1);
 | |
|   DumpBmpAsGray(SampleImageFnameResGold1, ImgDstGold1, ImgStride, ImgSize);
 | |
| 
 | |
|   // dump result of Gold 2 processing
 | |
|   printf("Success\nDumping result to %s... ", SampleImageFnameResGold2);
 | |
|   DumpBmpAsGray(SampleImageFnameResGold2, ImgDstGold2, ImgStride, ImgSize);
 | |
| 
 | |
|   // dump result of CUDA 1 processing
 | |
|   printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA1);
 | |
|   DumpBmpAsGray(SampleImageFnameResCUDA1, ImgDstCUDA1, ImgStride, ImgSize);
 | |
| 
 | |
|   // dump result of CUDA 2 processing
 | |
|   printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA2);
 | |
|   DumpBmpAsGray(SampleImageFnameResCUDA2, ImgDstCUDA2, ImgStride, ImgSize);
 | |
| 
 | |
|   // dump result of CUDA short processing
 | |
|   printf("Success\nDumping result to %s... ", SampleImageFnameResCUDAshort);
 | |
|   DumpBmpAsGray(SampleImageFnameResCUDAshort, ImgDstCUDAshort, ImgStride,
 | |
|                 ImgSize);
 | |
|   // print speed info
 | |
|   printf("Success\n");
 | |
| 
 | |
|   printf("Processing time (CUDA 1)    : %f ms \n", TimeCUDA1);
 | |
|   printf("Processing time (CUDA 2)    : %f ms \n", TimeCUDA2);
 | |
|   printf("Processing time (CUDA short): %f ms \n", TimeCUDAshort);
 | |
| 
 | |
|   // calculate PSNR between each pair of images
 | |
|   float PSNR_Src_DstGold1 =
 | |
|       CalculatePSNR(ImgSrc, ImgDstGold1, ImgStride, ImgSize);
 | |
|   float PSNR_Src_DstGold2 =
 | |
|       CalculatePSNR(ImgSrc, ImgDstGold2, ImgStride, ImgSize);
 | |
|   float PSNR_Src_DstCUDA1 =
 | |
|       CalculatePSNR(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize);
 | |
|   float PSNR_Src_DstCUDA2 =
 | |
|       CalculatePSNR(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize);
 | |
|   float PSNR_Src_DstCUDAshort =
 | |
|       CalculatePSNR(ImgSrc, ImgDstCUDAshort, ImgStride, ImgSize);
 | |
|   float PSNR_DstGold1_DstCUDA1 =
 | |
|       CalculatePSNR(ImgDstGold1, ImgDstCUDA1, ImgStride, ImgSize);
 | |
|   float PSNR_DstGold2_DstCUDA2 =
 | |
|       CalculatePSNR(ImgDstGold2, ImgDstCUDA2, ImgStride, ImgSize);
 | |
|   float PSNR_DstGold2_DstCUDA16b =
 | |
|       CalculatePSNR(ImgDstGold2, ImgDstCUDAshort, ImgStride, ImgSize);
 | |
| 
 | |
|   printf("PSNR Original    <---> CPU(Gold 1)    : %f\n", PSNR_Src_DstGold1);
 | |
|   printf("PSNR Original    <---> CPU(Gold 2)    : %f\n", PSNR_Src_DstGold2);
 | |
|   printf("PSNR Original    <---> GPU(CUDA 1)    : %f\n", PSNR_Src_DstCUDA1);
 | |
|   printf("PSNR Original    <---> GPU(CUDA 2)    : %f\n", PSNR_Src_DstCUDA2);
 | |
|   printf("PSNR Original    <---> GPU(CUDA short): %f\n", PSNR_Src_DstCUDAshort);
 | |
|   printf("PSNR CPU(Gold 1) <---> GPU(CUDA 1)    : %f\n",
 | |
|          PSNR_DstGold1_DstCUDA1);
 | |
|   printf("PSNR CPU(Gold 2) <---> GPU(CUDA 2)    : %f\n",
 | |
|          PSNR_DstGold2_DstCUDA2);
 | |
|   printf("PSNR CPU(Gold 2) <---> GPU(CUDA short): %f\n",
 | |
|          PSNR_DstGold2_DstCUDA16b);
 | |
| 
 | |
|   bool bTestResult = (PSNR_DstGold1_DstCUDA1 > PSNR_THRESHOLD_EQUAL &&
 | |
|                       PSNR_DstGold2_DstCUDA2 > PSNR_THRESHOLD_EQUAL &&
 | |
|                       PSNR_DstGold2_DstCUDA16b > PSNR_THRESHOLD_EQUAL);
 | |
| 
 | |
|   //
 | |
|   // Finalization
 | |
|   //
 | |
| 
 | |
|   // release byte planes
 | |
|   FreePlane(ImgSrc);
 | |
|   FreePlane(ImgDstGold1);
 | |
|   FreePlane(ImgDstGold2);
 | |
|   FreePlane(ImgDstCUDA1);
 | |
|   FreePlane(ImgDstCUDA2);
 | |
|   FreePlane(ImgDstCUDAshort);
 | |
| 
 | |
|   // finalize
 | |
|   printf("\nTest Summary...\n");
 | |
| 
 | |
|   if (!bTestResult) {
 | |
|     printf("Test failed!\n");
 | |
|     exit(EXIT_FAILURE);
 | |
|   }
 | |
| 
 | |
|   printf("Test passed\n");
 | |
|   exit(EXIT_SUCCESS);
 | |
| }
 |