cuda-samples/Samples/stereoDisparity/stereoDisparity.cu
2021-10-21 16:34:49 +05:30

283 lines
9.6 KiB
Plaintext

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* A CUDA program that demonstrates how to compute a stereo disparity map using
* SIMD SAD (Sum of Absolute Difference) intrinsics
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, kernels
#include <cuda_runtime.h>
#include "stereoDisparity_kernel.cuh"
// includes, project
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
#include <helper_cuda.h> // helper for checking cuda initialization and error checking
#include <helper_string.h> // helper functions for string parsing
static const char *sSDKsample = "[stereoDisparity]\0";
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s Starting...\n\n", sSDKsample);
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! CUDA Sample for calculating depth maps
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
cudaDeviceProp deviceProp;
deviceProp.major = 0;
deviceProp.minor = 0;
int dev = 0;
// This will pick the best possible CUDA capable device
dev = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
// Statistics about the GPU device
printf(
"> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
StopWatchInterface *timer;
sdkCreateTimer(&timer);
// Search parameters
int minDisp = -16;
int maxDisp = 0;
// Load image data
// allocate mem for the images on host side
// initialize pointers to NULL to request lib call to allocate as needed
// PPM images are loaded into 4 byte/pixel memory (RGBX)
unsigned char *h_img0 = NULL;
unsigned char *h_img1 = NULL;
unsigned int w, h;
char *fname0 = sdkFindFilePath("stereo.im0.640x533.ppm", argv[0]);
char *fname1 = sdkFindFilePath("stereo.im1.640x533.ppm", argv[0]);
printf("Loaded <%s> as image 0\n", fname0);
if (!sdkLoadPPM4ub(fname0, &h_img0, &w, &h)) {
fprintf(stderr, "Failed to load <%s>\n", fname0);
}
printf("Loaded <%s> as image 1\n", fname1);
if (!sdkLoadPPM4ub(fname1, &h_img1, &w, &h)) {
fprintf(stderr, "Failed to load <%s>\n", fname1);
}
dim3 numThreads = dim3(blockSize_x, blockSize_y, 1);
dim3 numBlocks = dim3(iDivUp(w, numThreads.x), iDivUp(h, numThreads.y));
unsigned int numData = w * h;
unsigned int memSize = sizeof(int) * numData;
// allocate mem for the result on host side
unsigned int *h_odata = (unsigned int *)malloc(memSize);
// initialize the memory
for (unsigned int i = 0; i < numData; i++) h_odata[i] = 0;
// allocate device memory for result
unsigned int *d_odata, *d_img0, *d_img1;
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
checkCudaErrors(cudaMalloc((void **)&d_img0, memSize));
checkCudaErrors(cudaMalloc((void **)&d_img1, memSize));
// copy host memory to device to initialize to zeros
checkCudaErrors(cudaMemcpy(d_img0, h_img0, memSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_img1, h_img1, memSize, cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_odata, h_odata, memSize, cudaMemcpyHostToDevice));
cudaChannelFormatDesc ca_desc0 = cudaCreateChannelDesc<unsigned int>();
cudaChannelFormatDesc ca_desc1 = cudaCreateChannelDesc<unsigned int>();
cudaTextureObject_t tex2Dleft, tex2Dright;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = d_img0;
texRes.res.pitch2D.desc = ca_desc0;
texRes.res.pitch2D.width = w;
texRes.res.pitch2D.height = h;
texRes.res.pitch2D.pitchInBytes = w * 4;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(
cudaCreateTextureObject(&tex2Dleft, &texRes, &texDescr, NULL));
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = d_img1;
texRes.res.pitch2D.desc = ca_desc1;
texRes.res.pitch2D.width = w;
texRes.res.pitch2D.height = h;
texRes.res.pitch2D.pitchInBytes = w * 4;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(
cudaCreateTextureObject(&tex2Dright, &texRes, &texDescr, NULL));
// First run the warmup kernel (which we'll use to get the GPU in the correct
// max power state
stereoDisparityKernel<<<numBlocks, numThreads>>>(
d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright);
cudaDeviceSynchronize();
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
printf("Launching CUDA stereoDisparityKernel()\n");
// Record the start event
checkCudaErrors(cudaEventRecord(start, NULL));
// launch the stereoDisparity kernel
stereoDisparityKernel<<<numBlocks, numThreads>>>(
d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright);
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, NULL));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
// Check to make sure the kernel didn't fail
getLastCudaError("Kernel execution failed");
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Copy result from device to host for verification
checkCudaErrors(
cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost));
printf("Input Size [%dx%d], ", w, h);
printf("Kernel size [%dx%d], ", (2 * RAD + 1), (2 * RAD + 1));
printf("Disparities [%d:%d]\n", minDisp, maxDisp);
printf("GPU processing time : %.4f (ms)\n", msecTotal);
printf("Pixel throughput : %.3f Mpixels/sec\n",
((float)(w * h * 1000.f) / msecTotal) / 1000000);
// calculate sum of resultant GPU image
unsigned int checkSum = 0;
for (unsigned int i = 0; i < w * h; i++) {
checkSum += h_odata[i];
}
printf("GPU Checksum = %u, ", checkSum);
// write out the resulting disparity image.
unsigned char *dispOut = (unsigned char *)malloc(numData);
int mult = 20;
const char *fnameOut = "output_GPU.pgm";
for (unsigned int i = 0; i < numData; i++) {
dispOut[i] = (int)h_odata[i] * mult;
}
printf("GPU image: <%s>\n", fnameOut);
sdkSavePGM(fnameOut, dispOut, w, h);
// compute reference solution
printf("Computing CPU reference...\n");
cpu_gold_stereo((unsigned int *)h_img0, (unsigned int *)h_img1,
(unsigned int *)h_odata, w, h, minDisp, maxDisp);
unsigned int cpuCheckSum = 0;
for (unsigned int i = 0; i < w * h; i++) {
cpuCheckSum += h_odata[i];
}
printf("CPU Checksum = %u, ", cpuCheckSum);
const char *cpuFnameOut = "output_CPU.pgm";
for (unsigned int i = 0; i < numData; i++) {
dispOut[i] = (int)h_odata[i] * mult;
}
printf("CPU image: <%s>\n", cpuFnameOut);
sdkSavePGM(cpuFnameOut, dispOut, w, h);
// cleanup memory
checkCudaErrors(cudaFree(d_odata));
checkCudaErrors(cudaFree(d_img0));
checkCudaErrors(cudaFree(d_img1));
if (h_odata != NULL) free(h_odata);
if (h_img0 != NULL) free(h_img0);
if (h_img1 != NULL) free(h_img1);
if (dispOut != NULL) free(dispOut);
sdkDeleteTimer(&timer);
exit((checkSum == cpuCheckSum) ? EXIT_SUCCESS : EXIT_FAILURE);
}