mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-01-20 02:45:53 +08:00
283 lines
9.6 KiB
Plaintext
283 lines
9.6 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/* A CUDA program that demonstrates how to compute a stereo disparity map using
|
|
* SIMD SAD (Sum of Absolute Difference) intrinsics
|
|
*/
|
|
|
|
// includes, system
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <math.h>
|
|
|
|
// includes, kernels
|
|
#include <cuda_runtime.h>
|
|
#include "stereoDisparity_kernel.cuh"
|
|
|
|
// includes, project
|
|
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
|
#include <helper_cuda.h> // helper for checking cuda initialization and error checking
|
|
#include <helper_string.h> // helper functions for string parsing
|
|
|
|
static const char *sSDKsample = "[stereoDisparity]\0";
|
|
|
|
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// declaration, forward
|
|
void runTest(int argc, char **argv);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Program main
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int main(int argc, char **argv) {
|
|
printf("%s Starting...\n\n", sSDKsample);
|
|
runTest(argc, argv);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! CUDA Sample for calculating depth maps
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void runTest(int argc, char **argv) {
|
|
cudaDeviceProp deviceProp;
|
|
deviceProp.major = 0;
|
|
deviceProp.minor = 0;
|
|
int dev = 0;
|
|
|
|
// This will pick the best possible CUDA capable device
|
|
dev = findCudaDevice(argc, (const char **)argv);
|
|
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
|
|
|
|
// Statistics about the GPU device
|
|
printf(
|
|
"> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
|
|
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
|
|
|
|
StopWatchInterface *timer;
|
|
sdkCreateTimer(&timer);
|
|
|
|
// Search parameters
|
|
int minDisp = -16;
|
|
int maxDisp = 0;
|
|
|
|
// Load image data
|
|
// allocate mem for the images on host side
|
|
// initialize pointers to NULL to request lib call to allocate as needed
|
|
// PPM images are loaded into 4 byte/pixel memory (RGBX)
|
|
unsigned char *h_img0 = NULL;
|
|
unsigned char *h_img1 = NULL;
|
|
unsigned int w, h;
|
|
char *fname0 = sdkFindFilePath("stereo.im0.640x533.ppm", argv[0]);
|
|
char *fname1 = sdkFindFilePath("stereo.im1.640x533.ppm", argv[0]);
|
|
|
|
printf("Loaded <%s> as image 0\n", fname0);
|
|
|
|
if (!sdkLoadPPM4ub(fname0, &h_img0, &w, &h)) {
|
|
fprintf(stderr, "Failed to load <%s>\n", fname0);
|
|
}
|
|
|
|
printf("Loaded <%s> as image 1\n", fname1);
|
|
|
|
if (!sdkLoadPPM4ub(fname1, &h_img1, &w, &h)) {
|
|
fprintf(stderr, "Failed to load <%s>\n", fname1);
|
|
}
|
|
|
|
dim3 numThreads = dim3(blockSize_x, blockSize_y, 1);
|
|
dim3 numBlocks = dim3(iDivUp(w, numThreads.x), iDivUp(h, numThreads.y));
|
|
unsigned int numData = w * h;
|
|
unsigned int memSize = sizeof(int) * numData;
|
|
|
|
// allocate mem for the result on host side
|
|
unsigned int *h_odata = (unsigned int *)malloc(memSize);
|
|
|
|
// initialize the memory
|
|
for (unsigned int i = 0; i < numData; i++) h_odata[i] = 0;
|
|
|
|
// allocate device memory for result
|
|
unsigned int *d_odata, *d_img0, *d_img1;
|
|
|
|
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
|
|
checkCudaErrors(cudaMalloc((void **)&d_img0, memSize));
|
|
checkCudaErrors(cudaMalloc((void **)&d_img1, memSize));
|
|
|
|
// copy host memory to device to initialize to zeros
|
|
checkCudaErrors(cudaMemcpy(d_img0, h_img0, memSize, cudaMemcpyHostToDevice));
|
|
checkCudaErrors(cudaMemcpy(d_img1, h_img1, memSize, cudaMemcpyHostToDevice));
|
|
checkCudaErrors(
|
|
cudaMemcpy(d_odata, h_odata, memSize, cudaMemcpyHostToDevice));
|
|
|
|
cudaChannelFormatDesc ca_desc0 = cudaCreateChannelDesc<unsigned int>();
|
|
cudaChannelFormatDesc ca_desc1 = cudaCreateChannelDesc<unsigned int>();
|
|
|
|
cudaTextureObject_t tex2Dleft, tex2Dright;
|
|
cudaResourceDesc texRes;
|
|
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
|
|
|
texRes.resType = cudaResourceTypePitch2D;
|
|
texRes.res.pitch2D.devPtr = d_img0;
|
|
texRes.res.pitch2D.desc = ca_desc0;
|
|
texRes.res.pitch2D.width = w;
|
|
texRes.res.pitch2D.height = h;
|
|
texRes.res.pitch2D.pitchInBytes = w * 4;
|
|
|
|
cudaTextureDesc texDescr;
|
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
|
|
|
texDescr.normalizedCoords = false;
|
|
texDescr.filterMode = cudaFilterModePoint;
|
|
texDescr.addressMode[0] = cudaAddressModeClamp;
|
|
texDescr.addressMode[1] = cudaAddressModeClamp;
|
|
texDescr.readMode = cudaReadModeElementType;
|
|
|
|
checkCudaErrors(
|
|
cudaCreateTextureObject(&tex2Dleft, &texRes, &texDescr, NULL));
|
|
|
|
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
|
|
|
texRes.resType = cudaResourceTypePitch2D;
|
|
texRes.res.pitch2D.devPtr = d_img1;
|
|
texRes.res.pitch2D.desc = ca_desc1;
|
|
texRes.res.pitch2D.width = w;
|
|
texRes.res.pitch2D.height = h;
|
|
texRes.res.pitch2D.pitchInBytes = w * 4;
|
|
|
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
|
|
|
texDescr.normalizedCoords = false;
|
|
texDescr.filterMode = cudaFilterModePoint;
|
|
texDescr.addressMode[0] = cudaAddressModeClamp;
|
|
texDescr.addressMode[1] = cudaAddressModeClamp;
|
|
texDescr.readMode = cudaReadModeElementType;
|
|
|
|
checkCudaErrors(
|
|
cudaCreateTextureObject(&tex2Dright, &texRes, &texDescr, NULL));
|
|
|
|
// First run the warmup kernel (which we'll use to get the GPU in the correct
|
|
// max power state
|
|
stereoDisparityKernel<<<numBlocks, numThreads>>>(
|
|
d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright);
|
|
cudaDeviceSynchronize();
|
|
|
|
// Allocate CUDA events that we'll use for timing
|
|
cudaEvent_t start, stop;
|
|
checkCudaErrors(cudaEventCreate(&start));
|
|
checkCudaErrors(cudaEventCreate(&stop));
|
|
|
|
printf("Launching CUDA stereoDisparityKernel()\n");
|
|
|
|
// Record the start event
|
|
checkCudaErrors(cudaEventRecord(start, NULL));
|
|
|
|
// launch the stereoDisparity kernel
|
|
stereoDisparityKernel<<<numBlocks, numThreads>>>(
|
|
d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright);
|
|
|
|
// Record the stop event
|
|
checkCudaErrors(cudaEventRecord(stop, NULL));
|
|
|
|
// Wait for the stop event to complete
|
|
checkCudaErrors(cudaEventSynchronize(stop));
|
|
|
|
// Check to make sure the kernel didn't fail
|
|
getLastCudaError("Kernel execution failed");
|
|
|
|
float msecTotal = 0.0f;
|
|
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
|
|
|
|
// Copy result from device to host for verification
|
|
checkCudaErrors(
|
|
cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost));
|
|
|
|
printf("Input Size [%dx%d], ", w, h);
|
|
printf("Kernel size [%dx%d], ", (2 * RAD + 1), (2 * RAD + 1));
|
|
printf("Disparities [%d:%d]\n", minDisp, maxDisp);
|
|
|
|
printf("GPU processing time : %.4f (ms)\n", msecTotal);
|
|
printf("Pixel throughput : %.3f Mpixels/sec\n",
|
|
((float)(w * h * 1000.f) / msecTotal) / 1000000);
|
|
|
|
// calculate sum of resultant GPU image
|
|
unsigned int checkSum = 0;
|
|
|
|
for (unsigned int i = 0; i < w * h; i++) {
|
|
checkSum += h_odata[i];
|
|
}
|
|
|
|
printf("GPU Checksum = %u, ", checkSum);
|
|
|
|
// write out the resulting disparity image.
|
|
unsigned char *dispOut = (unsigned char *)malloc(numData);
|
|
int mult = 20;
|
|
const char *fnameOut = "output_GPU.pgm";
|
|
|
|
for (unsigned int i = 0; i < numData; i++) {
|
|
dispOut[i] = (int)h_odata[i] * mult;
|
|
}
|
|
|
|
printf("GPU image: <%s>\n", fnameOut);
|
|
sdkSavePGM(fnameOut, dispOut, w, h);
|
|
|
|
// compute reference solution
|
|
printf("Computing CPU reference...\n");
|
|
cpu_gold_stereo((unsigned int *)h_img0, (unsigned int *)h_img1,
|
|
(unsigned int *)h_odata, w, h, minDisp, maxDisp);
|
|
unsigned int cpuCheckSum = 0;
|
|
|
|
for (unsigned int i = 0; i < w * h; i++) {
|
|
cpuCheckSum += h_odata[i];
|
|
}
|
|
|
|
printf("CPU Checksum = %u, ", cpuCheckSum);
|
|
const char *cpuFnameOut = "output_CPU.pgm";
|
|
|
|
for (unsigned int i = 0; i < numData; i++) {
|
|
dispOut[i] = (int)h_odata[i] * mult;
|
|
}
|
|
|
|
printf("CPU image: <%s>\n", cpuFnameOut);
|
|
sdkSavePGM(cpuFnameOut, dispOut, w, h);
|
|
|
|
// cleanup memory
|
|
checkCudaErrors(cudaFree(d_odata));
|
|
checkCudaErrors(cudaFree(d_img0));
|
|
checkCudaErrors(cudaFree(d_img1));
|
|
|
|
if (h_odata != NULL) free(h_odata);
|
|
|
|
if (h_img0 != NULL) free(h_img0);
|
|
|
|
if (h_img1 != NULL) free(h_img1);
|
|
|
|
if (dispOut != NULL) free(dispOut);
|
|
|
|
sdkDeleteTimer(&timer);
|
|
|
|
exit((checkSum == cpuCheckSum) ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
}
|