/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #define WINDOWS_LEAN_AND_MEAN #define NOMINMAX #include #pragma warning(disable : 4819) #endif #include #include #include #include #include #include #include // Note: If you want to view these images we HIGHLY recommend using imagej // which is free on the internet and works on most platforms // because it is one of the few image viewing apps that can display 32 // bit integer image data. While it normalizes the data to floating // point values for viewing it still provides a good representation of // the relative brightness of each label value. Note that label // compression output results in smaller differences between label values // making it visually more difficult to detect differences in labeled // regions. If you have an editor that can display hex values you can // see what the exact values of each label is, every 4 bytes represents 1 // 32 bit integer label value. // // The files read and written by this sample app use RAW image format, // that is, only the image data itself exists in the files with no image // format information. When viewing RAW files with imagej just enter // the image size and bit depth values that are part of the file name // when requested by imagej. // // This sample app works in 2 stages, first it processes all of the // images individually then it processes them all again in 1 batch using // the Batch_Advanced versions of the NPP batch functions which allow // each image to have it's own ROI. The 2 stages are completely // separable but in this sample the second stage takes advantage of some // of the data that has already been initialized. // // Note that there is a small amount of variability in the number of // unique label markers generated from one run to the next by the UF // algorithm. // // Performance of ALL NPP image batch functions is limited by the maximum // ROI height in the list of images. // Batched label compression support is only available on NPP versions > 11.0, // comment out if using NPP 11.0 #define USE_BATCHED_LABEL_COMPRESSION 1 #define NUMBER_OF_IMAGES 5 Npp8u *pInputImageDev[NUMBER_OF_IMAGES]; Npp8u *pInputImageHost[NUMBER_OF_IMAGES]; Npp8u *pUFGenerateLabelsScratchBufferDev[NUMBER_OF_IMAGES]; Npp8u *pUFCompressedLabelsScratchBufferDev[NUMBER_OF_IMAGES]; Npp32u *pUFLabelDev[NUMBER_OF_IMAGES]; Npp32u *pUFLabelHost[NUMBER_OF_IMAGES]; NppiImageDescriptor *pUFBatchSrcImageListDev = 0; NppiImageDescriptor *pUFBatchSrcDstImageListDev = 0; NppiImageDescriptor *pUFBatchSrcImageListHost = 0; NppiImageDescriptor *pUFBatchSrcDstImageListHost = 0; NppiBufferDescriptor *pUFBatchSrcDstScratchBufferListDev = 0; // from nppi_filtering_functions.h NppiBufferDescriptor *pUFBatchSrcDstScratchBufferListHost = 0; Npp32u *pUFBatchPerImageCompressedCountListDev = 0; Npp32u *pUFBatchPerImageCompressedCountListHost = 0; void tearDown() // Clean up and tear down { if (pUFBatchPerImageCompressedCountListDev != 0) cudaFree(pUFBatchPerImageCompressedCountListDev); if (pUFBatchSrcDstScratchBufferListDev != 0) cudaFree(pUFBatchSrcDstScratchBufferListDev); if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev); if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev); if (pUFBatchPerImageCompressedCountListHost != 0) cudaFreeHost(pUFBatchPerImageCompressedCountListHost); if (pUFBatchSrcDstScratchBufferListHost != 0) cudaFreeHost(pUFBatchSrcDstScratchBufferListHost); if (pUFBatchSrcDstImageListHost != 0) cudaFreeHost(pUFBatchSrcDstImageListHost); if (pUFBatchSrcImageListHost != 0) cudaFreeHost(pUFBatchSrcImageListHost); for (int j = 0; j < NUMBER_OF_IMAGES; j++) { if (pUFCompressedLabelsScratchBufferDev[j] != 0) cudaFree(pUFCompressedLabelsScratchBufferDev[j]); if (pUFGenerateLabelsScratchBufferDev[j] != 0) cudaFree(pUFGenerateLabelsScratchBufferDev[j]); if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]); if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]); if (pUFLabelHost[j] != 0) cudaFreeHost(pUFLabelHost[j]); if (pInputImageHost[j] != 0) cudaFreeHost(pInputImageHost[j]); } } const std::string &LabelMarkersOutputFile0 = "Lena_LabelMarkersUF_8Way_512x512_32u.raw"; const std::string &LabelMarkersOutputFile1 = "CT_skull_LabelMarkersUF_8Way_512x512_32u.raw"; const std::string &LabelMarkersOutputFile2 = "PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw"; const std::string &LabelMarkersOutputFile3 = "PCB2_LabelMarkersUF_8Way_1024x683_32u.raw"; const std::string &LabelMarkersOutputFile4 = "PCB_LabelMarkersUF_8Way_1280x720_32u.raw"; const std::string &CompressedMarkerLabelsOutputFile0 = "Lena_CompressedMarkerLabelsUF_8Way_512x512_32u.raw"; const std::string &CompressedMarkerLabelsOutputFile1 = "CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw"; const std::string &CompressedMarkerLabelsOutputFile2 = "PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw"; const std::string &CompressedMarkerLabelsOutputFile3 = "PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw"; const std::string &CompressedMarkerLabelsOutputFile4 = "PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw"; const std::string &LabelMarkersBatchOutputFile0 = "Lena_LabelMarkersUFBatch_8Way_512x512_32u.raw"; const std::string &LabelMarkersBatchOutputFile1 = "CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw"; const std::string &LabelMarkersBatchOutputFile2 = "PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw"; const std::string &LabelMarkersBatchOutputFile3 = "PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw"; const std::string &LabelMarkersBatchOutputFile4 = "PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw"; const std::string &CompressedMarkerLabelsBatchOutputFile0 = "Lena_CompressedMarkerLabelsUFBatch_8Way_512x512_32u.raw"; const std::string &CompressedMarkerLabelsBatchOutputFile1 = "CT_skull_CompressedMarkerLabelsUFBatch_8Way_512x512_32u.raw"; const std::string &CompressedMarkerLabelsBatchOutputFile2 = "PCB_METAL_CompressedMarkerLabelsUFBatch_8Way_509x335_32u.raw"; const std::string &CompressedMarkerLabelsBatchOutputFile3 = "PCB2_CompressedMarkerLabelsUFBatch_8Way_1024x683_32u.raw"; const std::string &CompressedMarkerLabelsBatchOutputFile4 = "PCB_CompressedMarkerLabelsUFBatch_8Way_1280x720_32u.raw"; int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) { FILE *bmpFile; size_t nSize; if (nImage == 0) { if (nWidth != 512 || nHeight != 512) return -1; const char *fileName = "lena_512x512_8u.raw"; const char *InputFile = sdkFindFilePath(fileName, "."); if (InputFile == NULL) { printf("%s file not found.. exiting\n", fileName); exit(EXIT_WAIVED); } FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 1) { if (nWidth != 512 || nHeight != 512) return -1; const char *fileName = "CT_skull_512x512_8u.raw"; const char *InputFile = sdkFindFilePath(fileName, "."); if (InputFile == NULL) { printf("%s file not found.. exiting\n", fileName); exit(EXIT_WAIVED); } FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 2) { if (nWidth != 509 || nHeight != 335) return -1; const char *fileName = "PCB_METAL_509x335_8u.raw"; const char *InputFile = sdkFindFilePath(fileName, "."); if (InputFile == NULL) { printf("%s file not found.. exiting\n", fileName); exit(EXIT_WAIVED); } FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 3) { if (nWidth != 1024 || nHeight != 683) return -1; const char *fileName = "PCB2_1024x683_8u.raw"; const char *InputFile = sdkFindFilePath(fileName, "."); if (InputFile == NULL) { printf("%s file not found.. exiting\n", fileName); exit(EXIT_WAIVED); } FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 4) { if (nWidth != 1280 || nHeight != 720) return -1; const char *fileName = "PCB_1280x720_8u.raw"; const char *InputFile = sdkFindFilePath(fileName, "."); if (InputFile == NULL) { printf("%s file not found.. exiting\n", fileName); exit(EXIT_WAIVED); } FOPEN(bmpFile, InputFile, "rb"); } else { printf("Input file load failed.\n"); return -1; } if (bmpFile == NULL) return -1; nSize = fread(pImage, 1, nWidth * nHeight, bmpFile); if (nSize < nWidth * nHeight) { fclose(bmpFile); return -1; } fclose(bmpFile); printf("Input file load succeeded.\n"); return 0; } int main(int argc, char **argv) { int aGenerateLabelsScratchBufferSize[NUMBER_OF_IMAGES]; int aCompressLabelsScratchBufferSize[NUMBER_OF_IMAGES]; int nCompressedLabelCount = 0; cudaError_t cudaError; NppStatus nppStatus; NppStreamContext nppStreamCtx; FILE *bmpFile; for (int j = 0; j < NUMBER_OF_IMAGES; j++) { pInputImageDev[j] = 0; pInputImageHost[j] = 0; pUFGenerateLabelsScratchBufferDev[j] = 0; pUFCompressedLabelsScratchBufferDev[j] = 0; pUFLabelDev[j] = 0; pUFLabelHost[j] = 0; } nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever // your stream ID is if not the NULL stream. cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); if (cudaError != cudaSuccess) { printf("CUDA error: no devices supporting CUDA.\n"); return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; } const NppLibraryVersion *libVer = nppGetLibVersion(); printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); int driverVersion, runtimeVersion; cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); cudaError = cudaDeviceGetAttribute( &nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, cudaDevAttrComputeCapabilityMajor, nppStreamCtx.nCudaDeviceId); if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; cudaError = cudaDeviceGetAttribute( &nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, cudaDevAttrComputeCapabilityMinor, nppStreamCtx.nCudaDeviceId); if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); cudaDeviceProp oDeviceProperties; cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; NppiSize oSizeROI[NUMBER_OF_IMAGES]; for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { if (nImage == 0) { oSizeROI[nImage].width = 512; oSizeROI[nImage].height = 512; } else if (nImage == 1) { oSizeROI[nImage].width = 512; oSizeROI[nImage].height = 512; } else if (nImage == 2) { oSizeROI[nImage].width = 509; oSizeROI[nImage].height = 335; } else if (nImage == 3) { oSizeROI[nImage].width = 1024; oSizeROI[nImage].height = 683; } else if (nImage == 4) { oSizeROI[nImage].width = 1280; oSizeROI[nImage].height = 720; } // NOTE: While using cudaMallocPitch() to allocate device memory for NPP can // significantly improve the performance of many NPP functions, for UF // function label markers generation or compression DO NOT USE // cudaMallocPitch(). Doing so could result in incorrect output. cudaError = cudaMalloc( (void **)&pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; // For images processed with UF label markers functions ROI width and height // for label markers generation output AND marker compression functions MUST // be the same AND line pitch MUST be equal to ROI.width * sizeof(Npp32u). // Also the image pointer used for label markers generation output must // start at the same position in the image as it does in the marker // compression function. Also note that actual input image size and ROI do // not necessarily need to be related other than ROI being less than or // equal to image size and image starting position does not necessarily have // to be at pixel 0 in the input image. cudaError = cudaMalloc( (void **)&pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; checkCudaErrors(cudaMallocHost( &(pInputImageHost[nImage]), oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height)); checkCudaErrors(cudaMallocHost( &(pUFLabelHost[nImage]), oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height)); // Use UF functions throughout this sample. nppStatus = nppiLabelMarkersUFGetBufferSize_32u_C1R( oSizeROI[nImage], &aGenerateLabelsScratchBufferSize[nImage]); // One at a time image processing cudaError = cudaMalloc((void **)&pUFGenerateLabelsScratchBufferDev[nImage], aGenerateLabelsScratchBufferSize[nImage]); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; if (loadRaw8BitImage(pInputImageHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, nImage) == 0) { cudaError = cudaMemcpy2DAsync( pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), pInputImageHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, cudaMemcpyHostToDevice, nppStreamCtx.hStream); nppStatus = nppiLabelMarkersUF_8u32u_C1R_Ctx( pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage], nppiNormInf, pUFGenerateLabelsScratchBufferDev[nImage], nppStreamCtx); if (nppStatus != NPP_SUCCESS) { if (nImage == 0) printf("Lena_LabelMarkersUF_8Way_512x512_32u failed.\n"); else if (nImage == 1) printf("CT_skull_LabelMarkersUF_8Way_512x512_32u failed.\n"); else if (nImage == 2) printf("PCB_METAL_LabelMarkersUF_8Way_509x335_32u failed.\n"); else if (nImage == 3) printf("PCB2_LabelMarkersUF_8Way_1024x683_32u failed.\n"); else if (nImage == 4) printf("PCB_LabelMarkersUF_8Way_1280x720_32u failed.\n"); tearDown(); return -1; } cudaError = cudaMemcpy2DAsync( pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u), pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height, cudaMemcpyDeviceToHost, nppStreamCtx.hStream); // Wait host image read backs to complete, not necessary if no need to // synchronize if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) { printf("Post label generation cudaStreamSynchronize failed\n"); tearDown(); return -1; } if (nImage == 0) FOPEN(bmpFile, LabelMarkersOutputFile0.c_str(), "wb"); else if (nImage == 1) FOPEN(bmpFile, LabelMarkersOutputFile1.c_str(), "wb"); else if (nImage == 2) FOPEN(bmpFile, LabelMarkersOutputFile2.c_str(), "wb"); else if (nImage == 3) FOPEN(bmpFile, LabelMarkersOutputFile3.c_str(), "wb"); else if (nImage == 4) FOPEN(bmpFile, LabelMarkersOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; size_t nSize = 0; for (int j = 0; j < oSizeROI[nImage].height; j++) { nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp32u), oSizeROI[nImage].width, bmpFile); } fclose(bmpFile); nppStatus = nppiCompressMarkerLabelsGetBufferSize_32u_C1R( oSizeROI[nImage].width * oSizeROI[nImage].height, &aCompressLabelsScratchBufferSize[nImage]); if (nppStatus != NPP_NO_ERROR) return nppStatus; cudaError = cudaMalloc((void **)&pUFCompressedLabelsScratchBufferDev[nImage], aCompressLabelsScratchBufferSize[nImage]); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; nCompressedLabelCount = 0; nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR( pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage], oSizeROI[nImage].width * oSizeROI[nImage].height, &nCompressedLabelCount, pUFCompressedLabelsScratchBufferDev[nImage]); if (nppStatus != NPP_SUCCESS) { if (nImage == 0) printf("Lena_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n"); else if (nImage == 1) printf( "CT_Skull_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n"); else if (nImage == 2) printf( "PCB_METAL_CompressedLabelMarkersUF_8Way_509x335_32u failed.\n"); else if (nImage == 3) printf("PCB2_CompressedLabelMarkersUF_8Way_1024x683_32u failed.\n"); else if (nImage == 4) printf("PCB_CompressedLabelMarkersUF_8Way_1280x720_32u failed.\n"); tearDown(); return -1; } cudaError = cudaMemcpy2DAsync( pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u), pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height, cudaMemcpyDeviceToHost, nppStreamCtx.hStream); // Wait for host image read backs to finish, not necessary if no need to // synchronize if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess || nCompressedLabelCount == 0) { printf("Post label compression cudaStreamSynchronize failed\n"); tearDown(); return -1; } if (nImage == 0) FOPEN(bmpFile, CompressedMarkerLabelsOutputFile0.c_str(), "wb"); else if (nImage == 1) FOPEN(bmpFile, CompressedMarkerLabelsOutputFile1.c_str(), "wb"); else if (nImage == 2) FOPEN(bmpFile, CompressedMarkerLabelsOutputFile2.c_str(), "wb"); else if (nImage == 3) FOPEN(bmpFile, CompressedMarkerLabelsOutputFile3.c_str(), "wb"); else if (nImage == 4) FOPEN(bmpFile, CompressedMarkerLabelsOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; nSize = 0; for (int j = 0; j < oSizeROI[nImage].height; j++) { nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp32u), oSizeROI[nImage].width, bmpFile); } fclose(bmpFile); if (nImage == 0) printf( "Lena_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, " "compressed label count is %d.\n", nCompressedLabelCount); else if (nImage == 1) printf( "CT_Skull_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, " "compressed label count is %d.\n", nCompressedLabelCount); else if (nImage == 2) printf( "PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u succeeded, " "compressed label count is %d.\n", nCompressedLabelCount); else if (nImage == 3) printf( "PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u succeeded, " "compressed label count is %d.\n", nCompressedLabelCount); else if (nImage == 4) printf( "PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u succeeded, " "compressed label count is %d.\n", nCompressedLabelCount); } } // Batch image processing // We want to allocate scratch buffers more efficiently for batch processing // so first we free up the scratch buffers for image 0 and reallocate them. // This is not required but helps cudaMalloc to work more efficiently. cudaFree(pUFCompressedLabelsScratchBufferDev[0]); int nTotalBatchedUFCompressLabelsScratchBufferDevSize = 0; for (int k = 0; k < NUMBER_OF_IMAGES; k++) nTotalBatchedUFCompressLabelsScratchBufferDevSize += aCompressLabelsScratchBufferSize[k]; cudaError = cudaMalloc((void **)&pUFCompressedLabelsScratchBufferDev[0], nTotalBatchedUFCompressLabelsScratchBufferDevSize); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; // Now allocate batch lists int nBatchImageListBytes = NUMBER_OF_IMAGES * sizeof(NppiImageDescriptor); cudaError = cudaMalloc((void **)&pUFBatchSrcImageListDev, nBatchImageListBytes); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; cudaError = cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; checkCudaErrors( cudaMallocHost((void **)&pUFBatchSrcImageListHost, nBatchImageListBytes)); checkCudaErrors(cudaMallocHost((void **)&pUFBatchSrcDstImageListHost, nBatchImageListBytes)); NppiSize oMaxROISize = {0, 0}; for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { pUFBatchSrcImageListHost[nImage].pData = pInputImageDev[nImage]; pUFBatchSrcImageListHost[nImage].nStep = oSizeROI[nImage].width * sizeof(Npp8u); // src image oSize parameter is ignored in these NPP functions pUFBatchSrcDstImageListHost[nImage].pData = pUFLabelDev[nImage]; pUFBatchSrcDstImageListHost[nImage].nStep = oSizeROI[nImage].width * sizeof(Npp32u); pUFBatchSrcDstImageListHost[nImage].oSize = oSizeROI[nImage]; if (oSizeROI[nImage].width > oMaxROISize.width) oMaxROISize.width = oSizeROI[nImage].width; if (oSizeROI[nImage].height > oMaxROISize.height) oMaxROISize.height = oSizeROI[nImage].height; } // Copy label generation batch lists from CPU to GPU cudaError = cudaMemcpyAsync(pUFBatchSrcImageListDev, pUFBatchSrcImageListHost, nBatchImageListBytes, cudaMemcpyHostToDevice, nppStreamCtx.hStream); if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR; cudaError = cudaMemcpyAsync(pUFBatchSrcDstImageListDev, pUFBatchSrcDstImageListHost, nBatchImageListBytes, cudaMemcpyHostToDevice, nppStreamCtx.hStream); if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR; // We use 8-way neighbor search throughout this example nppStatus = nppiLabelMarkersUFBatch_8u32u_C1R_Advanced_Ctx( pUFBatchSrcImageListDev, pUFBatchSrcDstImageListDev, NUMBER_OF_IMAGES, oMaxROISize, nppiNormInf, nppStreamCtx); if (nppStatus != NPP_SUCCESS) { printf("LabelMarkersUFBatch_8Way_8u32u failed.\n"); tearDown(); return -1; } // Now read back generated device images to the host for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { cudaError = cudaMemcpy2DAsync( pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u), pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height, cudaMemcpyDeviceToHost, nppStreamCtx.hStream); } // Wait for host image read backs to complete, not necessary if no need to // synchronize if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) { printf("Post label generation cudaStreamSynchronize failed\n"); tearDown(); return -1; } // Save output to files for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { if (nImage == 0) FOPEN(bmpFile, LabelMarkersBatchOutputFile0.c_str(), "wb"); else if (nImage == 1) FOPEN(bmpFile, LabelMarkersBatchOutputFile1.c_str(), "wb"); else if (nImage == 2) FOPEN(bmpFile, LabelMarkersBatchOutputFile2.c_str(), "wb"); else if (nImage == 3) FOPEN(bmpFile, LabelMarkersBatchOutputFile3.c_str(), "wb"); else if (nImage == 4) FOPEN(bmpFile, LabelMarkersBatchOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; size_t nSize = 0; for (int j = 0; j < oSizeROI[nImage].height; j++) { nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp32u), oSizeROI[nImage].width, bmpFile); } fclose(bmpFile); } #ifdef USE_BATCHED_LABEL_COMPRESSION // Now allocate scratch buffer memory for batched label compression cudaError = cudaMalloc((void **)&pUFBatchSrcDstScratchBufferListDev, NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; cudaError = cudaMalloc((void **)&pUFBatchPerImageCompressedCountListDev, NUMBER_OF_IMAGES * sizeof(Npp32u)); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; // Allocate host side scratch buffer point and size list and initialize with // device scratch buffer pointers checkCudaErrors( cudaMallocHost((void **)&pUFBatchSrcDstScratchBufferListHost, NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor))); checkCudaErrors( cudaMallocHost((void **)&pUFBatchPerImageCompressedCountListHost, +NUMBER_OF_IMAGES * sizeof(Npp32u))); // Start buffer pointer at beginning of full per image buffer list sized // pUFCompressedLabelsScratchBufferDev[0] Npp32u *pCurUFCompressedLabelsScratchBufferDev = reinterpret_cast(pUFCompressedLabelsScratchBufferDev[0]); int nMaxUFCompressedLabelsScratchBufferSize = 0; for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { // This particular function works on in-place data and SrcDst image batch // list has already been initialized in batched label generation function // setup // Initialize each per image buffer descriptor pUFBatchSrcDstScratchBufferListHost[nImage].pData = reinterpret_cast(pCurUFCompressedLabelsScratchBufferDev); pUFBatchSrcDstScratchBufferListHost[nImage].nBufferSize = aCompressLabelsScratchBufferSize[nImage]; if (aCompressLabelsScratchBufferSize[nImage] > nMaxUFCompressedLabelsScratchBufferSize) nMaxUFCompressedLabelsScratchBufferSize = aCompressLabelsScratchBufferSize[nImage]; // Offset buffer pointer to next per image buffer Npp8u *pTempBuffer = reinterpret_cast(pCurUFCompressedLabelsScratchBufferDev); pTempBuffer += aCompressLabelsScratchBufferSize[nImage]; pCurUFCompressedLabelsScratchBufferDev = reinterpret_cast((void *)(pTempBuffer)); } // Copy compression batch scratch buffer list from CPU to GPU cudaError = cudaMemcpyAsync(pUFBatchSrcDstScratchBufferListDev, pUFBatchSrcDstScratchBufferListHost, NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor), cudaMemcpyHostToDevice, nppStreamCtx.hStream); if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR; nppStatus = nppiCompressMarkerLabelsUFBatch_32u_C1IR_Advanced_Ctx( pUFBatchSrcDstImageListDev, pUFBatchSrcDstScratchBufferListDev, pUFBatchPerImageCompressedCountListDev, NUMBER_OF_IMAGES, oMaxROISize, nMaxUFCompressedLabelsScratchBufferSize, nppStreamCtx); if (nppStatus != NPP_SUCCESS) { printf("BatchCompressedLabelMarkersUF_8Way_32u failed.\n"); tearDown(); return -1; } // Copy output compressed label images back to host for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { cudaError = cudaMemcpy2DAsync( pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u), pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height, cudaMemcpyDeviceToHost, nppStreamCtx.hStream); } // Wait for host image read backs to complete, not necessary if no need to // synchronize if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) { printf("Post label compression cudaStreamSynchronize failed\n"); tearDown(); return -1; } // Save compressed label images into files for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { if (nImage == 0) FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb"); else if (nImage == 1) FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb"); else if (nImage == 2) FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb"); else if (nImage == 3) FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb"); else if (nImage == 4) FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; size_t nSize = 0; for (int j = 0; j < oSizeROI[nImage].height; j++) { nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp32u), oSizeROI[nImage].width, bmpFile); } fclose(bmpFile); } // Read back per image compressed label count. cudaError = cudaMemcpyAsync(pUFBatchPerImageCompressedCountListHost, pUFBatchPerImageCompressedCountListDev, NUMBER_OF_IMAGES * sizeof(Npp32u), cudaMemcpyDeviceToHost, nppStreamCtx.hStream); if (cudaError != cudaSuccess) { tearDown(); return NPP_MEMCPY_ERROR; } // Wait for host read back to complete cudaError = cudaStreamSynchronize(nppStreamCtx.hStream); printf("\n\n"); for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { if (nImage == 0) printf( "Lena_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, " "compressed label count is %d.\n", pUFBatchPerImageCompressedCountListHost[nImage]); else if (nImage == 1) printf( "CT_Skull_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, " "compressed label count is %d.\n", pUFBatchPerImageCompressedCountListHost[nImage]); else if (nImage == 2) printf( "PCB_METAL_CompressedMarkerLabelsUFBatch_8Way_509x335_32u succeeded, " "compressed label count is %d.\n", pUFBatchPerImageCompressedCountListHost[nImage]); else if (nImage == 3) printf( "PCB2_CompressedMarkerLabelsUFBatch_8Way_1024x683_32u succeeded, " "compressed label count is %d.\n", pUFBatchPerImageCompressedCountListHost[nImage]); else if (nImage == 4) printf( "PCB_CompressedMarkerLabelsUFBatch_8Way_1280x720_32u succeeded, " "compressed label count is %d.\n", pUFBatchPerImageCompressedCountListHost[nImage]); } #endif // USE_BATCHED_LABEL_COMPRESSION tearDown(); return 0; }