cuda-samples/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#pragma warning(disable : 4819)
#endif

#include <fstream>
#include <helper_cuda.h>
#include <npp.h>
#include <stdio.h>
#include <string.h>

// Note:  If you want to view these images we HIGHLY recommend using imagej which is free on the internet and works on
// most platforms
//        because it is one of the few image viewing apps that can display 32 bit integer image data.  While it
//        normalizes the data to floating point values for viewing it still provides a good representation of the
//        relative brightness of each label value.
//
//        The files read and written by this sample app use RAW image format, that is, only the image data itself exists
//        in the files with no image format information.   When viewing RAW files with imagej just enter the image size
//        and bit depth values that are part of the file name when requested by imagej.
//

#define NUMBER_OF_IMAGES 3

Npp8u  *pInputImageDev[NUMBER_OF_IMAGES];
Npp8u  *pInputImageHost[NUMBER_OF_IMAGES];
Npp8u  *pSegmentationScratchBufferDev[NUMBER_OF_IMAGES];
Npp8u  *pSegmentsDev[NUMBER_OF_IMAGES];
Npp8u  *pSegmentsHost[NUMBER_OF_IMAGES];
Npp32u *pSegmentLabelsOutputBufferDev[NUMBER_OF_IMAGES];
Npp32u *pSegmentLabelsOutputBufferHost[NUMBER_OF_IMAGES];

void tearDown() // Clean up and tear down
{
    for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
        if (pSegmentLabelsOutputBufferDev[j] != 0)
            cudaFree(pSegmentLabelsOutputBufferDev[j]);
        if (pSegmentationScratchBufferDev[j] != 0)
            cudaFree(pSegmentationScratchBufferDev[j]);
        if (pSegmentsDev[j] != 0)
            cudaFree(pSegmentsDev[j]);
        if (pInputImageDev[j] != 0)
            cudaFree(pInputImageDev[j]);
        if (pSegmentLabelsOutputBufferHost[j] != 0)
            free(pSegmentLabelsOutputBufferHost[j]);
        if (pSegmentsHost[j] != 0)
            free(pSegmentsHost[j]);
        if (pInputImageHost[j] != 0)
            free(pInputImageHost[j]);
    }
}

const std::string &SegmentsOutputFile0 = "teapot_Segments_8Way_512x512_8u.raw";
const std::string &SegmentsOutputFile1 = "CT_skull_Segments_8Way_512x512_8u.raw";
const std::string &SegmentsOutputFile2 = "Rocks_Segments_8Way_512x512_8u.raw";

const std::string &SegmentBoundariesOutputFile0 = "teapot_SegmentBoundaries_8Way_512x512_8u.raw";
const std::string &SegmentBoundariesOutputFile1 = "CT_skull_SegmentBoundaries_8Way_512x512_8u.raw";
const std::string &SegmentBoundariesOutputFile2 = "Rocks_SegmentBoundaries_8Way_512x512_8u.raw";

const std::string &SegmentsWithContrastingBoundariesOutputFile0 =
    "teapot_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw";
const std::string &SegmentsWithContrastingBoundariesOutputFile1 =
    "CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw";
const std::string &SegmentsWithContrastingBoundariesOutputFile2 =
    "Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw";

const std::string &CompressedSegmentLabelsOutputFile0 = "teapot_CompressedSegmentLabels_8Way_512x512_32u.raw";
const std::string &CompressedSegmentLabelsOutputFile1 = "CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw";
const std::string &CompressedSegmentLabelsOutputFile2 = "Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw";

int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage)
{
    FILE  *bmpFile;
    size_t nSize;

    if (nImage == 0) {
        if (nWidth != 512 || nHeight != 512)
            return -1;
        const char *fileName  = "teapot_512x512_8u_Gray.raw";
        const char *InputFile = sdkFindFilePath(fileName, ".");
        if (InputFile == NULL) {
            printf("%s file not found.. exiting\n", fileName);
            exit(EXIT_WAIVED);
        }

        bmpFile = fopen(InputFile, "rb");
    }
    else if (nImage == 1) {
        if (nWidth != 512 || nHeight != 512)
            return -1;
        const char *fileName  = "CT_skull_512x512_8u_Gray.raw";
        const char *InputFile = sdkFindFilePath(fileName, ".");
        if (InputFile == NULL) {
            printf("%s file not found.. exiting\n", fileName);
            exit(EXIT_WAIVED);
        }

        bmpFile = fopen(InputFile, "rb");
    }
    else if (nImage == 2) {
        if (nWidth != 512 || nHeight != 512)
            return -1;
        const char *fileName  = "Rocks_512x512_8u_Gray.raw";
        const char *InputFile = sdkFindFilePath(fileName, ".");
        if (InputFile == NULL) {
            printf("%s file not found.. exiting\n", fileName);
            exit(EXIT_WAIVED);
        }

        bmpFile = fopen(InputFile, "rb");
    }
    else {
        printf("Input file load failed.\n");
        return -1;
    }

    if (bmpFile == NULL) {
        printf("Input file load failed.\n");
        return -1;
    }
    nSize = fread(pImage, 1, nWidth * nHeight, bmpFile);
    if (nSize < nWidth * nHeight) {
        printf("Input file load failed.\n");
        fclose(bmpFile);
        return -1;
    }
    fclose(bmpFile);

    printf("Input file load succeeded.\n");

    return 0;
}

int main(int argc, char **argv)
{

    size_t aSegmentationScratchBufferSize[NUMBER_OF_IMAGES];
    int    aSegmentLabelsOutputBufferSize[NUMBER_OF_IMAGES];

    cudaError_t      cudaError;
    NppStatus        nppStatus;
    NppStreamContext nppStreamCtx;
    FILE            *bmpFile;
    NppiNorm         eNorm = nppiNormInf; // default to 8 way neighbor search

    for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
        pInputImageDev[j]                 = 0;
        pInputImageHost[j]                = 0;
        pSegmentationScratchBufferDev[j]  = 0;
        pSegmentLabelsOutputBufferDev[j]  = 0;
        pSegmentLabelsOutputBufferHost[j] = 0;
        pSegmentsDev[j]                   = 0;
        pSegmentsHost[j]                  = 0;
    }

    nppStreamCtx.hStream =
        0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream.

    cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
    if (cudaError != cudaSuccess) {
        printf("CUDA error: no devices supporting CUDA.\n");
        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
    }

    const NppLibraryVersion *libVer = nppGetLibVersion();

    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);

    int driverVersion, runtimeVersion;
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

    printf("CUDA Driver  Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10);
    printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10);

    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
                                       cudaDevAttrComputeCapabilityMajor,
                                       nppStreamCtx.nCudaDeviceId);
    if (cudaError != cudaSuccess)
        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;

    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
                                       cudaDevAttrComputeCapabilityMinor,
                                       nppStreamCtx.nCudaDeviceId);
    if (cudaError != cudaSuccess)
        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;

    cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);

    cudaDeviceProp oDeviceProperties;

    cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);

    nppStreamCtx.nMultiProcessorCount         = oDeviceProperties.multiProcessorCount;
    nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
    nppStreamCtx.nMaxThreadsPerBlock          = oDeviceProperties.maxThreadsPerBlock;
    nppStreamCtx.nSharedMemPerBlock           = oDeviceProperties.sharedMemPerBlock;

    NppiSize oSizeROI[NUMBER_OF_IMAGES];

    for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
        if (nImage == 0) {
            oSizeROI[nImage].width  = 512;
            oSizeROI[nImage].height = 512;
        }
        else if (nImage == 1) {
            oSizeROI[nImage].width  = 512;
            oSizeROI[nImage].height = 512;
        }
        else if (nImage == 2) {
            oSizeROI[nImage].width  = 512;
            oSizeROI[nImage].height = 512;
        }

        // cudaMallocPitch OR cudaMalloc can be used here, in this sample case width == pitch.

        cudaError = cudaMalloc((void **)&pInputImageDev[nImage],
                               oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height);
        if (cudaError != cudaSuccess)
            return NPP_MEMORY_ALLOCATION_ERR;

        cudaError = cudaMalloc((void **)&pSegmentsDev[nImage],
                               oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height);
        if (cudaError != cudaSuccess)
            return NPP_MEMORY_ALLOCATION_ERR;

        pInputImageHost[nImage] =
            reinterpret_cast<Npp8u *>(malloc(oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height));
        pSegmentsHost[nImage] =
            reinterpret_cast<Npp8u *>(malloc(oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));

        nppStatus = nppiSegmentWatershedGetBufferSize_8u_C1R(oSizeROI[nImage], &aSegmentationScratchBufferSize[nImage]);

        cudaError = cudaMalloc((void **)&pSegmentationScratchBufferDev[nImage], aSegmentationScratchBufferSize[nImage]);
        if (cudaError != cudaSuccess)
            return NPP_MEMORY_ALLOCATION_ERR;

        // Output label marker buffers are only needed if you want to same the generated segmentation labels, they ARE
        // compatible with NPP UF generated labels. Requesting segmentation output may slightly decrease segmentation
        // function performance.  Regardless of the pitch of the segmentation image the segment labels output buffer
        // will have a pitch of oSizeROI[nImage].width * sizeof(Npp32u).

        aSegmentLabelsOutputBufferSize[nImage] = oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height;

        cudaError = cudaMalloc((void **)&pSegmentLabelsOutputBufferDev[nImage], aSegmentLabelsOutputBufferSize[nImage]);
        if (cudaError != cudaSuccess)
            return NPP_MEMORY_ALLOCATION_ERR;

        pSegmentLabelsOutputBufferHost[nImage] =
            reinterpret_cast<Npp32u *>(malloc(oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));

        if (loadRaw8BitImage(
                pInputImageHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, nImage)
            == 0) {
            cudaError = cudaMemcpy2DAsync(pInputImageDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          pInputImageHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyHostToDevice,
                                          nppStreamCtx.hStream);

            // Make a second copy of the unaltered input image since this function works in place and we want to reuse
            // the input image multiple times.
            cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          pInputImageHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyHostToDevice,
                                          nppStreamCtx.hStream);

            nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage],
                                                         oSizeROI[nImage].width * sizeof(Npp8u),
                                                         pSegmentLabelsOutputBufferDev[nImage],
                                                         oSizeROI[nImage].width * sizeof(Npp32u),
                                                         eNorm,
                                                         NPP_WATERSHED_SEGMENT_BOUNDARIES_NONE,
                                                         oSizeROI[nImage],
                                                         pSegmentationScratchBufferDev[nImage],
                                                         nppStreamCtx);

            if (nppStatus != NPP_SUCCESS) {
                if (nImage == 0)
                    printf("Lena segments 8Way 512x512 8u failed.\n");
                else if (nImage == 1)
                    printf("CT skull segments 8Way 512x512 8u failed.\n");
                else if (nImage == 2)
                    printf("Rocks segments 8Way 512x512 8u failed.\n");
                tearDown();
                return -1;
            }

            // Now compress the label markers output to make them easier to view.
            int    nCompressedLabelsScratchBufferSize;
            Npp8u *pCompressedLabelsScratchBufferDev;

            nppStatus = nppiCompressMarkerLabelsGetBufferSize_32u_C1R(oSizeROI[nImage].width * oSizeROI[nImage].height,
                                                                      &nCompressedLabelsScratchBufferSize);
            if (nppStatus != NPP_NO_ERROR)
                return nppStatus;

            cudaError = cudaMalloc((void **)&pCompressedLabelsScratchBufferDev, nCompressedLabelsScratchBufferSize);
            if (cudaError != cudaSuccess)
                return NPP_MEMORY_ALLOCATION_ERR;

            int nCompressedLabelCount = 0;

            nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR_Ctx(pSegmentLabelsOutputBufferDev[nImage],
                                                                oSizeROI[nImage].width * sizeof(Npp32u),
                                                                oSizeROI[nImage],
                                                                oSizeROI[nImage].width * oSizeROI[nImage].height,
                                                                &nCompressedLabelCount,
                                                                pCompressedLabelsScratchBufferDev,
                                                                nppStreamCtx);

            if (nppStatus != NPP_SUCCESS) {
                if (nImage == 0)
                    printf("teapot_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n");
                else if (nImage == 1)
                    printf("CT_Skull_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n");
                else if (nImage == 2)
                    printf("Rocks_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n");
                tearDown();
                return -1;
            }

            // Copy segmented image to host
            cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          pSegmentsDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyDeviceToHost,
                                          nppStreamCtx.hStream);

            // Copy segment labels image to host
            cudaError = cudaMemcpy2DAsync(pSegmentLabelsOutputBufferHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp32u),
                                          pSegmentLabelsOutputBufferDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp32u),
                                          oSizeROI[nImage].width * sizeof(Npp32u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyDeviceToHost,
                                          nppStreamCtx.hStream);

            // Wait host image read backs to complete, not necessary if no need to synchronize
            if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) {
                printf("Post segmentation cudaStreamSynchronize failed\n");
                tearDown();
                return -1;
            }

            // Free single image scratch buffer
            cudaFree(pCompressedLabelsScratchBufferDev);

            // Save default segments file.
            if (nImage == 0)
                bmpFile = fopen(SegmentsOutputFile0.c_str(), "wb");
            else if (nImage == 1)
                bmpFile = fopen(SegmentsOutputFile1.c_str(), "wb");
            else if (nImage == 2)
                bmpFile = fopen(SegmentsOutputFile2.c_str(), "wb");

            if (bmpFile == NULL)
                return -1;
            size_t nSize = 0;
            for (int j = 0; j < oSizeROI[nImage].height; j++) {
                nSize += fwrite(
                    &pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile);
            }
            fclose(bmpFile);

            if (nImage == 0)
                printf("teapot_Segments_8Way_512x512_8u succeeded.\n");
            else if (nImage == 1)
                printf("CT_Skull_Segments_8Way_512x512_8u succeeded.\n");
            else if (nImage == 2)
                printf("Rocks_Segments_8Way_512x512_8u succeeded.\n");

            // Save segment labels file.
            if (nImage == 0)
                bmpFile = fopen(CompressedSegmentLabelsOutputFile0.c_str(), "wb");
            else if (nImage == 1)
                bmpFile = fopen(CompressedSegmentLabelsOutputFile1.c_str(), "wb");
            else if (nImage == 2)
                bmpFile = fopen(CompressedSegmentLabelsOutputFile2.c_str(), "wb");

            if (bmpFile == NULL)
                return -1;
            nSize = 0;
            for (int j = 0; j < oSizeROI[nImage].height; j++) {
                nSize += fwrite(&pSegmentLabelsOutputBufferHost[nImage][j * oSizeROI[nImage].width],
                                sizeof(Npp32u),
                                oSizeROI[nImage].width,
                                bmpFile);
            }
            fclose(bmpFile);

            if (nImage == 0)
                printf("teapot_CompressedSegmentLabels_8Way_512x512_32u succeeded.\n");
            else if (nImage == 1)
                printf("CT_Skull_CompressedSegmentLabels_8Way_512x512_32u succeeded.\n");
            else if (nImage == 2)
                printf("Rocks_CompressedSegmentLabels_8Way_512x512_32u succeeded.\n");

            // Now generate a segment boundaries only output image

            // Make a second copy of the unaltered input image since this function works in place and we want to reuse
            // the input image multiple times.
            cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          pInputImageHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyHostToDevice,
                                          nppStreamCtx.hStream);

            // We already generated segment labels images to skip that this time
            nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage],
                                                         oSizeROI[nImage].width * sizeof(Npp8u),
                                                         0,
                                                         0,
                                                         eNorm,
                                                         NPP_WATERSHED_SEGMENT_BOUNDARIES_ONLY,
                                                         oSizeROI[nImage],
                                                         pSegmentationScratchBufferDev[nImage],
                                                         nppStreamCtx);

            if (nppStatus != NPP_SUCCESS) {
                if (nImage == 0)
                    printf("Lena segment boundaries 8Way 512x512 8u failed.\n");
                else if (nImage == 1)
                    printf("CT skull segment boundaries 8Way 512x512 8u failed.\n");
                else if (nImage == 2)
                    printf("Rocks segment boundaries 8Way 512x512 8u failed.\n");
                tearDown();
                return -1;
            }

            // Copy segment boundaries image to host
            cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          pSegmentsDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyDeviceToHost,
                                          nppStreamCtx.hStream);

            // Wait host image read backs to complete, not necessary if no need to synchronize
            if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) {
                printf("Post segmentation cudaStreamSynchronize failed\n");
                tearDown();
                return -1;
            }

            if (nImage == 0)
                bmpFile = fopen(SegmentBoundariesOutputFile0.c_str(), "wb");
            else if (nImage == 1)
                bmpFile = fopen(SegmentBoundariesOutputFile1.c_str(), "wb");
            else if (nImage == 2)
                bmpFile = fopen(SegmentBoundariesOutputFile2.c_str(), "wb");

            if (bmpFile == NULL)
                return -1;
            nSize = 0;
            for (int j = 0; j < oSizeROI[nImage].height; j++) {
                nSize += fwrite(
                    &pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile);
            }
            fclose(bmpFile);

            if (nImage == 0)
                printf("teapot_SegmentBoundaries_8Way_512x512_8u succeeded.\n");
            else if (nImage == 1)
                printf("CT_Skull_SegmentBoundaries_8Way_512x512_8u succeeded.\n");
            else if (nImage == 2)
                printf("Rocks_SegmentBoundaries_8Way_512x512_8u succeeded.\n");

            // Now generate a segmented with contrasting boundaries output image

            // Make a second copy of the unaltered input image since this function works in place and we want to reuse
            // the input image multiple times.
            cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          pInputImageHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyHostToDevice,
                                          nppStreamCtx.hStream);

            // We already generated segment labels images to skip that this time
            nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage],
                                                         oSizeROI[nImage].width * sizeof(Npp8u),
                                                         0,
                                                         0,
                                                         eNorm,
                                                         NPP_WATERSHED_SEGMENT_BOUNDARIES_CONTRAST,
                                                         oSizeROI[nImage],
                                                         pSegmentationScratchBufferDev[nImage],
                                                         nppStreamCtx);

            if (nppStatus != NPP_SUCCESS) {
                if (nImage == 0)
                    printf("Lena segments with contrasting boundaries 8Way 512x512 8u failed.\n");
                else if (nImage == 1)
                    printf("CT skull segments with contrasting boundaries 8Way 512x512 8u failed.\n");
                else if (nImage == 2)
                    printf("Rocks segments with contrasting boundaries 8Way 512x512 8u failed.\n");
                tearDown();
                return -1;
            }

            // Copy segment boundaries image to host
            cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          pSegmentsDev[nImage],
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].width * sizeof(Npp8u),
                                          oSizeROI[nImage].height,
                                          cudaMemcpyDeviceToHost,
                                          nppStreamCtx.hStream);

            // Wait host image read backs to complete, not necessary if no need to synchronize
            if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) {
                printf("Post segmentation cudaStreamSynchronize failed\n");
                tearDown();
                return -1;
            }

            if (nImage == 0)
                bmpFile = fopen(SegmentsWithContrastingBoundariesOutputFile0.c_str(), "wb");
            else if (nImage == 1)
                bmpFile = fopen(SegmentsWithContrastingBoundariesOutputFile1.c_str(), "wb");
            else if (nImage == 2)
                bmpFile = fopen(SegmentsWithContrastingBoundariesOutputFile2.c_str(), "wb");

            if (bmpFile == NULL)
                return -1;
            nSize = 0;
            for (int j = 0; j < oSizeROI[nImage].height; j++) {
                nSize += fwrite(
                    &pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile);
            }
            fclose(bmpFile);

            if (nImage == 0)
                printf("teapot_SegmentsWithContrastingBoundaries_8Way_512x512_8u succeeded.\n");
            else if (nImage == 1)
                printf("CT_Skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u succeeded.\n");
            else if (nImage == 2)
                printf("Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u succeeded.\n");
        }
    }

    tearDown();

    return 0;
}