Revert "Bug 5034785: Update all non-ctx nppi APIs to ctx APIs as per latest change on NPP"

This reverts commit a9869fd6eaeecc748fc5f10f4b331fa41efbdaca
This commit is contained in:
Shawn Zeng 2025-02-27 02:48:03 -08:00
parent a9869fd6ea
commit acd3a015c8
18 changed files with 173 additions and 279 deletions

View File

@ -74,6 +74,26 @@ inline int cudaDeviceInit(int argc, const char **argv) {
return dev; return dev;
} }
bool printfNPPinfo(int argc, char *argv[]) {
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000,
(driverVersion % 100) / 10);
printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
// Min spec is SM 1.0 devices
bool bVal = checkCudaCapabilities(1, 0);
return bVal;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
@ -84,50 +104,11 @@ int main(int argc, char *argv[]) {
cudaDeviceInit(argc, (const char **)argv); cudaDeviceInit(argc, (const char **)argv);
NppStreamContext nppStreamCtx; if (printfNPPinfo(argc, argv) == false) {
nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. cudaDeviceReset();
exit(EXIT_SUCCESS);
cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
{
printf("CUDA error: no devices supporting CUDA.\n");
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
} }
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
cudaDeviceProp oDeviceProperties;
cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
char *filePath; char *filePath;
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
@ -205,11 +186,11 @@ int main(int argc, char *argv[]) {
npp::ImageNPP_16s_C1 oDeviceDstY(oSizeROI.width, oSizeROI.height); npp::ImageNPP_16s_C1 oDeviceDstY(oSizeROI.width, oSizeROI.height);
// run Prewitt edge detection gradient vector filter // run Prewitt edge detection gradient vector filter
NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset, oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset,
oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(), oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(),
oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3, oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3,
nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx)); nppiNormL1, NPP_BORDER_REPLICATE));
// allocate device destination images of appropriatedly size // allocate device destination images of appropriatedly size
npp::ImageNPP_8u_C1 oDeviceDstOutX(oSizeROI.width, oSizeROI.height); npp::ImageNPP_8u_C1 oDeviceDstOutX(oSizeROI.width, oSizeROI.height);
@ -217,13 +198,13 @@ int main(int argc, char *argv[]) {
// convert 16s_C1 result images to binary 8u_C1 output images using constant // convert 16s_C1 result images to binary 8u_C1 output images using constant
// value to adjust amount of visible detail // value to adjust amount of visible detail
NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx( NPP_CHECK_NPP(nppiCompareC_16s_C1R(
oDeviceDstX.data(), oDeviceDstX.pitch(), 32, oDeviceDstOutX.data(), oDeviceDstX.data(), oDeviceDstX.pitch(), 32, oDeviceDstOutX.data(),
oDeviceDstOutX.pitch(), oSizeROI, NPP_CMP_GREATER_EQ, nppStreamCtx)); oDeviceDstOutX.pitch(), oSizeROI, NPP_CMP_GREATER_EQ));
NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx( NPP_CHECK_NPP(nppiCompareC_16s_C1R(
oDeviceDstY.data(), oDeviceDstY.pitch(), 32, oDeviceDstOutY.data(), oDeviceDstY.data(), oDeviceDstY.pitch(), 32, oDeviceDstOutY.data(),
oDeviceDstOutY.pitch(), oSizeROI, NPP_CMP_GREATER_EQ, nppStreamCtx)); oDeviceDstOutY.pitch(), oSizeROI, NPP_CMP_GREATER_EQ));
// create host images for the results // create host images for the results
npp::ImageCPU_8u_C1 oHostDstX(oDeviceDstOutX.size()); npp::ImageCPU_8u_C1 oHostDstX(oDeviceDstOutX.size());
@ -253,10 +234,10 @@ int main(int argc, char *argv[]) {
// copy and enlarge the original device source image and surround it with a // copy and enlarge the original device source image and surround it with a
// white edge (border) // white edge (border)
NPP_CHECK_NPP(nppiCopyConstBorder_8u_C1R_Ctx( NPP_CHECK_NPP(nppiCopyConstBorder_8u_C1R(
oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize,
oEnlargedDeviceSrc.data(), oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oEnlargedDeviceSrc.data(), oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize,
oMaskSize.width / 2, oMaskSize.height / 2, 255, nppStreamCtx)); oMaskSize.width / 2, oMaskSize.height / 2, 255));
// adjust oEnlargedDeviceSrc pixel pointer to point to the first pixel of // adjust oEnlargedDeviceSrc pixel pointer to point to the first pixel of
// the original source image in the enlarged source image // the original source image in the enlarged source image
@ -280,25 +261,23 @@ int main(int argc, char *argv[]) {
// run Prewitt edge detection gradient vector filter bypassing border // run Prewitt edge detection gradient vector filter bypassing border
// control due to enlarged source image // control due to enlarged source image
NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset, pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset,
oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(), oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(),
oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3, oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3,
nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx)); nppiNormL1, NPP_BORDER_REPLICATE));
// convert 16s_C1 result images to binary 8u_C1 output images using constant // convert 16s_C1 result images to binary 8u_C1 output images using constant
// value to adjust amount of visible detail // value to adjust amount of visible detail
NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), oDeviceDstX.pitch(), NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstX.data(), oDeviceDstX.pitch(),
32, oDeviceDstOutXNoBorders.data(), 32, oDeviceDstOutXNoBorders.data(),
oDeviceDstOutXNoBorders.pitch(), oDeviceDstOutXNoBorders.pitch(),
oSizeROI, NPP_CMP_GREATER_EQ, oSizeROI, NPP_CMP_GREATER_EQ));
nppStreamCtx));
NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), oDeviceDstY.pitch(), NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstY.data(), oDeviceDstY.pitch(),
32, oDeviceDstOutYNoBorders.data(), 32, oDeviceDstOutYNoBorders.data(),
oDeviceDstOutYNoBorders.pitch(), oDeviceDstOutYNoBorders.pitch(),
oSizeROI, NPP_CMP_GREATER_EQ, oSizeROI, NPP_CMP_GREATER_EQ));
nppStreamCtx));
// create additional output files // create additional output files
std::string sResultXNoBordersFilename = sResultBaseFilename; std::string sResultXNoBordersFilename = sResultBaseFilename;
std::string sResultYNoBordersFilename = sResultBaseFilename; std::string sResultYNoBordersFilename = sResultBaseFilename;
@ -326,17 +305,15 @@ int main(int argc, char *argv[]) {
// diff the two 8u_C1 result images one with and one without border control // diff the two 8u_C1 result images one with and one without border control
NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
oDeviceDstOutXNoBorders.data(), oDeviceDstOutXNoBorders.pitch(), oDeviceDstOutXNoBorders.data(), oDeviceDstOutXNoBorders.pitch(),
oDeviceDstOutX.data(), oDeviceDstOutX.pitch(), oDeviceDstOutX.data(), oDeviceDstOutX.pitch(),
oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI, oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI));
nppStreamCtx));
NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
oDeviceDstOutYNoBorders.data(), oDeviceDstOutYNoBorders.pitch(), oDeviceDstOutYNoBorders.data(), oDeviceDstOutYNoBorders.pitch(),
oDeviceDstOutY.data(), oDeviceDstOutY.pitch(), oDeviceDstOutY.data(), oDeviceDstOutY.pitch(),
oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI, oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI));
nppStreamCtx));
// create additional output files // create additional output files
std::string sResultXDiffFilename = sResultBaseFilename; std::string sResultXDiffFilename = sResultBaseFilename;
@ -403,11 +380,11 @@ int main(int argc, char *argv[]) {
// run Prewitt edge detection gradient vector filter to generate the left // run Prewitt edge detection gradient vector filter to generate the left
// side of the output image // side of the output image
NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset, pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset,
oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(), oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(),
oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3, oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3,
nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx)); nppiNormL1, NPP_BORDER_REPLICATE));
// now move the enlarged source pointer to the horizontal middle of the // now move the enlarged source pointer to the horizontal middle of the
// enlarged source image and tell the function where it was moved to // enlarged source image and tell the function where it was moved to
@ -424,26 +401,23 @@ int main(int argc, char *argv[]) {
// run Prewitt edge detection gradient vector filter to generate the right // run Prewitt edge detection gradient vector filter to generate the right
// side of the output image adjusting the destination image pointers // side of the output image adjusting the destination image pointers
// appropriately // appropriately
NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset, pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset,
oDeviceDstX.data() + nLeftWidth, oDeviceDstX.pitch(), oDeviceDstX.data() + nLeftWidth, oDeviceDstX.pitch(),
oDeviceDstY.data() + nLeftWidth, oDeviceDstY.pitch(), 0, 0, 0, 0, oDeviceDstY.data() + nLeftWidth, oDeviceDstY.pitch(), 0, 0, 0, 0,
oSizeROI, NPP_MASK_SIZE_3_X_3, nppiNormL1, NPP_BORDER_REPLICATE, oSizeROI, NPP_MASK_SIZE_3_X_3, nppiNormL1, NPP_BORDER_REPLICATE));
nppStreamCtx));
// convert 16s_C1 result images to binary 8u_C1 output images using constant // convert 16s_C1 result images to binary 8u_C1 output images using constant
// value to adjust amount of visible detail // value to adjust amount of visible detail
NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), oDeviceDstX.pitch(), NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstX.data(), oDeviceDstX.pitch(),
32, oDeviceDstOutXMixedBorders.data(), 32, oDeviceDstOutXMixedBorders.data(),
oDeviceDstOutXMixedBorders.pitch(), oDeviceDstOutXMixedBorders.pitch(),
oSizeROI, NPP_CMP_GREATER_EQ, oSizeROI, NPP_CMP_GREATER_EQ));
nppStreamCtx));
NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), oDeviceDstY.pitch(), NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstY.data(), oDeviceDstY.pitch(),
32, oDeviceDstOutYMixedBorders.data(), 32, oDeviceDstOutYMixedBorders.data(),
oDeviceDstOutYMixedBorders.pitch(), oDeviceDstOutYMixedBorders.pitch(),
oSizeROI, NPP_CMP_GREATER_EQ, oSizeROI, NPP_CMP_GREATER_EQ));
nppStreamCtx));
// create additional output files // create additional output files
std::string sResultXMixedBordersFilename = sResultBaseFilename; std::string sResultXMixedBordersFilename = sResultBaseFilename;
std::string sResultYMixedBordersFilename = sResultBaseFilename; std::string sResultYMixedBordersFilename = sResultBaseFilename;
@ -465,17 +439,15 @@ int main(int argc, char *argv[]) {
// diff the original 8u_C1 result images with border control and the mixed // diff the original 8u_C1 result images with border control and the mixed
// border control images, they should match (diff image will be all black) // border control images, they should match (diff image will be all black)
NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
oDeviceDstOutXMixedBorders.data(), oDeviceDstOutXMixedBorders.pitch(), oDeviceDstOutXMixedBorders.data(), oDeviceDstOutXMixedBorders.pitch(),
oDeviceDstOutX.data(), oDeviceDstOutX.pitch(), oDeviceDstOutX.data(), oDeviceDstOutX.pitch(),
oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI, oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI));
nppStreamCtx));
NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
oDeviceDstOutYMixedBorders.data(), oDeviceDstOutYMixedBorders.pitch(), oDeviceDstOutYMixedBorders.data(), oDeviceDstOutYMixedBorders.pitch(),
oDeviceDstOutY.data(), oDeviceDstOutY.pitch(), oDeviceDstOutY.data(), oDeviceDstOutY.pitch(),
oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI, oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI));
nppStreamCtx));
// create additional output files // create additional output files
std::string sResultXMixedDiffFilename = sResultBaseFilename; std::string sResultXMixedDiffFilename = sResultBaseFilename;

View File

@ -48,56 +48,39 @@
#include <helper_string.h> #include <helper_string.h>
bool printfNPPinfo(int argc, char *argv[]) {
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000,
(driverVersion % 100) / 10);
printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
// Min spec is SM 1.0 devices
bool bVal = checkCudaCapabilities(1, 0);
return bVal;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
try { try {
std::string sFilename; std::string sFilename;
char *filePath; char *filePath;
NppStreamContext nppStreamCtx;
nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream.
cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); findCudaDevice(argc, (const char **)argv);
if (cudaError != cudaSuccess)
{ if (printfNPPinfo(argc, argv) == false) {
printf("CUDA error: no devices supporting CUDA.\n"); exit(EXIT_SUCCESS);
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
} }
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
cudaDeviceProp oDeviceProperties;
cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
} else { } else {
@ -171,10 +154,10 @@ int main(int argc, char *argv[]) {
NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2}; NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
// run box filter // run box filter
NPP_CHECK_NPP(nppiFilterBoxBorder_8u_C1R_Ctx( NPP_CHECK_NPP(nppiFilterBoxBorder_8u_C1R(
oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset, oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset,
oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor, oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor,
NPP_BORDER_REPLICATE, nppStreamCtx)); NPP_BORDER_REPLICATE));
// declare a host image for the result // declare a host image for the result
npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());

View File

@ -68,6 +68,26 @@ inline int cudaDeviceInit(int argc, const char **argv) {
return dev; return dev;
} }
bool printfNPPinfo(int argc, char *argv[]) {
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000,
(driverVersion % 100) / 10);
printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
// Min spec is SM 1.0 devices
bool bVal = checkCudaCapabilities(1, 0);
return bVal;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
@ -77,50 +97,10 @@ int main(int argc, char *argv[]) {
cudaDeviceInit(argc, (const char **)argv); cudaDeviceInit(argc, (const char **)argv);
NppStreamContext nppStreamCtx; if (printfNPPinfo(argc, argv) == false) {
nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. exit(EXIT_SUCCESS);
cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
{
printf("CUDA error: no devices supporting CUDA.\n");
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
} }
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
cudaDeviceProp oDeviceProperties;
cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
} else { } else {
@ -210,11 +190,11 @@ int main(int argc, char *argv[]) {
Npp16s nHighThreshold = 256; Npp16s nHighThreshold = 256;
if ((nBufferSize > 0) && (pScratchBufferNPP != 0)) { if ((nBufferSize > 0) && (pScratchBufferNPP != 0)) {
NPP_CHECK_NPP(nppiFilterCannyBorder_8u_C1R_Ctx( NPP_CHECK_NPP(nppiFilterCannyBorder_8u_C1R(
oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset, oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset,
oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, NPP_FILTER_SOBEL, oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, NPP_FILTER_SOBEL,
NPP_MASK_SIZE_3_X_3, nLowThreshold, nHighThreshold, nppiNormL2, NPP_MASK_SIZE_3_X_3, nLowThreshold, nHighThreshold, nppiNormL2,
NPP_BORDER_REPLICATE, pScratchBufferNPP, nppStreamCtx)); NPP_BORDER_REPLICATE, pScratchBufferNPP));
} }
// free scratch buffer memory // free scratch buffer memory

View File

@ -70,6 +70,25 @@ inline int cudaDeviceInit(int argc, const char **argv) {
return dev; return dev;
} }
bool printfNPPinfo(int argc, char *argv[], int cudaVerMajor, int cudaVerMinor) {
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000,
(driverVersion % 100) / 10);
printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
return bVal;
}
// Error handler for FreeImage library. // Error handler for FreeImage library.
// In case this handler is invoked, it throws an NPP exception. // In case this handler is invoked, it throws an NPP exception.
extern "C" void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, extern "C" void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif,
@ -138,50 +157,11 @@ int main(int argc, char *argv[]) {
cudaDeviceInit(argc, (const char **)argv); cudaDeviceInit(argc, (const char **)argv);
NppStreamContext nppStreamCtx; // Min spec is SM 1.0 devices
nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. if (printfNPPinfo(argc, argv, 1, 0) == false) {
exit(EXIT_SUCCESS);
cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
{
printf("CUDA error: no devices supporting CUDA.\n");
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
} }
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
cudaDeviceProp oDeviceProperties;
cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
} else { } else {
@ -280,9 +260,9 @@ int main(int argc, char *argv[]) {
Npp8u *pDstImageCUDA = Npp8u *pDstImageCUDA =
nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA); nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA);
NPP_ASSERT_NOT_NULL(pDstImageCUDA); NPP_ASSERT_NOT_NULL(pDstImageCUDA);
NPP_CHECK_NPP(nppiFilterBox_8u_C1R_Ctx(pSrcImageCUDA, nSrcPitchCUDA, NPP_CHECK_NPP(nppiFilterBox_8u_C1R(pSrcImageCUDA, nSrcPitchCUDA,
pDstImageCUDA, nDstPitchCUDA, oSizeROI, pDstImageCUDA, nDstPitchCUDA, oSizeROI,
oMaskSize, oMaskAchnor, nppStreamCtx)); oMaskSize, oMaskAchnor));
// create the result image storage using FreeImage so we can easily // create the result image storage using FreeImage so we can easily
// save // save
FIBITMAP *pResultBitmap = FreeImage_Allocate( FIBITMAP *pResultBitmap = FreeImage_Allocate(

View File

@ -73,6 +73,26 @@ inline int cudaDeviceInit(int argc, const char **argv) {
return dev; return dev;
} }
bool printfNPPinfo(int argc, char *argv[]) {
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000,
(driverVersion % 100) / 10);
printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
// Min spec is SM 1.1 devices
bool bVal = checkCudaCapabilities(1, 1);
return bVal;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
@ -82,50 +102,10 @@ int main(int argc, char *argv[]) {
cudaDeviceInit(argc, (const char **)argv); cudaDeviceInit(argc, (const char **)argv);
NppStreamContext nppStreamCtx; if (printfNPPinfo(argc, argv) == false) {
nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. exit(EXIT_SUCCESS);
cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
{
printf("CUDA error: no devices supporting CUDA.\n");
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
} }
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess)
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
cudaDeviceProp oDeviceProperties;
cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
} else { } else {
@ -202,9 +182,8 @@ int main(int argc, char *argv[]) {
(int)oDeviceSrc.height()}; // full image (int)oDeviceSrc.height()}; // full image
// create device scratch buffer for nppiHistogram // create device scratch buffer for nppiHistogram
size_t nDeviceBufferSize; size_t nDeviceBufferSize;
nppiHistogramEvenGetBufferSize_8u_C1R_Ctx(oSizeROI, levelCount, nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, levelCount,
&nDeviceBufferSize, &nDeviceBufferSize);
nppStreamCtx);
Npp8u *pDeviceBuffer; Npp8u *pDeviceBuffer;
NPP_CHECK_CUDA(cudaMalloc((void **)&pDeviceBuffer, nDeviceBufferSize)); NPP_CHECK_CUDA(cudaMalloc((void **)&pDeviceBuffer, nDeviceBufferSize));
@ -212,9 +191,9 @@ int main(int argc, char *argv[]) {
Npp32s levelsHost[levelCount]; Npp32s levelsHost[levelCount];
NPP_CHECK_NPP(nppiEvenLevelsHost_32s(levelsHost, levelCount, 0, binCount)); NPP_CHECK_NPP(nppiEvenLevelsHost_32s(levelsHost, levelCount, 0, binCount));
// compute the histogram // compute the histogram
NPP_CHECK_NPP(nppiHistogramEven_8u_C1R_Ctx( NPP_CHECK_NPP(nppiHistogramEven_8u_C1R(
oDeviceSrc.data(), oDeviceSrc.pitch(), oSizeROI, histDevice, levelCount, oDeviceSrc.data(), oDeviceSrc.pitch(), oSizeROI, histDevice, levelCount,
0, binCount, pDeviceBuffer, nppStreamCtx)); 0, binCount, pDeviceBuffer));
// copy histogram and levels to host memory // copy histogram and levels to host memory
Npp32s histHost[binCount]; Npp32s histHost[binCount];
NPP_CHECK_CUDA(cudaMemcpy(histHost, histDevice, binCount * sizeof(Npp32s), NPP_CHECK_CUDA(cudaMemcpy(histHost, histDevice, binCount * sizeof(Npp32s),
@ -275,22 +254,20 @@ int main(int argc, char *argv[]) {
sizeof(Npp32s) * (levelCount), sizeof(Npp32s) * (levelCount),
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx( NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R(
oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(),
oDeviceDst.pitch(), oSizeROI, oDeviceDst.pitch(), oSizeROI,
lutDevice, // value and level arrays are in host memory lutDevice, // value and level arrays are in GPU device memory
lvlsDevice, levelCount, lvlsDevice, levelCount));
nppStreamCtx));
NPP_CHECK_CUDA(cudaFree(lutDevice)); NPP_CHECK_CUDA(cudaFree(lutDevice));
NPP_CHECK_CUDA(cudaFree(lvlsDevice)); NPP_CHECK_CUDA(cudaFree(lvlsDevice));
#else #else
NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx( NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R(
oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(),
oDeviceDst.pitch(), oSizeROI, oDeviceDst.pitch(), oSizeROI,
lutHost, // value and level arrays are in host memory lutHost, // value and level arrays are in host memory
levelsHost, levelCount, levelsHost, levelCount));
nppStreamCtx));
#endif #endif
// copy the result image back into the storage that contained the // copy the result image back into the storage that contained the

View File

@ -328,10 +328,9 @@ main( int argc, char** argv )
int nCompressedLabelCount = 0; int nCompressedLabelCount = 0;
nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR_Ctx(pSegmentLabelsOutputBufferDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage], nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR(pSegmentLabelsOutputBufferDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage],
oSizeROI[nImage].width * oSizeROI[nImage].height, &nCompressedLabelCount, oSizeROI[nImage].width * oSizeROI[nImage].height, &nCompressedLabelCount,
pCompressedLabelsScratchBufferDev, pCompressedLabelsScratchBufferDev);
nppStreamCtx);
if (nppStatus != NPP_SUCCESS) if (nppStatus != NPP_SUCCESS)
{ {
@ -540,3 +539,6 @@ main( int argc, char** argv )
return 0; return 0;
} }