diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp
index e79eb73b..d57bdd9b 100644
--- a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp
+++ b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp
@@ -74,26 +74,6 @@ inline int cudaDeviceInit(int argc, const char **argv) {
   return dev;
 }
 
-bool printfNPPinfo(int argc, char *argv[]) {
-  const NppLibraryVersion *libVer = nppGetLibVersion();
-
-  printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
-         libVer->build);
-
-  int driverVersion, runtimeVersion;
-  cudaDriverGetVersion(&driverVersion);
-  cudaRuntimeGetVersion(&runtimeVersion);
-
-  printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000,
-         (driverVersion % 100) / 10);
-  printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
-         (runtimeVersion % 100) / 10);
-
-  // Min spec is SM 1.0 devices
-  bool bVal = checkCudaCapabilities(1, 0);
-  return bVal;
-}
-
 int main(int argc, char *argv[]) {
   printf("%s Starting...\n\n", argv[0]);
 
@@ -104,11 +84,50 @@ int main(int argc, char *argv[]) {
 
     cudaDeviceInit(argc, (const char **)argv);
 
-    if (printfNPPinfo(argc, argv) == false) {
-      cudaDeviceReset();
-      exit(EXIT_SUCCESS);
+    NppStreamContext nppStreamCtx;
+    nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream.
+
+    cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+    {
+        printf("CUDA error: no devices supporting CUDA.\n");
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
     }
 
+    const NppLibraryVersion *libVer   = nppGetLibVersion();
+
+    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
+
+    int driverVersion, runtimeVersion;
+    cudaDriverGetVersion(&driverVersion);
+    cudaRuntimeGetVersion(&runtimeVersion);
+
+    printf("CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
+    printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, 
+                                      cudaDevAttrComputeCapabilityMajor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, 
+                                      cudaDevAttrComputeCapabilityMinor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
+
+    cudaDeviceProp oDeviceProperties;
+
+    cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
+
+    nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
+    nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
+    nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
+    nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
+
     char *filePath;
 
     if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
@@ -186,11 +205,11 @@ int main(int argc, char *argv[]) {
     npp::ImageNPP_16s_C1 oDeviceDstY(oSizeROI.width, oSizeROI.height);
 
     // run Prewitt edge detection gradient vector filter
-    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
+    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(
         oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset,
         oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(),
         oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3,
-        nppiNormL1, NPP_BORDER_REPLICATE));
+        nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx));
 
     // allocate device destination images of appropriatedly size
     npp::ImageNPP_8u_C1 oDeviceDstOutX(oSizeROI.width, oSizeROI.height);
@@ -198,13 +217,13 @@ int main(int argc, char *argv[]) {
 
     // convert 16s_C1 result images to binary 8u_C1 output images using constant
     // value to adjust amount of visible detail
-    NPP_CHECK_NPP(nppiCompareC_16s_C1R(
+    NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(
         oDeviceDstX.data(), oDeviceDstX.pitch(), 32, oDeviceDstOutX.data(),
-        oDeviceDstOutX.pitch(), oSizeROI, NPP_CMP_GREATER_EQ));
+        oDeviceDstOutX.pitch(), oSizeROI, NPP_CMP_GREATER_EQ, nppStreamCtx));
 
-    NPP_CHECK_NPP(nppiCompareC_16s_C1R(
+    NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(
         oDeviceDstY.data(), oDeviceDstY.pitch(), 32, oDeviceDstOutY.data(),
-        oDeviceDstOutY.pitch(), oSizeROI, NPP_CMP_GREATER_EQ));
+        oDeviceDstOutY.pitch(), oSizeROI, NPP_CMP_GREATER_EQ, nppStreamCtx));
 
     // create host images for the results
     npp::ImageCPU_8u_C1 oHostDstX(oDeviceDstOutX.size());
@@ -234,10 +253,10 @@ int main(int argc, char *argv[]) {
 
     // copy and enlarge the original device source image and surround it with a
     // white edge (border)
-    NPP_CHECK_NPP(nppiCopyConstBorder_8u_C1R(
+    NPP_CHECK_NPP(nppiCopyConstBorder_8u_C1R_Ctx(
         oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize,
         oEnlargedDeviceSrc.data(), oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize,
-        oMaskSize.width / 2, oMaskSize.height / 2, 255));
+        oMaskSize.width / 2, oMaskSize.height / 2, 255, nppStreamCtx));
 
     // adjust oEnlargedDeviceSrc pixel pointer to point to the first pixel of
     // the original source image in the enlarged source image
@@ -261,23 +280,25 @@ int main(int argc, char *argv[]) {
 
     // run Prewitt edge detection gradient vector filter bypassing border
     // control due to enlarged source image
-    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
+    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(
         pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset,
         oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(),
         oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3,
-        nppiNormL1, NPP_BORDER_REPLICATE));
+        nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx));
 
     // convert 16s_C1 result images to binary 8u_C1 output images using constant
     // value to adjust amount of visible detail
-    NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstX.data(), oDeviceDstX.pitch(),
-                                       32, oDeviceDstOutXNoBorders.data(),
-                                       oDeviceDstOutXNoBorders.pitch(),
-                                       oSizeROI, NPP_CMP_GREATER_EQ));
+    NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), oDeviceDstX.pitch(),
+                                           32, oDeviceDstOutXNoBorders.data(),
+                                           oDeviceDstOutXNoBorders.pitch(),
+                                           oSizeROI, NPP_CMP_GREATER_EQ,
+                                           nppStreamCtx));
 
-    NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstY.data(), oDeviceDstY.pitch(),
-                                       32, oDeviceDstOutYNoBorders.data(),
-                                       oDeviceDstOutYNoBorders.pitch(),
-                                       oSizeROI, NPP_CMP_GREATER_EQ));
+    NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), oDeviceDstY.pitch(),
+                                           32, oDeviceDstOutYNoBorders.data(),
+                                           oDeviceDstOutYNoBorders.pitch(),
+                                           oSizeROI, NPP_CMP_GREATER_EQ,
+                                           nppStreamCtx));
     // create additional output files
     std::string sResultXNoBordersFilename = sResultBaseFilename;
     std::string sResultYNoBordersFilename = sResultBaseFilename;
@@ -305,15 +326,17 @@ int main(int argc, char *argv[]) {
 
     // diff the two 8u_C1 result images one with and one without border control
 
-    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
+    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(
         oDeviceDstOutXNoBorders.data(), oDeviceDstOutXNoBorders.pitch(),
         oDeviceDstOutX.data(), oDeviceDstOutX.pitch(),
-        oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI));
+        oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI,
+        nppStreamCtx));
 
-    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
+    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(
         oDeviceDstOutYNoBorders.data(), oDeviceDstOutYNoBorders.pitch(),
         oDeviceDstOutY.data(), oDeviceDstOutY.pitch(),
-        oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI));
+        oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI,
+        nppStreamCtx));
 
     // create additional output files
     std::string sResultXDiffFilename = sResultBaseFilename;
@@ -380,11 +403,11 @@ int main(int argc, char *argv[]) {
 
     // run Prewitt edge detection gradient vector filter to generate the left
     // side of the output image
-    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
+    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(
         pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset,
         oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(),
         oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3,
-        nppiNormL1, NPP_BORDER_REPLICATE));
+        nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx));
 
     // now move the enlarged source pointer to the horizontal middle of the
     // enlarged source image and tell the function where it was moved to
@@ -401,23 +424,26 @@ int main(int argc, char *argv[]) {
     // run Prewitt edge detection gradient vector filter to generate the right
     // side of the output image adjusting the destination image pointers
     // appropriately
-    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R(
+    NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(
         pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset,
         oDeviceDstX.data() + nLeftWidth, oDeviceDstX.pitch(),
         oDeviceDstY.data() + nLeftWidth, oDeviceDstY.pitch(), 0, 0, 0, 0,
-        oSizeROI, NPP_MASK_SIZE_3_X_3, nppiNormL1, NPP_BORDER_REPLICATE));
+        oSizeROI, NPP_MASK_SIZE_3_X_3, nppiNormL1, NPP_BORDER_REPLICATE,
+        nppStreamCtx));
 
     // convert 16s_C1 result images to binary 8u_C1 output images using constant
     // value to adjust amount of visible detail
-    NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstX.data(), oDeviceDstX.pitch(),
-                                       32, oDeviceDstOutXMixedBorders.data(),
-                                       oDeviceDstOutXMixedBorders.pitch(),
-                                       oSizeROI, NPP_CMP_GREATER_EQ));
+    NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), oDeviceDstX.pitch(),
+                                           32, oDeviceDstOutXMixedBorders.data(),
+                                           oDeviceDstOutXMixedBorders.pitch(),
+                                           oSizeROI, NPP_CMP_GREATER_EQ,
+                                           nppStreamCtx));
 
-    NPP_CHECK_NPP(nppiCompareC_16s_C1R(oDeviceDstY.data(), oDeviceDstY.pitch(),
-                                       32, oDeviceDstOutYMixedBorders.data(),
-                                       oDeviceDstOutYMixedBorders.pitch(),
-                                       oSizeROI, NPP_CMP_GREATER_EQ));
+    NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), oDeviceDstY.pitch(),
+                                           32, oDeviceDstOutYMixedBorders.data(),
+                                           oDeviceDstOutYMixedBorders.pitch(),
+                                           oSizeROI, NPP_CMP_GREATER_EQ,
+                                           nppStreamCtx));
     // create additional output files
     std::string sResultXMixedBordersFilename = sResultBaseFilename;
     std::string sResultYMixedBordersFilename = sResultBaseFilename;
@@ -439,15 +465,17 @@ int main(int argc, char *argv[]) {
     // diff the original 8u_C1 result images with border control and the mixed
     // border control images, they should match (diff image will be all black)
 
-    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
+    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(
         oDeviceDstOutXMixedBorders.data(), oDeviceDstOutXMixedBorders.pitch(),
         oDeviceDstOutX.data(), oDeviceDstOutX.pitch(),
-        oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI));
+        oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI,
+        nppStreamCtx));
 
-    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R(
+    NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(
         oDeviceDstOutYMixedBorders.data(), oDeviceDstOutYMixedBorders.pitch(),
         oDeviceDstOutY.data(), oDeviceDstOutY.pitch(),
-        oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI));
+        oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI,
+        nppStreamCtx));
 
     // create additional output files
     std::string sResultXMixedDiffFilename = sResultBaseFilename;
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512.pgm_gradientVectorPrewittBorderX_Vertical.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512.pgm_gradientVectorPrewittBorderX_Vertical.pgm
new file mode 100644
index 00000000..cb16736e
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512.pgm_gradientVectorPrewittBorderX_Vertical.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_BorderDiffs.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_BorderDiffs.pgm
new file mode 100644
index 00000000..693a9af6
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_BorderDiffs.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_MixedBorderDiffs.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_MixedBorderDiffs.pgm
new file mode 100644
index 00000000..1099979c
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_MixedBorderDiffs.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_WithMixedBorders.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_WithMixedBorders.pgm
new file mode 100644
index 00000000..d9963cc9
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_WithMixedBorders.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_WithNoSourceBorders.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_WithNoSourceBorders.pgm
new file mode 100644
index 00000000..b191e108
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderX_Vertical_WithNoSourceBorders.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal.pgm
new file mode 100644
index 00000000..dff35323
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_BorderDiffs.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_BorderDiffs.pgm
new file mode 100644
index 00000000..c7ac2fd7
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_BorderDiffs.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_MixedBorderDiffs.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_MixedBorderDiffs.pgm
new file mode 100644
index 00000000..30d47c98
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_MixedBorderDiffs.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_WithMixedBorders.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_WithMixedBorders.pgm
new file mode 100644
index 00000000..34e88fe7
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_WithMixedBorders.pgm differ
diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_WithNoSourceBorders.pgm b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_WithNoSourceBorders.pgm
new file mode 100644
index 00000000..5037464d
Binary files /dev/null and b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/data/teapot512_gradientVectorPrewittBorderY_Horizontal_WithNoSourceBorders.pgm differ
diff --git a/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp b/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp
index 3d177ce8..10cef4af 100644
--- a/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp
+++ b/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp
@@ -48,39 +48,56 @@
 #include <helper_string.h>
 
 
-bool printfNPPinfo(int argc, char *argv[]) {
-  const NppLibraryVersion *libVer = nppGetLibVersion();
-
-  printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
-         libVer->build);
-
-  int driverVersion, runtimeVersion;
-  cudaDriverGetVersion(&driverVersion);
-  cudaRuntimeGetVersion(&runtimeVersion);
-
-  printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000,
-         (driverVersion % 100) / 10);
-  printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
-         (runtimeVersion % 100) / 10);
-
-  // Min spec is SM 1.0 devices
-  bool bVal = checkCudaCapabilities(1, 0);
-  return bVal;
-}
-
 int main(int argc, char *argv[]) {
   printf("%s Starting...\n\n", argv[0]);
 
   try {
     std::string sFilename;
     char *filePath;
+    NppStreamContext nppStreamCtx;
+    nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream.
 
-    findCudaDevice(argc, (const char **)argv);
-
-    if (printfNPPinfo(argc, argv) == false) {
-      exit(EXIT_SUCCESS);
+    cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+    {
+        printf("CUDA error: no devices supporting CUDA.\n");
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
     }
 
+    const NppLibraryVersion *libVer   = nppGetLibVersion();
+
+    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
+
+    int driverVersion, runtimeVersion;
+    cudaDriverGetVersion(&driverVersion);
+    cudaRuntimeGetVersion(&runtimeVersion);
+
+    printf("CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
+    printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, 
+                                      cudaDevAttrComputeCapabilityMajor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, 
+                                      cudaDevAttrComputeCapabilityMinor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
+
+    cudaDeviceProp oDeviceProperties;
+
+    cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
+
+    nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
+    nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
+    nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
+    nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
+
     if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
       getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
     } else {
@@ -154,10 +171,10 @@ int main(int argc, char *argv[]) {
     NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
 
     // run box filter
-    NPP_CHECK_NPP(nppiFilterBoxBorder_8u_C1R(
+    NPP_CHECK_NPP(nppiFilterBoxBorder_8u_C1R_Ctx(
         oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset,
         oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor,
-        NPP_BORDER_REPLICATE));
+        NPP_BORDER_REPLICATE, nppStreamCtx));
 
     // declare a host image for the result
     npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());
diff --git a/Samples/4_CUDA_Libraries/boxFilterNPP/teapot512_boxFilter.pgm b/Samples/4_CUDA_Libraries/boxFilterNPP/teapot512_boxFilter.pgm
new file mode 100644
index 00000000..a681196d
Binary files /dev/null and b/Samples/4_CUDA_Libraries/boxFilterNPP/teapot512_boxFilter.pgm differ
diff --git a/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp
index e823e0b6..4a8ae7f7 100644
--- a/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp
+++ b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp
@@ -68,26 +68,6 @@ inline int cudaDeviceInit(int argc, const char **argv) {
   return dev;
 }
 
-bool printfNPPinfo(int argc, char *argv[]) {
-  const NppLibraryVersion *libVer = nppGetLibVersion();
-
-  printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
-         libVer->build);
-
-  int driverVersion, runtimeVersion;
-  cudaDriverGetVersion(&driverVersion);
-  cudaRuntimeGetVersion(&runtimeVersion);
-
-  printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000,
-         (driverVersion % 100) / 10);
-  printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
-         (runtimeVersion % 100) / 10);
-
-  // Min spec is SM 1.0 devices
-  bool bVal = checkCudaCapabilities(1, 0);
-  return bVal;
-}
-
 int main(int argc, char *argv[]) {
   printf("%s Starting...\n\n", argv[0]);
 
@@ -97,10 +77,50 @@ int main(int argc, char *argv[]) {
 
     cudaDeviceInit(argc, (const char **)argv);
 
-    if (printfNPPinfo(argc, argv) == false) {
-      exit(EXIT_SUCCESS);
+    NppStreamContext nppStreamCtx;
+    nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream.
+
+    cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+    {
+        printf("CUDA error: no devices supporting CUDA.\n");
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
     }
 
+    const NppLibraryVersion *libVer   = nppGetLibVersion();
+
+    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
+
+    int driverVersion, runtimeVersion;
+    cudaDriverGetVersion(&driverVersion);
+    cudaRuntimeGetVersion(&runtimeVersion);
+
+    printf("CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
+    printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, 
+                                      cudaDevAttrComputeCapabilityMajor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, 
+                                      cudaDevAttrComputeCapabilityMinor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
+
+    cudaDeviceProp oDeviceProperties;
+
+    cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
+
+    nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
+    nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
+    nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
+    nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
+
     if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
       getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
     } else {
@@ -190,11 +210,11 @@ int main(int argc, char *argv[]) {
     Npp16s nHighThreshold = 256;
 
     if ((nBufferSize > 0) && (pScratchBufferNPP != 0)) {
-      NPP_CHECK_NPP(nppiFilterCannyBorder_8u_C1R(
+      NPP_CHECK_NPP(nppiFilterCannyBorder_8u_C1R_Ctx(
           oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset,
           oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, NPP_FILTER_SOBEL,
           NPP_MASK_SIZE_3_X_3, nLowThreshold, nHighThreshold, nppiNormL2,
-          NPP_BORDER_REPLICATE, pScratchBufferNPP));
+          NPP_BORDER_REPLICATE, pScratchBufferNPP, nppStreamCtx));
     }
 
     // free scratch buffer memory
diff --git a/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/teapot512_cannyEdgeDetection.pgm b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/teapot512_cannyEdgeDetection.pgm
new file mode 100644
index 00000000..84b090d9
Binary files /dev/null and b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/teapot512_cannyEdgeDetection.pgm differ
diff --git a/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp b/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp
index f657f3b7..43aad888 100644
--- a/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp
+++ b/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp
@@ -70,25 +70,6 @@ inline int cudaDeviceInit(int argc, const char **argv) {
   return dev;
 }
 
-bool printfNPPinfo(int argc, char *argv[], int cudaVerMajor, int cudaVerMinor) {
-  const NppLibraryVersion *libVer = nppGetLibVersion();
-
-  printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
-         libVer->build);
-
-  int driverVersion, runtimeVersion;
-  cudaDriverGetVersion(&driverVersion);
-  cudaRuntimeGetVersion(&runtimeVersion);
-
-  printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000,
-         (driverVersion % 100) / 10);
-  printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
-         (runtimeVersion % 100) / 10);
-
-  bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
-  return bVal;
-}
-
 // Error handler for FreeImage library.
 //  In case this handler is invoked, it throws an NPP exception.
 extern "C" void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif,
@@ -157,11 +138,50 @@ int main(int argc, char *argv[]) {
 
     cudaDeviceInit(argc, (const char **)argv);
 
-    // Min spec is SM 1.0 devices
-    if (printfNPPinfo(argc, argv, 1, 0) == false) {
-      exit(EXIT_SUCCESS);
+    NppStreamContext nppStreamCtx;
+    nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream.
+
+    cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+    {
+        printf("CUDA error: no devices supporting CUDA.\n");
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
     }
 
+    const NppLibraryVersion *libVer   = nppGetLibVersion();
+
+    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
+
+    int driverVersion, runtimeVersion;
+    cudaDriverGetVersion(&driverVersion);
+    cudaRuntimeGetVersion(&runtimeVersion);
+
+    printf("CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
+    printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, 
+                                      cudaDevAttrComputeCapabilityMajor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, 
+                                      cudaDevAttrComputeCapabilityMinor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
+
+    cudaDeviceProp oDeviceProperties;
+
+    cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
+
+    nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
+    nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
+    nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
+    nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
+
     if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
       getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
     } else {
@@ -260,9 +280,9 @@ int main(int argc, char *argv[]) {
     Npp8u *pDstImageCUDA =
         nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA);
     NPP_ASSERT_NOT_NULL(pDstImageCUDA);
-    NPP_CHECK_NPP(nppiFilterBox_8u_C1R(pSrcImageCUDA, nSrcPitchCUDA,
-                                       pDstImageCUDA, nDstPitchCUDA, oSizeROI,
-                                       oMaskSize, oMaskAchnor));
+    NPP_CHECK_NPP(nppiFilterBox_8u_C1R_Ctx(pSrcImageCUDA, nSrcPitchCUDA,
+                                           pDstImageCUDA, nDstPitchCUDA, oSizeROI,
+                                           oMaskSize, oMaskAchnor, nppStreamCtx));
     // create the result image storage using FreeImage so we can easily
     // save
     FIBITMAP *pResultBitmap = FreeImage_Allocate(
diff --git a/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp b/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp
index ae897b27..1869feb7 100644
--- a/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp
+++ b/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp
@@ -73,26 +73,6 @@ inline int cudaDeviceInit(int argc, const char **argv) {
   return dev;
 }
 
-bool printfNPPinfo(int argc, char *argv[]) {
-  const NppLibraryVersion *libVer = nppGetLibVersion();
-
-  printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
-         libVer->build);
-
-  int driverVersion, runtimeVersion;
-  cudaDriverGetVersion(&driverVersion);
-  cudaRuntimeGetVersion(&runtimeVersion);
-
-  printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000,
-         (driverVersion % 100) / 10);
-  printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000,
-         (runtimeVersion % 100) / 10);
-
-  // Min spec is SM 1.1 devices
-  bool bVal = checkCudaCapabilities(1, 1);
-  return bVal;
-}
-
 int main(int argc, char *argv[]) {
   printf("%s Starting...\n\n", argv[0]);
 
@@ -102,10 +82,50 @@ int main(int argc, char *argv[]) {
 
     cudaDeviceInit(argc, (const char **)argv);
 
-    if (printfNPPinfo(argc, argv) == false) {
-      exit(EXIT_SUCCESS);
+    NppStreamContext nppStreamCtx;
+    nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream.
+
+    cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+    {
+        printf("CUDA error: no devices supporting CUDA.\n");
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
     }
 
+    const NppLibraryVersion *libVer   = nppGetLibVersion();
+
+    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
+
+    int driverVersion, runtimeVersion;
+    cudaDriverGetVersion(&driverVersion);
+    cudaRuntimeGetVersion(&runtimeVersion);
+
+    printf("CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
+    printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10);
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, 
+                                      cudaDevAttrComputeCapabilityMajor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, 
+                                      cudaDevAttrComputeCapabilityMinor, 
+                                      nppStreamCtx.nCudaDeviceId);
+    if (cudaError != cudaSuccess)
+        return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
+
+    cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
+
+    cudaDeviceProp oDeviceProperties;
+
+    cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
+
+    nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
+    nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
+    nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
+    nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
+
     if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
       getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
     } else {
@@ -182,8 +202,9 @@ int main(int argc, char *argv[]) {
                          (int)oDeviceSrc.height()};  // full image
     // create device scratch buffer for nppiHistogram
     size_t nDeviceBufferSize;
-    nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, levelCount,
-                                          &nDeviceBufferSize);
+    nppiHistogramEvenGetBufferSize_8u_C1R_Ctx(oSizeROI, levelCount,
+                                              &nDeviceBufferSize,
+                                              nppStreamCtx);
     Npp8u *pDeviceBuffer;
     NPP_CHECK_CUDA(cudaMalloc((void **)&pDeviceBuffer, nDeviceBufferSize));
 
@@ -191,9 +212,9 @@ int main(int argc, char *argv[]) {
     Npp32s levelsHost[levelCount];
     NPP_CHECK_NPP(nppiEvenLevelsHost_32s(levelsHost, levelCount, 0, binCount));
     // compute the histogram
-    NPP_CHECK_NPP(nppiHistogramEven_8u_C1R(
+    NPP_CHECK_NPP(nppiHistogramEven_8u_C1R_Ctx(
         oDeviceSrc.data(), oDeviceSrc.pitch(), oSizeROI, histDevice, levelCount,
-        0, binCount, pDeviceBuffer));
+        0, binCount, pDeviceBuffer, nppStreamCtx));
     // copy histogram and levels to host memory
     Npp32s histHost[binCount];
     NPP_CHECK_CUDA(cudaMemcpy(histHost, histDevice, binCount * sizeof(Npp32s),
@@ -254,20 +275,22 @@ int main(int argc, char *argv[]) {
                               sizeof(Npp32s) * (levelCount),
                               cudaMemcpyHostToDevice));
 
-    NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R(
+    NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx(
         oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(),
         oDeviceDst.pitch(), oSizeROI,
-        lutDevice,  // value and level arrays are in GPU device memory
-        lvlsDevice, levelCount));
+        lutDevice,  // value and level arrays are in host memory
+        lvlsDevice, levelCount,
+        nppStreamCtx));
 
     NPP_CHECK_CUDA(cudaFree(lutDevice));
     NPP_CHECK_CUDA(cudaFree(lvlsDevice));
 #else
-    NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R(
+    NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx(
         oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(),
         oDeviceDst.pitch(), oSizeROI,
         lutHost,  // value and level arrays are in host memory
-        levelsHost, levelCount));
+        levelsHost, levelCount,
+        nppStreamCtx));
 #endif
 
     // copy the result image back into the storage that contained the
diff --git a/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp b/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp
index e6800ab2..825cea6f 100644
--- a/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp
+++ b/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp
@@ -328,9 +328,10 @@ main( int argc, char** argv )
 
             int nCompressedLabelCount = 0;
 
-            nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR(pSegmentLabelsOutputBufferDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage], 
-                                                            oSizeROI[nImage].width * oSizeROI[nImage].height, &nCompressedLabelCount, 
-                                                            pCompressedLabelsScratchBufferDev);
+            nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR_Ctx(pSegmentLabelsOutputBufferDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage], 
+                                                                oSizeROI[nImage].width * oSizeROI[nImage].height, &nCompressedLabelCount, 
+                                                                pCompressedLabelsScratchBufferDev,
+                                                                nppStreamCtx);
 
             if (nppStatus != NPP_SUCCESS)  
             {
@@ -539,6 +540,3 @@ main( int argc, char** argv )
 
     return 0;
 }
-
-
-