Merge pull request #10 from XSShawnZeng/Tegra_Samples_Cmake_Transition

Add Tegra sample cudaNvSciBufMultiplanar
2025-08-24 07:50:31 +08:00 · 2025-01-16 09:01:44 -08:00 · 2025-01-16 09:01:44 -08:00 · 1a466282da
commit 1a466282da
parent b518bfe9be 545194e7aa
12 changed files with 884 additions and 0 deletions
--- a/Samples/8_Platform_Specific/Tegra/CMakeLists.txt
+++ b/Samples/8_Platform_Specific/Tegra/CMakeLists.txt
@ -1,4 +1,5 @@
 add_subdirectory(cudaNvSciNvMedia)
 add_subdirectory(cudaNvSciBufMultiplanar)
 add_subdirectory(cuDLAErrorReporting)
 add_subdirectory(cuDLAHybridMode)
 add_subdirectory(cuDLALayerwiseStatsHybrid)
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/c_cpp_properties.json
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/c_cpp_properties.json
@ -0,0 +1,18 @@
 {
    "configurations": [
        {
            "name": "Linux",
            "includePath": [
                "${workspaceFolder}/**",
                "${workspaceFolder}/../../../Common"
            ],
            "defines": [],
            "compilerPath": "/usr/local/cuda/bin/nvcc",
            "cStandard": "gnu17",
            "cppStandard": "gnu++14",
            "intelliSenseMode": "linux-gcc-x64",
            "configurationProvider": "ms-vscode.makefile-tools"
        }
    ],
    "version": 4
 }
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/extensions.json
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/extensions.json
@ -0,0 +1,7 @@
 {
    "recommendations": [
        "nvidia.nsight-vscode-edition",
        "ms-vscode.cpptools",
        "ms-vscode.makefile-tools"
    ]
 }
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/launch.json
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/launch.json
@ -0,0 +1,10 @@
 {
    "configurations": [
        {
            "name": "CUDA C++: Launch",
            "type": "cuda-gdb",
            "request": "launch",
            "program": "${workspaceFolder}/cudaNvSciBufMultiplanar"
        }
    ]
 }
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/tasks.json
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/tasks.json
@ -0,0 +1,15 @@
 {
    "version": "2.0.0",
    "tasks": [
        {
            "label": "sample",
            "type": "shell",
            "command": "make dbg=1",
            "problemMatcher": ["$nvcc"],
            "group": {
                "kind": "build",
                "isDefault": true
            }
        }
    ]
 }
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/CMakeLists.txt
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/CMakeLists.txt
@ -0,0 +1,74 @@
 cmake_minimum_required(VERSION 3.20)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../cmake/Modules")
 project(cudaNvSciBufMultiplanar LANGUAGES C CXX CUDA)
 find_package(CUDAToolkit REQUIRED)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CUDA_ARCHITECTURES 53 61 70 72 75 80 86 87 90)
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()
 # Include directories and libraries
 include_directories(../../../../Common)
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
    # Find the NVSCI libraries
    # use CMAKE_LIBRARY_PATH so that users can also specify the NVSCI lib path in cmake command
    set(CMAKE_LIBRARY_PATH "/usr/lib" ${CMAKE_LIBRARY_PATH})
    file(GLOB_RECURSE NVSCIBUF_LIB
         ${CMAKE_LIBRARY_PATH}/*/libnvscibuf.so
    )
    file(GLOB_RECURSE NVSCISYNC_LIB
         ${CMAKE_LIBRARY_PATH}/*/libnvscisync.so
    )
    # Find the NVSCI header files
    # use CMAKE_INCLUDE_PATH so that users can also specify the NVSCI include path in cmake command
    set(CMAKE_INCLUDE_PATH "/usr/include" ${CMAKE_LIBRARY_PATH})
    find_path(NVSCIBUF_INCLUDE_DIR nvscibuf.h PATHS ${CMAKE_INCLUDE_PATH})
    find_path(NVSCISYNC_INCLUDE_DIR nvscisync.h PATHS ${CMAKE_INCLUDE_PATH})
    if(NVSCIBUF_LIB AND NVSCISYNC_LIB AND NVSCIBUF_INCLUDE_DIR AND NVSCISYNC_INCLUDE_DIR)
        message(STATUS "FOUND NVSCI libs: ${NVSCIBUF_LIB} ${NVSCISYNC_LIB}")
        message(STATUS "Using NVSCI headers path: ${NVSCIBUF_INCLUDE_DIR} ${NVSCIBUF_INCLUDE_DIR}")
        # Source file
        # Add target for cudaNvSciBufMultiplanar
        add_executable(cudaNvSciBufMultiplanar imageKernels.cu cudaNvSciBufMultiplanar.cpp main.cpp)
        target_compile_options(cudaNvSciBufMultiplanar PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
        target_compile_features(cudaNvSciBufMultiplanar PRIVATE cxx_std_17 cuda_std_17)
        set_target_properties(cudaNvSciBufMultiplanar PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
        target_include_directories(cudaNvSciBufMultiplanar PUBLIC
            ${CUDAToolkit_INCLUDE_DIRS}
            ${NVSCIBUF_INCLUDE_DIR}
            ${NVSCISYNC_INCLUDE_DIR}
        )
        target_link_libraries(cudaNvSciBufMultiplanar
            CUDA::cuda_driver
            ${NVSCIBUF_LIB}
            ${NVSCISYNC_LIB}
        )
        # Copy yuv_planar_img1.yuv to the output directory
        add_custom_command(TARGET cudaNvSciBufMultiplanar POST_BUILD
            COMMAND ${CMAKE_COMMAND} -E copy_if_different
            ${CMAKE_CURRENT_SOURCE_DIR}/yuv_planar_img1.yuv ${CMAKE_CURRENT_BINARY_DIR}/yuv_planar_img1.yuv
        )
        # Specify additional clean files
        set_target_properties(cudaNvSciBufMultiplanar PROPERTIES
            ADDITIONAL_CLEAN_FILES "image_out.yuv"
        )
    else()
        message(STATUS "NvSCI not found - will not build sample 'cudaNvSciBufMultiplanar'")
    endif()
 else()
    message(STATUS "Will not build sample cudaNvSciBufMultiplanar - requires Linux OS")
 endif()
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/README.md
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/README.md
@ -0,0 +1,64 @@
 # cudaNvSciBufMultiplanar - CUDA NvSciBufMultiplanar Image Samples
 ## Description
 This sample demonstrates CUDA-NvSciBuf Interop for Multiplanar images. A YUV 420 multiplanar image is flipped and allocated using NvSciBuf APIs and imported into CUDA with CUDA External Resource Interoperability. A CUDA surface is created from the corresponding mapped CUDA array and again bit flipping is performed on the surface. The result is copied back to a YUV image which is compared against the input.
 ## Key Concepts
 CUDA NvSci Interop, Data Parallel Algorithms, Image Processing
 ## Supported SM Architectures
 [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.9 ](https://developer.nvidia.com/cuda-gpus)  [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 10.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 10.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 12.0 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux
 ## Supported CPU Architecture
 aarch64
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
 cudaDeviceGetAttribute, cudaNvSciBufMultiplanar, cudaDestroyExternalMemory, cuDriverGetVersion, cuDeviceGetUuid, cudaSetDevice, cudaGetMipmappedArrayLevel, cudaFreeMipmappedArray, cudaImportExternalMemory, cudaCreateChannelDesc, cudaExternalMemoryGetMappedMipmappedArray, cuCtxSynchronize, cudaMemcpy2DToArray, cudaMemcpy2DFromArray
 ## Dependencies needed to build/run
 [NVSCI](../../../README.md#nvsci)
 ## Prerequisites
 Download and install the [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are aarch64.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=aarch64` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.cpp
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.cpp
@ -0,0 +1,435 @@
 /* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cudaNvSciBufMultiplanar.h"
 NvSciBufModule module;
 NvSciBufObj buffObj;
 CUuuid uuid;
 void flipBits(uint8_t *pBuff, uint32_t size) {
    for (uint32_t i = 0; i < size; i++) {
        pBuff[i] = (~pBuff[i]);
    }
 }
 // Compare input and generated image files
 void compareFiles(std::string &path1, std::string &path2) {  
    bool result = true;
    FILE *fp1, *fp2;
    int ch1, ch2;
    fp1 = fopen(path1.c_str(), "rb");
    fp2 = fopen(path2.c_str(), "rb");
    if (!fp1) {
        result = false;
        printf("File %s open failed in %s line %d\n", path1.c_str(), __FILE__, __LINE__);
        exit(EXIT_FAILURE); 
    }
    if (!fp2) {
        result = false;
        printf("File %s open failed in %s line %d\n", path2.c_str(), __FILE__, __LINE__);
        exit(EXIT_FAILURE);
    }
    do  {       
        ch1 = getc(fp1); 
        ch2 = getc(fp2);
        if (ch1 != ch2) { 
            result = false; 
            break;
        } 
    } while(ch1 != EOF && ch2 != EOF);
    if (result) {
        printf("Input file : %s and output file : %s match SUCCESS\n", path1.c_str(), path2.c_str());
    } 
    else {
        printf("Input file : %s and output file : %s match FAILURE\n", path1.c_str(), path2.c_str());
    }
    if (fp1) {
        fclose(fp1);
    }
    if (fp2) {
        fclose(fp2);
    }
 } 
 void Caller::init() {
    checkNvSciErrors(NvSciBufAttrListCreate(module, &attrList));
    attrListOut = NULL;
 }
 void Caller::deinit() {
    NvSciBufAttrListFree(attrList);
    checkCudaErrors(cudaDestroyExternalMemory(extMem));
 }
 // Set NvSciBufImage attribute values in the attribute list  
 void Caller::setAttrListImageMultiPlanes(int imageWidth, int imageHeight) {
    NvSciBufType bufType = NvSciBufType_Image;
    NvSciBufAttrValImageLayoutType layout = NvSciBufImage_BlockLinearType;
    bool cpuAccessFlag = false;
    NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
    NvSciRmGpuId gpuid;
    bool vpr = false;
    int32_t planeCount = PLANAR_NUM_PLANES;
    int drvVersion;
    // Dimensions of the imported image in the YUV 420 planar format
    int32_t planeWidths[] = {imageWidth, imageWidth/2, imageWidth/2};
    int32_t planeHeights[] = {imageHeight, imageHeight/2, imageHeight/2};
    NvSciBufAttrKeyValuePair keyPair;
    NvSciBufAttrKeyValuePair pairArray[ATTR_SIZE];
    NvSciBufAttrValColorFmt planeColorFmts[] =
            { NvSciColor_Y8, NvSciColor_V8, NvSciColor_U8 };
    NvSciBufAttrValImageScanType planeScanType[] =
            { NvSciBufScan_ProgressiveType };
    memcpy(&gpuid.bytes, &uuid.bytes, sizeof(uuid.bytes));
    NvSciBufAttrKeyValuePair imgBuffAttrsArr[] = {
        { NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType) },
        { NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccessFlag,
            sizeof(cpuAccessFlag) },
        { NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm) },
        { NvSciBufGeneralAttrKey_GpuId, &gpuid, sizeof(gpuid) },
        { NvSciBufImageAttrKey_Layout, &layout, sizeof(layout) },
        { NvSciBufImageAttrKey_VprFlag, &vpr, sizeof(vpr) },
        { NvSciBufImageAttrKey_PlaneCount, &planeCount, sizeof(planeCount) },
        { NvSciBufImageAttrKey_PlaneColorFormat, planeColorFmts,
            sizeof(planeColorFmts) },
        { NvSciBufImageAttrKey_PlaneWidth, planeWidths, sizeof(planeWidths) },
        { NvSciBufImageAttrKey_PlaneHeight, planeHeights,
            sizeof(planeHeights) },
        { NvSciBufImageAttrKey_PlaneScanType, planeScanType,
            sizeof(planeScanType) },
    };
    std::vector<NvSciBufAttrKeyValuePair> imgBuffAttrsVec(imgBuffAttrsArr,
                            imgBuffAttrsArr+(sizeof(imgBuffAttrsArr)/sizeof(imgBuffAttrsArr[0])));
    memset(pairArray, 0, sizeof(NvSciBufAttrKeyValuePair) * imgBuffAttrsVec.size());
    std::copy(imgBuffAttrsVec.begin(), imgBuffAttrsVec.end(), pairArray);
    checkNvSciErrors(NvSciBufAttrListSetAttrs(attrList, pairArray, imgBuffAttrsVec.size()));
 }
 cudaNvSciBufMultiplanar::cudaNvSciBufMultiplanar(size_t width, size_t height, std::vector<int> &deviceIds)
    : imageWidth(width),
      imageHeight(height) {
        mCudaDeviceId =  deviceIds[0];
        attrListReconciled = NULL;
        attrListConflict = NULL;
        checkNvSciErrors(NvSciBufModuleOpen(&module));
        initCuda(mCudaDeviceId);
    }
 void cudaNvSciBufMultiplanar::initCuda(int devId) {
    int major = 0, minor = 0, drvVersion;
    NvSciRmGpuId gpuid;
    checkCudaErrors(cudaSetDevice(mCudaDeviceId));
    checkCudaErrors(cudaDeviceGetAttribute(
        &major, cudaDevAttrComputeCapabilityMajor, mCudaDeviceId));
    checkCudaErrors(cudaDeviceGetAttribute(
        &minor, cudaDevAttrComputeCapabilityMinor, mCudaDeviceId));
    printf(
        "[cudaNvSciBufMultiplanar] GPU Device %d: \"%s\" with compute capability "
        "%d.%d\n\n",
        mCudaDeviceId, _ConvertSMVer2ArchName(major, minor), major, minor);
    checkCudaDrvErrors(cuDriverGetVersion(&drvVersion));
    if (drvVersion <= 11030) {
        checkCudaDrvErrors(cuDeviceGetUuid(&uuid, devId));
    } else {
        checkCudaDrvErrors(cuDeviceGetUuid_v2(&uuid, devId));
    }
 }
 /* 
 Caller1 flips a YUV image which is allocated to nvscibuf APIs and copied into CUDA Array.
 It is mapped to CUDA surface and bit flip is done. Caller2 in the same thread copies 
 CUDA Array to a YUV image file. The original image is compared with the double bit
 flipped image.
 */
 void cudaNvSciBufMultiplanar::runCudaNvSciBufPlanar(std::string &imageFilename, std::string &imageFilenameOut) {
    cudaArray_t levelArray1[PLANAR_NUM_PLANES];
    cudaArray_t levelArray2[PLANAR_NUM_PLANES];
    Caller caller1;
    Caller caller2;
    int numPlanes = PLANAR_NUM_PLANES;
    caller1.init();
    caller2.init();
    // Set NvSciBufImage attribute values in the attribute list
    caller1.setAttrListImageMultiPlanes(imageWidth, imageHeight);
    caller2.setAttrListImageMultiPlanes(imageWidth, imageHeight);
    // Reconcile attribute lists and allocate NvSciBuf object
    reconcileAttrList(&caller1.attrList, &caller2.attrList); 
    caller1.copyExtMemToMultiPlanarArrays();
    for (int i = 0; i < numPlanes; i++) {
        checkCudaErrors(cudaGetMipmappedArrayLevel(&levelArray1[i], caller1.multiPlanarArray[i], 0)); 
    }
    caller1.copyYUVToCudaArrayAndFlipBits(imageFilename, levelArray1);
    caller2.copyExtMemToMultiPlanarArrays();
    for (int i = 0; i < numPlanes; i++) {
        checkCudaErrors(cudaGetMipmappedArrayLevel(&levelArray2[i], caller2.multiPlanarArray[i], 0)); 
    }
    // Maps cudaArray to surface memory and launches a kernel to flip bits
    launchFlipSurfaceBitsKernel(levelArray2, caller2.multiPlanarWidth, caller2.multiPlanarHeight, numPlanes);
    // Synchronization can be done using nvSciSync when non CUDA callers and cross-process signaler-waiter 
    // applications are involved. Please refer to the cudaNvSci sample library for more details.
    checkCudaDrvErrors(cuCtxSynchronize());
    printf("Bit flip of the surface memory done\n");
    caller2.copyCudaArrayToYUV(imageFilenameOut, levelArray2);
    compareFiles(imageFilename, imageFilenameOut);
    // Release memory
    printf("Releasing memory\n");
    for (int i = 0; i < numPlanes; i++) {
        checkCudaErrors(cudaFreeMipmappedArray(caller1.multiPlanarArray[i]));
        checkCudaErrors(cudaFreeMipmappedArray(caller2.multiPlanarArray[i]));
    }
    tearDown(&caller1, &caller2);
 }
 // Map NvSciBufObj to cudaMipmappedArray
 void Caller::copyExtMemToMultiPlanarArrays() {
    checkNvSciErrors(NvSciBufObjGetAttrList(buffObj, &attrListOut));
    memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * PLANE_ATTR_SIZE);
    cudaExternalMemoryHandleDesc memHandleDesc;
    cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0};
    cudaChannelFormatDesc desc = {0};
    cudaExtent extent = {0};
    pairArrayOut[PLANE_SIZE].key   = NvSciBufImageAttrKey_Size;                       // Datatype: @c uint64_t
    pairArrayOut[PLANE_ALIGNED_SIZE].key   = NvSciBufImageAttrKey_PlaneAlignedSize;   // Datatype: @c uint64_t[]
    pairArrayOut[PLANE_OFFSET].key   = NvSciBufImageAttrKey_PlaneOffset;              // Datatype: @c uint64_t[]
    pairArrayOut[PLANE_HEIGHT].key   = NvSciBufImageAttrKey_PlaneHeight;              // Datatype: @c uint32_t[]
    pairArrayOut[PLANE_WIDTH].key   = NvSciBufImageAttrKey_PlaneWidth;                // Datatype: @c int32_t[]
    pairArrayOut[PLANE_CHANNEL_COUNT].key   = NvSciBufImageAttrKey_PlaneChannelCount; // Datatype: @c uint8_t
    pairArrayOut[PLANE_BITS_PER_PIXEL].key   = NvSciBufImageAttrKey_PlaneBitsPerPixel;// Datatype: @c uint32_t[]
    pairArrayOut[PLANE_COUNT].key   = NvSciBufImageAttrKey_PlaneCount;                // Datatype: @c uint32_t
    checkNvSciErrors(NvSciBufAttrListGetAttrs(attrListOut, pairArrayOut, (PLANE_ATTR_SIZE)));  
    uint64_t size = *(uint64_t*)pairArrayOut[PLANE_SIZE].value;
    uint64_t *planeAlignedSize = (uint64_t*)pairArrayOut[PLANE_ALIGNED_SIZE].value;
    int32_t *planeWidth = (int32_t*)pairArrayOut[PLANE_WIDTH].value;
    int32_t *planeHeight = (int32_t*)pairArrayOut[PLANE_HEIGHT].value;
    uint64_t *planeOffset = (uint64_t*)pairArrayOut[PLANE_OFFSET].value;
    uint8_t planeChannelCount = *(uint8_t*)pairArrayOut[PLANE_CHANNEL_COUNT].value;
    uint32_t *planeBitsPerPixel = (uint32_t*)pairArrayOut[PLANE_BITS_PER_PIXEL].value;
    uint32_t planeCount = *(uint32_t*)pairArrayOut[PLANE_COUNT].value;
    numPlanes = planeCount;
    for (int i = 0; i < numPlanes; i++) {
        multiPlanarWidth[i] = planeWidth[i];
        multiPlanarHeight[i] = planeHeight[i];
    }
    memset(&memHandleDesc, 0, sizeof(memHandleDesc));
    memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
    memHandleDesc.handle.nvSciBufObject = buffObj;
    memHandleDesc.size = size;
    checkCudaErrors(cudaImportExternalMemory(&extMem, &memHandleDesc));
    desc = cudaCreateChannelDesc(planeBitsPerPixel[0], 0, 0, 0, cudaChannelFormatKindUnsigned);
    memset(&mipmapDesc, 0, sizeof(mipmapDesc));
    mipmapDesc.numLevels = 1; 
    for (int i = 0; i < numPlanes; i++) {    
        memset(&extent, 0, sizeof(extent));
        extent.width = planeWidth[i];
        extent.height = planeHeight[i];
        extent.depth = 0;
        mipmapDesc.offset = planeOffset[i]; 
        mipmapDesc.formatDesc = desc;
        mipmapDesc.extent = extent;
        mipmapDesc.flags = cudaArraySurfaceLoadStore;;
        checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&multiPlanarArray[i], extMem, &mipmapDesc));
    }
 }
 void cudaNvSciBufMultiplanar::reconcileAttrList(NvSciBufAttrList *attrList1, NvSciBufAttrList *attrList2) {
    attrList[0] = *attrList1;
    attrList[1] = *attrList2;
    bool isReconciled = false;
    checkNvSciErrors(NvSciBufAttrListReconcile(attrList, 2, &attrListReconciled, &attrListConflict));
    checkNvSciErrors(NvSciBufAttrListIsReconciled(attrListReconciled, &isReconciled));
    checkNvSciErrors(NvSciBufObjAlloc(attrListReconciled, &buffObj));
    printf("NvSciBufAttrList reconciled\n");
 }
 // YUV 420 image is flipped and copied to cuda Array which is mapped to nvsciBuf
 void Caller::copyYUVToCudaArrayAndFlipBits(std::string &path, cudaArray_t *cudaArr) {
    FILE *fp = NULL;
    uint8_t *pYBuff, *pUBuff, *pVBuff, *pChroma;
    uint8_t *pBuff = NULL; 
    uint32_t uvOffset[numPlanes] = {0}, copyWidthInBytes[numPlanes] = {0}, copyHeight[numPlanes] = {0};
    uint32_t width = multiPlanarWidth[0];
    uint32_t height = multiPlanarHeight[0];
    fp = fopen(path.c_str(), "rb");
    if (!fp) {
        printf("CudaProducer: Error opening file: %s in %s line %d\n", path.c_str(), __FILE__, __LINE__);
        exit(EXIT_FAILURE); 
    }
    pBuff = (uint8_t*)malloc((width * height * PLANAR_CHROMA_WIDTH_ORDER * PLANAR_CHROMA_HEIGHT_ORDER) * sizeof(unsigned char));
    if (!pBuff) {
        printf("CudaProducer: Failed to allocate image buffer in %s line %d\n", __FILE__, __LINE__);
        exit(EXIT_FAILURE); 
    }
    // Y V U order in the buffer. Fully planar formats use 
    // three planes to store the Y, Cb and Cr components separately.
    pYBuff = pBuff; 
    pVBuff = pYBuff + width * height;
    pUBuff = pVBuff + (width / PLANAR_CHROMA_WIDTH_ORDER)  * (height / PLANAR_CHROMA_HEIGHT_ORDER);
    for (uint32_t i = 0; i < height; i++) {
        if (fread(pYBuff, width, 1, fp) != 1) {
            printf("ReadYUVFrame: Error reading file: %s in %s line %d\n", path.c_str(), __FILE__, __LINE__);
            exit(EXIT_FAILURE); 
        }
        flipBits(pYBuff, width);
        pYBuff += width;
    }
    pChroma = pVBuff;
    for (uint32_t i = 0; i < height / PLANAR_CHROMA_HEIGHT_ORDER; i++) {
        if (fread(pChroma, width / PLANAR_CHROMA_WIDTH_ORDER, 1, fp) != 1) {
            printf("ReadYUVFrame: Error reading file: %s in %s line %d\n", path.c_str(), __FILE__, __LINE__);
            exit(EXIT_FAILURE);
        }
        flipBits(pChroma, width);
        pChroma += width / PLANAR_CHROMA_WIDTH_ORDER;
    }
    pChroma = pUBuff;
    for (uint32_t i = 0; i < height / PLANAR_CHROMA_HEIGHT_ORDER; i++) {
        if (fread(pChroma, width / PLANAR_CHROMA_WIDTH_ORDER, 1, fp) != 1) {
            printf("ReadYUVFrame: Error reading file: %s in %s line %d\n", path.c_str(), __FILE__, __LINE__);
            exit(EXIT_FAILURE);
        }
        flipBits(pChroma, width);
        pChroma += width / PLANAR_CHROMA_WIDTH_ORDER;
    }
    uvOffset[0] = 0;
    copyHeight[0] = height;
    copyHeight[1] = height / PLANAR_CHROMA_HEIGHT_ORDER;
    copyHeight[2] = height / PLANAR_CHROMA_HEIGHT_ORDER;
    copyWidthInBytes[0] = width;
    // Width of the second and third planes is half of the first plane.  
    copyWidthInBytes[1] = width / PLANAR_CHROMA_WIDTH_ORDER;        
    copyWidthInBytes[2] = width / PLANAR_CHROMA_WIDTH_ORDER;        
    uvOffset[1] = width * height;
    uvOffset[2] = uvOffset[1] + (width / PLANAR_CHROMA_WIDTH_ORDER) * (height / PLANAR_CHROMA_HEIGHT_ORDER);
    for (int i = 0; i < numPlanes; i++) {
        checkCudaDrvErrors(cuCtxSynchronize());
        checkCudaErrors(cudaMemcpy2DToArray(
        cudaArr[i], 0, 0, (void *)(pBuff + uvOffset[i]), copyWidthInBytes[i],
        copyWidthInBytes[i], copyHeight[i],
        cudaMemcpyHostToDevice));
    }
    if (fp) {
        fclose(fp);
        fp = NULL;
    }
    if (pBuff) {
        free(pBuff);
        pBuff = NULL;
    }
    printf("Image %s copied to CUDA Array and bit flip done\n", path.c_str());
 }
 // Copy Cuda Array in YUV 420 format to a file 
 void Caller::copyCudaArrayToYUV(std::string &path, cudaArray_t *cudaArr) {
    FILE *fp = NULL;
    int bufferSize;
    uint32_t width = multiPlanarWidth[0];
    uint32_t height = multiPlanarHeight[0];
    uint32_t copyWidthInBytes=0, copyHeight=0;
    uint8_t *pCudaCopyMem = NULL;
    fp = fopen(path.c_str(), "wb+");
    if (!fp) {
        printf("WriteFrame: file open failed %s in %s line %d\n", path.c_str(), __FILE__, __LINE__);
        exit(EXIT_FAILURE);
    }
    for (int i = 0; i < numPlanes; i++) {
        if (i == 0) {
            bufferSize = width * height;
            copyWidthInBytes = width;
            copyHeight = height;
            pCudaCopyMem = (uint8_t *)malloc(bufferSize);
            if (pCudaCopyMem == NULL) {
                printf("pCudaCopyMem malloc failed in %s line %d\n", __FILE__, __LINE__);
                exit(EXIT_FAILURE);
            }
        } 
        else {
            bufferSize = ((height / PLANAR_CHROMA_HEIGHT_ORDER) * (width / PLANAR_CHROMA_WIDTH_ORDER));
            copyWidthInBytes = width / PLANAR_CHROMA_WIDTH_ORDER;
            copyHeight = height / PLANAR_CHROMA_HEIGHT_ORDER;
        }
        memset(pCudaCopyMem, 0, bufferSize);
        checkCudaErrors(cudaMemcpy2DFromArray(
        (void *)pCudaCopyMem, copyWidthInBytes, cudaArr[i], 0, 0, 
        copyWidthInBytes, copyHeight,
        cudaMemcpyDeviceToHost));
        checkCudaDrvErrors(cuCtxSynchronize());
        if (fwrite(pCudaCopyMem, bufferSize, 1, fp) != 1) {
            printf("Cuda consumer: output file write failed in %s line %d\n", __FILE__, __LINE__);
             exit(EXIT_FAILURE);
        }  
    }
    printf("Output file : %s saved\n", path.c_str());
    if (fp) {
        fclose(fp);
        fp = NULL;
    }
 } 
 void cudaNvSciBufMultiplanar::tearDown(Caller *caller1, Caller *caller2) {
    caller1->deinit();
    caller2->deinit();
    NvSciBufObjFree(buffObj);
 }
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.h
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.h
@ -0,0 +1,124 @@
 /* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef CUDA_NVSCIBUF_MULTIPLANAR_H
 #define CUDA_NVSCIBUF_MULTIPLANAR_H
 #include <cuda_runtime.h>
 #include <nvscibuf.h>
 #include <vector>
 #include <cuda.h>
 #include <helper_cuda.h>
 #define PLANAR_NUM_PLANES 3
 #define PLANAR_CHROMA_WIDTH_ORDER 2
 #define PLANAR_CHROMA_HEIGHT_ORDER 2
 #define ATTR_SIZE 20
 #define DEFAULT_GPU 0
 #define checkNvSciErrors(call)                              \
  do {                                                      \
    NvSciError _status = call;                              \
    if (NvSciError_Success != _status) {                    \
      printf(                                               \
          "NVSCI call in file '%s' in line %i returned"     \
          " %d, expected %d\n",                             \
          __FILE__, __LINE__, _status, NvSciError_Success); \
      fflush(stdout);                                       \
      exit(EXIT_FAILURE);                                   \
    }                                                       \
  } while (0)
 #define checkCudaDrvErrors(call)                           \
  do {                                                     \
    CUresult err = call;                                   \
    if (CUDA_SUCCESS != err) {                             \
      const char *errorStr = NULL;                         \
      cuGetErrorString(err, &errorStr);                    \
      printf(                                              \
              "checkCudaDrvErrors() Driver API error"      \
              " = %04d \"%s\" from file <%s>, "            \
              "line %i.\n",                                \
              err, errorStr, __FILE__, __LINE__);          \
      exit(EXIT_FAILURE);                                  \
    }                                                      \
  } while (0)
 extern void launchFlipSurfaceBitsKernel(cudaArray_t *levelArray, int32_t *multiPlanarWidth,
 int32_t *multiPlanarHeight, int numPlanes);
 class Caller {
 private:
    NvSciBufAttrList attrListOut;
    NvSciBufAttrKeyValuePair pairArrayOut[ATTR_SIZE];
    cudaExternalMemory_t extMem;
    int32_t numPlanes;
 public:
    NvSciBufAttrList attrList;
    cudaMipmappedArray_t multiPlanarArray[PLANAR_NUM_PLANES];
    int32_t multiPlanarWidth[PLANAR_NUM_PLANES];
    int32_t multiPlanarHeight[PLANAR_NUM_PLANES];
    void init();
    void deinit();
    void copyExtMemToMultiPlanarArrays();
    void copyYUVToCudaArrayAndFlipBits(std::string &image_filename, cudaArray_t *yuvPlanes);
    void copyCudaArrayToYUV(std::string &image_filename, cudaArray_t *yuvPlanes);
    void setAttrListImageMultiPlanes(int imageWidth, int imageHeight);
 };
 class cudaNvSciBufMultiplanar {
 private:
    size_t imageWidth;
    size_t imageHeight;
    int mCudaDeviceId;
    int deviceCnt;
    NvSciBufAttrList attrList[2];
    NvSciBufAttrList attrListReconciled;
    NvSciBufAttrList attrListConflict;
 public:
    cudaNvSciBufMultiplanar(size_t imageWidth, size_t imageHeight, std::vector<int> &deviceIds);
    void initCuda(int devId);
    void reconcileAttrList(NvSciBufAttrList *attrList1, NvSciBufAttrList *attrList2);
    void runCudaNvSciBufPlanar(std::string &image_filename, std::string &image_filename_out);
    void tearDown(Caller *caller1, Caller *caller2);
 };
 enum NvSciBufImageAttributes {
    PLANE_SIZE,  
    PLANE_ALIGNED_SIZE,
    PLANE_OFFSET,
    PLANE_HEIGHT, 
    PLANE_WIDTH, 
    PLANE_CHANNEL_COUNT, 
    PLANE_BITS_PER_PIXEL,
    PLANE_COUNT,
    PLANE_ATTR_SIZE
 };
 #endif  // CUDA_NVSCIBUF_MULTIPLANAR_H
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/imageKernels.cu
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/imageKernels.cu
@ -0,0 +1,64 @@
 /* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <cuda.h>
 #include <helper_cuda.h>
 static __global__ void flipSurfaceBits(cudaSurfaceObject_t surfObj, int width, int height) {
    char data;
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < width && y < height) {
        // Read from input surface
        surf2Dread(&data,  surfObj, x, y);
        // Write to output surface
        data = ~data;
        surf2Dwrite(data, surfObj, x, y);
    }
 }
 // Copy cudaArray to surface memory and launch the CUDA kernel
 void launchFlipSurfaceBitsKernel(
    cudaArray_t *levelArray, 
    int32_t *multiPlanarWidth, 
    int32_t *multiPlanarHeight, 
    int numPlanes) {
    cudaSurfaceObject_t surfObject[numPlanes] = {0};
    cudaResourceDesc resDesc;
    for (int i = 0; i < numPlanes; i++) { 
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = levelArray[i];
        checkCudaErrors(cudaCreateSurfaceObject(&surfObject[i], &resDesc));
        dim3 threadsperBlock(16, 16);
        dim3 numBlocks((multiPlanarWidth[i] + threadsperBlock.x - 1) / threadsperBlock.x,
                (multiPlanarHeight[i] + threadsperBlock.y - 1) / threadsperBlock.y);
        flipSurfaceBits<<<numBlocks, threadsperBlock>>>(surfObject[i], multiPlanarWidth[i], multiPlanarHeight[i]);
    }
 }
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/main.cpp
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/main.cpp
@ -0,0 +1,72 @@
 /* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <cuda.h>
 #include <vector>
 #include "cudaNvSciBufMultiplanar.h"
 #include <helper_image.h>
 #define MAX_FILE_SIZE 100
 int main(int argc, const char **argv) {
  int numOfGPUs = 0;
  std::vector<int> deviceIds;
  (cudaGetDeviceCount(&numOfGPUs));
  printf("%d GPUs found\n", numOfGPUs);
  if (!numOfGPUs) {
    exit(EXIT_WAIVED);
  } else {
    for (int devID = 0; devID < numOfGPUs; devID++) {
      int major = 0, minor = 0;
      (cudaDeviceGetAttribute(
          &major, cudaDevAttrComputeCapabilityMajor, devID));
      (cudaDeviceGetAttribute(
          &minor, cudaDevAttrComputeCapabilityMinor, devID));
      if (major >= 6) {
        deviceIds.push_back(devID);
      }
    }
    if (deviceIds.size() == 0) {
      printf(
          "cudaNvSciBufMultiplanar requires one or more GPUs of Pascal(SM 6.0) or higher "
          "archs\nWaiving..\n");
      exit(EXIT_WAIVED);
    }
  }
  std::string image_filename = sdkFindFilePath("yuv_planar_img1.yuv", argv[0]);
  std::string image_filename_out = "image_out.yuv";
  uint32_t imageWidth = 720;
  uint32_t imageHeight = 480;
  printf("input image %s , width = %d, height = %d\n", image_filename.c_str(), imageWidth, imageHeight);
  cudaNvSciBufMultiplanar cudaNvSciBufMultiplanarApp(imageWidth, imageHeight, deviceIds);
  cudaNvSciBufMultiplanarApp.runCudaNvSciBufPlanar(image_filename, image_filename_out);
  return EXIT_SUCCESS;
 }
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/yuv_planar_img1.yuv
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/yuv_planar_img1.yuv