diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9375c1af..d99370f0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
## Changelog
-### CUDA 12.1
+### CUDA 12.2
* libNVVM samples received updates
* Fixed jitLto Case issues
* Enabled HOST_COMPILER flag to the makefiles for GCC which is untested but may still work.
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/c_cpp_properties.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/c_cpp_properties.json
new file mode 100644
index 00000000..f0066b0f
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/c_cpp_properties.json
@@ -0,0 +1,18 @@
+{
+ "configurations": [
+ {
+ "name": "Linux",
+ "includePath": [
+ "${workspaceFolder}/**",
+ "${workspaceFolder}/../../../Common"
+ ],
+ "defines": [],
+ "compilerPath": "/usr/local/cuda/bin/nvcc",
+ "cStandard": "gnu17",
+ "cppStandard": "gnu++14",
+ "intelliSenseMode": "linux-gcc-x64",
+ "configurationProvider": "ms-vscode.makefile-tools"
+ }
+ ],
+ "version": 4
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/extensions.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/extensions.json
new file mode 100644
index 00000000..c7eb54dc
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+ "recommendations": [
+ "nvidia.nsight-vscode-edition",
+ "ms-vscode.cpptools",
+ "ms-vscode.makefile-tools"
+ ]
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/launch.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/launch.json
new file mode 100644
index 00000000..65a931b5
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/launch.json
@@ -0,0 +1,10 @@
+{
+ "configurations": [
+ {
+ "name": "CUDA C++: Launch",
+ "type": "cuda-gdb",
+ "request": "launch",
+ "program": "${workspaceFolder}/cuDLALayerwiseStatsHybrid"
+ }
+ ]
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/tasks.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/tasks.json
new file mode 100644
index 00000000..4509aeb1
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/tasks.json
@@ -0,0 +1,15 @@
+{
+ "version": "2.0.0",
+ "tasks": [
+ {
+ "label": "sample",
+ "type": "shell",
+ "command": "make dbg=1",
+ "problemMatcher": ["$nvcc"],
+ "group": {
+ "kind": "build",
+ "isDefault": true
+ }
+ }
+ ]
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/Makefile b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/Makefile
new file mode 100644
index 00000000..8bbd3e20
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/Makefile
@@ -0,0 +1,400 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+ $(info WARNING - x86_64 variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+ TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+ $(info WARNING - ARMv7 variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=armv7l instead)
+ TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+ $(info WARNING - aarch64 variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+ TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+ $(info WARNING - ppc64le variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+ TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+ $(info WARNING - GCC variable has been deprecated)
+ $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+ HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+ $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+ TARGET_SIZE := 64
+ else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+ TARGET_SIZE := 32
+ endif
+ else
+ TARGET_SIZE := $(shell getconf LONG_BIT)
+ endif
+else
+ $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+ ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+ HOST_ARCH := sbsa
+ TARGET_ARCH := sbsa
+ endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+ $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+ endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+ TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+ $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifdef HOST_COMPILER
+ CUSTOM_HOST_COMPILER = 1
+endif
+
+ifeq ($(TARGET_OS),darwin)
+ ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+ HOST_COMPILER ?= clang++
+ endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+ ifeq ($(TARGET_OS),linux)
+ HOST_COMPILER ?= arm-linux-gnueabihf-g++
+ else ifeq ($(TARGET_OS),qnx)
+ ifeq ($(QNX_HOST),)
+ $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+ endif
+ ifeq ($(QNX_TARGET),)
+ $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+ endif
+ export QNX_HOST
+ export QNX_TARGET
+ HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+ else ifeq ($(TARGET_OS),android)
+ HOST_COMPILER ?= arm-linux-androideabi-g++
+ endif
+ else ifeq ($(TARGET_ARCH),aarch64)
+ ifeq ($(TARGET_OS), linux)
+ HOST_COMPILER ?= aarch64-linux-gnu-g++
+ else ifeq ($(TARGET_OS),qnx)
+ ifeq ($(QNX_HOST),)
+ $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+ endif
+ ifeq ($(QNX_TARGET),)
+ $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+ endif
+ export QNX_HOST
+ export QNX_TARGET
+ HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+ else ifeq ($(TARGET_OS), android)
+ HOST_COMPILER ?= aarch64-linux-android-clang++
+ endif
+ else ifeq ($(TARGET_ARCH),sbsa)
+ HOST_COMPILER ?= aarch64-linux-gnu-g++
+ else ifeq ($(TARGET_ARCH),ppc64le)
+ HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+ endif
+endif
+HOST_COMPILER ?= g++
+NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS := -m${TARGET_SIZE}
+CCFLAGS :=
+LDFLAGS :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+ LDFLAGS += -rpath $(CUDA_PATH)/lib
+ CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+ LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+ CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+ LDFLAGS += -pie
+ CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+ ifneq ($(TARGET_FS),)
+ GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+ ifeq ($(GCCVERSIONLTEQ46),1)
+ CCFLAGS += --sysroot=$(TARGET_FS)
+ endif
+ LDFLAGS += --sysroot=$(TARGET_FS)
+ LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+ endif
+ endif
+ ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+ ifneq ($(TARGET_FS),)
+ GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+ ifeq ($(GCCVERSIONLTEQ46),1)
+ CCFLAGS += --sysroot=$(TARGET_FS)
+ endif
+ LDFLAGS += --sysroot=$(TARGET_FS)
+ LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+ LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+ CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+ CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+ endif
+ endif
+ ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+ NVCCFLAGS += -D_QNX_SOURCE
+ NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+ CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+ LDFLAGS += -lsocket
+ LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+ CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+ ifdef TARGET_OVERRIDE
+ LDFLAGS += -lslog2
+ endif
+
+ ifneq ($(TARGET_FS),)
+ LDFLAGS += -L$(TARGET_FS)/usr/lib
+ CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+ LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+ CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+ CCFLAGS += -I$(TARGET_FS)/../include
+ endif
+ endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+ NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+ CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+ CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+ CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+ CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+ CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+ CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+ CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+ CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+ NVCCFLAGS += -g -G
+ BUILD_TYPE := debug
+else
+ BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Linux x86_64
+ifeq ($(TARGET_OS),linux)
+ ifeq ($(TARGET_ARCH),x86_64)
+ $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on Linux x86_64 - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+ endif
+endif
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+ $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on Mac OSX - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+ $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on ARMv7 - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+ $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on sbsa - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+ empty :=
+ space := $(empty) $(empty)
+ GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+ GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+ GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+ GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+ GCCVERSION += 00
+# Remove spaces from the version number
+ GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+#$(warning $(GCCVERSION))
+
+ IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
+ ifneq ($(CUSTOM_HOST_COMPILER), 1)
+ ifeq ($(IS_MIN_VERSION), 1)
+ $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+ else
+ $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+ SAMPLE_ENABLED := 0
+ endif
+ else
+ $(warning >>> Custom HOST_COMPILER set; skipping GCC version check. This may lead to unintended behavior. Please note the minimum equivalent GCC version is 4.7.0 <<<)
+ endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
+SMS ?= 53 61 70 72 75 80 86 87 90
+else
+SMS ?= 50 52 60 61 70 75 80 86 89 90
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11 --threads 0
+
+LIBRARIES += -lcudla
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: cuDLALayerwiseStatsHybrid
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+ @echo "Sample will be waived due to the above missing dependencies"
+else
+ @echo "Sample is ready - all dependencies have been met"
+endif
+
+main.o:main.cu
+ $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+cuDLALayerwiseStatsHybrid: main.o
+ $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+ $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+ $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+ $(EXEC) ./cuDLALayerwiseStatsHybrid
+
+testrun: build
+
+clean:
+ rm -f cuDLALayerwiseStatsHybrid main.o
+ rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cuDLALayerwiseStatsHybrid
+
+clobber: clean
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/NsightEclipse.xml b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/NsightEclipse.xml
new file mode 100644
index 00000000..c00592db
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/NsightEclipse.xml
@@ -0,0 +1,71 @@
+
+
+
+ cuDLALayerwiseStatsHybrid
+
+ --std=c++11
+
+
+ cudaStreamCreateWithFlags
+ cudaStreamDestroy
+ cudaFree
+ cudaGetErrorName
+ cudaSetDevice
+ cudaStreamSynchronize
+ cudaMalloc
+ cudaMemsetAsync
+ cudaMemcpyAsync
+
+
+ whole
+
+ ./
+ ../
+ ../../../Common
+
+
+ cuDLA
+ Data Parallel Algorithms
+ Image Processing
+
+
+ CUDA
+ CPP11
+
+
+ cudla
+
+
+
+ true
+ main.cu
+
+ 1:CUDA Advanced Topics
+ 1:cuDLA
+
+ sm60
+ sm61
+ sm70
+ sm72
+ sm75
+ sm80
+ sm86
+ sm87
+ sm89
+ sm90
+
+
+ aarch64
+ linux
+
+
+ aarch64
+ qnx
+
+
+
+ 6.0
+
+ cuDLA Layerwise statistics HybridMode
+ exe
+
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md
new file mode 100644
index 00000000..d3df1858
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md
@@ -0,0 +1,60 @@
+# cuDLALayerwiseStatsHybrid - cuDLA Layerwise statistics HybridMode
+
+## Description
+
+This sample is used to provide layerwise statistics to the application in the cuDLA hybrid mode wherein DLA is programmed using CUDA.
+
+## Key Concepts
+
+cuDLA, Data Parallel Algorithms, Image Processing
+
+## Supported SM Architectures
+
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, QNX
+
+## Supported CPU Architecture
+
+aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSetDevice, cudaStreamSynchronize, cudaMalloc, cudaMemsetAsync, cudaMemcpyAsync
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd
+$ make
+```
+The samples makefiles can take advantage of certain options:
+* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are aarch64.
+ By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+* **dbg=1** - build with debug symbols
+ ```
+ $ make dbg=1
+ ```
+* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+ ```
+ $ make SMS="50 60"
+ ```
+
+* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+ $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu
new file mode 100644
index 00000000..81e575c6
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu
@@ -0,0 +1,898 @@
+/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cudla.h"
+#include "cuda_runtime.h"
+#include "cudlaExternalEtbl.hpp"
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define MAX_FILENAME_LEN 200
+#define RESERVED_SUFFIX_LEN 10
+
+#define DPRINTF(...) printf(__VA_ARGS__)
+
+static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) {
+ DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name);
+ DPRINTF("\tsize: %lu\n", tensorDesc->size);
+
+ DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c,
+ tensorDesc->h, tensorDesc->w);
+
+ DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat);
+ DPRINTF("\tdata type: %d\n", tensorDesc->dataType);
+ DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory);
+ DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat);
+ DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping);
+ DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]);
+ DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]);
+ DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]);
+ DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]);
+}
+
+typedef struct {
+ cudlaDevHandle devHandle;
+ cudlaModule moduleHandle;
+ unsigned char* loadableData;
+ cudaStream_t stream;
+ uint32_t numInputTensors;
+ uint32_t numOutputTensors;
+ uint32_t numOutputTaskStatistics;
+ unsigned char** inputBuffer;
+ unsigned char** outputBuffer;
+ unsigned char** statisticsOutputBuffer;
+ void** inputBufferGPU;
+ void** outputBufferGPU;
+ void** outputTaskStatisticsGPU;
+ void **csv;
+ cudlaModuleTensorDescriptor* inputTensorDesc;
+ cudlaModuleTensorDescriptor* outputTensorDesc;
+ cudlaModuleTensorDescriptor* outputTaskStatisticsDesc;
+ uint64_t** inputBufferRegisteredPtr;
+ uint64_t** outputBufferRegisteredPtr;
+ uint64_t** outputTaskStatisticsRegisteredPtr;
+ uint64_t** outputStatisticsBufferRegisteredPtr;
+} ResourceList;
+
+void cleanUp(ResourceList* resourceList);
+
+void cleanUp(ResourceList* resourceList) {
+ uint32_t ii = 0;
+ if (resourceList->inputTensorDesc != NULL) {
+ free(resourceList->inputTensorDesc);
+ resourceList->inputTensorDesc = NULL;
+ }
+
+ if (resourceList->outputTensorDesc != NULL) {
+ free(resourceList->outputTensorDesc);
+ resourceList->outputTensorDesc = NULL;
+ }
+
+ if (resourceList->outputTaskStatisticsDesc != NULL) {
+ free(resourceList->outputTaskStatisticsDesc);
+ resourceList->outputTaskStatisticsDesc = NULL;
+ }
+
+ if (resourceList->loadableData != NULL) {
+ free(resourceList->loadableData);
+ resourceList->loadableData = NULL;
+ }
+
+ if (resourceList->moduleHandle != NULL) {
+ cudlaModuleUnload(resourceList->moduleHandle, 0);
+ resourceList->moduleHandle = NULL;
+ }
+
+ if (resourceList->devHandle != NULL) {
+ cudlaDestroyDevice(resourceList->devHandle);
+ resourceList->devHandle = NULL;
+ }
+
+ if (resourceList->inputBufferGPU != NULL) {
+ for (ii = 0; ii < resourceList->numInputTensors; ii++) {
+ if ((resourceList->inputBufferGPU)[ii] != NULL) {
+ cudaFree((resourceList->inputBufferGPU)[ii]);
+ (resourceList->inputBufferGPU)[ii] = NULL;
+ }
+ }
+ free(resourceList->inputBufferGPU);
+ resourceList->inputBufferGPU = NULL;
+ }
+
+ if (resourceList->outputBufferGPU != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
+ if ((resourceList->outputBufferGPU)[ii] != NULL) {
+ cudaFree((resourceList->outputBufferGPU)[ii]);
+ (resourceList->outputBufferGPU)[ii] = NULL;
+ }
+ }
+ free(resourceList->outputBufferGPU);
+ resourceList->outputBufferGPU = NULL;
+ }
+
+ if (resourceList->outputTaskStatisticsGPU != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if ((resourceList->outputTaskStatisticsGPU)[ii] != NULL) {
+ cudaFree((resourceList->outputTaskStatisticsGPU)[ii]);
+ (resourceList->outputTaskStatisticsGPU)[ii] = NULL;
+ }
+ }
+ free(resourceList->outputTaskStatisticsGPU);
+ resourceList->outputTaskStatisticsGPU = NULL;
+ }
+
+ if (resourceList->csv != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if ((resourceList->csv)[ii] != NULL)
+ {
+ free((resourceList->csv)[ii]);
+ (resourceList->csv)[ii] = NULL;
+ }
+ }
+ free(resourceList->csv);
+ resourceList->csv = NULL;
+ }
+
+ if (resourceList->inputBuffer != NULL) {
+ for (ii = 0; ii < resourceList->numInputTensors; ii++) {
+ if ((resourceList->inputBuffer)[ii] != NULL) {
+ free((resourceList->inputBuffer)[ii]);
+ (resourceList->inputBuffer)[ii] = NULL;
+ }
+ }
+ free(resourceList->inputBuffer);
+ resourceList->inputBuffer = NULL;
+ }
+
+ if (resourceList->outputBuffer != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
+ if ((resourceList->outputBuffer)[ii] != NULL)
+ {
+ free((resourceList->outputBuffer)[ii]);
+ (resourceList->outputBuffer)[ii] = NULL;
+ }
+ }
+ free(resourceList->outputBuffer);
+ resourceList->outputBuffer = NULL;
+ }
+
+ if (resourceList->statisticsOutputBuffer != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if ((resourceList->statisticsOutputBuffer)[ii] != NULL) {
+ free((resourceList->statisticsOutputBuffer)[ii]);
+ (resourceList->statisticsOutputBuffer)[ii] = NULL;
+ }
+ }
+ free(resourceList->statisticsOutputBuffer);
+ resourceList->statisticsOutputBuffer = NULL;
+ }
+
+ if (resourceList->stream != NULL) {
+ cudaStreamDestroy(resourceList->stream);
+ resourceList->stream = NULL;
+ }
+
+ if (resourceList->inputBufferRegisteredPtr != NULL) {
+ free(resourceList->inputBufferRegisteredPtr);
+ resourceList->inputBufferRegisteredPtr = NULL;
+ }
+
+ if (resourceList->outputBufferRegisteredPtr != NULL) {
+ free(resourceList->outputBufferRegisteredPtr);
+ resourceList->outputBufferRegisteredPtr = NULL;
+ }
+
+ if (resourceList->outputTaskStatisticsRegisteredPtr != NULL) {
+ free(resourceList->outputTaskStatisticsRegisteredPtr);
+ resourceList->outputTaskStatisticsRegisteredPtr = NULL;
+ }
+
+ if (resourceList->outputStatisticsBufferRegisteredPtr != NULL) {
+ free(resourceList->outputStatisticsBufferRegisteredPtr);
+ resourceList->outputStatisticsBufferRegisteredPtr = NULL;
+ }
+
+ resourceList->numInputTensors = 0;
+ resourceList->numOutputTensors = 0;
+ resourceList->numOutputTaskStatistics = 0;
+}
+
+int main(int argc, char** argv) {
+ cudlaDevHandle devHandle;
+ cudlaModule moduleHandle;
+ cudlaStatus err;
+ uint32_t statSupport = 0;
+ uint32_t dlaFreqInMHz = 0;
+ FILE* fp = NULL;
+ struct stat st;
+ size_t file_size;
+ size_t actually_read = 0;
+ unsigned char *loadableData = NULL;
+ char filename[MAX_FILENAME_LEN];
+ const char* suffix = ".csv";
+
+ cudaStream_t stream;
+ cudaError_t result;
+ const char* errPtr = NULL;
+
+ ResourceList resourceList;
+
+ memset(&resourceList, 0x00, sizeof(ResourceList));
+
+ if ((argc != 4) && (argc != 5)) {
+ DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_hybrid_test1 \n");
+ return 1;
+ }
+
+ if (argc == 5) {
+ if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN))
+ {
+ DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN));
+ return 1;
+ }
+ }
+
+ // Read loadable into buffer.
+ fp = fopen(argv[1], "rb");
+ if (fp == NULL) {
+ DPRINTF("Cannot open file %s\n", argv[1]);
+ return 1;
+ }
+
+ if (stat(argv[1], &st) != 0) {
+ DPRINTF("Cannot stat file\n");
+ return 1;
+ }
+
+ file_size = st.st_size;
+ DPRINTF("The file size = %ld\n", file_size);
+
+ dlaFreqInMHz = atoi(argv[2]);
+ statSupport = atoi(argv[3]);
+
+ loadableData = (unsigned char *)malloc(file_size);
+ if (loadableData == NULL) {
+ DPRINTF("Cannot Allocate memory for loadable\n");
+ return 1;
+ }
+
+ actually_read = fread(loadableData, 1, file_size, fp);
+ if ( actually_read != file_size ) {
+ free(loadableData);
+ DPRINTF("Read wrong size\n");
+ return 1;
+ }
+ fclose(fp);
+
+ resourceList.loadableData = loadableData;
+
+ // Initialize CUDA.
+ result = cudaFree(0);
+ if (result != cudaSuccess) {
+ errPtr = cudaGetErrorName(result);
+ DPRINTF("Error in creating cudaFree = %s\n", errPtr);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ result = cudaSetDevice(0);
+ if (result != cudaSuccess) {
+ errPtr = cudaGetErrorName(result);
+ DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cuDLA create device = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ DPRINTF("Device created successfully\n");
+ resourceList.devHandle = devHandle;
+
+ err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ } else {
+ DPRINTF("Successfully loaded module\n");
+ }
+
+ resourceList.moduleHandle = moduleHandle;
+
+ // Create CUDA stream.
+ result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+ if (result != cudaSuccess) {
+ errPtr = cudaGetErrorName(result);
+ DPRINTF("Error in creating cuda stream = %s\n", errPtr);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.stream = stream;
+
+ // Get tensor attributes.
+ uint32_t numInputTensors = 0;
+ uint32_t numOutputTensors = 0;
+ uint32_t numOutputTaskStatistics = 0;
+
+ cudlaModuleAttribute attribute;
+
+ err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting numInputTensors = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ numInputTensors = attribute.numInputTensors;
+ DPRINTF("numInputTensors = %d\n", numInputTensors);
+
+ err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting numOutputTensors = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ numOutputTensors = attribute.numOutputTensors;
+ DPRINTF("numOutputTensors = %d\n", numOutputTensors);
+
+ // using the same attributes to get num_output_task_statistics_tensors
+ attribute.numOutputTensors = 0;
+
+ err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting numOutputTensors = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ numOutputTaskStatistics = attribute.numOutputTensors;
+ DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics);
+
+ if(numOutputTaskStatistics == 0) {
+ DPRINTF("Layerwise stats is not supported for this Loadable \n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.numInputTensors = numInputTensors;
+ resourceList.numOutputTensors = numOutputTensors;
+ resourceList.numOutputTaskStatistics = numOutputTaskStatistics;
+
+ cudlaModuleTensorDescriptor* inputTensorDesc =
+ (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors);
+ cudlaModuleTensorDescriptor* outputTensorDesc =
+ (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors);
+
+ if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) {
+ if (inputTensorDesc != NULL) {
+ free(inputTensorDesc);
+ inputTensorDesc = NULL;
+ }
+
+ if (outputTensorDesc != NULL) {
+ free(outputTensorDesc);
+ outputTensorDesc = NULL;
+ }
+
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.inputTensorDesc = inputTensorDesc;
+ resourceList.outputTensorDesc = outputTensorDesc;
+
+ cudlaModuleTensorDescriptor* outputTaskStatisticsDesc =
+ (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics);
+ if (outputTaskStatisticsDesc == NULL) {
+ free(outputTaskStatisticsDesc);
+ outputTaskStatisticsDesc = NULL;
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc;
+
+ attribute.inputTensorDesc = inputTensorDesc;
+ err = cudlaModuleGetAttributes(moduleHandle,
+ CUDLA_INPUT_TENSOR_DESCRIPTORS,
+ &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting input tensor descriptor = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("Printing input tensor descriptor\n");
+ printTensorDesc(inputTensorDesc);
+
+ attribute.outputTensorDesc = outputTensorDesc;
+ err = cudlaModuleGetAttributes(moduleHandle,
+ CUDLA_OUTPUT_TENSOR_DESCRIPTORS,
+ &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting output tensor descriptor = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("Printing output tensor descriptor\n");
+ printTensorDesc(outputTensorDesc);
+
+ attribute.outputTensorDesc = outputTaskStatisticsDesc;
+ err = cudlaModuleGetAttributes(moduleHandle,
+ CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS,
+ &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting task statistics descriptor = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ DPRINTF("Printing output task statistics descriptor size\n");
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size);
+ }
+
+ // Setup the input and output buffers which will be used as an input to CUDA.
+ unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors);
+ if (inputBuffer == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors);
+ resourceList.inputBuffer = inputBuffer;
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size);
+ if (inputBuffer[ii] == NULL) {
+ DPRINTF("Error in allocating input memory\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size);
+ }
+
+ unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors);
+ if (outputBuffer == NULL) {
+ DPRINTF("Error in allocating memory for output buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors);
+ resourceList.outputBuffer = outputBuffer;
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size);
+ if (outputBuffer[ii] == NULL) {
+ DPRINTF("Error in allocating output memory\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size);
+ }
+
+ unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics);
+ if (statisticsOutputBuffer == NULL) {
+ DPRINTF("Error in allocating memory for output buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics);
+ resourceList.statisticsOutputBuffer = statisticsOutputBuffer;
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size);
+ if (outputBuffer[ii] == NULL) {
+ DPRINTF("Error in allocating output memory\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size);
+ }
+
+ // Allocate memory on GPU.
+ void** inputBufferGPU = (void **)malloc(sizeof(void *)*numInputTensors);
+ if (inputBufferGPU == NULL) {
+ DPRINTF("Error in allocating memory for input buffer GPU array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(inputBufferGPU, 0x00, sizeof(void *)*numInputTensors);
+ resourceList.inputBufferGPU = inputBufferGPU;
+
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ result = cudaMalloc(&(inputBufferGPU[ii]), inputTensorDesc[ii].size);
+ if (result != cudaSuccess)
+ {
+ DPRINTF("Error in allocating input memory on GPU\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ void** outputBufferGPU = (void **)malloc(sizeof(void *)*numOutputTensors);
+ if (outputBufferGPU == NULL) {
+ DPRINTF("Error in allocating memory for output buffer GPU array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputBufferGPU, 0x00, sizeof(void *)*numOutputTensors);
+ resourceList.outputBufferGPU = outputBufferGPU;
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ result = cudaMalloc(&(outputBufferGPU[ii]), outputTensorDesc[ii].size);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in allocating output memory on GPU\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ void** outputTaskStatisticsGPU = (void **)malloc(sizeof(void *)*numOutputTaskStatistics);
+ if (outputTaskStatisticsGPU == NULL) {
+ DPRINTF("Error in allocating memory for output task statistics GPU array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputTaskStatisticsGPU, 0x00, sizeof(void *)*numOutputTaskStatistics);
+ resourceList.outputTaskStatisticsGPU = outputTaskStatisticsGPU;
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ result = cudaMalloc(&(outputTaskStatisticsGPU[ii]), outputTaskStatisticsDesc[ii].size);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in allocating task statistics memory on GPU\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ uint64_t** inputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors);
+ uint64_t** outputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors);
+ uint64_t** outputTaskStatisticsRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics);
+
+ if ((inputBufferRegisteredPtr == NULL) || (outputBufferRegisteredPtr == NULL) || (outputTaskStatisticsRegisteredPtr == NULL)) {
+ if (inputBufferRegisteredPtr != NULL) {
+ free(inputBufferRegisteredPtr);
+ inputBufferRegisteredPtr = NULL;
+ }
+
+ if (outputBufferRegisteredPtr != NULL) {
+ free(outputBufferRegisteredPtr);
+ outputBufferRegisteredPtr = NULL;
+ }
+
+ if (outputTaskStatisticsRegisteredPtr != NULL) {
+ free(outputTaskStatisticsRegisteredPtr);
+ outputTaskStatisticsRegisteredPtr = NULL;
+ }
+
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.inputBufferRegisteredPtr = inputBufferRegisteredPtr;
+ resourceList.outputBufferRegisteredPtr = outputBufferRegisteredPtr;
+ resourceList.outputTaskStatisticsRegisteredPtr = outputTaskStatisticsRegisteredPtr;
+
+ // Register the CUDA-allocated buffers.
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ err = cudlaMemRegister(devHandle,
+ (uint64_t* )(inputBufferGPU[ii]),
+ inputTensorDesc[ii].size,
+ &(inputBufferRegisteredPtr[ii]),
+ 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering input memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ err = cudlaMemRegister(devHandle,
+ (uint64_t* )(outputBufferGPU[ii]),
+ outputTensorDesc[ii].size,
+ &(outputBufferRegisteredPtr[ii]),
+ 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering output memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ err = cudlaMemRegister(devHandle,
+ (uint64_t* )(outputTaskStatisticsGPU[ii]),
+ outputTaskStatisticsDesc[ii].size,
+ &(outputTaskStatisticsRegisteredPtr[ii]),
+ CUDLA_TASK_STATISTICS);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering statistics output memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n");
+
+ // Copy data from CPU buffers to GPU buffers.
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ result = cudaMemcpyAsync(inputBufferGPU[ii], inputBuffer[ii], inputTensorDesc[ii].size, cudaMemcpyHostToDevice, stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in enqueueing memcpy for input\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ result = cudaMemsetAsync(outputBufferGPU[ii], 0, outputTensorDesc[ii].size, stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in enqueueing memset for output\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ result = cudaMemsetAsync(outputTaskStatisticsGPU[ii], 0, outputTaskStatisticsDesc[ii].size, stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in enqueueing memset for statistics output\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0};
+ uint32_t index = 0;
+ for (; index < numOutputTensors ; index++) {
+ outputStatisticsBufferRegisteredPtr[index] = ((outputBufferRegisteredPtr[index]));
+ }
+
+ for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) {
+ outputStatisticsBufferRegisteredPtr[index++] = ((outputTaskStatisticsRegisteredPtr[jj]));
+ }
+
+ // Enqueue a cuDLA task.
+ cudlaTask task;
+ task.moduleHandle = moduleHandle;
+ task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr;
+
+ if(statSupport == 1) {
+ task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics);
+ DPRINTF("Layerwise profiling is requested \n");
+ } else {
+ task.numOutputTensors = numOutputTensors;
+ DPRINTF("Layerwise profiling is not requested \n");
+ }
+
+ task.numInputTensors = numInputTensors;
+ task.inputTensor = inputBufferRegisteredPtr;
+ task.waitEvents = NULL;
+ task.signalEvents = NULL;
+
+ err = cudlaSubmitTask(devHandle, &task, 1, stream, 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("no of output tensor %u \n",(task.numOutputTensors));
+ DPRINTF("Error in submitting task\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("SUBMIT IS DONE !!!\n");
+
+ result = cudaStreamSynchronize(stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in synchronizing stream = %s\n", cudaGetErrorName(result));
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ // Wait for stream operations to finish and bring output buffer to CPU.
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ result = cudaMemcpyAsync(outputBuffer[ii], outputBufferGPU[ii],
+ outputTensorDesc[ii].size, cudaMemcpyDeviceToHost, stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in bringing result back to CPU\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ result = cudaStreamSynchronize(stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in synchronizing stream\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ if(statSupport == 1) {
+ // copy statistics data to cpu
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ result = cudaMemcpyAsync(statisticsOutputBuffer[ii], outputTaskStatisticsGPU[ii],
+ outputTaskStatisticsDesc[ii].size, cudaMemcpyDeviceToHost, stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in bringing result back to CPU\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ result = cudaStreamSynchronize(stream);
+ if (result != cudaSuccess) {
+ DPRINTF("Error in synchronizing stream\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ // To get the last index of the filename prefix in which statistics will be dumped
+ uint32_t index = 0;
+ if (argc == 5) {
+ while(argv[4][index]!='\0') {
+ index++;
+ }
+ }
+
+ const cudlaExternalEtbl* etbl = NULL;
+ if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) {
+ DPRINTF("Error in getting export table\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics);
+ if (csv == NULL) {
+ DPRINTF("Error in allocating memory for csv stream\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics);
+ resourceList.csv = csv;
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ cudlaTranslateCsvAttribute csvAttribute;
+ uint64_t csvStreamLength = 0;
+
+ err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute);
+ csv[ii] = (void* )malloc(csvAttribute.csvStreamLength);
+ csvStreamLength = csvAttribute.csvStreamLength;
+ DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength);
+
+ if (csv[ii] == NULL) {
+ DPRINTF("Error in allocating memory for csv stream\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(csv[ii], 0x00, csvAttribute.csvStreamLength);
+
+ csvAttribute.csvStreamStats = csv[ii];
+ err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in translating stats\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ if (argc == 5) {
+ sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix);
+ fp = fopen(filename, "w+");
+ if (fp == NULL) {
+ DPRINTF("Cannot open file %s\n", filename);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp);
+ if(ret_val != csvStreamLength) {
+ DPRINTF("number of elements written to file is %u \n", ret_val);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ fclose(fp);
+ } else {
+ DPRINTF("%s \n",(char *)csv[ii]);
+ }
+ }
+ }
+
+ // unregister the CUDA-allocated buffers.
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ err = cudlaMemUnregister(devHandle,
+ (inputBufferRegisteredPtr[ii]));
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering input memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ err = cudlaMemUnregister(devHandle,
+ (outputBufferRegisteredPtr[ii]));
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering output memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ err = cudlaMemUnregister(devHandle,
+ (outputTaskStatisticsRegisteredPtr[ii]));
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering output memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n");
+
+ result = cudaStreamDestroy(stream);
+ if (result != cudaSuccess) {
+ errPtr = cudaGetErrorName(result);
+ DPRINTF("Error in destroying cuda stream = %s\n", errPtr);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.stream = NULL;
+
+ err = cudlaModuleUnload(moduleHandle, 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cudlaModuleUnload = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ } else {
+ DPRINTF("Successfully unloaded module\n");
+ }
+
+ resourceList.moduleHandle = NULL;
+
+ err = cudlaDestroyDevice(devHandle);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cuDLA destroy device = %d\n", err);
+ return 1;
+ }
+ DPRINTF("Device destroyed successfully\n");
+
+ resourceList.devHandle = NULL;
+
+ cleanUp(&resourceList);
+
+ DPRINTF("cuDLALayerwiseStatsHybrid DONE !!!\n");
+
+ return 0;
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/c_cpp_properties.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/c_cpp_properties.json
new file mode 100644
index 00000000..f0066b0f
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/c_cpp_properties.json
@@ -0,0 +1,18 @@
+{
+ "configurations": [
+ {
+ "name": "Linux",
+ "includePath": [
+ "${workspaceFolder}/**",
+ "${workspaceFolder}/../../../Common"
+ ],
+ "defines": [],
+ "compilerPath": "/usr/local/cuda/bin/nvcc",
+ "cStandard": "gnu17",
+ "cppStandard": "gnu++14",
+ "intelliSenseMode": "linux-gcc-x64",
+ "configurationProvider": "ms-vscode.makefile-tools"
+ }
+ ],
+ "version": 4
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/extensions.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/extensions.json
new file mode 100644
index 00000000..c7eb54dc
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+ "recommendations": [
+ "nvidia.nsight-vscode-edition",
+ "ms-vscode.cpptools",
+ "ms-vscode.makefile-tools"
+ ]
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/launch.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/launch.json
new file mode 100644
index 00000000..a0aa79e8
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/launch.json
@@ -0,0 +1,10 @@
+{
+ "configurations": [
+ {
+ "name": "CUDA C++: Launch",
+ "type": "cuda-gdb",
+ "request": "launch",
+ "program": "${workspaceFolder}/cuDLALayerwiseStatsStandalone"
+ }
+ ]
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/tasks.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/tasks.json
new file mode 100644
index 00000000..4509aeb1
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/tasks.json
@@ -0,0 +1,15 @@
+{
+ "version": "2.0.0",
+ "tasks": [
+ {
+ "label": "sample",
+ "type": "shell",
+ "command": "make dbg=1",
+ "problemMatcher": ["$nvcc"],
+ "group": {
+ "kind": "build",
+ "isDefault": true
+ }
+ }
+ ]
+}
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/Makefile b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/Makefile
new file mode 100644
index 00000000..aa97778f
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/Makefile
@@ -0,0 +1,403 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+ $(info WARNING - x86_64 variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+ TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+ $(info WARNING - ARMv7 variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=armv7l instead)
+ TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+ $(info WARNING - aarch64 variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+ TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+ $(info WARNING - ppc64le variable has been deprecated)
+ $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+ TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+ $(info WARNING - GCC variable has been deprecated)
+ $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+ HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+ $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+ TARGET_SIZE := 64
+ else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+ TARGET_SIZE := 32
+ endif
+ else
+ TARGET_SIZE := $(shell getconf LONG_BIT)
+ endif
+else
+ $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+ ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+ HOST_ARCH := sbsa
+ TARGET_ARCH := sbsa
+ endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+ $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+ endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+ TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+ $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifdef HOST_COMPILER
+ CUSTOM_HOST_COMPILER = 1
+endif
+
+ifeq ($(TARGET_OS),darwin)
+ ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+ HOST_COMPILER ?= clang++
+ endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+ ifeq ($(TARGET_OS),linux)
+ HOST_COMPILER ?= arm-linux-gnueabihf-g++
+ else ifeq ($(TARGET_OS),qnx)
+ ifeq ($(QNX_HOST),)
+ $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+ endif
+ ifeq ($(QNX_TARGET),)
+ $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+ endif
+ export QNX_HOST
+ export QNX_TARGET
+ HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+ else ifeq ($(TARGET_OS),android)
+ HOST_COMPILER ?= arm-linux-androideabi-g++
+ endif
+ else ifeq ($(TARGET_ARCH),aarch64)
+ ifeq ($(TARGET_OS), linux)
+ HOST_COMPILER ?= aarch64-linux-gnu-g++
+ else ifeq ($(TARGET_OS),qnx)
+ ifeq ($(QNX_HOST),)
+ $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+ endif
+ ifeq ($(QNX_TARGET),)
+ $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+ endif
+ export QNX_HOST
+ export QNX_TARGET
+ HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+ else ifeq ($(TARGET_OS), android)
+ HOST_COMPILER ?= aarch64-linux-android-clang++
+ endif
+ else ifeq ($(TARGET_ARCH),sbsa)
+ HOST_COMPILER ?= aarch64-linux-gnu-g++
+ else ifeq ($(TARGET_ARCH),ppc64le)
+ HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+ endif
+endif
+HOST_COMPILER ?= g++
+NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS := -m${TARGET_SIZE}
+CCFLAGS :=
+LDFLAGS :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+ LDFLAGS += -rpath $(CUDA_PATH)/lib
+ CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+ LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+ CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+ LDFLAGS += -pie
+ CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+ ifneq ($(TARGET_FS),)
+ GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+ ifeq ($(GCCVERSIONLTEQ46),1)
+ CCFLAGS += --sysroot=$(TARGET_FS)
+ endif
+ LDFLAGS += --sysroot=$(TARGET_FS)
+ LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+ endif
+ endif
+ ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+ ifneq ($(TARGET_FS),)
+ GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+ ifeq ($(GCCVERSIONLTEQ46),1)
+ CCFLAGS += --sysroot=$(TARGET_FS)
+ endif
+ LDFLAGS += --sysroot=$(TARGET_FS)
+ LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+ LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+ LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+ CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+ CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+ endif
+ endif
+ ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+ NVCCFLAGS += -D_QNX_SOURCE
+ NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+ CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+ LDFLAGS += -lsocket
+ LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+ CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+ ifdef TARGET_OVERRIDE
+ LDFLAGS += -lslog2
+ endif
+
+ ifneq ($(TARGET_FS),)
+ LDFLAGS += -L$(TARGET_FS)/usr/lib
+ CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+ LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+ CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+ CCFLAGS += -I$(TARGET_FS)/../include
+ endif
+ endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+ NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+ CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+ CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+ CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+ CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+ CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+ CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+ CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+ CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+ NVCCFLAGS += -g -G
+ BUILD_TYPE := debug
+else
+ BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Linux x86_64
+ifeq ($(TARGET_OS),linux)
+ ifeq ($(TARGET_ARCH),x86_64)
+ $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on Linux x86_64 - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+ endif
+endif
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+ $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on Mac OSX - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+ $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on ARMv7 - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+ $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on sbsa - waiving sample <<<)
+ SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Makefile include to help find NVSCI Libraries
+include ./findnvsci.mk
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+ empty :=
+ space := $(empty) $(empty)
+ GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+ GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+ GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+ GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+ GCCVERSION += 00
+# Remove spaces from the version number
+ GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+#$(warning $(GCCVERSION))
+
+ IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
+ ifneq ($(CUSTOM_HOST_COMPILER), 1)
+ ifeq ($(IS_MIN_VERSION), 1)
+ $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+ else
+ $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+ SAMPLE_ENABLED := 0
+ endif
+ else
+ $(warning >>> Custom HOST_COMPILER set; skipping GCC version check. This may lead to unintended behavior. Please note the minimum equivalent GCC version is 4.7.0 <<<)
+ endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
+SMS ?= 53 61 70 72 75 80 86 87 90
+else
+SMS ?= 50 52 60 61 70 75 80 86 89 90
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11 --threads 0
+
+LIBRARIES += -lcudla -lnvscibuf -lnvscisync
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: cuDLALayerwiseStatsStandalone
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+ @echo "Sample will be waived due to the above missing dependencies"
+else
+ @echo "Sample is ready - all dependencies have been met"
+endif
+
+main.o:main.cpp
+ $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+cuDLALayerwiseStatsStandalone: main.o
+ $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+ $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+ $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+ $(EXEC) ./cuDLALayerwiseStatsStandalone
+
+testrun: build
+
+clean:
+ rm -f cuDLALayerwiseStatsStandalone main.o
+ rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cuDLALayerwiseStatsStandalone
+
+clobber: clean
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/NsightEclipse.xml b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/NsightEclipse.xml
new file mode 100644
index 00000000..341a6c76
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/NsightEclipse.xml
@@ -0,0 +1,65 @@
+
+
+
+ cuDLALayerwiseStatsStandalone
+
+ --std=c++11
+
+
+ whole
+
+ ./
+ ../
+ ../../../Common
+
+
+ cuDLA
+ Data Parallel Algorithms
+ Image Processing
+
+
+ CUDA
+ CPP11
+
+
+ cudla
+ nvscibuf
+ nvscisync
+
+
+
+ true
+ main.cpp
+
+ NVSCI
+
+
+ 1:CUDA Advanced Topics
+ 1:cuDLA
+
+ sm60
+ sm61
+ sm70
+ sm72
+ sm75
+ sm80
+ sm86
+ sm87
+ sm89
+ sm90
+
+
+ aarch64
+ linux
+
+
+ aarch64
+ qnx
+
+
+
+ 6.0
+
+ cuDLA Layerwise Statistics Standalone Mode
+ exe
+
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md
new file mode 100644
index 00000000..ec12c923
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md
@@ -0,0 +1,61 @@
+# cuDLALayerwiseStatsStandalone - cuDLA Layerwise Statistics Standalone Mode
+
+## Description
+
+This sample is used to provide layerwise statistics to the application in cuDLA standalone mode where DLA is programmed without using CUDA.
+
+## Key Concepts
+
+cuDLA, Data Parallel Algorithms, Image Processing
+
+## Supported SM Architectures
+
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, QNX
+
+## Supported CPU Architecture
+
+aarch64
+
+## CUDA APIs involved
+
+## Dependencies needed to build/run
+[NVSCI](../../../README.md#nvsci)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd
+$ make
+```
+The samples makefiles can take advantage of certain options:
+* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are aarch64.
+ By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+* **dbg=1** - build with debug symbols
+ ```
+ $ make dbg=1
+ ```
+* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+ ```
+ $ make SMS="50 60"
+ ```
+
+* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+ $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/findnvsci.mk b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/findnvsci.mk
new file mode 100644
index 00000000..0d6d157c
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/findnvsci.mk
@@ -0,0 +1,144 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################
+# findnvsci.mk is used to find the NVSCI Libraries and headers
+#
+################################################################################
+
+# Determine OS platform and unix distribution
+ifeq ("$(TARGET_OS)","linux")
+ # first search lsb_release
+ DISTRO = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+ ifeq ("$(DISTRO)","")
+ # second search and parse /etc/issue
+ DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
+ # ensure data from /etc/issue is valid
+ ifeq (,$(filter $(DISTRO),ubuntu fedora red rhel centos suse))
+ DISTRO =
+ endif
+ ifeq ("$(DISTRO)","")
+ # third, we can search in /etc/os-release or /etc/{distro}-release
+ DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
+ endif
+ endif
+endif
+
+ifeq ("$(TARGET_OS)","linux")
+ # $(info) >> findegl.mk -> LINUX path <<<)
+ # Each set of Linux Distros have different paths for where to find their OpenGL libraries reside
+ UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu >/dev/null 2>&1; echo $$?)
+ FEDORA = $(shell echo $(DISTRO) | grep -i fedora >/dev/null 2>&1; echo $$?)
+ RHEL = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
+ CENTOS = $(shell echo $(DISTRO) | grep -i centos >/dev/null 2>&1; echo $$?)
+ SUSE = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?)
+ KYLIN = $(shell echo $(DISTRO) | grep -i kylin >/dev/null 2>&1; echo $$?)
+ ifeq ("$(UBUNTU)","0")
+ ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+ GLPATH := /usr/arm-linux-gnueabihf/lib
+ GLLINK := -L/usr/arm-linux-gnueabihf/lib
+ ifneq ($(TARGET_FS),)
+ GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+ GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+ endif
+ else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64)
+ GLPATH := /usr/aarch64-linux-gnu/lib
+ GLLINK := -L/usr/aarch64-linux-gnu/lib
+ ifneq ($(TARGET_FS),)
+ GLPATH += $(TARGET_FS)/usr/lib
+ GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+ GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+ endif
+ else
+ UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1)
+ ifneq ("$(UBUNTU_PKG_NAME)","")
+ GLPATH ?= /usr/lib/$(UBUNTU_PKG_NAME)
+ GLLINK ?= -L/usr/lib/$(UBUNTU_PKG_NAME)
+ endif
+ DFLT_PATH ?= /usr/lib
+ endif
+ endif
+ ifeq ("$(SUSE)","0")
+ GLPATH ?= /usr/X11R6/lib64
+ GLLINK ?= -L/usr/X11R6/lib64
+ DFLT_PATH ?= /usr/lib64
+ endif
+ ifeq ("$(FEDORA)","0")
+ GLPATH ?= /usr/lib64/nvidia
+ GLLINK ?= -L/usr/lib64/nvidia
+ DFLT_PATH ?= /usr/lib64
+ endif
+ ifeq ("$(RHEL)","0")
+ GLPATH ?= /usr/lib64/nvidia
+ GLLINK ?= -L/usr/lib64/nvidia
+ DFLT_PATH ?= /usr/lib64
+ endif
+ ifeq ("$(CENTOS)","0")
+ GLPATH ?= /usr/lib64/nvidia
+ GLLINK ?= -L/usr/lib64/nvidia
+ DFLT_PATH ?= /usr/lib64
+ endif
+ ifeq ("$(KYLIN)","0")
+ GLPATH ?= /usr/lib64/nvidia
+ GLLINK ?= -L/usr/lib64/nvidia
+ DFLT_PATH ?= /usr/lib64
+ endif
+
+ NVSCIBUFLIB := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libnvscibuf.so -print 2>/dev/null)
+ NVSCISYNCLIB := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libnvscisync.so -print 2>/dev/null)
+
+ ifeq ("$(NVSCIBUFLIB)","")
+ $(info >>> WARNING - libnvscibuf.so not found, Waiving the sample <<<)
+ SAMPLE_ENABLED := 0
+ endif
+
+ ifeq ("$(NVSCISYNCLIB)","")
+ $(info >>> WARNING - libnvscisync.so not found, Waiving the sample <<<)
+ SAMPLE_ENABLED := 0
+ endif
+
+ HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include
+ ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+ HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include
+ else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux)
+ HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
+ endif
+
+ NVSCIBUFHEADER := $(shell find -L $(HEADER_SEARCH_PATH) -name nvscibuf.h -print 2>/dev/null)
+ NVSCISYNCHEADER := $(shell find -L $(HEADER_SEARCH_PATH) -name nvscisync.h -print 2>/dev/null)
+
+ ifeq ("$(NVSCIBUFHEADER)","")
+ $(info >>> WARNING - nvscibuf.h not found, Waiving the sample <<<)
+ SAMPLE_ENABLED := 0
+ endif
+ ifeq ("$(NVSCISYNCHEADER)","")
+ $(info >>> WARNING - nvscisync.h not found, Waiving the sample <<<)
+ SAMPLE_ENABLED := 0
+ endif
+else
+endif
+
diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp
new file mode 100644
index 00000000..639e7889
--- /dev/null
+++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp
@@ -0,0 +1,1348 @@
+/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cudla.h"
+#include "nvscierror.h"
+#include "nvscibuf.h"
+#include "nvscisync.h"
+#include "cudlaExternalEtbl.hpp"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define MAX_FILENAME_LEN 200
+#define RESERVED_SUFFIX_LEN 10
+
+#define DPRINTF(...) printf(__VA_ARGS__)
+
+static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) {
+ DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name);
+ DPRINTF("\tsize: %lu\n", tensorDesc->size);
+
+ DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n",
+ tensorDesc->n,
+ tensorDesc->c,
+ tensorDesc->h,
+ tensorDesc->w);
+
+ DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat);
+ DPRINTF("\tdata type: %d\n", tensorDesc->dataType);
+ DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory);
+ DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat);
+ DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping);
+ DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]);
+ DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]);
+ DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]);
+ DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]);
+}
+
+typedef struct {
+ cudlaDevHandle devHandle;
+ cudlaModule moduleHandle;
+ unsigned char* loadableData;
+ uint32_t numInputTensors;
+ uint32_t numOutputTensors;
+ uint32_t numOutputTaskStatistics;
+ unsigned char** inputBuffer;
+ unsigned char** outputBuffer;
+ unsigned char** statisticsOutputBuffer;
+ cudlaModuleTensorDescriptor* inputTensorDesc;
+ cudlaModuleTensorDescriptor* outputTensorDesc;
+ cudlaModuleTensorDescriptor* outputTaskStatisticsDesc;
+ NvSciBufObj* inputBufObj;
+ NvSciBufObj* outputBufObj;
+ NvSciBufObj* statisticsBufObj;
+ NvSciBufModule bufModule;
+ NvSciBufAttrList* inputAttrList;
+ NvSciBufAttrList* reconciledInputAttrList;
+ NvSciBufAttrList* inputConflictList;
+ NvSciBufAttrList* outputAttrList;
+ NvSciBufAttrList* reconciledOutputAttrList;
+ NvSciBufAttrList* outputConflictList;
+ NvSciSyncObj syncObj;
+ NvSciSyncModule syncModule;
+ NvSciSyncCpuWaitContext nvSciCtx;
+ NvSciSyncAttrList waiterAttrListObj;
+ NvSciSyncAttrList signalerAttrListObj;
+ NvSciSyncAttrList nvSciSyncConflictListObj;
+ NvSciSyncAttrList nvSciSyncReconciledListObj;
+ NvSciBufAttrList* statisticsOutputAttrList;
+ NvSciBufAttrList* reconciledStatisticsOutputAttrList;
+ NvSciBufAttrList* statisticsOutputConflictList;
+ uint64_t** inputBufObjRegPtr;
+ uint64_t** outputBufObjRegPtr;
+ uint64_t** statisticsBufObjRegPtr;
+ uint64_t** devPtrs;
+ cudlaSignalEvents* signalEvents;
+ NvSciSyncFence eofFence;
+ void **csv;
+} ResourceList;
+
+void cleanUp(ResourceList* resourceList);
+
+void cleanUp(ResourceList* resourceList) {
+ uint32_t ii = 0;
+
+ if (resourceList->inputTensorDesc != NULL) {
+ free(resourceList->inputTensorDesc);
+ resourceList->inputTensorDesc = NULL;
+ }
+ if (resourceList->outputTensorDesc != NULL) {
+ free(resourceList->outputTensorDesc);
+ resourceList->outputTensorDesc = NULL;
+ }
+
+ if (resourceList->outputTaskStatisticsDesc != NULL) {
+ free(resourceList->outputTaskStatisticsDesc);
+ resourceList->outputTaskStatisticsDesc = NULL;
+ }
+
+ if (resourceList->loadableData != NULL) {
+ free(resourceList->loadableData);
+ resourceList->loadableData = NULL;
+ }
+
+ if (resourceList->moduleHandle != NULL) {
+ cudlaModuleUnload(resourceList->moduleHandle, 0);
+ resourceList->moduleHandle = NULL;
+ }
+
+ if (resourceList->devHandle != NULL) {
+ cudlaDestroyDevice(resourceList->devHandle);
+ resourceList->devHandle = NULL;
+ }
+
+ if (resourceList->inputBufObj != NULL) {
+ for (ii = 0; ii < resourceList->numInputTensors; ii++) {
+ if((resourceList->inputBufObj)[ii] != NULL) {
+ NvSciBufObjFree((resourceList->inputBufObj)[ii]);
+ (resourceList->inputBufObj)[ii] = NULL;
+ }
+ }
+ }
+
+ if (resourceList->outputBufObj != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
+ if((resourceList->outputBufObj)[ii] != NULL) {
+ NvSciBufObjFree((resourceList->outputBufObj)[ii]);
+ (resourceList->outputBufObj)[ii] = NULL;
+ }
+ }
+ }
+
+ if (resourceList->statisticsBufObj != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if((resourceList->statisticsBufObj)[ii] != NULL) {
+ NvSciBufObjFree((resourceList->statisticsBufObj)[ii]);
+ (resourceList->statisticsBufObj)[ii] = NULL;
+ }
+ }
+ }
+
+ if (resourceList->inputBuffer != NULL) {
+ for (ii = 0; ii < resourceList->numInputTensors; ii++) {
+ if ((resourceList->inputBuffer)[ii] != NULL) {
+ free((resourceList->inputBuffer)[ii]);
+ (resourceList->inputBuffer)[ii] = NULL;
+ }
+ }
+ free(resourceList->inputBuffer);
+ resourceList->inputBuffer = NULL;
+ }
+
+ if (resourceList->outputBuffer != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
+ if ((resourceList->outputBuffer)[ii] != NULL) {
+ free((resourceList->outputBuffer)[ii]);
+ (resourceList->outputBuffer)[ii] = NULL;
+ }
+ }
+ free(resourceList->outputBuffer);
+ resourceList->outputBuffer = NULL;
+ }
+
+ if (resourceList->statisticsOutputBuffer != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if ((resourceList->statisticsOutputBuffer)[ii] != NULL) {
+ free((resourceList->statisticsOutputBuffer)[ii]);
+ (resourceList->statisticsOutputBuffer)[ii] = NULL;
+ }
+ }
+ free(resourceList->statisticsOutputBuffer);
+ resourceList->statisticsOutputBuffer = NULL;
+ }
+
+ if (resourceList->csv != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if ((resourceList->csv)[ii] != NULL) {
+ free((resourceList->csv)[ii]);
+ (resourceList->csv)[ii] = NULL;
+ }
+ }
+ free(resourceList->csv);
+ resourceList->csv = NULL;
+ }
+
+ if (resourceList->reconciledInputAttrList != NULL) {
+ for (ii = 0; ii < resourceList->numInputTensors; ii++) {
+ if((resourceList->reconciledInputAttrList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->reconciledInputAttrList)[ii]);
+ (resourceList->reconciledInputAttrList)[ii] = NULL;
+ }
+ }
+ free(resourceList->reconciledInputAttrList);
+ resourceList->reconciledInputAttrList = NULL;
+ }
+
+ if (resourceList->inputConflictList != NULL) {
+ for (ii = 0; ii < resourceList->numInputTensors; ii++) {
+ if((resourceList->inputConflictList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->inputConflictList)[ii]);
+ (resourceList->inputConflictList)[ii] = NULL;
+ }
+ }
+ free(resourceList->inputConflictList);
+ resourceList->inputConflictList = NULL;
+ }
+
+ if (resourceList->inputAttrList != NULL) {
+ for (ii = 0; ii < resourceList->numInputTensors; ii++) {
+ if((resourceList->inputAttrList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->inputAttrList)[ii]);
+ (resourceList->inputAttrList)[ii] = NULL;
+ }
+ }
+ free(resourceList->inputAttrList);
+ resourceList->inputAttrList = NULL;
+ }
+
+ if (resourceList->reconciledOutputAttrList != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
+ if((resourceList->reconciledOutputAttrList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->reconciledOutputAttrList)[ii]);
+ (resourceList->reconciledOutputAttrList)[ii] = NULL;
+ }
+ }
+ free(resourceList->reconciledOutputAttrList);
+ resourceList->reconciledOutputAttrList = NULL;
+ }
+
+ if (resourceList->outputConflictList != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
+ if((resourceList->outputConflictList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->outputConflictList)[ii]);
+ (resourceList->outputConflictList)[ii] = NULL;
+ }
+ }
+ free(resourceList->outputConflictList);
+ resourceList->outputConflictList = NULL;
+ }
+
+ if (resourceList->outputAttrList != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTensors; ii++) {
+ if((resourceList->outputAttrList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->outputAttrList)[ii]);
+ (resourceList->outputAttrList)[ii] = NULL;
+ }
+ }
+ free(resourceList->outputAttrList);
+ resourceList->outputAttrList = NULL;
+ }
+
+ if (resourceList->reconciledStatisticsOutputAttrList != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if((resourceList->reconciledStatisticsOutputAttrList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->reconciledStatisticsOutputAttrList)[ii]);
+ (resourceList->reconciledStatisticsOutputAttrList)[ii] = NULL;
+ }
+ }
+ free(resourceList->reconciledStatisticsOutputAttrList);
+ resourceList->reconciledStatisticsOutputAttrList = NULL;
+ }
+
+ if (resourceList->statisticsOutputConflictList != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if((resourceList->statisticsOutputConflictList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->statisticsOutputConflictList)[ii]);
+ (resourceList->statisticsOutputConflictList)[ii] = NULL;
+ }
+ }
+ free(resourceList->statisticsOutputConflictList);
+ resourceList->statisticsOutputConflictList = NULL;
+ }
+
+ if (resourceList->statisticsOutputAttrList != NULL) {
+ for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) {
+ if((resourceList->statisticsOutputAttrList)[ii] != NULL) {
+ NvSciBufAttrListFree((resourceList->statisticsOutputAttrList)[ii]);
+ (resourceList->statisticsOutputAttrList)[ii] = NULL;
+ }
+ }
+ free(resourceList->statisticsOutputAttrList);
+ resourceList->statisticsOutputAttrList = NULL;
+ }
+
+ if (resourceList->outputBufObjRegPtr != NULL) {
+ free(resourceList->outputBufObjRegPtr);
+ resourceList->outputBufObjRegPtr = NULL;
+ }
+
+ if (resourceList->statisticsBufObjRegPtr != NULL) {
+ free(resourceList->statisticsBufObjRegPtr);
+ resourceList->statisticsBufObjRegPtr = NULL;
+ }
+
+ if (resourceList->inputBufObjRegPtr != NULL) {
+ free(resourceList->inputBufObjRegPtr);
+ resourceList->inputBufObjRegPtr = NULL;
+ }
+
+ if (resourceList->bufModule != NULL) {
+ NvSciBufModuleClose(resourceList->bufModule);
+ resourceList->bufModule = NULL;
+ }
+
+ NvSciSyncFenceClear(&(resourceList->eofFence));
+ if (resourceList->syncObj != NULL) {
+ NvSciSyncObjFree(resourceList->syncObj);
+ resourceList->syncObj = NULL;
+ }
+
+ if (resourceList->nvSciSyncConflictListObj != NULL) {
+ NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj);
+ resourceList->nvSciSyncConflictListObj = NULL;
+ }
+
+ if (resourceList->nvSciSyncReconciledListObj != NULL) {
+ NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj);
+ resourceList->nvSciSyncReconciledListObj = NULL;
+ }
+
+ if (resourceList->signalerAttrListObj != NULL) {
+ NvSciSyncAttrListFree(resourceList->signalerAttrListObj);
+ resourceList->signalerAttrListObj = NULL;
+ }
+
+ if (resourceList->waiterAttrListObj != NULL) {
+ NvSciSyncAttrListFree(resourceList->waiterAttrListObj);
+ resourceList->waiterAttrListObj = NULL;
+ }
+
+ if (resourceList->nvSciCtx != NULL) {
+ NvSciSyncCpuWaitContextFree(resourceList->nvSciCtx);
+ resourceList->nvSciCtx = NULL;
+ }
+
+ if (resourceList->syncModule != NULL) {
+ NvSciSyncModuleClose(resourceList->syncModule);
+ resourceList->syncModule = NULL;
+ }
+
+ if (resourceList->signalEvents != NULL) {
+ if (resourceList->signalEvents->eofFences != NULL) {
+ free(resourceList->signalEvents->eofFences);
+ resourceList->signalEvents->eofFences = NULL;
+ }
+ free(resourceList->signalEvents);
+ resourceList->signalEvents = NULL;
+ }
+
+ if (resourceList->devPtrs != NULL) {
+ free(resourceList->devPtrs);
+ resourceList->devPtrs = NULL;
+ }
+
+ resourceList->numInputTensors = 0;
+ resourceList->numOutputTensors = 0;
+ resourceList->numOutputTaskStatistics = 0;
+}
+
+cudlaStatus createAndSetAttrList(NvSciBufModule module,
+ uint64_t bufSize,
+ NvSciBufAttrList *attrList);
+
+
+cudlaStatus createAndSetAttrList(NvSciBufModule module,
+ uint64_t bufSize,
+ NvSciBufAttrList *attrList) {
+ cudlaStatus status = cudlaSuccess;
+ NvSciError sciStatus = NvSciError_Success;
+
+ sciStatus = NvSciBufAttrListCreate(module, attrList);
+ if (sciStatus != NvSciError_Success) {
+ status = cudlaErrorNvSci;
+ DPRINTF("Error in creating NvSciBuf attribute list\n");
+ return status;
+ }
+
+ // TODO: Refactor into multiple dimensions
+ bool needCpuAccess = true;
+ NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
+ uint32_t dimcount = 1;
+ uint64_t sizes[] = {bufSize};
+ uint32_t alignment[] = {1};
+ uint32_t dataType = NvSciDataType_Int8;
+ NvSciBufType type = NvSciBufType_Tensor;
+ uint64_t baseAddrAlign = 512;
+
+ NvSciBufAttrKeyValuePair setAttrs[] = {
+ {.key = NvSciBufGeneralAttrKey_Types,
+ .value = &type,
+ .len = sizeof(type)},
+ {.key = NvSciBufTensorAttrKey_DataType,
+ .value = &dataType,
+ .len = sizeof(dataType)},
+ {.key = NvSciBufTensorAttrKey_NumDims,
+ .value = &dimcount,
+ .len = sizeof(dimcount)},
+ {.key = NvSciBufTensorAttrKey_SizePerDim,
+ .value = &sizes,
+ .len = sizeof(sizes)},
+ {.key = NvSciBufTensorAttrKey_AlignmentPerDim,
+ .value = &alignment,
+ .len = sizeof(alignment)},
+ {.key = NvSciBufTensorAttrKey_BaseAddrAlign,
+ .value = &baseAddrAlign,
+ .len = sizeof(baseAddrAlign)},
+ {.key = NvSciBufGeneralAttrKey_RequiredPerm,
+ .value = &perm,
+ .len = sizeof(perm)},
+ {.key = NvSciBufGeneralAttrKey_NeedCpuAccess,
+ .value = &needCpuAccess,
+ .len = sizeof(needCpuAccess)}};
+ size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair);
+
+ sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length);
+ if (sciStatus != NvSciError_Success)
+ {
+ status = cudlaErrorNvSci;
+ DPRINTF("Error in setting NvSciBuf attribute list\n");
+ return status;
+ }
+
+ return status;
+}
+
+NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list);
+
+NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) {
+ bool cpuWaiter = true;
+ NvSciSyncAttrKeyValuePair keyValue[2];
+ memset(keyValue, 0, sizeof(keyValue));
+ keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess;
+ keyValue[0].value = (void*) &cpuWaiter;
+ keyValue[0].len = sizeof(cpuWaiter);
+ NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly;
+ keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm;
+ keyValue[1].value = (void*) &cpuPerm;
+ keyValue[1].len = sizeof(cpuPerm);
+ return NvSciSyncAttrListSetAttrs(list, keyValue, 2);
+}
+
+int main(int argc, char** argv) {
+ cudlaDevHandle devHandle;
+ cudlaModule moduleHandle;
+ cudlaStatus err;
+ uint32_t statSupport = 0;
+ uint32_t dlaFreqInMHz = 0;
+ FILE* fp = NULL;
+ struct stat st;
+ size_t file_size;
+ size_t actually_read = 0;
+ unsigned char *loadableData = NULL;
+ char filename[MAX_FILENAME_LEN];
+ const char* suffix = ".csv";
+
+
+ ResourceList resourceList;
+
+ memset(&resourceList, 0x00, sizeof(ResourceList));
+
+ if ((argc != 4) && (argc != 5)) {
+ DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_standalone_test1 \n");
+ return 1;
+ }
+
+ if (argc == 5) {
+ if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) {
+ DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN));
+ return 1;
+ }
+ }
+
+ // Read loadable into buffer.
+ fp = fopen(argv[1], "rb");
+ if (fp == NULL) {
+ DPRINTF("Cannot open file %s\n", argv[1]);
+ return 1;
+ }
+
+ if (stat(argv[1], &st) != 0) {
+ DPRINTF("Cannot stat file\n");
+ return 1;
+ }
+
+ file_size = st.st_size;
+ DPRINTF("The file size = %ld\n", file_size);
+
+ dlaFreqInMHz = atoi(argv[2]);
+ statSupport = atoi(argv[3]);
+
+ loadableData = (unsigned char *)malloc(file_size);
+ if (loadableData == NULL) {
+ DPRINTF("Cannot Allocate memory for loadable\n");
+ return 1;
+ }
+
+ actually_read = fread(loadableData, 1, file_size, fp);
+ if ( actually_read != file_size ) {
+ free(loadableData);
+ DPRINTF("Read wrong size\n");
+ return 1;
+ }
+ fclose(fp);
+
+ resourceList.loadableData = loadableData;
+
+ err = cudlaCreateDevice(0, &devHandle, CUDLA_STANDALONE);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cuDLA create device = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ DPRINTF("Device created successfully\n");
+ resourceList.devHandle = devHandle;
+
+ err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ } else {
+ DPRINTF("Successfully loaded module\n");
+ }
+
+ resourceList.moduleHandle = moduleHandle;
+
+ // Get tensor attributes.
+ uint32_t numInputTensors = 0;
+ uint32_t numOutputTensors = 0;
+ uint32_t numOutputTaskStatistics = 0;
+
+ cudlaModuleAttribute attribute;
+
+ err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting numInputTensors = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ numInputTensors = attribute.numInputTensors;
+ DPRINTF("numInputTensors = %d\n", numInputTensors);
+
+ err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting numOutputTensors = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ numOutputTensors = attribute.numOutputTensors;
+ DPRINTF("numOutputTensors = %d\n", numOutputTensors);
+
+ // using the same attributes to get num_output_task_statistics_tensors
+ attribute.numOutputTensors = 0;
+
+ err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting numOutputTensors = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ numOutputTaskStatistics = attribute.numOutputTensors;
+ DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics);
+
+ if(numOutputTaskStatistics == 0) {
+ DPRINTF("Layerwise stats is not supported for this Loadable \n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.numInputTensors = numInputTensors;
+ resourceList.numOutputTensors = numOutputTensors;
+ resourceList.numOutputTaskStatistics = numOutputTaskStatistics;
+
+ cudlaModuleTensorDescriptor* inputTensorDesc =
+ (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors);
+ cudlaModuleTensorDescriptor* outputTensorDesc =
+ (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors);
+
+ if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) {
+ if (inputTensorDesc != NULL)
+ {
+ free(inputTensorDesc);
+ inputTensorDesc = NULL;
+ }
+
+ if (outputTensorDesc != NULL)
+ {
+ free(outputTensorDesc);
+ outputTensorDesc = NULL;
+ }
+
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.inputTensorDesc = inputTensorDesc;
+ resourceList.outputTensorDesc = outputTensorDesc;
+
+ cudlaModuleTensorDescriptor* outputTaskStatisticsDesc =
+ (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics);
+ if (outputTaskStatisticsDesc == NULL) {
+ free(outputTaskStatisticsDesc);
+ outputTaskStatisticsDesc = NULL;
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc;
+
+ attribute.inputTensorDesc = inputTensorDesc;
+ err = cudlaModuleGetAttributes(moduleHandle,
+ CUDLA_INPUT_TENSOR_DESCRIPTORS,
+ &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting input tensor descriptor = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("Printing input tensor descriptor\n");
+ printTensorDesc(inputTensorDesc);
+
+ attribute.outputTensorDesc = outputTensorDesc;
+ err = cudlaModuleGetAttributes(moduleHandle,
+ CUDLA_OUTPUT_TENSOR_DESCRIPTORS,
+ &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting output tensor descriptor = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("Printing output tensor descriptor\n");
+ printTensorDesc(outputTensorDesc);
+
+ attribute.outputTensorDesc = outputTaskStatisticsDesc;
+ err = cudlaModuleGetAttributes(moduleHandle,
+ CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS,
+ &attribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting task statistics descriptor = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("Printing output task statistics descriptor size\n");
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size);
+ }
+
+ // Setup the input and output buffers.
+ unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors);
+ if (inputBuffer == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors);
+ resourceList.inputBuffer = inputBuffer;
+
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size);
+ if (inputBuffer[ii] == NULL) {
+ DPRINTF("Error in allocating input memory\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size);
+ }
+
+ unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors);
+ if (outputBuffer == NULL) {
+ DPRINTF("Error in allocating memory for output buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors);
+ resourceList.outputBuffer = outputBuffer;
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size);
+ if (outputBuffer[ii] == NULL) {
+ DPRINTF("Error in allocating output memory\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size);
+ }
+
+ unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics);
+ if (statisticsOutputBuffer == NULL) {
+ DPRINTF("Error in allocating memory for output buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics);
+ resourceList.statisticsOutputBuffer = statisticsOutputBuffer;
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size);
+ if (outputBuffer[ii] == NULL) {
+ DPRINTF("Error in allocating output memory\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size);
+ }
+
+ NvSciBufModule bufModule = NULL;
+ NvSciBufAttrList *inputAttrList = {NULL};
+ NvSciBufAttrList *outputAttrList = {NULL};
+ NvSciBufAttrList *statisticsOutputAttrList = {NULL};
+ NvSciBufAttrList *reconciledInputAttrList = {NULL};
+ NvSciBufAttrList *reconciledOutputAttrList = {NULL};
+ NvSciBufAttrList *reconciledStatisticsOutputAttrList = {NULL};
+ NvSciBufAttrList *inputConflictList = {NULL};
+ NvSciBufAttrList *outputConflictList = {NULL};
+ NvSciBufAttrList *statisticsOutputConflictList = {NULL};
+ NvSciError sciError = NvSciError_Success;
+
+ sciError = NvSciBufModuleOpen(&bufModule);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in initializing NvSciBufModule\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ resourceList.bufModule = bufModule;
+
+ // creating and setting input attribute list
+
+ inputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors);
+ if (inputAttrList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(inputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors);
+ resourceList.inputAttrList = inputAttrList;
+
+ reconciledInputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors);
+ if (reconciledInputAttrList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(reconciledInputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors);
+ resourceList.reconciledInputAttrList = reconciledInputAttrList;
+
+ inputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors);
+ if (inputConflictList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(inputConflictList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors);
+ resourceList.inputConflictList = inputConflictList;
+
+
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ err = createAndSetAttrList(bufModule,
+ inputTensorDesc[ii].size,
+ &inputAttrList[ii]);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in creating NvSciBuf attribute list for input attribute\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ sciError = NvSciBufAttrListReconcile(&inputAttrList[ii],
+ 1,
+ &reconciledInputAttrList[ii],
+ &inputConflictList[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in reconciling NvSciBuf attribute list for input attribute\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ }
+
+ outputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors);
+ if (outputAttrList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors);
+ resourceList.outputAttrList = outputAttrList;
+
+ reconciledOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors);
+ if (reconciledOutputAttrList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(reconciledOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors);
+ resourceList.reconciledOutputAttrList = reconciledOutputAttrList;
+
+ outputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors);
+ if (outputConflictList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors);
+ resourceList.outputConflictList = outputConflictList;
+
+ // creating and setting output attribute list
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ err = createAndSetAttrList(bufModule,
+ outputTensorDesc[ii].size,
+ &outputAttrList[ii]);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in creating NvSciBuf attribute list for output attibute\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ sciError = NvSciBufAttrListReconcile(&outputAttrList[ii],
+ 1,
+ &reconciledOutputAttrList[ii],
+ &outputConflictList[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in reconciling NvSciBuf attribute list for output attribute\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ statisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
+ if (statisticsOutputAttrList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(statisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
+ resourceList.statisticsOutputAttrList = statisticsOutputAttrList;
+
+ reconciledStatisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
+ if (reconciledStatisticsOutputAttrList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(reconciledStatisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
+ resourceList.reconciledStatisticsOutputAttrList = reconciledStatisticsOutputAttrList;
+
+ statisticsOutputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
+ if (statisticsOutputConflictList == NULL) {
+ DPRINTF("Error in allocating memory for input buffer array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(statisticsOutputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics);
+ resourceList.statisticsOutputConflictList = statisticsOutputConflictList;
+
+ // creating and setting statistics output attribute list
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ err = createAndSetAttrList(bufModule,
+ outputTaskStatisticsDesc[ii].size,
+ &statisticsOutputAttrList[ii]);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in creating NvSciBuf attribute list\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ sciError = NvSciBufAttrListReconcile(&statisticsOutputAttrList[ii],
+ 1,
+ &reconciledStatisticsOutputAttrList[ii],
+ &statisticsOutputConflictList[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in reconciling NvSciBuf attribute list\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ NvSciBufObj *inputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numInputTensors);
+ NvSciBufObj *outputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTensors);
+ NvSciBufObj *statisticsBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTaskStatistics);
+
+ resourceList.inputBufObj = inputBufObj;
+ resourceList.outputBufObj = outputBufObj;
+ resourceList.statisticsBufObj = statisticsBufObj;
+
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ sciError = NvSciBufObjAlloc(reconciledInputAttrList[ii], &inputBufObj[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in allocating NvSciBuf object\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ sciError = NvSciBufObjAlloc(reconciledOutputAttrList[ii], &outputBufObj[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in allocating NvSciBuf object\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ sciError = NvSciBufObjAlloc(reconciledStatisticsOutputAttrList[ii], &statisticsBufObj[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in allocating NvSciBuf object\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ uint64_t** inputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors);
+ uint64_t** outputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors);
+ uint64_t** statisticsBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics);
+
+ if ((inputBufObjRegPtr == NULL) || (outputBufObjRegPtr == NULL) || (statisticsBufObjRegPtr == NULL)) {
+ if (inputBufObjRegPtr != NULL) {
+ free(inputBufObjRegPtr);
+ inputBufObjRegPtr = NULL;
+ }
+
+ if (outputBufObjRegPtr != NULL) {
+ free(outputBufObjRegPtr);
+ outputBufObjRegPtr = NULL;
+ }
+
+ if (statisticsBufObjRegPtr != NULL) {
+ free(statisticsBufObjRegPtr);
+ statisticsBufObjRegPtr = NULL;
+ }
+
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ resourceList.inputBufObjRegPtr = inputBufObjRegPtr;
+ resourceList.outputBufObjRegPtr = outputBufObjRegPtr;
+ resourceList.statisticsBufObjRegPtr = statisticsBufObjRegPtr;
+
+ void **inputBufObjBuffer = (void **)malloc(sizeof(void*)*numInputTensors);
+ void **outputBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTensors);
+ void **statisticsBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTaskStatistics);
+
+ cudlaExternalMemoryHandleDesc memDesc = { 0 };
+ // importing external memory
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ memset(&memDesc, 0, sizeof(memDesc));
+ memDesc.extBufObject = (void *)inputBufObj[ii];
+ memDesc.size = inputTensorDesc[ii].size;
+ err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr[ii], 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in importing external memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ sciError = NvSciBufObjGetCpuPtr(inputBufObj[ii], &inputBufObjBuffer[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in getting NvSciBuf CPU pointer\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memcpy(inputBufObjBuffer[ii], inputBuffer[ii], inputTensorDesc[ii].size);
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ memset(&memDesc, 0, sizeof(memDesc));
+ memDesc.extBufObject = (void *)outputBufObj[ii];
+ memDesc.size = outputTensorDesc[ii].size;
+ err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr[ii], 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in importing external memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ sciError = NvSciBufObjGetCpuPtr(outputBufObj[ii], &outputBufObjBuffer[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in getting NvSciBuf CPU pointer\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(outputBufObjBuffer[ii], 0, outputTensorDesc[ii].size);
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ memset(&memDesc, 0, sizeof(memDesc));
+ memDesc.extBufObject = (void *)statisticsBufObj[ii];
+ memDesc.size = outputTaskStatisticsDesc[ii].size;
+ err = cudlaImportExternalMemory(devHandle, &memDesc, &statisticsBufObjRegPtr[ii], CUDLA_TASK_STATISTICS);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in importing external memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ sciError = NvSciBufObjGetCpuPtr(statisticsBufObj[ii], &statisticsBufObjBuffer[ii]);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in getting NvSciBuf CPU pointer\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(statisticsBufObjBuffer[ii], 0, outputTaskStatisticsDesc[ii].size);
+ }
+
+ uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0} ;
+
+ uint32_t index = 0;
+ for (; index < numOutputTensors ; index++) {
+ outputStatisticsBufferRegisteredPtr[index] = ((outputBufObjRegPtr[index]));
+ }
+
+ for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) {
+ outputStatisticsBufferRegisteredPtr[index++] = ((statisticsBufObjRegPtr[jj]));
+ }
+
+ NvSciSyncObj syncObj;
+ NvSciSyncModule syncModule;
+ NvSciSyncAttrList syncAttrListObj[2];
+ NvSciSyncCpuWaitContext nvSciCtx;
+ NvSciSyncAttrList waiterAttrListObj = NULL;
+ NvSciSyncAttrList signalerAttrListObj = NULL;
+ NvSciSyncAttrList nvSciSyncConflictListObj;
+ NvSciSyncAttrList nvSciSyncReconciledListObj;
+
+ sciError = NvSciSyncModuleOpen(&syncModule);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in initializing NvSciSyncModuleOpen\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ resourceList.syncModule = syncModule;
+
+ sciError = NvSciSyncCpuWaitContextAlloc(syncModule, &nvSciCtx);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in allocating cpu wait context NvSciSyncCpuWaitContextAlloc\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ resourceList.nvSciCtx = nvSciCtx;
+
+ sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in creating NvSciSync attribute list\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ resourceList.signalerAttrListObj = signalerAttrListObj;
+
+ sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in creating NvSciSync attribute list\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ resourceList.waiterAttrListObj = waiterAttrListObj;
+
+ err = cudlaGetNvSciSyncAttributes(reinterpret_cast(signalerAttrListObj),
+ CUDLA_NVSCISYNC_ATTR_SIGNAL);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in getting cuDLA's NvSciSync attributes\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ sciError = fillCpuWaiterAttrList(waiterAttrListObj);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in setting NvSciSync attribute list\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ syncAttrListObj[0] = signalerAttrListObj;
+ syncAttrListObj[1] = waiterAttrListObj;
+ sciError = NvSciSyncAttrListReconcile(syncAttrListObj,
+ 2,
+ &nvSciSyncReconciledListObj,
+ &nvSciSyncConflictListObj);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in reconciling NvSciSync's attribute lists\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ resourceList.nvSciSyncConflictListObj = nvSciSyncConflictListObj;
+ resourceList.nvSciSyncReconciledListObj = nvSciSyncReconciledListObj;
+
+ sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj, &syncObj);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in allocating NvSciSync object\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ resourceList.syncObj = syncObj;
+
+ // importing external semaphore
+ uint64_t* nvSciSyncObjRegPtr = NULL;
+ cudlaExternalSemaphoreHandleDesc semaMemDesc = { 0 };
+ memset(&semaMemDesc, 0, sizeof(semaMemDesc));
+ semaMemDesc.extSyncObject = syncObj;
+ err = cudlaImportExternalSemaphore(devHandle,
+ &semaMemDesc,
+ &nvSciSyncObjRegPtr,
+ 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in importing external semaphore = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n");
+
+ // Signal Events
+ cudlaSignalEvents* signalEvents;
+ signalEvents = (cudlaSignalEvents *)malloc(sizeof(cudlaSignalEvents));
+ if (signalEvents == NULL) {
+ DPRINTF("Error in allocating signal events\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ signalEvents->numEvents = 1;
+ uint64_t** devPtrs = (uint64_t **)malloc(signalEvents->numEvents *
+ sizeof(uint64_t *));
+ if (devPtrs == NULL) {
+ DPRINTF("Error in allocating output pointer's array of registered objects\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ devPtrs[0] = nvSciSyncObjRegPtr;
+ signalEvents->devPtrs = devPtrs;
+ resourceList.devPtrs = devPtrs;
+
+ signalEvents->eofFences = (CudlaFence *)malloc(signalEvents->numEvents *
+ sizeof(CudlaFence));
+ if (signalEvents->eofFences == NULL) {
+ DPRINTF("Error in allocating eofFence array\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ NvSciSyncFence eofFence = NvSciSyncFenceInitializer;
+ signalEvents->eofFences[0].fence = &eofFence;
+ signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE;
+ resourceList.signalEvents = signalEvents;
+ resourceList.eofFence = eofFence;
+
+ // Enqueue a cuDLA task.
+ cudlaTask task;
+ task.moduleHandle = moduleHandle;
+ task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr;
+
+ if(statSupport == 1) {
+ task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics);
+ DPRINTF("Layerwise profiling is requested \n");
+ } else {
+ task.numOutputTensors = numOutputTensors;
+ DPRINTF("Layerwise profiling is not requested \n");
+ }
+
+ task.numInputTensors = numInputTensors;
+ task.inputTensor = inputBufObjRegPtr;
+ task.waitEvents = NULL;
+ task.signalEvents = signalEvents;
+
+ err = cudlaSubmitTask(devHandle, &task, 1, NULL, 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in submitting task\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ DPRINTF("SUBMIT IS DONE !!!\n");
+
+ // Wait for operations to finish and bring output buffer to CPU.
+ sciError = NvSciSyncFenceWait(reinterpret_cast(signalEvents->eofFences[0].fence),
+ nvSciCtx, -1);
+ if (sciError != NvSciError_Success) {
+ DPRINTF("Error in waiting on NvSciSyncFence\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ // copy statistics data to cpu
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ memcpy(outputBuffer[ii], outputBufObjBuffer[ii], outputTensorDesc[ii].size);
+ }
+
+ if(statSupport == 1) {
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ memcpy(statisticsOutputBuffer[ii], statisticsBufObjBuffer[ii], outputTaskStatisticsDesc[ii].size);
+ }
+
+ const cudlaExternalEtbl* etbl = NULL;
+ if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) {
+ DPRINTF("Error in getting export table\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics);
+ if (csv == NULL) {
+ DPRINTF("Error in allocating memory for csv stream\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics);
+ resourceList.csv = csv;
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ cudlaTranslateCsvAttribute csvAttribute;
+ uint64_t csvStreamLength = 0;
+
+ err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute);
+ csv[ii] = (void* )malloc(csvAttribute.csvStreamLength);
+ csvStreamLength = csvAttribute.csvStreamLength;
+ DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength);
+
+ if (csv[ii] == NULL) {
+ DPRINTF("Error in allocating memory for csv stream\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+ memset(csv[ii], 0x00, csvAttribute.csvStreamLength);
+
+ csvAttribute.csvStreamStats = csv[ii];
+ err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in translating stats\n");
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ if (argc == 5) {
+ sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix);
+ fp = fopen(filename, "w+");
+ if (fp == NULL) {
+ DPRINTF("Cannot open file %s\n", filename);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp);
+ if(ret_val != csvStreamLength) {
+ DPRINTF("number of elements written to file is %u \n", ret_val);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ fclose(fp);
+ } else {
+ DPRINTF("%s \n",(char *)csv[ii]);
+ }
+ }
+ }
+
+ // unregister the CUDA-allocated buffers.
+ for (uint32_t ii = 0; ii < numInputTensors; ii++) {
+ err = cudlaMemUnregister(devHandle,
+ (inputBufObjRegPtr[ii]));
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering input memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTensors; ii++) {
+ err = cudlaMemUnregister(devHandle,
+ (outputBufObjRegPtr[ii]));
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering output memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) {
+ err = cudlaMemUnregister(devHandle,
+ (statisticsBufObjRegPtr[ii]));
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in registering output memory = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+ }
+
+ err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in unregistering external semaphore = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ }
+
+ DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n");
+
+
+ err = cudlaModuleUnload(moduleHandle, 0);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cudlaModuleUnload = %d\n", err);
+ cleanUp(&resourceList);
+ return 1;
+ } else {
+ DPRINTF("Successfully unloaded module\n");
+ }
+
+ resourceList.moduleHandle = NULL;
+
+ err = cudlaDestroyDevice(devHandle);
+ if (err != cudlaSuccess) {
+ DPRINTF("Error in cuDLA destroy device = %d\n", err);
+ return 1;
+ }
+ DPRINTF("Device destroyed successfully\n");
+
+ resourceList.devHandle = NULL;
+
+ cleanUp(&resourceList);
+
+ DPRINTF("cuDLALayerwiseStatsStandalone DONE !!!\n");
+
+ return 0;
+}
diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
index d42ac64c..9ee2f171 100644
--- a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
+++ b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
@@ -552,7 +552,6 @@ int main(int argc, char *argv[]) {
checkCudaErrors(cusparseScatter(cusparseHandle, vecz, vecx));
checkCudaErrors(cusparseDestroySpVec(vecz));
-
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
diff --git a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp
index e244bfe2..6e27672f 100644
--- a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp
+++ b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp
@@ -31,12 +31,13 @@
* https://vulkan-tutorial.com/
*/
-#include
-#include
-#include
#include
+#include
#include
+#include
+#include
#include
+#include
#include
#include "VulkanBaseApp.h"
diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp
index cfb5c190..23151947 100644
--- a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp
+++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp
@@ -31,14 +31,16 @@
* https://vulkan-tutorial.com/
*/
-#include
-#include
-#include
#include
+#include
#include
+#include
+#include
#include
+#include
#include
+
#include "VulkanBaseApp.h"
#include "VulkanCudaInterop.h"