From 1fa9c59db4ee8b34c4efa5b6e8fbd2a8c72a93ce Mon Sep 17 00:00:00 2001 From: Rob Nertney Date: Wed, 3 Jan 2024 05:10:37 +0000 Subject: [PATCH] Fixing missing limits header on Vulkan app and missing cuDLA apps --- CHANGELOG.md | 2 +- .../.vscode/c_cpp_properties.json | 18 + .../.vscode/extensions.json | 7 + .../.vscode/launch.json | 10 + .../.vscode/tasks.json | 15 + .../cuDLALayerwiseStatsHybrid/Makefile | 400 +++++ .../NsightEclipse.xml | 71 + .../cuDLALayerwiseStatsHybrid/README.md | 60 + .../cuDLALayerwiseStatsHybrid/main.cu | 898 +++++++++++ .../.vscode/c_cpp_properties.json | 18 + .../.vscode/extensions.json | 7 + .../.vscode/launch.json | 10 + .../.vscode/tasks.json | 15 + .../cuDLALayerwiseStatsStandalone/Makefile | 403 +++++ .../NsightEclipse.xml | 65 + .../cuDLALayerwiseStatsStandalone/README.md | 61 + .../findnvsci.mk | 144 ++ .../cuDLALayerwiseStatsStandalone/main.cpp | 1348 +++++++++++++++++ .../cuSolverSp_LinearSolver.cpp | 1 - .../simpleVulkan/VulkanBaseApp.cpp | 7 +- .../simpleVulkanMMAP/VulkanBaseApp.cpp | 8 +- 21 files changed, 3560 insertions(+), 8 deletions(-) create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/c_cpp_properties.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/extensions.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/launch.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/tasks.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/Makefile create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/NsightEclipse.xml create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/c_cpp_properties.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/extensions.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/launch.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/tasks.json create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/Makefile create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/NsightEclipse.xml create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/findnvsci.mk create mode 100644 Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 9375c1af..d99370f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## Changelog -### CUDA 12.1 +### CUDA 12.2 * libNVVM samples received updates * Fixed jitLto Case issues * Enabled HOST_COMPILER flag to the makefiles for GCC which is untested but may still work. diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/c_cpp_properties.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/c_cpp_properties.json new file mode 100644 index 00000000..f0066b0f --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "${workspaceFolder}/../../../Common" + ], + "defines": [], + "compilerPath": "/usr/local/cuda/bin/nvcc", + "cStandard": "gnu17", + "cppStandard": "gnu++14", + "intelliSenseMode": "linux-gcc-x64", + "configurationProvider": "ms-vscode.makefile-tools" + } + ], + "version": 4 +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/extensions.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/extensions.json new file mode 100644 index 00000000..c7eb54dc --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "nvidia.nsight-vscode-edition", + "ms-vscode.cpptools", + "ms-vscode.makefile-tools" + ] +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/launch.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/launch.json new file mode 100644 index 00000000..65a931b5 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/launch.json @@ -0,0 +1,10 @@ +{ + "configurations": [ + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "program": "${workspaceFolder}/cuDLALayerwiseStatsHybrid" + } + ] +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/tasks.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/tasks.json new file mode 100644 index 00000000..4509aeb1 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/.vscode/tasks.json @@ -0,0 +1,15 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "sample", + "type": "shell", + "command": "make dbg=1", + "problemMatcher": ["$nvcc"], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/Makefile b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/Makefile new file mode 100644 index 00000000..8bbd3e20 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/Makefile @@ -0,0 +1,400 @@ +################################################################################ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif + +# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. +ifeq ($(HOST_ARCH),aarch64) + ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) + HOST_ARCH := sbsa + TARGET_ARCH := sbsa + endif +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifdef HOST_COMPILER + CUSTOM_HOST_COMPILER = 1 +endif + +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),sbsa) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + NVCCFLAGS += -D_QNX_SOURCE + NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le + CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu + LDFLAGS += -lsocket + LDFLAGS += -L/usr/lib/aarch64-qnx-gnu + CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" + ifdef TARGET_OVERRIDE + LDFLAGS += -lslog2 + endif + + ifneq ($(TARGET_FS),) + LDFLAGS += -L$(TARGET_FS)/usr/lib + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" + LDFLAGS += -L$(TARGET_FS)/usr/libnvidia + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" + CCFLAGS += -I$(TARGET_FS)/../include + endif + endif +endif + +ifdef TARGET_OVERRIDE # cuda toolkit targets override + NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on Linux x86_64 +ifeq ($(TARGET_OS),linux) + ifeq ($(TARGET_ARCH),x86_64) + $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on Linux x86_64 - waiving sample <<<) + SAMPLE_ENABLED := 0 + endif +endif + +# This sample is not supported on Mac OSX +ifeq ($(TARGET_OS),darwin) + $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on Mac OSX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on sbsa +ifeq ($(TARGET_ARCH),sbsa) + $(info >>> WARNING - cuDLALayerwiseStatsHybrid is not supported on sbsa - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../../Common +LIBRARIES := + +################################################################################ + +#Detect if installed version of GCC supports required C++11 +ifeq ($(TARGET_OS),linux) + empty := + space := $(empty) $(empty) + GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`) +#Create version number without "." + GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.) +# Make sure the version number has at least 3 decimals + GCCVERSION += 00 +# Remove spaces from the version number + GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION)) +#$(warning $(GCCVERSION)) + + IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000) + ifneq ($(CUSTOM_HOST_COMPILER), 1) + ifeq ($(IS_MIN_VERSION), 1) + $(info >>> GCC Version is greater or equal to 4.7.0 <<<) + else + $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<) + SAMPLE_ENABLED := 0 + endif + else + $(warning >>> Custom HOST_COMPILER set; skipping GCC version check. This may lead to unintended behavior. Please note the minimum equivalent GCC version is 4.7.0 <<<) + endif +endif + +# Gencode arguments +ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa)) +SMS ?= 53 61 70 72 75 80 86 87 90 +else +SMS ?= 50 52 60 61 70 75 80 86 89 90 +endif + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ALL_CCFLAGS += --std=c++11 --threads 0 + +LIBRARIES += -lcudla + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: cuDLALayerwiseStatsHybrid + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +main.o:main.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +cuDLALayerwiseStatsHybrid: main.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./cuDLALayerwiseStatsHybrid + +testrun: build + +clean: + rm -f cuDLALayerwiseStatsHybrid main.o + rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cuDLALayerwiseStatsHybrid + +clobber: clean diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/NsightEclipse.xml b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/NsightEclipse.xml new file mode 100644 index 00000000..c00592db --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/NsightEclipse.xml @@ -0,0 +1,71 @@ + + + + cuDLALayerwiseStatsHybrid + + --std=c++11 + + + cudaStreamCreateWithFlags + cudaStreamDestroy + cudaFree + cudaGetErrorName + cudaSetDevice + cudaStreamSynchronize + cudaMalloc + cudaMemsetAsync + cudaMemcpyAsync + + + whole + + ./ + ../ + ../../../Common + + + cuDLA + Data Parallel Algorithms + Image Processing + + + CUDA + CPP11 + + + cudla + + + + true + main.cu + + 1:CUDA Advanced Topics + 1:cuDLA + + sm60 + sm61 + sm70 + sm72 + sm75 + sm80 + sm86 + sm87 + sm89 + sm90 + + + aarch64 + linux + + + aarch64 + qnx + + + + 6.0 + + cuDLA Layerwise statistics HybridMode + exe + diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md new file mode 100644 index 00000000..d3df1858 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md @@ -0,0 +1,60 @@ +# cuDLALayerwiseStatsHybrid - cuDLA Layerwise statistics HybridMode + +## Description + +This sample is used to provide layerwise statistics to the application in the cuDLA hybrid mode wherein DLA is programmed using CUDA. + +## Key Concepts + +cuDLA, Data Parallel Algorithms, Image Processing + +## Supported SM Architectures + +[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, QNX + +## Supported CPU Architecture + +aarch64 + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSetDevice, cudaStreamSynchronize, cudaMalloc, cudaMemsetAsync, cudaMemcpyAsync + +## Prerequisites + +Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu new file mode 100644 index 00000000..81e575c6 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu @@ -0,0 +1,898 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cudla.h" +#include "cuda_runtime.h" +#include "cudlaExternalEtbl.hpp" + +#include +#include +#include +#include +#include +#include + +#define MAX_FILENAME_LEN 200 +#define RESERVED_SUFFIX_LEN 10 + +#define DPRINTF(...) printf(__VA_ARGS__) + +static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { + DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); + DPRINTF("\tsize: %lu\n", tensorDesc->size); + + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, + tensorDesc->h, tensorDesc->w); + + DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); + DPRINTF("\tdata type: %d\n", tensorDesc->dataType); + DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); + DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); + DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); + DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); + DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); + DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); + DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); +} + +typedef struct { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char* loadableData; + cudaStream_t stream; + uint32_t numInputTensors; + uint32_t numOutputTensors; + uint32_t numOutputTaskStatistics; + unsigned char** inputBuffer; + unsigned char** outputBuffer; + unsigned char** statisticsOutputBuffer; + void** inputBufferGPU; + void** outputBufferGPU; + void** outputTaskStatisticsGPU; + void **csv; + cudlaModuleTensorDescriptor* inputTensorDesc; + cudlaModuleTensorDescriptor* outputTensorDesc; + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc; + uint64_t** inputBufferRegisteredPtr; + uint64_t** outputBufferRegisteredPtr; + uint64_t** outputTaskStatisticsRegisteredPtr; + uint64_t** outputStatisticsBufferRegisteredPtr; +} ResourceList; + +void cleanUp(ResourceList* resourceList); + +void cleanUp(ResourceList* resourceList) { + uint32_t ii = 0; + if (resourceList->inputTensorDesc != NULL) { + free(resourceList->inputTensorDesc); + resourceList->inputTensorDesc = NULL; + } + + if (resourceList->outputTensorDesc != NULL) { + free(resourceList->outputTensorDesc); + resourceList->outputTensorDesc = NULL; + } + + if (resourceList->outputTaskStatisticsDesc != NULL) { + free(resourceList->outputTaskStatisticsDesc); + resourceList->outputTaskStatisticsDesc = NULL; + } + + if (resourceList->loadableData != NULL) { + free(resourceList->loadableData); + resourceList->loadableData = NULL; + } + + if (resourceList->moduleHandle != NULL) { + cudlaModuleUnload(resourceList->moduleHandle, 0); + resourceList->moduleHandle = NULL; + } + + if (resourceList->devHandle != NULL) { + cudlaDestroyDevice(resourceList->devHandle); + resourceList->devHandle = NULL; + } + + if (resourceList->inputBufferGPU != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if ((resourceList->inputBufferGPU)[ii] != NULL) { + cudaFree((resourceList->inputBufferGPU)[ii]); + (resourceList->inputBufferGPU)[ii] = NULL; + } + } + free(resourceList->inputBufferGPU); + resourceList->inputBufferGPU = NULL; + } + + if (resourceList->outputBufferGPU != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if ((resourceList->outputBufferGPU)[ii] != NULL) { + cudaFree((resourceList->outputBufferGPU)[ii]); + (resourceList->outputBufferGPU)[ii] = NULL; + } + } + free(resourceList->outputBufferGPU); + resourceList->outputBufferGPU = NULL; + } + + if (resourceList->outputTaskStatisticsGPU != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->outputTaskStatisticsGPU)[ii] != NULL) { + cudaFree((resourceList->outputTaskStatisticsGPU)[ii]); + (resourceList->outputTaskStatisticsGPU)[ii] = NULL; + } + } + free(resourceList->outputTaskStatisticsGPU); + resourceList->outputTaskStatisticsGPU = NULL; + } + + if (resourceList->csv != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->csv)[ii] != NULL) + { + free((resourceList->csv)[ii]); + (resourceList->csv)[ii] = NULL; + } + } + free(resourceList->csv); + resourceList->csv = NULL; + } + + if (resourceList->inputBuffer != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if ((resourceList->inputBuffer)[ii] != NULL) { + free((resourceList->inputBuffer)[ii]); + (resourceList->inputBuffer)[ii] = NULL; + } + } + free(resourceList->inputBuffer); + resourceList->inputBuffer = NULL; + } + + if (resourceList->outputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if ((resourceList->outputBuffer)[ii] != NULL) + { + free((resourceList->outputBuffer)[ii]); + (resourceList->outputBuffer)[ii] = NULL; + } + } + free(resourceList->outputBuffer); + resourceList->outputBuffer = NULL; + } + + if (resourceList->statisticsOutputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->statisticsOutputBuffer)[ii] != NULL) { + free((resourceList->statisticsOutputBuffer)[ii]); + (resourceList->statisticsOutputBuffer)[ii] = NULL; + } + } + free(resourceList->statisticsOutputBuffer); + resourceList->statisticsOutputBuffer = NULL; + } + + if (resourceList->stream != NULL) { + cudaStreamDestroy(resourceList->stream); + resourceList->stream = NULL; + } + + if (resourceList->inputBufferRegisteredPtr != NULL) { + free(resourceList->inputBufferRegisteredPtr); + resourceList->inputBufferRegisteredPtr = NULL; + } + + if (resourceList->outputBufferRegisteredPtr != NULL) { + free(resourceList->outputBufferRegisteredPtr); + resourceList->outputBufferRegisteredPtr = NULL; + } + + if (resourceList->outputTaskStatisticsRegisteredPtr != NULL) { + free(resourceList->outputTaskStatisticsRegisteredPtr); + resourceList->outputTaskStatisticsRegisteredPtr = NULL; + } + + if (resourceList->outputStatisticsBufferRegisteredPtr != NULL) { + free(resourceList->outputStatisticsBufferRegisteredPtr); + resourceList->outputStatisticsBufferRegisteredPtr = NULL; + } + + resourceList->numInputTensors = 0; + resourceList->numOutputTensors = 0; + resourceList->numOutputTaskStatistics = 0; +} + +int main(int argc, char** argv) { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + cudlaStatus err; + uint32_t statSupport = 0; + uint32_t dlaFreqInMHz = 0; + FILE* fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; + char filename[MAX_FILENAME_LEN]; + const char* suffix = ".csv"; + + cudaStream_t stream; + cudaError_t result; + const char* errPtr = NULL; + + ResourceList resourceList; + + memset(&resourceList, 0x00, sizeof(ResourceList)); + + if ((argc != 4) && (argc != 5)) { + DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_hybrid_test1 \n"); + return 1; + } + + if (argc == 5) { + if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) + { + DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); + return 1; + } + } + + // Read loadable into buffer. + fp = fopen(argv[1], "rb"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", argv[1]); + return 1; + } + + if (stat(argv[1], &st) != 0) { + DPRINTF("Cannot stat file\n"); + return 1; + } + + file_size = st.st_size; + DPRINTF("The file size = %ld\n", file_size); + + dlaFreqInMHz = atoi(argv[2]); + statSupport = atoi(argv[3]); + + loadableData = (unsigned char *)malloc(file_size); + if (loadableData == NULL) { + DPRINTF("Cannot Allocate memory for loadable\n"); + return 1; + } + + actually_read = fread(loadableData, 1, file_size, fp); + if ( actually_read != file_size ) { + free(loadableData); + DPRINTF("Read wrong size\n"); + return 1; + } + fclose(fp); + + resourceList.loadableData = loadableData; + + // Initialize CUDA. + result = cudaFree(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaFree = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + result = cudaSetDevice(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA create device = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Device created successfully\n"); + resourceList.devHandle = devHandle; + + err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully loaded module\n"); + } + + resourceList.moduleHandle = moduleHandle; + + // Create CUDA stream. + result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cuda stream = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + resourceList.stream = stream; + + // Get tensor attributes. + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; + uint32_t numOutputTaskStatistics = 0; + + cudlaModuleAttribute attribute; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numInputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numInputTensors = attribute.numInputTensors; + DPRINTF("numInputTensors = %d\n", numInputTensors); + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTensors = attribute.numOutputTensors; + DPRINTF("numOutputTensors = %d\n", numOutputTensors); + + // using the same attributes to get num_output_task_statistics_tensors + attribute.numOutputTensors = 0; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTaskStatistics = attribute.numOutputTensors; + DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics); + + if(numOutputTaskStatistics == 0) { + DPRINTF("Layerwise stats is not supported for this Loadable \n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.numInputTensors = numInputTensors; + resourceList.numOutputTensors = numOutputTensors; + resourceList.numOutputTaskStatistics = numOutputTaskStatistics; + + cudlaModuleTensorDescriptor* inputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors); + cudlaModuleTensorDescriptor* outputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors); + + if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { + if (inputTensorDesc != NULL) { + free(inputTensorDesc); + inputTensorDesc = NULL; + } + + if (outputTensorDesc != NULL) { + free(outputTensorDesc); + outputTensorDesc = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputTensorDesc = inputTensorDesc; + resourceList.outputTensorDesc = outputTensorDesc; + + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics); + if (outputTaskStatisticsDesc == NULL) { + free(outputTaskStatisticsDesc); + outputTaskStatisticsDesc = NULL; + cleanUp(&resourceList); + return 1; + } + + resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc; + + attribute.inputTensorDesc = inputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_INPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting input tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing input tensor descriptor\n"); + printTensorDesc(inputTensorDesc); + + attribute.outputTensorDesc = outputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting output tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output tensor descriptor\n"); + printTensorDesc(outputTensorDesc); + + attribute.outputTensorDesc = outputTaskStatisticsDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting task statistics descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Printing output task statistics descriptor size\n"); + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size); + } + + // Setup the input and output buffers which will be used as an input to CUDA. + unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors); + if (inputBuffer == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors); + resourceList.inputBuffer = inputBuffer; + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size); + if (inputBuffer[ii] == NULL) { + DPRINTF("Error in allocating input memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size); + } + + unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors); + if (outputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors); + resourceList.outputBuffer = outputBuffer; + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size); + } + + unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics); + if (statisticsOutputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics); + resourceList.statisticsOutputBuffer = statisticsOutputBuffer; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size); + } + + // Allocate memory on GPU. + void** inputBufferGPU = (void **)malloc(sizeof(void *)*numInputTensors); + if (inputBufferGPU == NULL) { + DPRINTF("Error in allocating memory for input buffer GPU array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBufferGPU, 0x00, sizeof(void *)*numInputTensors); + resourceList.inputBufferGPU = inputBufferGPU; + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + result = cudaMalloc(&(inputBufferGPU[ii]), inputTensorDesc[ii].size); + if (result != cudaSuccess) + { + DPRINTF("Error in allocating input memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + void** outputBufferGPU = (void **)malloc(sizeof(void *)*numOutputTensors); + if (outputBufferGPU == NULL) { + DPRINTF("Error in allocating memory for output buffer GPU array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBufferGPU, 0x00, sizeof(void *)*numOutputTensors); + resourceList.outputBufferGPU = outputBufferGPU; + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + result = cudaMalloc(&(outputBufferGPU[ii]), outputTensorDesc[ii].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating output memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + void** outputTaskStatisticsGPU = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + if (outputTaskStatisticsGPU == NULL) { + DPRINTF("Error in allocating memory for output task statistics GPU array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputTaskStatisticsGPU, 0x00, sizeof(void *)*numOutputTaskStatistics); + resourceList.outputTaskStatisticsGPU = outputTaskStatisticsGPU; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + result = cudaMalloc(&(outputTaskStatisticsGPU[ii]), outputTaskStatisticsDesc[ii].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating task statistics memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + uint64_t** inputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors); + uint64_t** outputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors); + uint64_t** outputTaskStatisticsRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics); + + if ((inputBufferRegisteredPtr == NULL) || (outputBufferRegisteredPtr == NULL) || (outputTaskStatisticsRegisteredPtr == NULL)) { + if (inputBufferRegisteredPtr != NULL) { + free(inputBufferRegisteredPtr); + inputBufferRegisteredPtr = NULL; + } + + if (outputBufferRegisteredPtr != NULL) { + free(outputBufferRegisteredPtr); + outputBufferRegisteredPtr = NULL; + } + + if (outputTaskStatisticsRegisteredPtr != NULL) { + free(outputTaskStatisticsRegisteredPtr); + outputTaskStatisticsRegisteredPtr = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBufferRegisteredPtr = inputBufferRegisteredPtr; + resourceList.outputBufferRegisteredPtr = outputBufferRegisteredPtr; + resourceList.outputTaskStatisticsRegisteredPtr = outputTaskStatisticsRegisteredPtr; + + // Register the CUDA-allocated buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = cudlaMemRegister(devHandle, + (uint64_t* )(inputBufferGPU[ii]), + inputTensorDesc[ii].size, + &(inputBufferRegisteredPtr[ii]), + 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = cudlaMemRegister(devHandle, + (uint64_t* )(outputBufferGPU[ii]), + outputTensorDesc[ii].size, + &(outputBufferRegisteredPtr[ii]), + 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = cudlaMemRegister(devHandle, + (uint64_t* )(outputTaskStatisticsGPU[ii]), + outputTaskStatisticsDesc[ii].size, + &(outputTaskStatisticsRegisteredPtr[ii]), + CUDLA_TASK_STATISTICS); + if (err != cudlaSuccess) { + DPRINTF("Error in registering statistics output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); + + // Copy data from CPU buffers to GPU buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + result = cudaMemcpyAsync(inputBufferGPU[ii], inputBuffer[ii], inputTensorDesc[ii].size, cudaMemcpyHostToDevice, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memcpy for input\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + result = cudaMemsetAsync(outputBufferGPU[ii], 0, outputTensorDesc[ii].size, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memset for output\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + result = cudaMemsetAsync(outputTaskStatisticsGPU[ii], 0, outputTaskStatisticsDesc[ii].size, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memset for statistics output\n"); + cleanUp(&resourceList); + return 1; + } + } + + uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0}; + uint32_t index = 0; + for (; index < numOutputTensors ; index++) { + outputStatisticsBufferRegisteredPtr[index] = ((outputBufferRegisteredPtr[index])); + } + + for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) { + outputStatisticsBufferRegisteredPtr[index++] = ((outputTaskStatisticsRegisteredPtr[jj])); + } + + // Enqueue a cuDLA task. + cudlaTask task; + task.moduleHandle = moduleHandle; + task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr; + + if(statSupport == 1) { + task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics); + DPRINTF("Layerwise profiling is requested \n"); + } else { + task.numOutputTensors = numOutputTensors; + DPRINTF("Layerwise profiling is not requested \n"); + } + + task.numInputTensors = numInputTensors; + task.inputTensor = inputBufferRegisteredPtr; + task.waitEvents = NULL; + task.signalEvents = NULL; + + err = cudlaSubmitTask(devHandle, &task, 1, stream, 0); + if (err != cudlaSuccess) { + DPRINTF("no of output tensor %u \n",(task.numOutputTensors)); + DPRINTF("Error in submitting task\n"); + cleanUp(&resourceList); + return 1; + } + DPRINTF("SUBMIT IS DONE !!!\n"); + + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream = %s\n", cudaGetErrorName(result)); + cleanUp(&resourceList); + return 1; + } + + // Wait for stream operations to finish and bring output buffer to CPU. + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + result = cudaMemcpyAsync(outputBuffer[ii], outputBufferGPU[ii], + outputTensorDesc[ii].size, cudaMemcpyDeviceToHost, stream); + if (result != cudaSuccess) { + DPRINTF("Error in bringing result back to CPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream\n"); + cleanUp(&resourceList); + return 1; + } + + if(statSupport == 1) { + // copy statistics data to cpu + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + result = cudaMemcpyAsync(statisticsOutputBuffer[ii], outputTaskStatisticsGPU[ii], + outputTaskStatisticsDesc[ii].size, cudaMemcpyDeviceToHost, stream); + if (result != cudaSuccess) { + DPRINTF("Error in bringing result back to CPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream\n"); + cleanUp(&resourceList); + return 1; + } + + // To get the last index of the filename prefix in which statistics will be dumped + uint32_t index = 0; + if (argc == 5) { + while(argv[4][index]!='\0') { + index++; + } + } + + const cudlaExternalEtbl* etbl = NULL; + if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) { + DPRINTF("Error in getting export table\n"); + cleanUp(&resourceList); + return 1; + } + + void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + if (csv == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics); + resourceList.csv = csv; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + cudlaTranslateCsvAttribute csvAttribute; + uint64_t csvStreamLength = 0; + + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute); + csv[ii] = (void* )malloc(csvAttribute.csvStreamLength); + csvStreamLength = csvAttribute.csvStreamLength; + DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength); + + if (csv[ii] == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv[ii], 0x00, csvAttribute.csvStreamLength); + + csvAttribute.csvStreamStats = csv[ii]; + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute); + if (err != cudlaSuccess) { + DPRINTF("Error in translating stats\n"); + cleanUp(&resourceList); + return 1; + } + + if (argc == 5) { + sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix); + fp = fopen(filename, "w+"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", filename); + cleanUp(&resourceList); + return 1; + } + + uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp); + if(ret_val != csvStreamLength) { + DPRINTF("number of elements written to file is %u \n", ret_val); + cleanUp(&resourceList); + return 1; + } + fclose(fp); + } else { + DPRINTF("%s \n",(char *)csv[ii]); + } + } + } + + // unregister the CUDA-allocated buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (inputBufferRegisteredPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (outputBufferRegisteredPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = cudlaMemUnregister(devHandle, + (outputTaskStatisticsRegisteredPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); + + result = cudaStreamDestroy(stream); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in destroying cuda stream = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + resourceList.stream = NULL; + + err = cudlaModuleUnload(moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleUnload = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully unloaded module\n"); + } + + resourceList.moduleHandle = NULL; + + err = cudlaDestroyDevice(devHandle); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA destroy device = %d\n", err); + return 1; + } + DPRINTF("Device destroyed successfully\n"); + + resourceList.devHandle = NULL; + + cleanUp(&resourceList); + + DPRINTF("cuDLALayerwiseStatsHybrid DONE !!!\n"); + + return 0; +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/c_cpp_properties.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/c_cpp_properties.json new file mode 100644 index 00000000..f0066b0f --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "${workspaceFolder}/../../../Common" + ], + "defines": [], + "compilerPath": "/usr/local/cuda/bin/nvcc", + "cStandard": "gnu17", + "cppStandard": "gnu++14", + "intelliSenseMode": "linux-gcc-x64", + "configurationProvider": "ms-vscode.makefile-tools" + } + ], + "version": 4 +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/extensions.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/extensions.json new file mode 100644 index 00000000..c7eb54dc --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "nvidia.nsight-vscode-edition", + "ms-vscode.cpptools", + "ms-vscode.makefile-tools" + ] +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/launch.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/launch.json new file mode 100644 index 00000000..a0aa79e8 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/launch.json @@ -0,0 +1,10 @@ +{ + "configurations": [ + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "program": "${workspaceFolder}/cuDLALayerwiseStatsStandalone" + } + ] +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/tasks.json b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/tasks.json new file mode 100644 index 00000000..4509aeb1 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/.vscode/tasks.json @@ -0,0 +1,15 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "sample", + "type": "shell", + "command": "make dbg=1", + "problemMatcher": ["$nvcc"], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/Makefile b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/Makefile new file mode 100644 index 00000000..aa97778f --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/Makefile @@ -0,0 +1,403 @@ +################################################################################ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif + +# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. +ifeq ($(HOST_ARCH),aarch64) + ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) + HOST_ARCH := sbsa + TARGET_ARCH := sbsa + endif +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifdef HOST_COMPILER + CUSTOM_HOST_COMPILER = 1 +endif + +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),sbsa) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + NVCCFLAGS += -D_QNX_SOURCE + NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le + CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu + LDFLAGS += -lsocket + LDFLAGS += -L/usr/lib/aarch64-qnx-gnu + CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" + ifdef TARGET_OVERRIDE + LDFLAGS += -lslog2 + endif + + ifneq ($(TARGET_FS),) + LDFLAGS += -L$(TARGET_FS)/usr/lib + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" + LDFLAGS += -L$(TARGET_FS)/usr/libnvidia + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" + CCFLAGS += -I$(TARGET_FS)/../include + endif + endif +endif + +ifdef TARGET_OVERRIDE # cuda toolkit targets override + NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on Linux x86_64 +ifeq ($(TARGET_OS),linux) + ifeq ($(TARGET_ARCH),x86_64) + $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on Linux x86_64 - waiving sample <<<) + SAMPLE_ENABLED := 0 + endif +endif + +# This sample is not supported on Mac OSX +ifeq ($(TARGET_OS),darwin) + $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on Mac OSX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on sbsa +ifeq ($(TARGET_ARCH),sbsa) + $(info >>> WARNING - cuDLALayerwiseStatsStandalone is not supported on sbsa - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../../Common +LIBRARIES := + +################################################################################ + +# Makefile include to help find NVSCI Libraries +include ./findnvsci.mk + +#Detect if installed version of GCC supports required C++11 +ifeq ($(TARGET_OS),linux) + empty := + space := $(empty) $(empty) + GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`) +#Create version number without "." + GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.) +# Make sure the version number has at least 3 decimals + GCCVERSION += 00 +# Remove spaces from the version number + GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION)) +#$(warning $(GCCVERSION)) + + IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000) + ifneq ($(CUSTOM_HOST_COMPILER), 1) + ifeq ($(IS_MIN_VERSION), 1) + $(info >>> GCC Version is greater or equal to 4.7.0 <<<) + else + $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<) + SAMPLE_ENABLED := 0 + endif + else + $(warning >>> Custom HOST_COMPILER set; skipping GCC version check. This may lead to unintended behavior. Please note the minimum equivalent GCC version is 4.7.0 <<<) + endif +endif + +# Gencode arguments +ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa)) +SMS ?= 53 61 70 72 75 80 86 87 90 +else +SMS ?= 50 52 60 61 70 75 80 86 89 90 +endif + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ALL_CCFLAGS += --std=c++11 --threads 0 + +LIBRARIES += -lcudla -lnvscibuf -lnvscisync + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: cuDLALayerwiseStatsStandalone + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +main.o:main.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +cuDLALayerwiseStatsStandalone: main.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./cuDLALayerwiseStatsStandalone + +testrun: build + +clean: + rm -f cuDLALayerwiseStatsStandalone main.o + rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cuDLALayerwiseStatsStandalone + +clobber: clean diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/NsightEclipse.xml b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/NsightEclipse.xml new file mode 100644 index 00000000..341a6c76 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/NsightEclipse.xml @@ -0,0 +1,65 @@ + + + + cuDLALayerwiseStatsStandalone + + --std=c++11 + + + whole + + ./ + ../ + ../../../Common + + + cuDLA + Data Parallel Algorithms + Image Processing + + + CUDA + CPP11 + + + cudla + nvscibuf + nvscisync + + + + true + main.cpp + + NVSCI + + + 1:CUDA Advanced Topics + 1:cuDLA + + sm60 + sm61 + sm70 + sm72 + sm75 + sm80 + sm86 + sm87 + sm89 + sm90 + + + aarch64 + linux + + + aarch64 + qnx + + + + 6.0 + + cuDLA Layerwise Statistics Standalone Mode + exe + diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md new file mode 100644 index 00000000..ec12c923 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md @@ -0,0 +1,61 @@ +# cuDLALayerwiseStatsStandalone - cuDLA Layerwise Statistics Standalone Mode + +## Description + +This sample is used to provide layerwise statistics to the application in cuDLA standalone mode where DLA is programmed without using CUDA. + +## Key Concepts + +cuDLA, Data Parallel Algorithms, Image Processing + +## Supported SM Architectures + +[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, QNX + +## Supported CPU Architecture + +aarch64 + +## CUDA APIs involved + +## Dependencies needed to build/run +[NVSCI](../../../README.md#nvsci) + +## Prerequisites + +Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/findnvsci.mk b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/findnvsci.mk new file mode 100644 index 00000000..0d6d157c --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/findnvsci.mk @@ -0,0 +1,144 @@ +################################################################################ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################# +# findnvsci.mk is used to find the NVSCI Libraries and headers +# +################################################################################ + +# Determine OS platform and unix distribution +ifeq ("$(TARGET_OS)","linux") + # first search lsb_release + DISTRO = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]") + ifeq ("$(DISTRO)","") + # second search and parse /etc/issue + DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]") + # ensure data from /etc/issue is valid + ifeq (,$(filter $(DISTRO),ubuntu fedora red rhel centos suse)) + DISTRO = + endif + ifeq ("$(DISTRO)","") + # third, we can search in /etc/os-release or /etc/{distro}-release + DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB") + endif + endif +endif + +ifeq ("$(TARGET_OS)","linux") + # $(info) >> findegl.mk -> LINUX path <<<) + # Each set of Linux Distros have different paths for where to find their OpenGL libraries reside + UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu >/dev/null 2>&1; echo $$?) + FEDORA = $(shell echo $(DISTRO) | grep -i fedora >/dev/null 2>&1; echo $$?) + RHEL = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?) + CENTOS = $(shell echo $(DISTRO) | grep -i centos >/dev/null 2>&1; echo $$?) + SUSE = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?) + KYLIN = $(shell echo $(DISTRO) | grep -i kylin >/dev/null 2>&1; echo $$?) + ifeq ("$(UBUNTU)","0") + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + GLPATH := /usr/arm-linux-gnueabihf/lib + GLLINK := -L/usr/arm-linux-gnueabihf/lib + ifneq ($(TARGET_FS),) + GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf + GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64) + GLPATH := /usr/aarch64-linux-gnu/lib + GLLINK := -L/usr/aarch64-linux-gnu/lib + ifneq ($(TARGET_FS),) + GLPATH += $(TARGET_FS)/usr/lib + GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu + GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + endif + else + UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1) + ifneq ("$(UBUNTU_PKG_NAME)","") + GLPATH ?= /usr/lib/$(UBUNTU_PKG_NAME) + GLLINK ?= -L/usr/lib/$(UBUNTU_PKG_NAME) + endif + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(SUSE)","0") + GLPATH ?= /usr/X11R6/lib64 + GLLINK ?= -L/usr/X11R6/lib64 + DFLT_PATH ?= /usr/lib64 + endif + ifeq ("$(FEDORA)","0") + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + endif + ifeq ("$(RHEL)","0") + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + endif + ifeq ("$(CENTOS)","0") + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + endif + ifeq ("$(KYLIN)","0") + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + endif + + NVSCIBUFLIB := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libnvscibuf.so -print 2>/dev/null) + NVSCISYNCLIB := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libnvscisync.so -print 2>/dev/null) + + ifeq ("$(NVSCIBUFLIB)","") + $(info >>> WARNING - libnvscibuf.so not found, Waiving the sample <<<) + SAMPLE_ENABLED := 0 + endif + + ifeq ("$(NVSCISYNCLIB)","") + $(info >>> WARNING - libnvscisync.so not found, Waiving the sample <<<) + SAMPLE_ENABLED := 0 + endif + + HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include + ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include + else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux) + HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include + endif + + NVSCIBUFHEADER := $(shell find -L $(HEADER_SEARCH_PATH) -name nvscibuf.h -print 2>/dev/null) + NVSCISYNCHEADER := $(shell find -L $(HEADER_SEARCH_PATH) -name nvscisync.h -print 2>/dev/null) + + ifeq ("$(NVSCIBUFHEADER)","") + $(info >>> WARNING - nvscibuf.h not found, Waiving the sample <<<) + SAMPLE_ENABLED := 0 + endif + ifeq ("$(NVSCISYNCHEADER)","") + $(info >>> WARNING - nvscisync.h not found, Waiving the sample <<<) + SAMPLE_ENABLED := 0 + endif +else +endif + diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp new file mode 100644 index 00000000..639e7889 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp @@ -0,0 +1,1348 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cudla.h" +#include "nvscierror.h" +#include "nvscibuf.h" +#include "nvscisync.h" +#include "cudlaExternalEtbl.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_FILENAME_LEN 200 +#define RESERVED_SUFFIX_LEN 10 + +#define DPRINTF(...) printf(__VA_ARGS__) + +static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { + DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); + DPRINTF("\tsize: %lu\n", tensorDesc->size); + + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", + tensorDesc->n, + tensorDesc->c, + tensorDesc->h, + tensorDesc->w); + + DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); + DPRINTF("\tdata type: %d\n", tensorDesc->dataType); + DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); + DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); + DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); + DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); + DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); + DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); + DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); +} + +typedef struct { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char* loadableData; + uint32_t numInputTensors; + uint32_t numOutputTensors; + uint32_t numOutputTaskStatistics; + unsigned char** inputBuffer; + unsigned char** outputBuffer; + unsigned char** statisticsOutputBuffer; + cudlaModuleTensorDescriptor* inputTensorDesc; + cudlaModuleTensorDescriptor* outputTensorDesc; + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc; + NvSciBufObj* inputBufObj; + NvSciBufObj* outputBufObj; + NvSciBufObj* statisticsBufObj; + NvSciBufModule bufModule; + NvSciBufAttrList* inputAttrList; + NvSciBufAttrList* reconciledInputAttrList; + NvSciBufAttrList* inputConflictList; + NvSciBufAttrList* outputAttrList; + NvSciBufAttrList* reconciledOutputAttrList; + NvSciBufAttrList* outputConflictList; + NvSciSyncObj syncObj; + NvSciSyncModule syncModule; + NvSciSyncCpuWaitContext nvSciCtx; + NvSciSyncAttrList waiterAttrListObj; + NvSciSyncAttrList signalerAttrListObj; + NvSciSyncAttrList nvSciSyncConflictListObj; + NvSciSyncAttrList nvSciSyncReconciledListObj; + NvSciBufAttrList* statisticsOutputAttrList; + NvSciBufAttrList* reconciledStatisticsOutputAttrList; + NvSciBufAttrList* statisticsOutputConflictList; + uint64_t** inputBufObjRegPtr; + uint64_t** outputBufObjRegPtr; + uint64_t** statisticsBufObjRegPtr; + uint64_t** devPtrs; + cudlaSignalEvents* signalEvents; + NvSciSyncFence eofFence; + void **csv; +} ResourceList; + +void cleanUp(ResourceList* resourceList); + +void cleanUp(ResourceList* resourceList) { + uint32_t ii = 0; + + if (resourceList->inputTensorDesc != NULL) { + free(resourceList->inputTensorDesc); + resourceList->inputTensorDesc = NULL; + } + if (resourceList->outputTensorDesc != NULL) { + free(resourceList->outputTensorDesc); + resourceList->outputTensorDesc = NULL; + } + + if (resourceList->outputTaskStatisticsDesc != NULL) { + free(resourceList->outputTaskStatisticsDesc); + resourceList->outputTaskStatisticsDesc = NULL; + } + + if (resourceList->loadableData != NULL) { + free(resourceList->loadableData); + resourceList->loadableData = NULL; + } + + if (resourceList->moduleHandle != NULL) { + cudlaModuleUnload(resourceList->moduleHandle, 0); + resourceList->moduleHandle = NULL; + } + + if (resourceList->devHandle != NULL) { + cudlaDestroyDevice(resourceList->devHandle); + resourceList->devHandle = NULL; + } + + if (resourceList->inputBufObj != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->inputBufObj)[ii] != NULL) { + NvSciBufObjFree((resourceList->inputBufObj)[ii]); + (resourceList->inputBufObj)[ii] = NULL; + } + } + } + + if (resourceList->outputBufObj != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->outputBufObj)[ii] != NULL) { + NvSciBufObjFree((resourceList->outputBufObj)[ii]); + (resourceList->outputBufObj)[ii] = NULL; + } + } + } + + if (resourceList->statisticsBufObj != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->statisticsBufObj)[ii] != NULL) { + NvSciBufObjFree((resourceList->statisticsBufObj)[ii]); + (resourceList->statisticsBufObj)[ii] = NULL; + } + } + } + + if (resourceList->inputBuffer != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if ((resourceList->inputBuffer)[ii] != NULL) { + free((resourceList->inputBuffer)[ii]); + (resourceList->inputBuffer)[ii] = NULL; + } + } + free(resourceList->inputBuffer); + resourceList->inputBuffer = NULL; + } + + if (resourceList->outputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if ((resourceList->outputBuffer)[ii] != NULL) { + free((resourceList->outputBuffer)[ii]); + (resourceList->outputBuffer)[ii] = NULL; + } + } + free(resourceList->outputBuffer); + resourceList->outputBuffer = NULL; + } + + if (resourceList->statisticsOutputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->statisticsOutputBuffer)[ii] != NULL) { + free((resourceList->statisticsOutputBuffer)[ii]); + (resourceList->statisticsOutputBuffer)[ii] = NULL; + } + } + free(resourceList->statisticsOutputBuffer); + resourceList->statisticsOutputBuffer = NULL; + } + + if (resourceList->csv != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->csv)[ii] != NULL) { + free((resourceList->csv)[ii]); + (resourceList->csv)[ii] = NULL; + } + } + free(resourceList->csv); + resourceList->csv = NULL; + } + + if (resourceList->reconciledInputAttrList != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->reconciledInputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->reconciledInputAttrList)[ii]); + (resourceList->reconciledInputAttrList)[ii] = NULL; + } + } + free(resourceList->reconciledInputAttrList); + resourceList->reconciledInputAttrList = NULL; + } + + if (resourceList->inputConflictList != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->inputConflictList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->inputConflictList)[ii]); + (resourceList->inputConflictList)[ii] = NULL; + } + } + free(resourceList->inputConflictList); + resourceList->inputConflictList = NULL; + } + + if (resourceList->inputAttrList != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->inputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->inputAttrList)[ii]); + (resourceList->inputAttrList)[ii] = NULL; + } + } + free(resourceList->inputAttrList); + resourceList->inputAttrList = NULL; + } + + if (resourceList->reconciledOutputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->reconciledOutputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->reconciledOutputAttrList)[ii]); + (resourceList->reconciledOutputAttrList)[ii] = NULL; + } + } + free(resourceList->reconciledOutputAttrList); + resourceList->reconciledOutputAttrList = NULL; + } + + if (resourceList->outputConflictList != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->outputConflictList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->outputConflictList)[ii]); + (resourceList->outputConflictList)[ii] = NULL; + } + } + free(resourceList->outputConflictList); + resourceList->outputConflictList = NULL; + } + + if (resourceList->outputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->outputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->outputAttrList)[ii]); + (resourceList->outputAttrList)[ii] = NULL; + } + } + free(resourceList->outputAttrList); + resourceList->outputAttrList = NULL; + } + + if (resourceList->reconciledStatisticsOutputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->reconciledStatisticsOutputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->reconciledStatisticsOutputAttrList)[ii]); + (resourceList->reconciledStatisticsOutputAttrList)[ii] = NULL; + } + } + free(resourceList->reconciledStatisticsOutputAttrList); + resourceList->reconciledStatisticsOutputAttrList = NULL; + } + + if (resourceList->statisticsOutputConflictList != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->statisticsOutputConflictList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->statisticsOutputConflictList)[ii]); + (resourceList->statisticsOutputConflictList)[ii] = NULL; + } + } + free(resourceList->statisticsOutputConflictList); + resourceList->statisticsOutputConflictList = NULL; + } + + if (resourceList->statisticsOutputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->statisticsOutputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->statisticsOutputAttrList)[ii]); + (resourceList->statisticsOutputAttrList)[ii] = NULL; + } + } + free(resourceList->statisticsOutputAttrList); + resourceList->statisticsOutputAttrList = NULL; + } + + if (resourceList->outputBufObjRegPtr != NULL) { + free(resourceList->outputBufObjRegPtr); + resourceList->outputBufObjRegPtr = NULL; + } + + if (resourceList->statisticsBufObjRegPtr != NULL) { + free(resourceList->statisticsBufObjRegPtr); + resourceList->statisticsBufObjRegPtr = NULL; + } + + if (resourceList->inputBufObjRegPtr != NULL) { + free(resourceList->inputBufObjRegPtr); + resourceList->inputBufObjRegPtr = NULL; + } + + if (resourceList->bufModule != NULL) { + NvSciBufModuleClose(resourceList->bufModule); + resourceList->bufModule = NULL; + } + + NvSciSyncFenceClear(&(resourceList->eofFence)); + if (resourceList->syncObj != NULL) { + NvSciSyncObjFree(resourceList->syncObj); + resourceList->syncObj = NULL; + } + + if (resourceList->nvSciSyncConflictListObj != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj); + resourceList->nvSciSyncConflictListObj = NULL; + } + + if (resourceList->nvSciSyncReconciledListObj != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj); + resourceList->nvSciSyncReconciledListObj = NULL; + } + + if (resourceList->signalerAttrListObj != NULL) { + NvSciSyncAttrListFree(resourceList->signalerAttrListObj); + resourceList->signalerAttrListObj = NULL; + } + + if (resourceList->waiterAttrListObj != NULL) { + NvSciSyncAttrListFree(resourceList->waiterAttrListObj); + resourceList->waiterAttrListObj = NULL; + } + + if (resourceList->nvSciCtx != NULL) { + NvSciSyncCpuWaitContextFree(resourceList->nvSciCtx); + resourceList->nvSciCtx = NULL; + } + + if (resourceList->syncModule != NULL) { + NvSciSyncModuleClose(resourceList->syncModule); + resourceList->syncModule = NULL; + } + + if (resourceList->signalEvents != NULL) { + if (resourceList->signalEvents->eofFences != NULL) { + free(resourceList->signalEvents->eofFences); + resourceList->signalEvents->eofFences = NULL; + } + free(resourceList->signalEvents); + resourceList->signalEvents = NULL; + } + + if (resourceList->devPtrs != NULL) { + free(resourceList->devPtrs); + resourceList->devPtrs = NULL; + } + + resourceList->numInputTensors = 0; + resourceList->numOutputTensors = 0; + resourceList->numOutputTaskStatistics = 0; +} + +cudlaStatus createAndSetAttrList(NvSciBufModule module, + uint64_t bufSize, + NvSciBufAttrList *attrList); + + +cudlaStatus createAndSetAttrList(NvSciBufModule module, + uint64_t bufSize, + NvSciBufAttrList *attrList) { + cudlaStatus status = cudlaSuccess; + NvSciError sciStatus = NvSciError_Success; + + sciStatus = NvSciBufAttrListCreate(module, attrList); + if (sciStatus != NvSciError_Success) { + status = cudlaErrorNvSci; + DPRINTF("Error in creating NvSciBuf attribute list\n"); + return status; + } + + // TODO: Refactor into multiple dimensions + bool needCpuAccess = true; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; + uint32_t dimcount = 1; + uint64_t sizes[] = {bufSize}; + uint32_t alignment[] = {1}; + uint32_t dataType = NvSciDataType_Int8; + NvSciBufType type = NvSciBufType_Tensor; + uint64_t baseAddrAlign = 512; + + NvSciBufAttrKeyValuePair setAttrs[] = { + {.key = NvSciBufGeneralAttrKey_Types, + .value = &type, + .len = sizeof(type)}, + {.key = NvSciBufTensorAttrKey_DataType, + .value = &dataType, + .len = sizeof(dataType)}, + {.key = NvSciBufTensorAttrKey_NumDims, + .value = &dimcount, + .len = sizeof(dimcount)}, + {.key = NvSciBufTensorAttrKey_SizePerDim, + .value = &sizes, + .len = sizeof(sizes)}, + {.key = NvSciBufTensorAttrKey_AlignmentPerDim, + .value = &alignment, + .len = sizeof(alignment)}, + {.key = NvSciBufTensorAttrKey_BaseAddrAlign, + .value = &baseAddrAlign, + .len = sizeof(baseAddrAlign)}, + {.key = NvSciBufGeneralAttrKey_RequiredPerm, + .value = &perm, + .len = sizeof(perm)}, + {.key = NvSciBufGeneralAttrKey_NeedCpuAccess, + .value = &needCpuAccess, + .len = sizeof(needCpuAccess)}}; + size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair); + + sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length); + if (sciStatus != NvSciError_Success) + { + status = cudlaErrorNvSci; + DPRINTF("Error in setting NvSciBuf attribute list\n"); + return status; + } + + return status; +} + +NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list); + +NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) { + bool cpuWaiter = true; + NvSciSyncAttrKeyValuePair keyValue[2]; + memset(keyValue, 0, sizeof(keyValue)); + keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; + keyValue[0].value = (void*) &cpuWaiter; + keyValue[0].len = sizeof(cpuWaiter); + NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly; + keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; + keyValue[1].value = (void*) &cpuPerm; + keyValue[1].len = sizeof(cpuPerm); + return NvSciSyncAttrListSetAttrs(list, keyValue, 2); +} + +int main(int argc, char** argv) { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + cudlaStatus err; + uint32_t statSupport = 0; + uint32_t dlaFreqInMHz = 0; + FILE* fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; + char filename[MAX_FILENAME_LEN]; + const char* suffix = ".csv"; + + + ResourceList resourceList; + + memset(&resourceList, 0x00, sizeof(ResourceList)); + + if ((argc != 4) && (argc != 5)) { + DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_standalone_test1 \n"); + return 1; + } + + if (argc == 5) { + if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) { + DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); + return 1; + } + } + + // Read loadable into buffer. + fp = fopen(argv[1], "rb"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", argv[1]); + return 1; + } + + if (stat(argv[1], &st) != 0) { + DPRINTF("Cannot stat file\n"); + return 1; + } + + file_size = st.st_size; + DPRINTF("The file size = %ld\n", file_size); + + dlaFreqInMHz = atoi(argv[2]); + statSupport = atoi(argv[3]); + + loadableData = (unsigned char *)malloc(file_size); + if (loadableData == NULL) { + DPRINTF("Cannot Allocate memory for loadable\n"); + return 1; + } + + actually_read = fread(loadableData, 1, file_size, fp); + if ( actually_read != file_size ) { + free(loadableData); + DPRINTF("Read wrong size\n"); + return 1; + } + fclose(fp); + + resourceList.loadableData = loadableData; + + err = cudlaCreateDevice(0, &devHandle, CUDLA_STANDALONE); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA create device = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Device created successfully\n"); + resourceList.devHandle = devHandle; + + err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully loaded module\n"); + } + + resourceList.moduleHandle = moduleHandle; + + // Get tensor attributes. + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; + uint32_t numOutputTaskStatistics = 0; + + cudlaModuleAttribute attribute; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numInputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numInputTensors = attribute.numInputTensors; + DPRINTF("numInputTensors = %d\n", numInputTensors); + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTensors = attribute.numOutputTensors; + DPRINTF("numOutputTensors = %d\n", numOutputTensors); + + // using the same attributes to get num_output_task_statistics_tensors + attribute.numOutputTensors = 0; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTaskStatistics = attribute.numOutputTensors; + DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics); + + if(numOutputTaskStatistics == 0) { + DPRINTF("Layerwise stats is not supported for this Loadable \n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.numInputTensors = numInputTensors; + resourceList.numOutputTensors = numOutputTensors; + resourceList.numOutputTaskStatistics = numOutputTaskStatistics; + + cudlaModuleTensorDescriptor* inputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors); + cudlaModuleTensorDescriptor* outputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors); + + if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { + if (inputTensorDesc != NULL) + { + free(inputTensorDesc); + inputTensorDesc = NULL; + } + + if (outputTensorDesc != NULL) + { + free(outputTensorDesc); + outputTensorDesc = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputTensorDesc = inputTensorDesc; + resourceList.outputTensorDesc = outputTensorDesc; + + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics); + if (outputTaskStatisticsDesc == NULL) { + free(outputTaskStatisticsDesc); + outputTaskStatisticsDesc = NULL; + cleanUp(&resourceList); + return 1; + } + + resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc; + + attribute.inputTensorDesc = inputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_INPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting input tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing input tensor descriptor\n"); + printTensorDesc(inputTensorDesc); + + attribute.outputTensorDesc = outputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting output tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output tensor descriptor\n"); + printTensorDesc(outputTensorDesc); + + attribute.outputTensorDesc = outputTaskStatisticsDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting task statistics descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output task statistics descriptor size\n"); + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size); + } + + // Setup the input and output buffers. + unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors); + if (inputBuffer == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors); + resourceList.inputBuffer = inputBuffer; + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size); + if (inputBuffer[ii] == NULL) { + DPRINTF("Error in allocating input memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size); + } + + unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors); + if (outputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors); + resourceList.outputBuffer = outputBuffer; + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size); + } + + unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics); + if (statisticsOutputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics); + resourceList.statisticsOutputBuffer = statisticsOutputBuffer; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size); + } + + NvSciBufModule bufModule = NULL; + NvSciBufAttrList *inputAttrList = {NULL}; + NvSciBufAttrList *outputAttrList = {NULL}; + NvSciBufAttrList *statisticsOutputAttrList = {NULL}; + NvSciBufAttrList *reconciledInputAttrList = {NULL}; + NvSciBufAttrList *reconciledOutputAttrList = {NULL}; + NvSciBufAttrList *reconciledStatisticsOutputAttrList = {NULL}; + NvSciBufAttrList *inputConflictList = {NULL}; + NvSciBufAttrList *outputConflictList = {NULL}; + NvSciBufAttrList *statisticsOutputConflictList = {NULL}; + NvSciError sciError = NvSciError_Success; + + sciError = NvSciBufModuleOpen(&bufModule); + if (sciError != NvSciError_Success) { + DPRINTF("Error in initializing NvSciBufModule\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.bufModule = bufModule; + + // creating and setting input attribute list + + inputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + if (inputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + resourceList.inputAttrList = inputAttrList; + + reconciledInputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + if (reconciledInputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(reconciledInputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + resourceList.reconciledInputAttrList = reconciledInputAttrList; + + inputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + if (inputConflictList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputConflictList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + resourceList.inputConflictList = inputConflictList; + + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = createAndSetAttrList(bufModule, + inputTensorDesc[ii].size, + &inputAttrList[ii]); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list for input attribute\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufAttrListReconcile(&inputAttrList[ii], + 1, + &reconciledInputAttrList[ii], + &inputConflictList[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list for input attribute\n"); + cleanUp(&resourceList); + return 1; + } + + } + + outputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + if (outputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + resourceList.outputAttrList = outputAttrList; + + reconciledOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + if (reconciledOutputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(reconciledOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + resourceList.reconciledOutputAttrList = reconciledOutputAttrList; + + outputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + if (outputConflictList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + resourceList.outputConflictList = outputConflictList; + + // creating and setting output attribute list + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = createAndSetAttrList(bufModule, + outputTensorDesc[ii].size, + &outputAttrList[ii]); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list for output attibute\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufAttrListReconcile(&outputAttrList[ii], + 1, + &reconciledOutputAttrList[ii], + &outputConflictList[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list for output attribute\n"); + cleanUp(&resourceList); + return 1; + } + } + + statisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + if (statisticsOutputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + resourceList.statisticsOutputAttrList = statisticsOutputAttrList; + + reconciledStatisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + if (reconciledStatisticsOutputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(reconciledStatisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + resourceList.reconciledStatisticsOutputAttrList = reconciledStatisticsOutputAttrList; + + statisticsOutputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + if (statisticsOutputConflictList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + resourceList.statisticsOutputConflictList = statisticsOutputConflictList; + + // creating and setting statistics output attribute list + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = createAndSetAttrList(bufModule, + outputTaskStatisticsDesc[ii].size, + &statisticsOutputAttrList[ii]); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufAttrListReconcile(&statisticsOutputAttrList[ii], + 1, + &reconciledStatisticsOutputAttrList[ii], + &statisticsOutputConflictList[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + } + + NvSciBufObj *inputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numInputTensors); + NvSciBufObj *outputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTensors); + NvSciBufObj *statisticsBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTaskStatistics); + + resourceList.inputBufObj = inputBufObj; + resourceList.outputBufObj = outputBufObj; + resourceList.statisticsBufObj = statisticsBufObj; + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + sciError = NvSciBufObjAlloc(reconciledInputAttrList[ii], &inputBufObj[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + sciError = NvSciBufObjAlloc(reconciledOutputAttrList[ii], &outputBufObj[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + sciError = NvSciBufObjAlloc(reconciledStatisticsOutputAttrList[ii], &statisticsBufObj[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + } + + uint64_t** inputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors); + uint64_t** outputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors); + uint64_t** statisticsBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics); + + if ((inputBufObjRegPtr == NULL) || (outputBufObjRegPtr == NULL) || (statisticsBufObjRegPtr == NULL)) { + if (inputBufObjRegPtr != NULL) { + free(inputBufObjRegPtr); + inputBufObjRegPtr = NULL; + } + + if (outputBufObjRegPtr != NULL) { + free(outputBufObjRegPtr); + outputBufObjRegPtr = NULL; + } + + if (statisticsBufObjRegPtr != NULL) { + free(statisticsBufObjRegPtr); + statisticsBufObjRegPtr = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBufObjRegPtr = inputBufObjRegPtr; + resourceList.outputBufObjRegPtr = outputBufObjRegPtr; + resourceList.statisticsBufObjRegPtr = statisticsBufObjRegPtr; + + void **inputBufObjBuffer = (void **)malloc(sizeof(void*)*numInputTensors); + void **outputBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTensors); + void **statisticsBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTaskStatistics); + + cudlaExternalMemoryHandleDesc memDesc = { 0 }; + // importing external memory + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)inputBufObj[ii]; + memDesc.size = inputTensorDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr[ii], 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(inputBufObj[ii], &inputBufObjBuffer[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memcpy(inputBufObjBuffer[ii], inputBuffer[ii], inputTensorDesc[ii].size); + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)outputBufObj[ii]; + memDesc.size = outputTensorDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr[ii], 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(outputBufObj[ii], &outputBufObjBuffer[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBufObjBuffer[ii], 0, outputTensorDesc[ii].size); + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)statisticsBufObj[ii]; + memDesc.size = outputTaskStatisticsDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &statisticsBufObjRegPtr[ii], CUDLA_TASK_STATISTICS); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(statisticsBufObj[ii], &statisticsBufObjBuffer[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsBufObjBuffer[ii], 0, outputTaskStatisticsDesc[ii].size); + } + + uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0} ; + + uint32_t index = 0; + for (; index < numOutputTensors ; index++) { + outputStatisticsBufferRegisteredPtr[index] = ((outputBufObjRegPtr[index])); + } + + for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) { + outputStatisticsBufferRegisteredPtr[index++] = ((statisticsBufObjRegPtr[jj])); + } + + NvSciSyncObj syncObj; + NvSciSyncModule syncModule; + NvSciSyncAttrList syncAttrListObj[2]; + NvSciSyncCpuWaitContext nvSciCtx; + NvSciSyncAttrList waiterAttrListObj = NULL; + NvSciSyncAttrList signalerAttrListObj = NULL; + NvSciSyncAttrList nvSciSyncConflictListObj; + NvSciSyncAttrList nvSciSyncReconciledListObj; + + sciError = NvSciSyncModuleOpen(&syncModule); + if (sciError != NvSciError_Success) { + DPRINTF("Error in initializing NvSciSyncModuleOpen\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.syncModule = syncModule; + + sciError = NvSciSyncCpuWaitContextAlloc(syncModule, &nvSciCtx); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating cpu wait context NvSciSyncCpuWaitContextAlloc\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.nvSciCtx = nvSciCtx; + + sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.signalerAttrListObj = signalerAttrListObj; + + sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.waiterAttrListObj = waiterAttrListObj; + + err = cudlaGetNvSciSyncAttributes(reinterpret_cast(signalerAttrListObj), + CUDLA_NVSCISYNC_ATTR_SIGNAL); + if (err != cudlaSuccess) { + DPRINTF("Error in getting cuDLA's NvSciSync attributes\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = fillCpuWaiterAttrList(waiterAttrListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in setting NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + syncAttrListObj[0] = signalerAttrListObj; + syncAttrListObj[1] = waiterAttrListObj; + sciError = NvSciSyncAttrListReconcile(syncAttrListObj, + 2, + &nvSciSyncReconciledListObj, + &nvSciSyncConflictListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciSync's attribute lists\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.nvSciSyncConflictListObj = nvSciSyncConflictListObj; + resourceList.nvSciSyncReconciledListObj = nvSciSyncReconciledListObj; + + sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj, &syncObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciSync object\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.syncObj = syncObj; + + // importing external semaphore + uint64_t* nvSciSyncObjRegPtr = NULL; + cudlaExternalSemaphoreHandleDesc semaMemDesc = { 0 }; + memset(&semaMemDesc, 0, sizeof(semaMemDesc)); + semaMemDesc.extSyncObject = syncObj; + err = cudlaImportExternalSemaphore(devHandle, + &semaMemDesc, + &nvSciSyncObjRegPtr, + 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); + + // Signal Events + cudlaSignalEvents* signalEvents; + signalEvents = (cudlaSignalEvents *)malloc(sizeof(cudlaSignalEvents)); + if (signalEvents == NULL) { + DPRINTF("Error in allocating signal events\n"); + cleanUp(&resourceList); + return 1; + } + + signalEvents->numEvents = 1; + uint64_t** devPtrs = (uint64_t **)malloc(signalEvents->numEvents * + sizeof(uint64_t *)); + if (devPtrs == NULL) { + DPRINTF("Error in allocating output pointer's array of registered objects\n"); + cleanUp(&resourceList); + return 1; + } + devPtrs[0] = nvSciSyncObjRegPtr; + signalEvents->devPtrs = devPtrs; + resourceList.devPtrs = devPtrs; + + signalEvents->eofFences = (CudlaFence *)malloc(signalEvents->numEvents * + sizeof(CudlaFence)); + if (signalEvents->eofFences == NULL) { + DPRINTF("Error in allocating eofFence array\n"); + cleanUp(&resourceList); + return 1; + } + + NvSciSyncFence eofFence = NvSciSyncFenceInitializer; + signalEvents->eofFences[0].fence = &eofFence; + signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE; + resourceList.signalEvents = signalEvents; + resourceList.eofFence = eofFence; + + // Enqueue a cuDLA task. + cudlaTask task; + task.moduleHandle = moduleHandle; + task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr; + + if(statSupport == 1) { + task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics); + DPRINTF("Layerwise profiling is requested \n"); + } else { + task.numOutputTensors = numOutputTensors; + DPRINTF("Layerwise profiling is not requested \n"); + } + + task.numInputTensors = numInputTensors; + task.inputTensor = inputBufObjRegPtr; + task.waitEvents = NULL; + task.signalEvents = signalEvents; + + err = cudlaSubmitTask(devHandle, &task, 1, NULL, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in submitting task\n"); + cleanUp(&resourceList); + return 1; + } + DPRINTF("SUBMIT IS DONE !!!\n"); + + // Wait for operations to finish and bring output buffer to CPU. + sciError = NvSciSyncFenceWait(reinterpret_cast(signalEvents->eofFences[0].fence), + nvSciCtx, -1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in waiting on NvSciSyncFence\n"); + cleanUp(&resourceList); + return 1; + } + + // copy statistics data to cpu + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + memcpy(outputBuffer[ii], outputBufObjBuffer[ii], outputTensorDesc[ii].size); + } + + if(statSupport == 1) { + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + memcpy(statisticsOutputBuffer[ii], statisticsBufObjBuffer[ii], outputTaskStatisticsDesc[ii].size); + } + + const cudlaExternalEtbl* etbl = NULL; + if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) { + DPRINTF("Error in getting export table\n"); + cleanUp(&resourceList); + return 1; + } + + void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + if (csv == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics); + resourceList.csv = csv; + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + cudlaTranslateCsvAttribute csvAttribute; + uint64_t csvStreamLength = 0; + + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute); + csv[ii] = (void* )malloc(csvAttribute.csvStreamLength); + csvStreamLength = csvAttribute.csvStreamLength; + DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength); + + if (csv[ii] == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv[ii], 0x00, csvAttribute.csvStreamLength); + + csvAttribute.csvStreamStats = csv[ii]; + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute); + if (err != cudlaSuccess) { + DPRINTF("Error in translating stats\n"); + cleanUp(&resourceList); + return 1; + } + + if (argc == 5) { + sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix); + fp = fopen(filename, "w+"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", filename); + cleanUp(&resourceList); + return 1; + } + + uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp); + if(ret_val != csvStreamLength) { + DPRINTF("number of elements written to file is %u \n", ret_val); + cleanUp(&resourceList); + return 1; + } + + fclose(fp); + } else { + DPRINTF("%s \n",(char *)csv[ii]); + } + } + } + + // unregister the CUDA-allocated buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (inputBufObjRegPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (outputBufObjRegPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = cudlaMemUnregister(devHandle, + (statisticsBufObjRegPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr); + if (err != cudlaSuccess) { + DPRINTF("Error in unregistering external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); + + + err = cudlaModuleUnload(moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleUnload = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully unloaded module\n"); + } + + resourceList.moduleHandle = NULL; + + err = cudlaDestroyDevice(devHandle); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA destroy device = %d\n", err); + return 1; + } + DPRINTF("Device destroyed successfully\n"); + + resourceList.devHandle = NULL; + + cleanUp(&resourceList); + + DPRINTF("cuDLALayerwiseStatsStandalone DONE !!!\n"); + + return 0; +} diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp index d42ac64c..9ee2f171 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp @@ -552,7 +552,6 @@ int main(int argc, char *argv[]) { checkCudaErrors(cusparseScatter(cusparseHandle, vecz, vecx)); checkCudaErrors(cusparseDestroySpVec(vecz)); - checkCudaErrors(cudaDeviceSynchronize()); stop = second(); diff --git a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp index e244bfe2..6e27672f 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp +++ b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp @@ -31,12 +31,13 @@ * https://vulkan-tutorial.com/ */ -#include -#include -#include #include +#include #include +#include +#include #include +#include #include #include "VulkanBaseApp.h" diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp index cfb5c190..23151947 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp @@ -31,14 +31,16 @@ * https://vulkan-tutorial.com/ */ -#include -#include -#include #include +#include #include +#include +#include #include +#include #include + #include "VulkanBaseApp.h" #include "VulkanCudaInterop.h"