commit 8bb8c5fac058dd66e061a0530afd0336ad4470f7 Author: Andy Dick Date: Fri Mar 2 16:07:37 2018 -0800 Initial public release for CUDA 9.2. diff --git a/Common/drvapi_error_string.h b/Common/drvapi_error_string.h new file mode 100644 index 00000000..c0e42fbb --- /dev/null +++ b/Common/drvapi_error_string.h @@ -0,0 +1,470 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef COMMON_DRVAPI_ERROR_STRING_H_ +#define COMMON_DRVAPI_ERROR_STRING_H_ + +#include +#include +#include + +#ifdef __cuda_cuda_h__ // check to see if CUDA_H is included above + +// Error Code string definitions here +typedef struct { + char const *error_string; + int error_id; +} s_CudaErrorStr; + +/** + * Error codes + */ +static s_CudaErrorStr sCudaDrvErrorString[] = { + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + {"CUDA_SUCCESS", 0}, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + {"CUDA_ERROR_INVALID_VALUE", 1}, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + {"CUDA_ERROR_OUT_OF_MEMORY", 2}, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + {"CUDA_ERROR_NOT_INITIALIZED", 3}, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + {"CUDA_ERROR_DEINITIALIZED", 4}, + + /** + * This indicates profiling APIs are called while application is running + * in visual profiler mode. + */ + {"CUDA_ERROR_PROFILER_DISABLED", 5}, + /** + * This indicates profiling has not been initialized for this context. + * Call cuProfilerInitialize() to resolve this. + */ + {"CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6}, + /** + * This indicates profiler has already been started and probably + * cuProfilerStart() is incorrectly called. + */ + {"CUDA_ERROR_PROFILER_ALREADY_STARTED", 7}, + /** + * This indicates profiler has already been stopped and probably + * cuProfilerStop() is incorrectly called. + */ + {"CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8}, + /** + * This indicates that no CUDA-capable devices were detected by the + * installed CUDA driver. + */ + {"CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100}, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + {"CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", + 101}, + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + {"CUDA_ERROR_INVALID_IMAGE", 200}, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + {"CUDA_ERROR_INVALID_CONTEXT", 201}, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + {"CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202}, + + /** + * This indicates that a map or register operation has failed. + */ + {"CUDA_ERROR_MAP_FAILED", 205}, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + {"CUDA_ERROR_UNMAP_FAILED", 206}, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + {"CUDA_ERROR_ARRAY_IS_MAPPED", 207}, + + /** + * This indicates that the resource is already mapped. + */ + {"CUDA_ERROR_ALREADY_MAPPED", 208}, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + {"CUDA_ERROR_NO_BINARY_FOR_GPU", 209}, + + /** + * This indicates that a resource has already been acquired. + */ + {"CUDA_ERROR_ALREADY_ACQUIRED", 210}, + + /** + * This indicates that a resource is not mapped. + */ + {"CUDA_ERROR_NOT_MAPPED", 211}, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + {"CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212}, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + {"CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213}, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + {"CUDA_ERROR_ECC_UNCORRECTABLE", 214}, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + {"CUDA_ERROR_UNSUPPORTED_LIMIT", 215}, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + {"CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216}, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + {"CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, + + /** + * This indicates that a PTX JIT compilation failed. + */ + {"CUDA_ERROR_INVALID_PTX", 218}, + + /** + * This indicates an error with OpenGL or DirectX context. + */ + {"CUDA_ERROR_INVALID_GRAPHICS_CONTEXT", 219}, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + {"CUDA_ERROR_NVLINK_UNCORRECTABLE", 220}, + + /** + * This indicates that the PTX JIT compiler library was not found. + */ + {"CUDA_ERROR_JIT_COMPILER_NOT_FOUND", 221}, + + /** + * This indicates that the device kernel source is invalid. + */ + {"CUDA_ERROR_INVALID_SOURCE", 300}, + + /** + * This indicates that the file specified was not found. + */ + {"CUDA_ERROR_FILE_NOT_FOUND", 301}, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + {"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302}, + + /** + * This indicates that initialization of a shared object failed. + */ + {"CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303}, + + /** + * This indicates that an OS call failed. + */ + {"CUDA_ERROR_OPERATING_SYSTEM", 304}, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + {"CUDA_ERROR_INVALID_HANDLE", 400}, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names }, and surface names. + */ + {"CUDA_ERROR_NOT_FOUND", 500}, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be + * indicated differently than ::CUDA_SUCCESS (which indicates completion). + * Calls that may return this value include ::cuEventQuery() and + * ::cuStreamQuery(). + */ + {"CUDA_ERROR_NOT_READY", 600}, + + /** + * While executing a kernel, the device encountered a + * load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must + * be terminated and relaunched. + */ + {"CUDA_ERROR_ILLEGAL_ADDRESS", 700}, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + {"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701}, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The + * context cannot be used (and must be destroyed similar to + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from + * this context are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + {"CUDA_ERROR_LAUNCH_TIMEOUT", 702}, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + {"CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703}, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + {"CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704}, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + {"CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705}, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + {"CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708}, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy }, or is a primary context which + * has not yet been initialized. + */ + {"CUDA_ERROR_CONTEXT_IS_DESTROYED", 709}, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + {"CUDA_ERROR_ASSERT", 710}, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + {"CUDA_ERROR_TOO_MANY_PEERS", 711}, + + /** + * This error indicates that the memory range passed to + * ::cuMemHostRegister() has already been registered. + */ + {"CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712}, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + {"CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713}, + + /** + * While executing a kernel, the device encountered a stack error. + * This can be due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must + * be terminated and relaunched. + */ + {"CUDA_ERROR_HARDWARE_STACK_ERROR", 714}, + + /** + * While executing a kernel, the device encountered an illegal instruction. + * This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must + * be terminated and relaunched. + */ + {"CUDA_ERROR_ILLEGAL_INSTRUCTION", 715}, + + /** + * While executing a kernel, the device encountered a load or store + * instruction on a memory address which is not aligned. This leaves the + * process in an inconsistent state and any further CUDA work will return + * the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + {"CUDA_ERROR_MISALIGNED_ADDRESS", 716}, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must + * be terminated and relaunched. + */ + {"CUDA_ERROR_INVALID_ADDRESS_SPACE", 717}, + + /** + * While executing a kernel, the device program counter wrapped its address + * space. This leaves the process in an inconsistent state and any further + * CUDA work will return the same error. To continue using CUDA, the process + * must be terminated and relaunched. + */ + {"CUDA_ERROR_INVALID_PC", 718}, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The context cannot be used }, so it must + * be destroyed (and a new one should be created). All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + {"CUDA_ERROR_LAUNCH_FAILED", 719}, + + /** + * This error indicates that the number of blocks launched per grid for a + * kernel that was launched via either ::cuLaunchCooperativeKernel or + * ::cuLaunchCooperativeKernelMultiDevice exceeds the maximum number of + * blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number + * of multiprocessors as specified by the device attribute + * ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + */ + {"CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE", 720}, + + /** + * This error indicates that the attempted operation is not permitted. + */ + {"CUDA_ERROR_NOT_PERMITTED", 800}, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + {"CUDA_ERROR_NOT_SUPPORTED", 801}, + + /** + * This indicates that an unknown internal error has occurred. + */ + {"CUDA_ERROR_UNKNOWN", 999}, + {NULL, -1}}; + +// This is just a linear search through the array, since the error_id's are not +// always ocurring consecutively +inline const char *getCudaDrvErrorString(CUresult error_id) { + int index = 0; + + while (sCudaDrvErrorString[index].error_id != error_id && + sCudaDrvErrorString[index].error_id != -1) { + index++; + } + + if (sCudaDrvErrorString[index].error_id == error_id) + return (const char *)sCudaDrvErrorString[index].error_string; + else + return (const char *)"CUDA_ERROR not found!"; +} + +#endif // __cuda_cuda_h__ + +#endif // COMMON_DRVAPI_ERROR_STRING_H_ diff --git a/Common/exception.h b/Common/exception.h new file mode 100644 index 00000000..6449c21f --- /dev/null +++ b/Common/exception.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* CUda UTility Library */ +#ifndef COMMON_EXCEPTION_H_ +#define COMMON_EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception { + public: + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + explicit Exception(const std::string &str); +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void handleException(const Exception_Typ &ex) { + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const char *detailed) { + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const std::string &msg) { + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : Std_Exception("Unknown Exception.\n") {} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : Std_Exception(s) {} + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() {} + + // functions, exported + +#endif // COMMON_EXCEPTION_H_ diff --git a/Common/helper_cuda.h b/Common/helper_cuda.h new file mode 100644 index 00000000..1e1c84f2 --- /dev/null +++ b/Common/helper_cuda.h @@ -0,0 +1,1341 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include + +#include + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) { + switch (error) { + case cudaSuccess: + return "cudaSuccess"; + + case cudaErrorMissingConfiguration: + return "cudaErrorMissingConfiguration"; + + case cudaErrorMemoryAllocation: + return "cudaErrorMemoryAllocation"; + + case cudaErrorInitializationError: + return "cudaErrorInitializationError"; + + case cudaErrorLaunchFailure: + return "cudaErrorLaunchFailure"; + + case cudaErrorPriorLaunchFailure: + return "cudaErrorPriorLaunchFailure"; + + case cudaErrorLaunchTimeout: + return "cudaErrorLaunchTimeout"; + + case cudaErrorLaunchOutOfResources: + return "cudaErrorLaunchOutOfResources"; + + case cudaErrorInvalidDeviceFunction: + return "cudaErrorInvalidDeviceFunction"; + + case cudaErrorInvalidConfiguration: + return "cudaErrorInvalidConfiguration"; + + case cudaErrorInvalidDevice: + return "cudaErrorInvalidDevice"; + + case cudaErrorInvalidValue: + return "cudaErrorInvalidValue"; + + case cudaErrorInvalidPitchValue: + return "cudaErrorInvalidPitchValue"; + + case cudaErrorInvalidSymbol: + return "cudaErrorInvalidSymbol"; + + case cudaErrorMapBufferObjectFailed: + return "cudaErrorMapBufferObjectFailed"; + + case cudaErrorUnmapBufferObjectFailed: + return "cudaErrorUnmapBufferObjectFailed"; + + case cudaErrorInvalidHostPointer: + return "cudaErrorInvalidHostPointer"; + + case cudaErrorInvalidDevicePointer: + return "cudaErrorInvalidDevicePointer"; + + case cudaErrorInvalidTexture: + return "cudaErrorInvalidTexture"; + + case cudaErrorInvalidTextureBinding: + return "cudaErrorInvalidTextureBinding"; + + case cudaErrorInvalidChannelDescriptor: + return "cudaErrorInvalidChannelDescriptor"; + + case cudaErrorInvalidMemcpyDirection: + return "cudaErrorInvalidMemcpyDirection"; + + case cudaErrorAddressOfConstant: + return "cudaErrorAddressOfConstant"; + + case cudaErrorTextureFetchFailed: + return "cudaErrorTextureFetchFailed"; + + case cudaErrorTextureNotBound: + return "cudaErrorTextureNotBound"; + + case cudaErrorSynchronizationError: + return "cudaErrorSynchronizationError"; + + case cudaErrorInvalidFilterSetting: + return "cudaErrorInvalidFilterSetting"; + + case cudaErrorInvalidNormSetting: + return "cudaErrorInvalidNormSetting"; + + case cudaErrorMixedDeviceExecution: + return "cudaErrorMixedDeviceExecution"; + + case cudaErrorCudartUnloading: + return "cudaErrorCudartUnloading"; + + case cudaErrorUnknown: + return "cudaErrorUnknown"; + + case cudaErrorNotYetImplemented: + return "cudaErrorNotYetImplemented"; + + case cudaErrorMemoryValueTooLarge: + return "cudaErrorMemoryValueTooLarge"; + + case cudaErrorInvalidResourceHandle: + return "cudaErrorInvalidResourceHandle"; + + case cudaErrorNotReady: + return "cudaErrorNotReady"; + + case cudaErrorInsufficientDriver: + return "cudaErrorInsufficientDriver"; + + case cudaErrorSetOnActiveProcess: + return "cudaErrorSetOnActiveProcess"; + + case cudaErrorInvalidSurface: + return "cudaErrorInvalidSurface"; + + case cudaErrorNoDevice: + return "cudaErrorNoDevice"; + + case cudaErrorECCUncorrectable: + return "cudaErrorECCUncorrectable"; + + case cudaErrorSharedObjectSymbolNotFound: + return "cudaErrorSharedObjectSymbolNotFound"; + + case cudaErrorSharedObjectInitFailed: + return "cudaErrorSharedObjectInitFailed"; + + case cudaErrorUnsupportedLimit: + return "cudaErrorUnsupportedLimit"; + + case cudaErrorDuplicateVariableName: + return "cudaErrorDuplicateVariableName"; + + case cudaErrorDuplicateTextureName: + return "cudaErrorDuplicateTextureName"; + + case cudaErrorDuplicateSurfaceName: + return "cudaErrorDuplicateSurfaceName"; + + case cudaErrorDevicesUnavailable: + return "cudaErrorDevicesUnavailable"; + + case cudaErrorInvalidKernelImage: + return "cudaErrorInvalidKernelImage"; + + case cudaErrorNoKernelImageForDevice: + return "cudaErrorNoKernelImageForDevice"; + + case cudaErrorIncompatibleDriverContext: + return "cudaErrorIncompatibleDriverContext"; + + case cudaErrorPeerAccessAlreadyEnabled: + return "cudaErrorPeerAccessAlreadyEnabled"; + + case cudaErrorPeerAccessNotEnabled: + return "cudaErrorPeerAccessNotEnabled"; + + case cudaErrorDeviceAlreadyInUse: + return "cudaErrorDeviceAlreadyInUse"; + + case cudaErrorProfilerDisabled: + return "cudaErrorProfilerDisabled"; + + case cudaErrorProfilerNotInitialized: + return "cudaErrorProfilerNotInitialized"; + + case cudaErrorProfilerAlreadyStarted: + return "cudaErrorProfilerAlreadyStarted"; + + case cudaErrorProfilerAlreadyStopped: + return "cudaErrorProfilerAlreadyStopped"; + + /* Since CUDA 4.0*/ + case cudaErrorAssert: + return "cudaErrorAssert"; + + case cudaErrorTooManyPeers: + return "cudaErrorTooManyPeers"; + + case cudaErrorHostMemoryAlreadyRegistered: + return "cudaErrorHostMemoryAlreadyRegistered"; + + case cudaErrorHostMemoryNotRegistered: + return "cudaErrorHostMemoryNotRegistered"; + + /* Since CUDA 5.0 */ + case cudaErrorOperatingSystem: + return "cudaErrorOperatingSystem"; + + case cudaErrorPeerAccessUnsupported: + return "cudaErrorPeerAccessUnsupported"; + + case cudaErrorLaunchMaxDepthExceeded: + return "cudaErrorLaunchMaxDepthExceeded"; + + case cudaErrorLaunchFileScopedTex: + return "cudaErrorLaunchFileScopedTex"; + + case cudaErrorLaunchFileScopedSurf: + return "cudaErrorLaunchFileScopedSurf"; + + case cudaErrorSyncDepthExceeded: + return "cudaErrorSyncDepthExceeded"; + + case cudaErrorLaunchPendingCountExceeded: + return "cudaErrorLaunchPendingCountExceeded"; + + case cudaErrorNotPermitted: + return "cudaErrorNotPermitted"; + + case cudaErrorNotSupported: + return "cudaErrorNotSupported"; + + /* Since CUDA 6.0 */ + case cudaErrorHardwareStackError: + return "cudaErrorHardwareStackError"; + + case cudaErrorIllegalInstruction: + return "cudaErrorIllegalInstruction"; + + case cudaErrorMisalignedAddress: + return "cudaErrorMisalignedAddress"; + + case cudaErrorInvalidAddressSpace: + return "cudaErrorInvalidAddressSpace"; + + case cudaErrorInvalidPc: + return "cudaErrorInvalidPc"; + + case cudaErrorIllegalAddress: + return "cudaErrorIllegalAddress"; + + /* Since CUDA 6.5*/ + case cudaErrorInvalidPtx: + return "cudaErrorInvalidPtx"; + + case cudaErrorInvalidGraphicsContext: + return "cudaErrorInvalidGraphicsContext"; + + case cudaErrorStartupFailure: + return "cudaErrorStartupFailure"; + + case cudaErrorApiFailureBase: + return "cudaErrorApiFailureBase"; + + /* Since CUDA 8.0*/ + case cudaErrorNvlinkUncorrectable: + return "cudaErrorNvlinkUncorrectable"; + + /* Since CUDA 8.5*/ + case cudaErrorJitCompilerNotFound: + return "cudaErrorJitCompilerNotFound"; + + /* Since CUDA 9.0*/ + case cudaErrorCooperativeLaunchTooLarge: + return "cudaErrorCooperativeLaunchTooLarge"; + } + + return ""; +} +#endif + +#ifdef __cuda_cuda_h__ +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + switch (error) { + case CUDA_SUCCESS: + return "CUDA_SUCCESS"; + + case CUDA_ERROR_INVALID_VALUE: + return "CUDA_ERROR_INVALID_VALUE"; + + case CUDA_ERROR_OUT_OF_MEMORY: + return "CUDA_ERROR_OUT_OF_MEMORY"; + + case CUDA_ERROR_NOT_INITIALIZED: + return "CUDA_ERROR_NOT_INITIALIZED"; + + case CUDA_ERROR_DEINITIALIZED: + return "CUDA_ERROR_DEINITIALIZED"; + + case CUDA_ERROR_PROFILER_DISABLED: + return "CUDA_ERROR_PROFILER_DISABLED"; + + case CUDA_ERROR_PROFILER_NOT_INITIALIZED: + return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; + + case CUDA_ERROR_PROFILER_ALREADY_STARTED: + return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; + + case CUDA_ERROR_PROFILER_ALREADY_STOPPED: + return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; + + case CUDA_ERROR_NO_DEVICE: + return "CUDA_ERROR_NO_DEVICE"; + + case CUDA_ERROR_INVALID_DEVICE: + return "CUDA_ERROR_INVALID_DEVICE"; + + case CUDA_ERROR_INVALID_IMAGE: + return "CUDA_ERROR_INVALID_IMAGE"; + + case CUDA_ERROR_INVALID_CONTEXT: + return "CUDA_ERROR_INVALID_CONTEXT"; + + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: + return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; + + case CUDA_ERROR_MAP_FAILED: + return "CUDA_ERROR_MAP_FAILED"; + + case CUDA_ERROR_UNMAP_FAILED: + return "CUDA_ERROR_UNMAP_FAILED"; + + case CUDA_ERROR_ARRAY_IS_MAPPED: + return "CUDA_ERROR_ARRAY_IS_MAPPED"; + + case CUDA_ERROR_ALREADY_MAPPED: + return "CUDA_ERROR_ALREADY_MAPPED"; + + case CUDA_ERROR_NO_BINARY_FOR_GPU: + return "CUDA_ERROR_NO_BINARY_FOR_GPU"; + + case CUDA_ERROR_ALREADY_ACQUIRED: + return "CUDA_ERROR_ALREADY_ACQUIRED"; + + case CUDA_ERROR_NOT_MAPPED: + return "CUDA_ERROR_NOT_MAPPED"; + + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: + return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; + + case CUDA_ERROR_NOT_MAPPED_AS_POINTER: + return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; + + case CUDA_ERROR_ECC_UNCORRECTABLE: + return "CUDA_ERROR_ECC_UNCORRECTABLE"; + + case CUDA_ERROR_UNSUPPORTED_LIMIT: + return "CUDA_ERROR_UNSUPPORTED_LIMIT"; + + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: + return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; + + case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: + return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"; + + case CUDA_ERROR_INVALID_PTX: + return "CUDA_ERROR_INVALID_PTX"; + + case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: + return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"; + + case CUDA_ERROR_NVLINK_UNCORRECTABLE: + return "CUDA_ERROR_NVLINK_UNCORRECTABLE"; + + case CUDA_ERROR_JIT_COMPILER_NOT_FOUND: + return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND"; + + case CUDA_ERROR_INVALID_SOURCE: + return "CUDA_ERROR_INVALID_SOURCE"; + + case CUDA_ERROR_FILE_NOT_FOUND: + return "CUDA_ERROR_FILE_NOT_FOUND"; + + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: + return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; + + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: + return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; + + case CUDA_ERROR_OPERATING_SYSTEM: + return "CUDA_ERROR_OPERATING_SYSTEM"; + + case CUDA_ERROR_INVALID_HANDLE: + return "CUDA_ERROR_INVALID_HANDLE"; + + case CUDA_ERROR_NOT_FOUND: + return "CUDA_ERROR_NOT_FOUND"; + + case CUDA_ERROR_NOT_READY: + return "CUDA_ERROR_NOT_READY"; + + case CUDA_ERROR_ILLEGAL_ADDRESS: + return "CUDA_ERROR_ILLEGAL_ADDRESS"; + + case CUDA_ERROR_LAUNCH_FAILED: + return "CUDA_ERROR_LAUNCH_FAILED"; + + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + + case CUDA_ERROR_LAUNCH_TIMEOUT: + return "CUDA_ERROR_LAUNCH_TIMEOUT"; + + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: + return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: + return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + + case CUDA_ERROR_CONTEXT_IS_DESTROYED: + return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + + case CUDA_ERROR_ASSERT: + return "CUDA_ERROR_ASSERT"; + + case CUDA_ERROR_TOO_MANY_PEERS: + return "CUDA_ERROR_TOO_MANY_PEERS"; + + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + + case CUDA_ERROR_HARDWARE_STACK_ERROR: + return "CUDA_ERROR_HARDWARE_STACK_ERROR"; + + case CUDA_ERROR_ILLEGAL_INSTRUCTION: + return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; + + case CUDA_ERROR_MISALIGNED_ADDRESS: + return "CUDA_ERROR_MISALIGNED_ADDRESS"; + + case CUDA_ERROR_INVALID_ADDRESS_SPACE: + return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; + + case CUDA_ERROR_INVALID_PC: + return "CUDA_ERROR_INVALID_PC"; + + case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: + return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE"; + + case CUDA_ERROR_NOT_PERMITTED: + return "CUDA_ERROR_NOT_PERMITTED"; + + case CUDA_ERROR_NOT_SUPPORTED: + return "CUDA_ERROR_NOT_SUPPORTED"; + + case CUDA_ERROR_UNKNOWN: + return "CUDA_ERROR_UNKNOWN"; + } + + return ""; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +#ifdef __DRIVER_TYPES_H__ +#ifndef DEVICE_RESET +#define DEVICE_RESET cudaDeviceReset(); +#endif +#else +#ifndef DEVICE_RESET +#define DEVICE_RESET +#endif +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + DEVICE_RESET + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + DEVICE_RESET + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class + {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class + {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class + {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class + {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class + {0x60, 64}, // Pascal Generation (SM 6.0) GP100 class + {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class + {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class + {0x70, 64}, // Volta Generation (SM 7.0) GV100 class + {0x72, 64}, // Volta Generation (SM 7.2) GV11b class + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + // end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + + if (deviceProp.computeMode == cudaComputeModeProhibited) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (deviceProp.major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0, best_SM_arch = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best major SM Architecture GPU device + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (deviceProp.computeMode != cudaComputeModeProhibited) { + if (deviceProp.major > 0 && deviceProp.major < 9999) { + best_SM_arch = MAX(best_SM_arch, deviceProp.major); + } + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (deviceProp.computeMode != cudaComputeModeProhibited) { + if (deviceProp.major == 9999 && deviceProp.minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + } + + uint64_t compute_perf = (uint64_t)deviceProp.multiProcessorCount * + sm_per_multiproc * deviceProp.clockRate; + + if (compute_perf > max_compute_perf) { + // If we find GPU with SM major > 2, search only these + if (best_SM_arch > 2) { + // If our device==dest_SM_arch, choose this, or else pass + if (deviceProp.major == best_SM_arch) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + } + + ++current_device; + } + + return max_perf_device; +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + cudaDeviceProp deviceProp; + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, + deviceProp.name, deviceProp.major, deviceProp.minor); + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + if (deviceProp.integrated && + (deviceProp.computeMode != cudaComputeModeProhibited)) { + checkCudaErrors(cudaSetDevice(current_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, deviceProp.name, deviceProp.major, + deviceProp.minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + cudaDeviceProp deviceProp; + deviceProp.major = 0; + deviceProp.minor = 0; + int dev; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + + if ((deviceProp.major > major_version) || + (deviceProp.major == major_version && + deviceProp.minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + deviceProp.name, deviceProp.major, deviceProp.minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/Common/helper_cuda_drvapi.h b/Common/helper_cuda_drvapi.h new file mode 100644 index 00000000..db43cff7 --- /dev/null +++ b/Common/helper_cuda_drvapi.h @@ -0,0 +1,419 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper functions for CUDA Driver API error handling (make sure that CUDA_H is +// included in your projects) +#ifndef COMMON_HELPER_CUDA_DRVAPI_H_ +#define COMMON_HELPER_CUDA_DRVAPI_H_ + +#include +#include +#include + +#include +#include + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +#ifndef COMMON_HELPER_CUDA_H_ +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions + +// add a level of protection to the CUDA SDK samples, let's force samples to +// explicitly include CUDA.H +#ifdef __cuda_cuda_h__ +// This will output the proper CUDA error strings in the event that a CUDA host +// call returns an error +#ifndef checkCudaErrors +#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) + +// These are the inline versions for all of the SDK helper functions +inline void __checkCudaErrors(CUresult err, const char *file, const int line) { + if (CUDA_SUCCESS != err) { + fprintf(stderr, + "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, " + "line %i.\n", + err, getCudaDrvErrorString(err), file, line); + exit(EXIT_FAILURE); + } +} +#endif + +#ifdef getLastCudaDrvErrorMsg +#undef getLastCudaDrvErrorMsg +#endif + +#define getLastCudaDrvErrorMsg(msg) \ + __getLastCudaDrvErrorMsg(msg, __FILE__, __LINE__) + +inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, + const int line) { + CUresult err = cuCtxSynchronize(); + + if (CUDA_SUCCESS != err) { + fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg); + fprintf(stderr, + "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d " + "\"%s\" in file <%s>, line %i.\n", + err, getCudaDrvErrorString(err), file, line); + exit(EXIT_FAILURE); + } +} + +// This function wraps the CUDA Driver API into a template function +template +inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, + int device) { + CUresult error_result = + cuDeviceGetAttribute(attribute, device_attribute, device); + + if (error_result != CUDA_SUCCESS) { + printf("cuDeviceGetAttribute returned %d\n-> %s\n", + static_cast(error_result), getCudaDrvErrorString(error_result)); + exit(EXIT_SUCCESS); + } +} +#endif + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2CoresDRV(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine the # + // of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM + // minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class + {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class + {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class + {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class + {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class + {0x60, 64}, // Pascal Generation (SM 6.0) GP100 class + {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class + {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class + {0x70, 64}, // Volta Generation (SM 7.0) GV100 class + {0x72, 64}, // Volta Generation (SM 7.2) GV11b class + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one to run + // properly + printf( + "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} +// end of GPU Architecture definitions + +#ifdef __cuda_cuda_h__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) { + int cuDevice = 0; + int deviceCount = 0; + CUresult err = cuInit(0); + + if (CUDA_SUCCESS == err) { + checkCudaErrors(cuDeviceGetCount(&deviceCount)); + } + + if (deviceCount == 0) { + fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } + + int dev = 0; + dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device="); + + if (dev < 0) { + dev = 0; + } + + if (dev > deviceCount - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + deviceCount); + fprintf(stderr, + ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", + dev); + fprintf(stderr, "\n"); + return -dev; + } + + checkCudaErrors(cuDeviceGet(&cuDevice, dev)); + char name[100]; + cuDeviceGetName(name, 100, cuDevice); + + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); + + if (computeMode == CU_COMPUTEMODE_PROHIBITED) { + fprintf(stderr, + "Error: device is running in , no " + "threads can use this CUDA Device.\n"); + return -1; + } + + if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) { + printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name); + } + + return dev; +} + +// This function returns the best GPU based on performance +inline int gpuGetMaxGflopsDeviceIdDRV() { + CUdevice current_device = 0; + CUdevice max_perf_device = 0; + int device_count = 0; + int sm_per_multiproc = 0; + unsigned long long max_compute_perf = 0; + int best_SM_arch = 0; + int major = 0; + int minor = 0; + int multiProcessorCount; + int clockRate; + int devices_prohibited = 0; + + cuInit(0); + checkCudaErrors(cuDeviceGetCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } + + // Find the best major SM Architecture GPU device + while (current_device < device_count) { + checkCudaErrors(cuDeviceGetAttribute( + &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); + checkCudaErrors(cuDeviceGetAttribute( + &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); + if (major > 0 && major < 9999) { + best_SM_arch = MAX(best_SM_arch, major); + } + + current_device++; + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + checkCudaErrors(cuDeviceGetAttribute( + &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + current_device)); + checkCudaErrors(cuDeviceGetAttribute( + &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device)); + checkCudaErrors(cuDeviceGetAttribute( + &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); + checkCudaErrors(cuDeviceGetAttribute( + &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); + + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, + current_device); + + if (computeMode != CU_COMPUTEMODE_PROHIBITED) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); + } + + unsigned long long compute_perf = + (unsigned long long)(multiProcessorCount * sm_per_multiproc * + clockRate); + + if (compute_perf > max_compute_perf) { + // If we find GPU with SM major > 2, search only these + if (best_SM_arch > 2) { + // If our device==dest_SM_arch, choose this, or else pass + if (major == best_SM_arch) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode " + "prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} + +// General initialization call to pick the best CUDA Device +inline CUdevice findCudaDeviceDRV(int argc, const char **argv) { + CUdevice cuDevice; + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + devID = gpuDeviceInitDRV(argc, argv); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } + } else { + // Otherwise pick the device with highest Gflops/s + char name[100]; + devID = gpuGetMaxGflopsDeviceIdDRV(); + checkCudaErrors(cuDeviceGet(&cuDevice, devID)); + cuDeviceGetName(name, 100, cuDevice); + printf("> Using CUDA Device [%d]: %s\n", devID, name); + } + + cuDeviceGet(&cuDevice, devID); + + return cuDevice; +} + +inline CUdevice findIntegratedGPUDrv() { + CUdevice current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + int isIntegrated; + + cuInit(0); + checkCudaErrors(cuDeviceGetCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1; + checkCudaErrors(cuDeviceGetAttribute( + &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device)); + checkCudaErrors(cuDeviceGetAttribute( + &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device)); + + // If GPU is integrated and is not running on Compute Mode prohibited use + // that + if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) { + int major = 0, minor = 0; + char deviceName[256]; + checkCudaErrors(cuDeviceGetAttribute( + &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + current_device)); + checkCudaErrors(cuDeviceGetAttribute( + &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + current_device)); + checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, deviceName, major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, + int devID) { + CUdevice cuDevice; + char name[256]; + int major = 0, minor = 0; + + checkCudaErrors(cuDeviceGet(&cuDevice, devID)); + checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute( + &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute( + &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + + if ((major > major_version) || + (major == major_version && minor >= minor_version)) { + printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, + major, minor); + return true; + } else { + printf( + "No GPU device was found that can support CUDA compute capability " + "%d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + +// end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_DRVAPI_H_ diff --git a/Common/helper_functions.h b/Common/helper_functions.h new file mode 100644 index 00000000..ec389470 --- /dev/null +++ b/Common/helper_functions.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, +// timers, image helpers, etc) +#ifndef COMMON_HELPER_FUNCTIONS_H_ +#define COMMON_HELPER_FUNCTIONS_H_ + +#ifdef WIN32 +#pragma warning(disable : 4996) +#endif + +// includes, project +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include // helper functions for image compare, dump, data comparisons +#include // helper functions for string parsing +#include // helper functions for timers + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/Common/helper_image.h b/Common/helper_image.h new file mode 100644 index 00000000..08e33307 --- /dev/null +++ b/Common/helper_image.h @@ -0,0 +1,1001 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef COMMON_HELPER_IMAGE_H_ +#define COMMON_HELPER_IMAGE_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a, b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include + +// namespace unnamed (internal) +namespace helper_image_internal { +//! size of PGM file header +const unsigned int PGMHeaderSize = 0x40; + +// types + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterFromUByte; + +//! Data converter from unsigned char / unsigned byte +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val); + } +}; + +//! Data converter from unsigned char / unsigned byte to float +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val) / 255.0f; + } +}; + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterToUByte; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char &val) { return val; } +}; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float &val) { + return static_cast(val * 255.0f); + } +}; +} // namespace helper_image_internal + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, + unsigned int *h, unsigned int *channels) { + FILE *fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[helper_image_internal::PGMHeaderSize]; + + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) { + *channels = 1; + } else if (strncmp(header, "P6", 2) == 0) { + *channels = 3; + } else { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) { + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" + << std::endl; + return false; + } + + if (header[0] == '#') { + continue; + } + + if (i == 0) { + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + } else if (i == 1) { + i += SSCANF(header, "%u %u", &height, &maxval); + } else if (i == 2) { + i += SSCANF(header, "%u", &maxval); + } + } + + // check if given handle for the data is initialized + if (NULL != *data) { + if (*w != width || *h != height) { + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } + } else { + *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height * + *channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == + 0) { + std::cerr << "__LoadPPM() read data returned error." << std::endl; + } + + fclose(fp); + + return true; +} + +template +inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) { + return false; + } + + unsigned int size = *w * *h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) { + *data = reinterpret_cast(malloc(sizeof(T) * size)); + } + + // copy and cast data + std::transform(idata, idata + size, *data, + helper_image_internal::ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, + unsigned int h, unsigned int channels) { + assert(NULL != data); + assert(w > 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) { + fh << "P5\n"; + } else if (channels == 3) { + fh << "P6\n"; + } else { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { + fh << data[i]; + } + + fh.flush(); + + if (fh.bad()) { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool sdkSavePGM(const char *file, T *data, unsigned int w, + unsigned int h) { + unsigned int size = w * h; + unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, + helper_image_internal::ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, + unsigned int h) { + // strip 4th component + int size = w * h; + unsigned char *ndata = + (unsigned char *)malloc(sizeof(unsigned char) * size * 3); + unsigned char *ptr = ndata; + + for (int i = 0; i < size; i++) { + *ptr++ = *data++; + *ptr++ = *data++; + *ptr++ = *data++; + data++; + } + + bool result = __savePPM(file, ndata, w, h, 3); + free(ndata); + return result; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE *fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) { + if (*len != data_read.size()) { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } else { + // allocate storage for the data read + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, + unsigned int block_num, unsigned int block_size, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE *fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = reinterpret_cast(malloc(block_size)); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, + const S epsilon, bool verbose, bool append = false) { + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename + << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for + write." << std::endl; + } + } + */ + + // check if filestream is valid + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + } + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) { + fh << data[i] << ' '; + } + + // Check if writing succeeded + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + } + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareData(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = static_cast(reference[i]) - static_cast(data[i]); + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (!comp) { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) { + return (result) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return (len * threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareDataAsFloatThreshold(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) { + float diff = + fabs(static_cast(reference[i]) - static_cast(data[i])); + bool comp = (diff < max_error); + result &= comp; + + if (!comp) { + error_count++; + } + } + + if (threshold == 0.0f) { + if (error_count) { + printf("total # of errors = %d\n", error_count); + } + + return (error_count == 0) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return ((len * threshold > error_count) ? true : false); + } +} + +inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) { + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + uint64_t error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", + src_file); + error_count++; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", + ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf( + "compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) { + src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf( + "> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize)); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize)); + + if (!compareData(ref_buffer, src_buffer, nelements, + epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + float *src_buffer = NULL, *ref_buffer = NULL; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + uint64_t error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, + exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin unable to open ref_file: %s\n", + ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) { + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); + fsize = fread(src_buffer, sizeof(float), nelements, src_fp); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize * sizeof(float))); + fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize * sizeof(float))); + + if (!compareDataAsFloatThreshold( + ref_buffer, src_buffer, nelements, epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareL2fe(const float *reference, const float *data, + const unsigned int len, const float epsilon) { + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (!result) { + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " + << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool sdkLoadPPMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool sdkComparePPM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data, *ref_data; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) { + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + } + + if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +inline bool sdkComparePGM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data = 0, *ref_data = 0; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width * src_height, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/Common/helper_string.h b/Common/helper_string.h new file mode 100644 index 00000000..050302a8 --- /dev/null +++ b/Common/helper_string.h @@ -0,0 +1,699 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./_data_files/", + "./common/", // "/common/" subdir + "./common/data/", // "/common/data/" subdir + "./data/", // "/data/" subdir + "./src/", // "/src/" subdir + "./src//data/", // "/src//data/" subdir + "./inc/", // "/inc/" subdir + "./0_Simple/", // "/0_Simple/" subdir + "./1_Utilities/", // "/1_Utilities/" subdir + "./2_Graphics/", // "/2_Graphics/" subdir + "./3_Imaging/", // "/3_Imaging/" subdir + "./4_Finance/", // "/4_Finance/" subdir + "./5_Simulations/", // "/5_Simulations/" subdir + "./6_Advanced/", // "/6_Advanced/" subdir + "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir + "./8_Android/", // "/8_Android/" subdir + "./samples/", // "/samples/" subdir + + "./0_Simple//data/", // "/0_Simple//data/" + // subdir + "./1_Utilities//data/", // "/1_Utilities//data/" + // subdir + "./2_Graphics//data/", // "/2_Graphics//data/" + // subdir + "./3_Imaging//data/", // "/3_Imaging//data/" + // subdir + "./4_Finance//data/", // "/4_Finance//data/" + // subdir + "./5_Simulations//data/", // "/5_Simulations//data/" + // subdir + "./6_Advanced//data/", // "/6_Advanced//data/" + // subdir + "./7_CUDALibraries//", // "/7_CUDALibraries//" + // subdir + "./7_CUDALibraries//data/", // "/7_CUDALibraries//data/" + // subdir + + "../", // up 1 in tree + "../common/", // up 1 in tree, "/common/" subdir + "../common/data/", // up 1 in tree, "/common/data/" subdir + "../data/", // up 1 in tree, "/data/" subdir + "../src/", // up 1 in tree, "/src/" subdir + "../inc/", // up 1 in tree, "/inc/" subdir + + "../0_Simple//data/", // up 1 in tree, + // "/0_Simple//" + // subdir + "../1_Utilities//data/", // up 1 in tree, + // "/1_Utilities//" + // subdir + "../2_Graphics//data/", // up 1 in tree, + // "/2_Graphics//" + // subdir + "../3_Imaging//data/", // up 1 in tree, + // "/3_Imaging//" + // subdir + "../4_Finance//data/", // up 1 in tree, + // "/4_Finance//" + // subdir + "../5_Simulations//data/", // up 1 in tree, + // "/5_Simulations//" + // subdir + "../6_Advanced//data/", // up 1 in tree, + // "/6_Advanced//" + // subdir + "../7_CUDALibraries//data/", // up 1 in tree, + // "/7_CUDALibraries//" + // subdir + "../8_Android//data/", // up 1 in tree, + // "/8_Android//" + // subdir + "../samples//data/", // up 1 in tree, + // "/samples//" + // subdir + "../../", // up 2 in tree + "../../common/", // up 2 in tree, "/common/" subdir + "../../common/data/", // up 2 in tree, "/common/data/" subdir + "../../data/", // up 2 in tree, "/data/" subdir + "../../src/", // up 2 in tree, "/src/" subdir + "../../inc/", // up 2 in tree, "/inc/" subdir + "../../sandbox//data/", // up 2 in tree, + // "/sandbox//" + // subdir + "../../0_Simple//data/", // up 2 in tree, + // "/0_Simple//" + // subdir + "../../1_Utilities//data/", // up 2 in tree, + // "/1_Utilities//" + // subdir + "../../2_Graphics//data/", // up 2 in tree, + // "/2_Graphics//" + // subdir + "../../3_Imaging//data/", // up 2 in tree, + // "/3_Imaging//" + // subdir + "../../4_Finance//data/", // up 2 in tree, + // "/4_Finance//" + // subdir + "../../5_Simulations//data/", // up 2 in tree, + // "/5_Simulations//" + // subdir + "../../6_Advanced//data/", // up 2 in tree, + // "/6_Advanced//" + // subdir + "../../7_CUDALibraries//data/", // up 2 in tree, + // "/7_CUDALibraries//" + // subdir + "../../8_Android//data/", // up 2 in tree, + // "/8_Android//" + // subdir + "../../samples//data/", // up 2 in tree, + // "/samples//" + // subdir + "../../../", // up 3 in tree + "../../../src//", // up 3 in tree, + // "/src//" subdir + "../../../src//data/", // up 3 in tree, + // "/src//data/" + // subdir + "../../../src//src/", // up 3 in tree, + // "/src//src/" + // subdir + "../../../src//inc/", // up 3 in tree, + // "/src//inc/" + // subdir + "../../../sandbox//", // up 3 in tree, + // "/sandbox//" + // subdir + "../../../sandbox//data/", // up 3 in tree, + // "/sandbox//data/" + // subdir + "../../../sandbox//src/", // up 3 in tree, + // "/sandbox//src/" + // subdir + "../../../sandbox//inc/", // up 3 in tree, + // "/sandbox//inc/" + // subdir + "../../../0_Simple//data/", // up 3 in tree, + // "/0_Simple//" + // subdir + "../../../1_Utilities//data/", // up 3 in tree, + // "/1_Utilities//" + // subdir + "../../../2_Graphics//data/", // up 3 in tree, + // "/2_Graphics//" + // subdir + "../../../3_Imaging//data/", // up 3 in tree, + // "/3_Imaging//" + // subdir + "../../../4_Finance//data/", // up 3 in tree, + // "/4_Finance//" + // subdir + "../../../5_Simulations//data/", // up 3 in tree, + // "/5_Simulations//" + // subdir + "../../../6_Advanced//data/", // up 3 in tree, + // "/6_Advanced//" + // subdir + "../../../7_CUDALibraries//data/", // up 3 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../8_Android//data/", // up 3 in tree, + // "/8_Android//" + // subdir + "../../../0_Simple//", // up 3 in tree, + // "/0_Simple//" + // subdir + "../../../1_Utilities//", // up 3 in tree, + // "/1_Utilities//" + // subdir + "../../../2_Graphics//", // up 3 in tree, + // "/2_Graphics//" + // subdir + "../../../3_Imaging//", // up 3 in tree, + // "/3_Imaging//" + // subdir + "../../../4_Finance//", // up 3 in tree, + // "/4_Finance//" + // subdir + "../../../5_Simulations//", // up 3 in tree, + // "/5_Simulations//" + // subdir + "../../../6_Advanced//", // up 3 in tree, + // "/6_Advanced//" + // subdir + "../../../7_CUDALibraries//", // up 3 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../8_Android//", // up 3 in tree, + // "/8_Android//" + // subdir + "../../../samples//data/", // up 3 in tree, + // "/samples//" + // subdir + "../../../common/", // up 3 in tree, "../../../common/" subdir + "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir + "../../../data/", // up 3 in tree, "../../../data/" subdir + "../../../../", // up 4 in tree + "../../../../src//", // up 4 in tree, + // "/src//" subdir + "../../../../src//data/", // up 4 in tree, + // "/src//data/" + // subdir + "../../../../src//src/", // up 4 in tree, + // "/src//src/" + // subdir + "../../../../src//inc/", // up 4 in tree, + // "/src//inc/" + // subdir + "../../../../sandbox//", // up 4 in tree, + // "/sandbox//" + // subdir + "../../../../sandbox//data/", // up 4 in tree, + // "/sandbox//data/" + // subdir + "../../../../sandbox//src/", // up 4 in tree, + // "/sandbox//src/" + // subdir + "../../../../sandbox//inc/", // up 4 in tree, + // "/sandbox//inc/" + // subdir + "../../../../0_Simple//data/", // up 4 in tree, + // "/0_Simple//" + // subdir + "../../../../1_Utilities//data/", // up 4 in tree, + // "/1_Utilities//" + // subdir + "../../../../2_Graphics//data/", // up 4 in tree, + // "/2_Graphics//" + // subdir + "../../../../3_Imaging//data/", // up 4 in tree, + // "/3_Imaging//" + // subdir + "../../../../4_Finance//data/", // up 4 in tree, + // "/4_Finance//" + // subdir + "../../../../5_Simulations//data/", // up 4 in tree, + // "/5_Simulations//" + // subdir + "../../../../6_Advanced//data/", // up 4 in tree, + // "/6_Advanced//" + // subdir + "../../../../7_CUDALibraries//data/", // up 4 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../../8_Android//data/", // up 4 in tree, + // "/8_Android//" + // subdir + "../../../../0_Simple//", // up 4 in tree, + // "/0_Simple//" + // subdir + "../../../../1_Utilities//", // up 4 in tree, + // "/1_Utilities//" + // subdir + "../../../../2_Graphics//", // up 4 in tree, + // "/2_Graphics//" + // subdir + "../../../../3_Imaging//", // up 4 in tree, + // "/3_Imaging//" + // subdir + "../../../../4_Finance//", // up 4 in tree, + // "/4_Finance//" + // subdir + "../../../../5_Simulations//", // up 4 in tree, + // "/5_Simulations//" + // subdir + "../../../../6_Advanced//", // up 4 in tree, + // "/6_Advanced//" + // subdir + "../../../../7_CUDALibraries//", // up 4 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../../8_Android//", // up 4 in tree, + // "/8_Android//" + // subdir + "../../../../samples//data/", // up 4 in tree, + // "/samples//" + // subdir + "../../../../common/", // up 4 in tree, "../../../common/" subdir + "../../../../common/data/", // up 4 in tree, "../../../common/data/" + // subdir + "../../../../data/", // up 4 in tree, "../../../data/" subdir + "../../../../../", // up 5 in tree + "../../../../../src//", // up 5 in tree, + // "/src//" + // subdir + "../../../../../src//data/", // up 5 in tree, + // "/src//data/" + // subdir + "../../../../../src//src/", // up 5 in tree, + // "/src//src/" + // subdir + "../../../../../src//inc/", // up 5 in tree, + // "/src//inc/" + // subdir + "../../../../../sandbox//", // up 5 in tree, + // "/sandbox//" + // subdir + "../../../../../sandbox//data/", // up 5 in tree, + // "/sandbox//data/" + // subdir + "../../../../../sandbox//src/", // up 5 in tree, + // "/sandbox//src/" + // subdir + "../../../../../sandbox//inc/", // up 5 in tree, + // "/sandbox//inc/" + // subdir + "../../../../../0_Simple//data/", // up 5 in tree, + // "/0_Simple//" + // subdir + "../../../../../1_Utilities//data/", // up 5 in tree, + // "/1_Utilities//" + // subdir + "../../../../../2_Graphics//data/", // up 5 in tree, + // "/2_Graphics//" + // subdir + "../../../../../3_Imaging//data/", // up 5 in tree, + // "/3_Imaging//" + // subdir + "../../../../../4_Finance//data/", // up 5 in tree, + // "/4_Finance//" + // subdir + "../../../../../5_Simulations//data/", // up 5 in tree, + // "/5_Simulations//" + // subdir + "../../../../../6_Advanced//data/", // up 5 in tree, + // "/6_Advanced//" + // subdir + "../../../../../7_CUDALibraries//data/", // up 5 in + // tree, + // "/7_CUDALibraries//" + // subdir + "../../../../../8_Android//data/", // up 5 in tree, + // "/8_Android//" + // subdir + "../../../../../samples//data/", // up 5 in tree, + // "/samples//" + // subdir + "../../../../../common/", // up 5 in tree, "../../../common/" subdir + "../../../../../common/data/", // up 5 in tree, "../../../common/data/" + // subdir + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/Common/helper_timer.h b/Common/helper_timer.h new file mode 100644 index 00000000..2e21cdc4 --- /dev/null +++ b/Common/helper_timer.h @@ -0,0 +1,465 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper Timing Functions +#ifndef COMMON_HELPER_TIMER_H_ +#define COMMON_HELPER_TIMER_H_ + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include + +// Definition of the StopWatch Interface, this is used if we don't want to use +// the CUT functions But rather in a self contained class interface +class StopWatchInterface { + public: + StopWatchInterface() {} + virtual ~StopWatchInterface() {} + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface { + public: + //! Constructor, default + StopWatchWin() + : start_time(), + end_time(), + diff_time(0.0f), + total_time(0.0f), + running(false), + clock_sessions(0), + freq(0), + freq_set(false) { + if (!freq_set) { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency(reinterpret_cast(&temp)); + + // convert to type in which it is needed + freq = (static_cast(temp.QuadPart)) / 1000.0; + + // rememeber query + freq_set = true; + } + } + + // Destructor + ~StopWatchWin() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::start() { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::stop() { + QueryPerformanceCounter(reinterpret_cast(&end_time)); + diff_time = static_cast(((static_cast(end_time.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + LARGE_INTEGER temp; + QueryPerformanceCounter(reinterpret_cast(&temp)); + retval += static_cast(((static_cast(temp.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface { + public: + //! Constructor, default + StopWatchLinux() + : start_time(), + diff_time(0.0), + total_time(0.0), + running(false), + clock_sessions(0) {} + + // Destructor + virtual ~StopWatchLinux() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::start() { + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::stop() { + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getDiffTime() { + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { +// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = reinterpret_cast(new StopWatchWin()); +#else + *timer_interface = + reinterpret_cast(new StopWatchLinux()); +#endif + return (*timer_interface != NULL) ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStartTimer(StopWatchInterface **timer_interface) { + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStopTimer(StopWatchInterface **timer_interface) { + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkResetTimer(StopWatchInterface **timer_interface) { + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetAverageTimerValue called object %08x\n", (void + // *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getAverageTime(); + } else { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getTime(); + } else { + return 0.0f; + } +} + +#endif // COMMON_HELPER_TIMER_H_ diff --git a/Common/nvrtc_helper.h b/Common/nvrtc_helper.h new file mode 100644 index 00000000..52e75abb --- /dev/null +++ b/Common/nvrtc_helper.h @@ -0,0 +1,170 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef COMMON_NVRTC_HELPER_H_ + +#define COMMON_NVRTC_HELPER_H_ 1 + +#include +#include +#include +#include +#include +#include +#include + +#define NVRTC_SAFE_CALL(Name, x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " << Name << " failed with error " \ + << nvrtcGetErrorString(result); \ + exit(1); \ + } \ + } while (0) + +void compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult, + size_t *ptxResultSize, int requiresCGheaders) { + std::ifstream inputFile(filename, + std::ios::in | std::ios::binary | std::ios::ate); + + if (!inputFile.is_open()) { + std::cerr << "\nerror: unable to open " << filename << " for reading!\n"; + exit(1); + } + + std::streampos pos = inputFile.tellg(); + size_t inputSize = (size_t)pos; + char *memBlock = new char[inputSize + 1]; + + inputFile.seekg(0, std::ios::beg); + inputFile.read(memBlock, inputSize); + inputFile.close(); + memBlock[inputSize] = '\x0'; + + int numCompileOptions = 0; + + char *compileParams[1]; + + if (requiresCGheaders) { + std::string compileOptions; + char HeaderNames[256]; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h"); +#else + snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h"); +#endif + + compileOptions = "--include-path="; + + std::string path = sdkFindFilePath(HeaderNames, argv[0]); + if (!path.empty()) { + std::size_t found = path.find(HeaderNames); + path.erase(found); + } else { + printf( + "\nCooperativeGroups headers not found, please install it in %s " + "sample directory..\n Exiting..\n", + argv[0]); + } + compileOptions += path.c_str(); + compileParams[0] = reinterpret_cast( + malloc(sizeof(char) * (compileOptions.length() + 1))); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(compileParams[0], sizeof(char) * (compileOptions.length() + 1), + "%s", compileOptions.c_str()); +#else + snprintf(compileParams[0], compileOptions.size(), "%s", + compileOptions.c_str()); +#endif + numCompileOptions++; + } + + // compile + nvrtcProgram prog; + NVRTC_SAFE_CALL("nvrtcCreateProgram", + nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL)); + + nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams); + + // dump log + size_t logSize; + NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", + nvrtcGetProgramLogSize(prog, &logSize)); + char *log = reinterpret_cast(malloc(sizeof(char) * logSize + 1)); + NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log)); + log[logSize] = '\x0'; + + if (strlen(log) >= 2) { + std::cerr << "\n compilation log ---\n"; + std::cerr << log; + std::cerr << "\n end log ---\n"; + } + + free(log); + + NVRTC_SAFE_CALL("nvrtcCompileProgram", res); + // fetch PTX + size_t ptxSize; + NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxSize)); + char *ptx = reinterpret_cast(malloc(sizeof(char) * ptxSize)); + NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx)); + NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog)); + *ptxResult = ptx; + *ptxResultSize = ptxSize; + + if (requiresCGheaders) free(compileParams[0]); +} + +CUmodule loadPTX(char *ptx, int argc, char **argv) { + CUmodule module; + CUcontext context; + int major = 0, minor = 0; + char deviceName[256]; + + // Picks the best CUDA device available + CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv); + + // get compute capabilities and the devicename + checkCudaErrors(cuDeviceGetAttribute( + &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute( + &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice)); + printf("> GPU Device has SM %d.%d compute capability\n", major, minor); + + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGet(&cuDevice, 0)); + checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); + + checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); + free(ptx); + + return module; +} + +#endif // COMMON_NVRTC_HELPER_H_ diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..afba8ea0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..4c8265e0 --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +############################################################################### +# +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +############################################################################### +# +# CUDA Samples +# +############################################################################### + +TARGET_ARCH ?= $(shell uname -m) + +# Project folders that contain CUDA samples +PROJECTS ?= $(shell find Samples -name Makefile) + +FILTER_OUT := + +PROJECTS := $(filter-out $(FILTER_OUT),$(PROJECTS)) + +%.ph_build : + +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) + +%.ph_clean : + +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) + +%.ph_clobber : + +@$(MAKE) -C $(dir $*) clobber $(USE_DEVICE) + +all: $(addsuffix .ph_build,$(PROJECTS)) + @echo "Finished building CUDA samples" + +build: $(addsuffix .ph_build,$(PROJECTS)) + +tidy: + @find * | egrep "#" | xargs rm -f + @find * | egrep "\~" | xargs rm -f + +clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) + +clobber: clean $(addsuffix .ph_clobber,$(PROJECTS)) diff --git a/README.md b/README.md new file mode 100644 index 00000000..f4173548 --- /dev/null +++ b/README.md @@ -0,0 +1,261 @@ +# CUDA Samples + +Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads). + +## Release Notes + +This section describes the release notes for the CUDA Samples on GitHub only. + +### CUDA 9.2 + +This is the first release of CUDA Samples on GitHub: +* Added `warpAggregatedAtomicsCG`. Demonstrates warp aggregated atomics using Cooperative Groups. +* Added `deviceQuery`. Enumerates the properties of the CUDA devices present in the system. +* Added `matrixMul`. Demonstrates a matrix multiplication using shared memory through tiled approach. +* Added `matrixMulDrv`. Demonstrates a matrix multiplication using shared memory through tiled approach, uses CUDA Driver API. +* Added `cudaTensorCoreGemm`. Demonstrates a GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API introduced in CUDA 9, as well as the new Tensor Cores introduced in the Volta chip family. +* Added `simpleVoteIntrinsics` which uses *_sync equivalent of the vote intrinsics _any, _all added since CUDA 9.0. +* Added `shfl_scan` which uses *_sync equivalent of the shfl intrinsics added since CUDA 9.0. + +## Getting Started + +### Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html), and the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html). + +### Getting the CUDA Samples + +Using git clone the repository of CUDA Samples using the command below. +``` +git clone +``` + +Without using git the easiest way to use these samples is to download the zip file containing the current version by clicking the "Download ZIP" button on the repo page. You can then unzip the entire archive and use the samples. + +## Building CUDA Samples + +### Windows + +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Complete samples solution files exist at parent directory of the repo: + +Each individual sample has its own set of solution files at: +`\Samples\\` + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check [DirectX Dependencies](#directx) section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details on cross platform compilation of cuda samples. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=g++ + ``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## Samples list + +### Samples by OS + +#### Linux +**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +---|---|---|---| +**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[matrixMul](./Samples/matrixMul)** | **[deviceQuery](./Samples/deviceQuery)** | + +#### Windows +**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +---|---|---|---| +**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[matrixMul](./Samples/matrixMul)** | **[deviceQuery](./Samples/deviceQuery)** | + +#### Mac OSX +**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +---|---|---|---| +**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[matrixMul](./Samples/matrixMul)** | **[deviceQuery](./Samples/deviceQuery)** | + +## Dependencies + +Some CUDA Samples rely on third-party applications and/or libraries, or features provided by the CUDA Toolkit and Driver, to either build or execute. These dependencies are listed below. + +If a sample has a third-party dependency that is available on the system, but is not installed, the sample will waive itself at build time. + +Each sample's dependencies are listed in its README's Dependencies section. + +### Third-Party Dependencies + +These third-party dependencies are required by some CUDA samples. If available, these dependencies are either installed on your system automatically, or are installable via your system's package manager (Linux) or a third-party website. + +#### FreeImage + +FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the [FreeImage website](http://freeimage.sourceforge.net/). FreeImage is also redistributed with the CUDA Samples. + +#### Message Passing Interface + +MPI (Message Passing Interface) is an API for communicating data between distributed processes. A MPI compiler can be installed using your Linux distribution's package manager system. It is also available on some online resources, such as [Open MPI](http://www.open-mpi.org/). On Windows, to build and run MPI-CUDA applications one can install [MS-MPI SDK](https://msdn.microsoft.com/en-us/library/bb524831(v=vs.85).aspx). + +#### Only 64-Bit + +Some samples can only be run on a 64-bit operating system. + +#### DirectX + +DirectX is a collection of APIs designed to allow development of multimedia applications on Microsoft platforms. For Microsoft platforms, NVIDIA's CUDA Driver supports DirectX. Several CUDA Samples for Windows demonstrates CUDA-DirectX Interoperability, for building such samples one needs to install [Direct X SDK (June 2010 or newer)](http://www.microsoft.com/en-us/download/details.aspx?id=6812) , this is required to be installed on Windows 7, Windows 10 and Windows Server 2008, Other Windows OSes do not need to explicitly install the DirectX SDK. + +#### OpenGL + +OpenGL is a graphics library used for 2D and 3D rendering. On systems which support OpenGL, NVIDIA's OpenGL implementation is provided with the CUDA Driver. + +#### OpenGL ES + +OpenGL ES is an embedded systems graphics library used for 2D and 3D rendering. On systems which support OpenGL ES, NVIDIA's OpenGL ES implementation is provided with the CUDA Driver. + +#### OpenMP + +OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). + +#### Screen + +Screen is a windowing system found on the QNX operating system. Screen is usually found as part of the root filesystem. + +#### X11 + +X11 is a windowing system commonly found on *-nix style operating systems. X11 can be installed using your Linux distribution's package manager, and comes preinstalled on Mac OS X systems. + +#### EGL + +EGL is an interface between Khronos rendering APIs (such as OpenGL, OpenGL ES or OpenVG) and the underlying native platform windowing system. + +#### EGLOutput + +EGLOutput is a set of EGL extensions which allow EGL to render directly to the display. + +#### EGLSync + +EGLSync is a set of EGL extensions which provides sync objects that are synchronization primitive, representing events whose completion can be tested or waited upon. + +### CUDA Features + +These CUDA features are needed by some CUDA samples. They are provided by either the CUDA Toolkit or CUDA Driver. Some features may not be available on your system. + +#### CUFFT Callback Routines + +CUFFT Callback Routines are user-supplied kernel routines that CUFFT will call when loading or storing data. These callback routines are only available on Linux x86_64 and ppc64le systems. + +#### CUDA Dynamic Paralellism + +CDP (CUDA Dynamic Paralellism) allows kernels to be launched from threads running on the GPU. CDP is only available on GPUs with SM architecture of 3.5 or above. + +#### Multi-block Cooperative Groups + +Multi Block Cooperative Groups(MBCG) extends Cooperative Groups and the CUDA programming model to express inter-thread-block synchronization. MBCG is available on GPUs with Pascal and higher architecture on Linux systems. + +#### CUBLAS + +CUBLAS (CUDA Basic Linear Algebra Subroutines) is a GPU-accelerated version of the BLAS library. + +#### CUDA Interprocess Communication + +IPC (Interprocess Communication) allows processes to share device pointers. IPC is only available on Linux x86_64 and ppc64le systems. + +#### CUFFT + +CUFFT (CUDA Fast Fourier Transform) is a GPU-accelerated FFT library. + +#### CURAND + +CURAND (CUDA Random Number Generation) is a GPU-accelerated RNG library. + +#### CUSPARSE + +CUSPARSE (CUDA Sparse Matrix) provides linear algebra subroutines used for sparse matrix calculations. + +#### CUSOLVER + +CUSOLVER library is a high-level package based on the CUBLAS and CUSPARSE libraries. It combines three separate libraries under a single umbrella, each of which can be used independently or in concert with other toolkit libraries. The intent ofCUSOLVER is to provide useful LAPACK-like features, such as common matrix factorization and triangular solve routines for dense matrices, a sparse least-squares solver and an eigenvalue solver. In addition cuSolver provides a new refactorization library useful for solving sequences of matrices with a shared sparsity pattern. + +#### NPP + +NPP (NVIDIA Performance Primitives) provides GPU-accelerated image, video, and signal processing functions. + +#### NVGRAPH + +NVGRAPH is a GPU-accelerated graph analytics library.. + +#### NVRTC + +NVRTC (CUDA RunTime Compilation) is a runtime compilation library for CUDA C++. + +#### NVCUVID + +NVCUVID (NVIDIA CUDA Video Decoder) provides GPU-accelerated video decoding capabilities. + +#### Stream Priorities + +Stream Priorities allows the creation of streams with specified priorities. Stream Priorities is only available on GPUs with SM architecture of 3.5 or above. + +#### Unified Virtual Memory + +UVM (Unified Virtual Memory) enables memory that can be accessed by both the CPU and GPU without explicit copying between the two. UVM is only available on Linux and Windows systems. + +#### 16-bit Floating Point + +FP16 is a 16-bit floating-point format. One bit is used for the sign, five bits for the exponent, and ten bits for the mantissa. + +#### C++11 CUDA + +NVCC support of [C++11 features](https://en.wikipedia.org/wiki/C++11). + +## Contributors Guide + +We welcome your input on issues and suggestions for new samples. At this time we are not accepting contributions from the public, check back here as we evolve our contribution model. + +We use Google C++ Style Guide for all the sources https://google.github.io/styleguide/cppguide.html + +## Frequently Asked Questions + +Answers to frequently asked questions about CUDA can be found at http://developer.nvidia.com/cuda-faq and in the [CUDA Toolkit Release Notes](http://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html). + diff --git a/Samples/deviceQuery/Makefile b/Samples/deviceQuery/Makefile new file mode 100644 index 00000000..445c1cc5 --- /dev/null +++ b/Samples/deviceQuery/Makefile @@ -0,0 +1,287 @@ +################################################################################ +# +# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-g++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: deviceQuery + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +deviceQuery.o:deviceQuery.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +deviceQuery: deviceQuery.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./deviceQuery + +clean: + rm -f deviceQuery deviceQuery.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/deviceQuery + +clobber: clean diff --git a/Samples/deviceQuery/NsightEclipse.xml b/Samples/deviceQuery/NsightEclipse.xml new file mode 100644 index 00000000..6c26a3ed --- /dev/null +++ b/Samples/deviceQuery/NsightEclipse.xml @@ -0,0 +1,70 @@ + + + + deviceQuery + + cudaSetDevice + cudaGetDeviceCount + cudaGetDeviceProperties + cudaDriverGetVersion + cudaRuntimeGetVersion + + + whole + + ./ + ../ + ../../common/inc + + + CUDA Runtime API + Device Query + + + + + + + + true + deviceQuery.cpp + + 1:CUDA Basic Topics + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + aarch64 + + + ppc64le + linux + + + + all + + Device Query + exe + diff --git a/Samples/deviceQuery/README.md b/Samples/deviceQuery/README.md new file mode 100644 index 00000000..5b2910dd --- /dev/null +++ b/Samples/deviceQuery/README.md @@ -0,0 +1,94 @@ +# deviceQuery - Device Query + +## Description + +This sample enumerates the properties of the CUDA devices present in the system. + +## Key Concepts + +CUDA Runtime API, Device Query + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l, aarch64 + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion, cudaRuntimeGetVersion + +## Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/deviceQuery/deviceQuery.cpp b/Samples/deviceQuery/deviceQuery.cpp new file mode 100644 index 00000000..7c11cd86 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery.cpp @@ -0,0 +1,338 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* This sample queries the properties of the CUDA devices present in the system + * via CUDA Runtime API. */ + +// std::system includes + +#include +#include + +#include +#include +#include + +int *pArgc = NULL; +char **pArgv = NULL; + +#if CUDART_VERSION < 5000 + +// CUDA-C includes +#include + +// This function wraps the CUDA Driver API into a template function +template +inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, + int device) { + CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); + + if (CUDA_SUCCESS != error) { + fprintf( + stderr, + "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", + error, __FILE__, __LINE__); + + exit(EXIT_FAILURE); + } +} + +#endif /* CUDART_VERSION < 5000 */ + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + pArgc = &argc; + pArgv = argv; + + printf("%s Starting...\n\n", argv[0]); + printf( + " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); + + int deviceCount = 0; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + + if (error_id != cudaSuccess) { + printf("cudaGetDeviceCount returned %d\n-> %s\n", + static_cast(error_id), cudaGetErrorString(error_id)); + printf("Result = FAIL\n"); + exit(EXIT_FAILURE); + } + + // This function call returns 0 if there are no CUDA capable devices. + if (deviceCount == 0) { + printf("There are no available device(s) that support CUDA\n"); + } else { + printf("Detected %d CUDA Capable device(s)\n", deviceCount); + } + + int dev, driverVersion = 0, runtimeVersion = 0; + + for (dev = 0; dev < deviceCount; ++dev) { + cudaSetDevice(dev); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + + printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); + + // Console log + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", + driverVersion / 1000, (driverVersion % 100) / 10, + runtimeVersion / 1000, (runtimeVersion % 100) / 10); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", + deviceProp.major, deviceProp.minor); + + char msg[256]; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(msg, sizeof(msg), + " Total amount of global memory: %.0f MBytes " + "(%llu bytes)\n", + static_cast(deviceProp.totalGlobalMem / 1048576.0f), + (unsigned long long)deviceProp.totalGlobalMem); +#else + snprintf(msg, sizeof(msg), + " Total amount of global memory: %.0f MBytes " + "(%llu bytes)\n", + static_cast(deviceProp.totalGlobalMem / 1048576.0f), + (unsigned long long)deviceProp.totalGlobalMem); +#endif + printf("%s", msg); + + printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", + deviceProp.multiProcessorCount, + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * + deviceProp.multiProcessorCount); + printf( + " GPU Max Clock rate: %.0f MHz (%0.2f " + "GHz)\n", + deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); + +#if CUDART_VERSION >= 5000 + // This is supported in CUDA 5.0 (runtime API device properties) + printf(" Memory Clock rate: %.0f Mhz\n", + deviceProp.memoryClockRate * 1e-3f); + printf(" Memory Bus Width: %d-bit\n", + deviceProp.memoryBusWidth); + + if (deviceProp.l2CacheSize) { + printf(" L2 Cache Size: %d bytes\n", + deviceProp.l2CacheSize); + } + +#else + // This only available in CUDA 4.0-4.2 (but these were only exposed in the + // CUDA Driver API) + int memoryClock; + getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + dev); + printf(" Memory Clock rate: %.0f Mhz\n", + memoryClock * 1e-3f); + int memBusWidth; + getCudaAttribute(&memBusWidth, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); + printf(" Memory Bus Width: %d-bit\n", + memBusWidth); + int L2CacheSize; + getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); + + if (L2CacheSize) { + printf(" L2 Cache Size: %d bytes\n", + L2CacheSize); + } + +#endif + + printf( + " Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, " + "%d), 3D=(%d, %d, %d)\n", + deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], + deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], + deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); + printf( + " Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", + deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); + printf( + " Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d " + "layers\n", + deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], + deviceProp.maxTexture2DLayered[2]); + + printf(" Total amount of constant memory: %lu bytes\n", + deviceProp.totalConstMem); + printf(" Total amount of shared memory per block: %lu bytes\n", + deviceProp.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", + deviceProp.regsPerBlock); + printf(" Warp size: %d\n", + deviceProp.warpSize); + printf(" Maximum number of threads per multiprocessor: %d\n", + deviceProp.maxThreadsPerMultiProcessor); + printf(" Maximum number of threads per block: %d\n", + deviceProp.maxThreadsPerBlock); + printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", + deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], + deviceProp.maxThreadsDim[2]); + printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", + deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], + deviceProp.maxGridSize[2]); + printf(" Maximum memory pitch: %lu bytes\n", + deviceProp.memPitch); + printf(" Texture alignment: %lu bytes\n", + deviceProp.textureAlignment); + printf( + " Concurrent copy and kernel execution: %s with %d copy " + "engine(s)\n", + (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", + deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", + deviceProp.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", + deviceProp.canMapHostMemory ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", + deviceProp.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support: %s\n", + deviceProp.ECCEnabled ? "Enabled" : "Disabled"); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", + deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" + : "WDDM (Windows Display Driver Model)"); +#endif + printf(" Device supports Unified Addressing (UVA): %s\n", + deviceProp.unifiedAddressing ? "Yes" : "No"); + printf(" Device supports Compute Preemption: %s\n", + deviceProp.computePreemptionSupported ? "Yes" : "No"); + printf(" Supports Cooperative Kernel Launch: %s\n", + deviceProp.cooperativeLaunch ? "Yes" : "No"); + printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", + deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No"); + printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", + deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); + + const char *sComputeMode[] = { + "Default (multiple host threads can use ::cudaSetDevice() with device " + "simultaneously)", + "Exclusive (only one host thread in one process is able to use " + "::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this " + "device)", + "Exclusive Process (many threads in one process is able to use " + "::cudaSetDevice() with this device)", + "Unknown", + NULL}; + printf(" Compute Mode:\n"); + printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); + } + + // If there are 2 or more GPUs, query to determine whether RDMA is supported + if (deviceCount >= 2) { + cudaDeviceProp prop[64]; + int gpuid[64]; // we want to find the first two GPUs that can support P2P + int gpu_p2p_count = 0; + + for (int i = 0; i < deviceCount; i++) { + checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); + + // Only boards based on Fermi or later can support P2P + if ((prop[i].major >= 2) +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // on Windows (64-bit), the Tesla Compute Cluster driver for windows + // must be enabled to support this + && prop[i].tccDriver +#endif + ) { + // This is an array of P2P capable GPUs + gpuid[gpu_p2p_count++] = i; + } + } + + // Show all the combinations of support P2P GPUs + int can_access_peer; + + if (gpu_p2p_count >= 2) { + for (int i = 0; i < gpu_p2p_count; i++) { + for (int j = 0; j < gpu_p2p_count; j++) { + if (gpuid[i] == gpuid[j]) { + continue; + } + checkCudaErrors( + cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); + printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", + prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], + can_access_peer ? "Yes" : "No"); + } + } + } + } + + // csv masterlog info + // ***************************** + // exe and CUDA driver name + printf("\n"); + std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; + char cTemp[16]; + + // driver version + sProfileString += ", CUDA Driver Version = "; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); +#else + snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, + (driverVersion % 100) / 10); +#endif + sProfileString += cTemp; + + // Runtime version + sProfileString += ", CUDA Runtime Version = "; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); +#else + snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, + (runtimeVersion % 100) / 10); +#endif + sProfileString += cTemp; + + // Device count + sProfileString += ", NumDevs = "; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 10, "%d", deviceCount); +#else + snprintf(cTemp, sizeof(cTemp), "%d", deviceCount); +#endif + sProfileString += cTemp; + sProfileString += "\n"; + printf("%s", sProfileString.c_str()); + + printf("Result = PASS\n"); + + // finish + exit(EXIT_SUCCESS); +} diff --git a/Samples/deviceQuery/deviceQuery_vs2010.sln b/Samples/deviceQuery/deviceQuery_vs2010.sln new file mode 100644 index 00000000..7aa6dd37 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2010.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deviceQuery", "deviceQuery_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/deviceQuery/deviceQuery_vs2010.vcxproj b/Samples/deviceQuery/deviceQuery_vs2010.vcxproj new file mode 100644 index 00000000..06258954 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2010.vcxproj @@ -0,0 +1,106 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + deviceQuery_vs2010 + deviceQuery + + + + + Application + MultiByte + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/deviceQuery.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/deviceQuery/deviceQuery_vs2012.sln b/Samples/deviceQuery/deviceQuery_vs2012.sln new file mode 100644 index 00000000..1737da5f --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deviceQuery", "deviceQuery_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj new file mode 100644 index 00000000..c827a82f --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + deviceQuery_vs2012 + deviceQuery + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/deviceQuery.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/deviceQuery/deviceQuery_vs2013.sln b/Samples/deviceQuery/deviceQuery_vs2013.sln new file mode 100644 index 00000000..51c58f49 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deviceQuery", "deviceQuery_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj new file mode 100644 index 00000000..f01ab104 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + deviceQuery_vs2013 + deviceQuery + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/deviceQuery.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/deviceQuery/deviceQuery_vs2015.sln b/Samples/deviceQuery/deviceQuery_vs2015.sln new file mode 100644 index 00000000..46da5435 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deviceQuery", "deviceQuery_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj new file mode 100644 index 00000000..3b5128bd --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + deviceQuery_vs2015 + deviceQuery + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/deviceQuery.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/deviceQuery/deviceQuery_vs2017.sln b/Samples/deviceQuery/deviceQuery_vs2017.sln new file mode 100644 index 00000000..da9ed199 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deviceQuery", "deviceQuery_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj new file mode 100644 index 00000000..fe6e0cc1 --- /dev/null +++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + deviceQuery_vs2017 + deviceQuery + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/deviceQuery.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/matrixMul/Makefile b/Samples/matrixMul/Makefile new file mode 100644 index 00000000..e884aa52 --- /dev/null +++ b/Samples/matrixMul/Makefile @@ -0,0 +1,287 @@ +################################################################################ +# +# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-g++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: matrixMul + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +matrixMul.o:matrixMul.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +matrixMul: matrixMul.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./matrixMul + +clean: + rm -f matrixMul matrixMul.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/matrixMul + +clobber: clean diff --git a/Samples/matrixMul/NsightEclipse.xml b/Samples/matrixMul/NsightEclipse.xml new file mode 100644 index 00000000..4132ceb3 --- /dev/null +++ b/Samples/matrixMul/NsightEclipse.xml @@ -0,0 +1,76 @@ + + + + matrixMul + + cudaEventCreate + cudaEventRecord + cudaEventQuery + cudaEventDestroy + cudaEventElapsedTime + cudaEventSynchronize + cudaMalloc + cudaFree + cudaMemcpy + + + whole + + ./ + ../ + ../../common/inc + + + CUDA Runtime API + Linear Algebra + + + CUDA + matrix multiply + + + + + + true + matrixMul.cu + + 1:CUDA Basic Topics + 3:Linear Algebra + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + aarch64 + + + ppc64le + linux + + + + all + + Matrix Multiplication (CUDA Runtime API Version) + diff --git a/Samples/matrixMul/README.md b/Samples/matrixMul/README.md new file mode 100644 index 00000000..c6180676 --- /dev/null +++ b/Samples/matrixMul/README.md @@ -0,0 +1,94 @@ +# matrixMul - Matrix Multiplication (CUDA Runtime API Version) + +## Description + +This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication. + +## Key Concepts + +CUDA Runtime API, Linear Algebra + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l, aarch64 + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventElapsedTime, cudaEventSynchronize, cudaMalloc, cudaFree, cudaMemcpy + +## Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/matrixMul/matrixMul.cu b/Samples/matrixMul/matrixMul.cu new file mode 100644 index 00000000..d913246c --- /dev/null +++ b/Samples/matrixMul/matrixMul.cu @@ -0,0 +1,348 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Matrix multiplication: C = A * B. + * Host code. + * + * This sample implements matrix multiplication which makes use of shared memory + * to ensure data reuse, the matrix multiplication is done using tiling approach. + * It has been written for clarity of exposition to illustrate various CUDA programming + * principles, not with the goal of providing the most performant generic kernel for matrix multiplication. + * See also: + * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," + * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08), + * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. + */ + +// System includes +#include +#include + +// CUDA runtime +#include + +// Helper functions and utilities to work with CUDA +#include +#include + +/** + * Matrix multiplication (CUDA Kernel) on the device: C = A * B + * wA is A's width and wB is B's width + */ +template __global__ void MatrixMulCUDA(float *C, float *A, + float *B, int wA, + int wB) { + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; + + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; + + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * by; + + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; + + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * bx; + + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; + + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; + a <= aEnd; + a += aStep, b += bStep) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[ty][tx] = A[a + wA * ty + tx]; + Bs[ty][tx] = B[b + wB * ty + tx]; + + // Synchronize to make sure the matrices are loaded + __syncthreads(); + + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix +#pragma unroll + + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[ty][k] * Bs[k][tx]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; + C[c + wB * ty + tx] = Csub; +} + +void ConstantInit(float *data, int size, float val) { + for (int i = 0; i < size; ++i) { + data[i] = val; + } +} + +/** + * Run a simple test of matrix multiplication using CUDA + */ +int MatrixMultiply(int argc, char **argv, + int block_size, const dim3 &dimsA, + const dim3 &dimsB) { + // Allocate host memory for matrices A and B + unsigned int size_A = dimsA.x * dimsA.y; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A = reinterpret_cast(malloc(mem_size_A)); + unsigned int size_B = dimsB.x * dimsB.y; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B = reinterpret_cast(malloc(mem_size_B)); + + // Initialize host memory + const float valB = 0.01f; + ConstantInit(h_A, size_A, 1.0f); + ConstantInit(h_B, size_B, valB); + + // Allocate device memory + float *d_A, *d_B, *d_C; + + // Allocate host matrix C + dim3 dimsC(dimsB.x, dimsA.y, 1); + unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); + float *h_C = reinterpret_cast(malloc(mem_size_C)); + + if (h_C == NULL) { + fprintf(stderr, "Failed to allocate host matrix C!\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); + + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); + + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); + + // copy host memory to device + checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); + + // Setup execution parameters + dim3 threads(block_size, block_size); + dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); + + // Create and start timer + printf("Computing result using CUDA Kernel...\n"); + + // Performs warmup operation using matrixMul CUDA kernel + if (block_size == 16) { + MatrixMulCUDA<16> <<< grid, threads >>>(d_C, d_A, d_B, + dimsA.x, dimsB.x); + } else { + MatrixMulCUDA<32> <<< grid, threads >>>(d_C, d_A, d_B, + dimsA.x, dimsB.x); + } + + printf("done\n"); + + cudaDeviceSynchronize(); + + // Allocate CUDA events that we'll use for timing + cudaEvent_t start; + checkCudaErrors(cudaEventCreate(&start)); + + cudaEvent_t stop; + checkCudaErrors(cudaEventCreate(&stop)); + + // Record the start event + checkCudaErrors(cudaEventRecord(start, NULL)); + + // Execute the kernel + int nIter = 300; + + for (int j = 0; j < nIter; j++) { + if (block_size == 16) { + MatrixMulCUDA<16> <<< grid, threads >>>(d_C, d_A, d_B, + dimsA.x, dimsB.x); + } else { + MatrixMulCUDA<32> <<< grid, threads >>>(d_C, d_A, d_B, + dimsA.x, dimsB.x); + } + } + + // Record the stop event + checkCudaErrors(cudaEventRecord(stop, NULL)); + + // Wait for the stop event to complete + checkCudaErrors(cudaEventSynchronize(stop)); + + float msecTotal = 0.0f; + checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); + + // Compute and print the performance + float msecPerMatrixMul = msecTotal / nIter; + double flopsPerMatrixMul = 2.0 * static_cast(dimsA.x) * + static_cast(dimsA.y) * + static_cast(dimsB.x); + double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / + (msecPerMatrixMul / 1000.0f); + printf( + "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," \ + " WorkgroupSize= %u threads/block\n", + gigaFlops, + msecPerMatrixMul, + flopsPerMatrixMul, + threads.x * threads.y); + + // Copy result from device to host + checkCudaErrors(cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost)); + + printf("Checking computed result for correctness: "); + bool correct = true; + + // test relative error by the formula + // |_cpu - _gpu|/<|x|, |y|> < eps + double eps = 1.e-6; // machine zero + + for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { + double abs_err = fabs(h_C[i] - (dimsA.x * valB)); + double dot_length = dimsA.x; + double abs_val = fabs(h_C[i]); + double rel_err = abs_err / abs_val / dot_length; + + if (rel_err > eps) { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", + i, h_C[i], dimsA.x * valB, eps); + correct = false; + } + } + + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + + // Clean up memory + free(h_A); + free(h_B); + free(h_C); + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_C)); + + printf("\nNOTE: The CUDA Samples are not meant for performance"\ + "measurements. Results may vary when GPU Boost is enabled.\n"); + + if (correct) { + return EXIT_SUCCESS; + } else { + return EXIT_FAILURE; + } +} + + +/** + * Program main + */ +int main(int argc, char **argv) { + printf("[Matrix Multiply Using CUDA] - Starting...\n"); + + if (checkCmdLineFlag(argc, (const char **)argv, "help") || + checkCmdLineFlag(argc, (const char **)argv, "?")) { + printf("Usage -device=n (n >= 0 for deviceID)\n"); + printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); + printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); + printf(" Note: Outer matrix dimensions of A & B matrices" \ + " must be equal.\n"); + + exit(EXIT_SUCCESS); + } + + // This will pick the best possible CUDA capable device, otherwise + // override the device ID based on input provided at the command line + int dev = findCudaDevice(argc, (const char **)argv); + + int block_size = 32; + + dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); + dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); + + // width of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { + dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); + } + + // height of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { + dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); + } + + // width of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { + dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); + } + + // height of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { + dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); + } + + if (dimsA.x != dimsB.y) { + printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", + dimsA.x, dimsB.y); + exit(EXIT_FAILURE); + } + + printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, + dimsB.x, dimsB.y); + + int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB); + + exit(matrix_result); +} diff --git a/Samples/matrixMul/matrixMul_vs2010.sln b/Samples/matrixMul/matrixMul_vs2010.sln new file mode 100644 index 00000000..99cfcad9 --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2010.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMul", "matrixMul_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMul/matrixMul_vs2010.vcxproj b/Samples/matrixMul/matrixMul_vs2010.vcxproj new file mode 100644 index 00000000..dc6c0653 --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2010.vcxproj @@ -0,0 +1,106 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMul_vs2010 + matrixMul + + + + + Application + MultiByte + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMul.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/matrixMul/matrixMul_vs2012.sln b/Samples/matrixMul/matrixMul_vs2012.sln new file mode 100644 index 00000000..fbbf6070 --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMul", "matrixMul_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMul/matrixMul_vs2012.vcxproj b/Samples/matrixMul/matrixMul_vs2012.vcxproj new file mode 100644 index 00000000..660cc790 --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2012.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMul_vs2012 + matrixMul + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMul.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/matrixMul/matrixMul_vs2013.sln b/Samples/matrixMul/matrixMul_vs2013.sln new file mode 100644 index 00000000..1c5c1999 --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMul", "matrixMul_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMul/matrixMul_vs2013.vcxproj b/Samples/matrixMul/matrixMul_vs2013.vcxproj new file mode 100644 index 00000000..fb59ec7e --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMul_vs2013 + matrixMul + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMul.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/matrixMul/matrixMul_vs2015.sln b/Samples/matrixMul/matrixMul_vs2015.sln new file mode 100644 index 00000000..78d50b32 --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMul", "matrixMul_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMul/matrixMul_vs2015.vcxproj b/Samples/matrixMul/matrixMul_vs2015.vcxproj new file mode 100644 index 00000000..88350f3a --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMul_vs2015 + matrixMul + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMul.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/matrixMul/matrixMul_vs2017.sln b/Samples/matrixMul/matrixMul_vs2017.sln new file mode 100644 index 00000000..f9a44def --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMul", "matrixMul_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMul/matrixMul_vs2017.vcxproj b/Samples/matrixMul/matrixMul_vs2017.vcxproj new file mode 100644 index 00000000..7dcf0770 --- /dev/null +++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMul_vs2017 + matrixMul + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMul.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/matrixMulDrv/Makefile b/Samples/matrixMulDrv/Makefile new file mode 100644 index 00000000..e7ddd0db --- /dev/null +++ b/Samples/matrixMulDrv/Makefile @@ -0,0 +1,344 @@ +################################################################################ +# +# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-g++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +PTX_FILE := matrixMul_kernel${TARGET_SIZE}.ptx + +# Gencode arguments +SMS ?= + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +ifeq ($(SMS),) +# Generate PTX code from SM 30 +GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30 +endif + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(TARGET_OS),darwin) + ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA +else + ifeq ($(TARGET_ARCH),x86_64) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs + CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs + endif + + ifeq ($(TARGET_ARCH),ppc64le) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs + endif + + CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) + ifeq ("$(CUDALIB)","") + $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<) + SAMPLE_ENABLED := 0 + endif + + LIBRARIES += -lcuda +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: matrixMulDrv $(PTX_FILE) + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +$(PTX_FILE): matrixMul_kernel.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -ptx $< + $(EXEC) mkdir -p data + $(EXEC) cp -f $@ ./data + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp -f $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +matrixMulDrv.o:matrixMulDrv.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +matrixMulDrv: matrixMulDrv.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./matrixMulDrv + +clean: + rm -f matrixMulDrv matrixMulDrv.o data/$(PTX_FILE) $(PTX_FILE) + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/matrixMulDrv + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(PTX_FILE) + +clobber: clean diff --git a/Samples/matrixMulDrv/README.md b/Samples/matrixMulDrv/README.md new file mode 100644 index 00000000..a510b31c --- /dev/null +++ b/Samples/matrixMulDrv/README.md @@ -0,0 +1,94 @@ +# matrixMulDrv - Matrix Multiplication (CUDA Driver API Version) + +## Description + +This sample implements matrix multiplication and uses the new CUDA 4.0 kernel launch Driver API. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. CUBLAS provides high-performance matrix multiplication. + +## Key Concepts + +CUDA Driver API, Matrix Multiply + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l, aarch64 + +## CUDA APIs involved + +### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html) +cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH, cuLaunchKernel + +## Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/matrixMulDrv/matrixMul.h b/Samples/matrixMulDrv/matrixMul.h new file mode 100644 index 00000000..9147b82f --- /dev/null +++ b/Samples/matrixMulDrv/matrixMul.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _MATRIXMUL_H_ +#define _MATRIXMUL_H_ + +// Matrix dimensions +// (chosen as multiples of the thread block size for simplicity) +#define WA (4 * block_size) // Matrix A width +#define HA (6 * block_size) // Matrix A height +#define WB (4 * block_size) // Matrix B width +#define HB WA // Matrix B height +#define WC WB // Matrix C width +#define HC HA // Matrix C height + +#endif // _MATRIXMUL_H_ diff --git a/Samples/matrixMulDrv/matrixMulDrv.cpp b/Samples/matrixMulDrv/matrixMulDrv.cpp new file mode 100644 index 00000000..b331d4f8 --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv.cpp @@ -0,0 +1,436 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Matrix multiplication: C = A * B. + * Host code. + * + * This sample implements matrix multiplication using the CUDA driver API. + * It has been written for clarity of exposition to illustrate various CUDA + * programming principles, not with the goal of providing the most + * performant generic kernel for matrix multiplication. + * + * CUBLAS provides high-performance matrix multiplication. + * See also: + * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," + * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08), + * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. + * + * Volkov, V. 2010. Better performance at lower occupancy, + * GPU Technology Conference 2~010 (GTC 2010). + * + */ + +// includes, system +#include +#include +#include +#include +#include +#include + +// includes, project +#include +#include +#include +#include + +#include +#include +#include +#include "matrixMul.h" + +// includes, CUDA +const bool use_64bit_memory_address = false; + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest(int argc, char **argv); +void randomInit(float *, int); + +extern "C" void computeGold(float *, const float *, const float *, unsigned int, + unsigned int, unsigned int); + +static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul); + +// define input ptx file for different platforms +#if defined(_WIN64) || defined(__LP64__) +#define PTX_FILE "matrixMul_kernel64.ptx" +#define CUBIN_FILE "matrixMul_kernel64.cubin" +#else +#define PTX_FILE "matrixMul_kernel32.ptx" +#define CUBIN_FILE "matrixMul_kernel32.cubin" +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Globals +//////////////////////////////////////////////////////////////////////////////// +CUdevice cuDevice; +CUcontext cuContext; +CUmodule cuModule; +size_t totalGlobalMem; + +const char *sSDKsample = "matrixMulDrv (Driver API)"; + +void constantInit(float *data, int size, float val) { + for (int i = 0; i < size; ++i) { + data[i] = val; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + printf("[ %s ]\n", sSDKsample); + + runTest(argc, argv); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Run a simple test for CUDA +//////////////////////////////////////////////////////////////////////////////// +void runTest(int argc, char **argv) { + // initialize CUDA + CUfunction matrixMul = NULL; + int block_size = 32; + + CUresult error_id = initCUDA(argc, argv, &matrixMul); + + if (error_id != CUDA_SUCCESS) { + printf("initCUDA() returned %d\n-> %s\n", error_id, + getCudaDrvErrorString(error_id)); + exit(EXIT_FAILURE); + } + + // set seed for rand() + srand(2006); + + // allocate host memory for matrices A and B + unsigned int size_A = WA * HA; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A = reinterpret_cast(malloc(mem_size_A)); + unsigned int size_B = WB * HB; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B = reinterpret_cast(malloc(mem_size_B)); + + // initialize host memory + const float valB = 0.01f; + constantInit(h_A, size_A, 1.0f); + constantInit(h_B, size_B, valB); + + // First reserve about 4GB of memory, so we ensure that all memory allocated + // afterwards is > 4GB + CUdeviceptr d_Mem[4]; + + if (use_64bit_memory_address) { + unsigned int mem_size = 1024 * 1024 * 1024; + checkCudaErrors(cuMemAlloc(&d_Mem[0], mem_size)); + checkCudaErrors(cuMemAlloc(&d_Mem[1], mem_size)); + checkCudaErrors(cuMemAlloc(&d_Mem[2], mem_size)); + checkCudaErrors(cuMemAlloc(&d_Mem[3], mem_size)); + } + + // allocate device memory + CUdeviceptr d_A; + checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); + CUdeviceptr d_B; + checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); + + // copy host memory to device + checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); + checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); + + // allocate device memory for result + size_t size_C = WC * HC; + size_t mem_size_C = sizeof(float) * size_C; + + CUdeviceptr d_C; + checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); + + // allocate mem for the result on host side + float *h_C = reinterpret_cast(malloc(mem_size_C)); + + // create and start timer + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + + // start the timer + sdkStartTimer(&timer); + + // There are two ways to launch CUDA kernels via the Driver API. + // In this CUDA Sample, we illustrate both ways to pass parameters + // and specify parameters. By default we use the simpler method. + dim3 block(block_size, block_size, 1); + dim3 grid(WC / block_size, HC / block_size, 1); + + if (1) { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (simplier method) + if (use_64bit_memory_address && + (totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L)) { + size_t Matrix_Width_A = (size_t)WA; + size_t Matrix_Width_B = (size_t)WB; + void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; + // new CUDA 4.0 Driver API Kernel launch call + checkCudaErrors(cuLaunchKernel( + matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, + 2 * block_size * block_size * sizeof(float), NULL, args, NULL)); + + } else { + int Matrix_Width_A = WA; + int Matrix_Width_B = WB; + void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; + // new CUDA 4.0 Driver API Kernel launch call + checkCudaErrors(cuLaunchKernel( + matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, + 2 * block_size * block_size * sizeof(float), NULL, args, NULL)); + } + + } else { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (advanced method) + int offset = 0; + char argBuffer[256]; + + // pass in launch parameters (not actually de-referencing CUdeviceptr). + // CUdeviceptr is storing the value of the parameters + *(reinterpret_cast(&argBuffer[offset])) = d_C; + offset += sizeof(d_C); + *(reinterpret_cast(&argBuffer[offset])) = d_A; + offset += sizeof(d_A); + *(reinterpret_cast(&argBuffer[offset])) = d_B; + offset += sizeof(d_B); + + if (use_64bit_memory_address && + (totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L)) { + size_t Matrix_Width_A = (size_t)WA; + size_t Matrix_Width_B = (size_t)WB; + + *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_A; + offset += sizeof(Matrix_Width_A); + *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_B; + offset += sizeof(Matrix_Width_B); + } else { + int Matrix_Width_A = WA; + int Matrix_Width_B = WB; + + *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_A; + offset += sizeof(Matrix_Width_A); + *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_B; + offset += sizeof(Matrix_Width_B); + } + + void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, + CU_LAUNCH_PARAM_END}; + + // new CUDA 4.0 Driver API Kernel launch call + checkCudaErrors(cuLaunchKernel( + matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, + 2 * block_size * block_size * sizeof(float), NULL, NULL, + reinterpret_cast(&kernel_launch_config))); + } + + // copy result from device to host + checkCudaErrors(cuMemcpyDtoH(reinterpret_cast(h_C), d_C, mem_size_C)); + + // stop and destroy timer + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); + + printf("Checking computed result for correctness: "); + bool correct = true; + + for (int i = 0; i < static_cast(WC * HC); i++) { + if (fabs(h_C[i] - (WA * valB)) > 1e-5) { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, + h_C[i], WA * valB); + correct = false; + } + } + + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + + printf( + "\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); + + // clean up memory + if (use_64bit_memory_address) { + cuMemFree(d_Mem[0]); + cuMemFree(d_Mem[1]); + cuMemFree(d_Mem[2]); + cuMemFree(d_Mem[3]); + } + + free(h_A); + free(h_B); + free(h_C); + checkCudaErrors(cuMemFree(d_A)); + checkCudaErrors(cuMemFree(d_B)); + checkCudaErrors(cuMemFree(d_C)); + checkCudaErrors(cuCtxDestroy(cuContext)); +} + +// Allocates a matrix with random float entries. +void randomInit(float *data, int size) { + for (int i = 0; i < size; ++i) { + data[i] = rand() / static_cast(RAND_MAX); + } +} + +bool inline findModulePath(const char *module_file, std::string &module_path, + char **argv, std::string &ptx_source) { + char *actual_path = sdkFindFilePath(module_file, argv[0]); + + if (actual_path) { + module_path = actual_path; + } else { + printf("> findModulePath file not found: <%s> \n", module_file); + return false; + } + + if (module_path.empty()) { + printf("> findModulePath file not found: <%s> \n", module_file); + return false; + } else { + printf("> findModulePath <%s>\n", module_path.c_str()); + + if (module_path.rfind(".ptx") != std::string::npos) { + FILE *fp = fopen(module_path.c_str(), "rb"); + fseek(fp, 0, SEEK_END); + int file_size = ftell(fp); + char *buf = new char[file_size + 1]; + fseek(fp, 0, SEEK_SET); + fread(buf, sizeof(char), file_size, fp); + fclose(fp); + buf[file_size] = '\0'; + ptx_source = buf; + delete[] buf; + } + + return true; + } +} + +static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul) { + CUfunction cuFunction = 0; + CUresult status; + int major = 0, minor = 0; + char deviceName[100]; + std::string module_path, ptx_source; + + cuDevice = findCudaDeviceDRV(argc, (const char **)argv); + + // get compute capabilities and the devicename + checkCudaErrors(cuDeviceGetAttribute( + &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute( + &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice)); + printf("> GPU Device has SM %d.%d compute capability\n", major, minor); + + checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice)); + printf(" Total amount of global memory: %llu bytes\n", + (long long unsigned int)totalGlobalMem); + printf(" 64-bit Memory Address: %s\n", + (totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L) ? "YES" : "NO"); + + status = cuCtxCreate(&cuContext, 0, cuDevice); + + if (CUDA_SUCCESS != status) { + goto Error; + } + + // first search for the module path before we load the results + if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) { + if (!findModulePath(CUBIN_FILE, module_path, argv, ptx_source)) { + printf( + "> findModulePath could not find ptx or cubin\n"); + status = CUDA_ERROR_NOT_FOUND; + goto Error; + } + } else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); + } + + if (module_path.rfind("ptx") != std::string::npos) { + // in this branch we use compilation with parameters + const unsigned int jitNumOptions = 3; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void *[jitNumOptions]; + + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = reinterpret_cast(jitLogBufferSize); + + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; + + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = reinterpret_cast(jitRegCount); + + status = + cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, + jitOptions, reinterpret_cast(jitOptVals)); + + printf("> PTX JIT log:\n%s\n", jitLogBuffer); + } else { + status = cuModuleLoad(&cuModule, module_path.c_str()); + } + + if (CUDA_SUCCESS != status) { + goto Error; + } + +#if USE_64BIT_MEMORY_ADDRESS + + if (totalGlobalMem > (uint64_t)4 * 1024 * 1024 * 1024L) { + status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit"); + } else +#endif + { + status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_32bit"); + } + + if (CUDA_SUCCESS != status) { + goto Error; + } + + *pMatrixMul = cuFunction; + + return CUDA_SUCCESS; +Error: + cuCtxDestroy(cuContext); + return status; +} diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2010.sln b/Samples/matrixMulDrv/matrixMulDrv_vs2010.sln new file mode 100644 index 00000000..0c4ee250 --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2010.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMulDrv", "matrixMulDrv_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2010.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2010.vcxproj new file mode 100644 index 00000000..96dfcbcc --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2010.vcxproj @@ -0,0 +1,110 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMulDrv_vs2010 + matrixMulDrv + + + + + Application + MultiByte + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMulDrv.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + data/%(Filename)64.ptx + ptx + + + + + + + + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2012.sln b/Samples/matrixMulDrv/matrixMulDrv_vs2012.sln new file mode 100644 index 00000000..92bf41ca --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMulDrv", "matrixMulDrv_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj new file mode 100644 index 00000000..10c5d339 --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj @@ -0,0 +1,111 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMulDrv_vs2012 + matrixMulDrv + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMulDrv.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + data/%(Filename)64.ptx + ptx + + + + + + + + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2013.sln b/Samples/matrixMulDrv/matrixMulDrv_vs2013.sln new file mode 100644 index 00000000..211d2c9d --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMulDrv", "matrixMulDrv_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj new file mode 100644 index 00000000..197edc7a --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj @@ -0,0 +1,111 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMulDrv_vs2013 + matrixMulDrv + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMulDrv.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + data/%(Filename)64.ptx + ptx + + + + + + + + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2015.sln b/Samples/matrixMulDrv/matrixMulDrv_vs2015.sln new file mode 100644 index 00000000..0a0b709f --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMulDrv", "matrixMulDrv_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj new file mode 100644 index 00000000..b4cc9ce1 --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj @@ -0,0 +1,111 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMulDrv_vs2015 + matrixMulDrv + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMulDrv.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + data/%(Filename)64.ptx + ptx + + + + + + + + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.sln b/Samples/matrixMulDrv/matrixMulDrv_vs2017.sln new file mode 100644 index 00000000..69fdebae --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMulDrv", "matrixMulDrv_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj new file mode 100644 index 00000000..14a248bb --- /dev/null +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj @@ -0,0 +1,112 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + matrixMulDrv_vs2017 + matrixMulDrv + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/matrixMulDrv.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + data/%(Filename)64.ptx + ptx + + + + + + + + diff --git a/Samples/matrixMulDrv/matrixMul_kernel.cu b/Samples/matrixMulDrv/matrixMul_kernel.cu new file mode 100644 index 00000000..3b5253e5 --- /dev/null +++ b/Samples/matrixMulDrv/matrixMul_kernel.cu @@ -0,0 +1,139 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Matrix multiplication: C = A * B. + * Device code. + */ + +#ifndef _MATRIXMUL_KERNEL_H_ +#define _MATRIXMUL_KERNEL_H_ + +#include + +#define CHECK_BANK_CONFLICTS 0 +#if CHECK_BANK_CONFLICTS +#define AS(i, j) \ + cutilBankChecker((reinterpret_cast(&As[0][0])), (block_size * i + j)) +#define BS(i, j) \ + cutilBankChecker((reinterpret_cast(&Bs[0][0])), (block_size * i + j)) +#else +#define AS(i, j) As[i][j] +#define BS(i, j) Bs[i][j] +#endif + +//////////////////////////////////////////////////////////////////////////////// +//! Matrix multiplication on the device: C = A * B +//! wA is A's width and wB is B's width +//////////////////////////////////////////////////////////////////////////////// +template +__device__ void matrixMul(float *C, float *A, float *B, size_type wA, + size_type wB) { + // Block index + size_type bx = blockIdx.x; + size_type by = blockIdx.y; + + // Thread index + size_type tx = threadIdx.x; + size_type ty = threadIdx.y; + + // Index of the first sub-matrix of A processed by the block + size_type aBegin = wA * block_size * by; + + // Index of the last sub-matrix of A processed by the block + size_type aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + size_type aStep = block_size; + + // Index of the first sub-matrix of B processed by the block + size_type bBegin = block_size * bx; + + // Step size used to iterate through the sub-matrices of B + size_type bStep = block_size * wB; + + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[block_size][block_size]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[block_size][block_size]; + + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + AS(ty, tx) = A[a + wA * ty + tx]; + BS(ty, tx) = B[b + wB * ty + tx]; + + // Synchronize to make sure the matrices are loaded + __syncthreads(); + + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix +#pragma unroll + + for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx); + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes one element + size_type c = wB * block_size * by + block_size * bx; + C[c + wB * ty + tx] = Csub; +} + +// C wrappers around our template kernel +extern "C" __global__ void matrixMul_bs16_32bit(float *C, float *A, float *B, + int wA, int wB) { + matrixMul<16, int>(C, A, B, wA, wB); +} +extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, + size_t wA, size_t wB) { + matrixMul<16, size_t>(C, A, B, wA, wB); +} +extern "C" __global__ void matrixMul_bs32_32bit(float *C, float *A, float *B, + int wA, int wB) { + matrixMul<32, int>(C, A, B, wA, wB); +} +extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, + size_t wA, size_t wB) { + matrixMul<32, size_t>(C, A, B, wA, wB); +} + +#endif // #ifndef _MATRIXMUL_KERNEL_H_ diff --git a/Samples/simpleCUBLAS/Makefile b/Samples/simpleCUBLAS/Makefile new file mode 100644 index 00000000..28ec4d43 --- /dev/null +++ b/Samples/simpleCUBLAS/Makefile @@ -0,0 +1,276 @@ +################################################################################ +# +# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-g++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +ifeq ($(SMS),) +# Generate PTX code from SM 30 +GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30 +endif + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +LIBRARIES += -lcublas + +################################################################################ + +# Target rules +all: build + +build: simpleCUBLAS + +simpleCUBLAS.o:simpleCUBLAS.cpp + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +simpleCUBLAS: simpleCUBLAS.o + $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + ./simpleCUBLAS + +clean: + rm -f simpleCUBLAS simpleCUBLAS.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleCUBLAS + +clobber: clean diff --git a/Samples/simpleCUBLAS/NsightEclipse.xml b/Samples/simpleCUBLAS/NsightEclipse.xml new file mode 100644 index 00000000..30b0a9f9 --- /dev/null +++ b/Samples/simpleCUBLAS/NsightEclipse.xml @@ -0,0 +1,71 @@ + + + + simpleCUBLAS + + whole + true + + ./ + ../ + ../../common/inc + + + Image Processing + CUBLAS Library + + + CUDA + CUBLAS + Linear Algebra + + + cublas + + + + true + simpleCUBLAS.cpp + + CUBLAS + + + 1:CUDA Basic Topics + 3:Linear Algebra + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + aarch64 + + + ppc64le + linux + + + + all + + Simple CUBLAS + diff --git a/Samples/simpleCUBLAS/README.md b/Samples/simpleCUBLAS/README.md new file mode 100644 index 00000000..2cac0888 --- /dev/null +++ b/Samples/simpleCUBLAS/README.md @@ -0,0 +1,95 @@ +# simpleCUBLAS - Simple CUBLAS + +## Description + +Example of using CUBLAS using the new CUBLAS API interface available in CUDA 4.0. + +## Key Concepts + +Image Processing, CUBLAS Library + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l, aarch64 + +## CUDA APIs involved + +## Dependencies needed to build/run +[CUBLAS](../../README.md#cublas) + +## Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS.cpp b/Samples/simpleCUBLAS/simpleCUBLAS.cpp new file mode 100644 index 00000000..1976e6ed --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS.cpp @@ -0,0 +1,255 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* This example demonstrates how to use the CUBLAS library + * by scaling an array of floating-point values on the device + * and comparing the result to the same operation performed + * on the host. + */ + +/* Includes, system */ +#include +#include +#include + +/* Includes, cuda */ +#include +#include +#include + +/* Matrix size */ +#define N (275) + +/* Host implementation of a simple version of sgemm */ +static void simple_sgemm(int n, float alpha, const float *A, const float *B, + float beta, float *C) { + int i; + int j; + int k; + + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + float prod = 0; + + for (k = 0; k < n; ++k) { + prod += A[k * n + i] * B[j * n + k]; + } + + C[j * n + i] = alpha * prod + beta * C[j * n + i]; + } + } +} + +/* Main */ +int main(int argc, char **argv) { + cublasStatus_t status; + float *h_A; + float *h_B; + float *h_C; + float *h_C_ref; + float *d_A = 0; + float *d_B = 0; + float *d_C = 0; + float alpha = 1.0f; + float beta = 0.0f; + int n2 = N * N; + int i; + float error_norm; + float ref_norm; + float diff; + cublasHandle_t handle; + + int dev = findCudaDevice(argc, (const char **)argv); + + if (dev == -1) { + return EXIT_FAILURE; + } + + /* Initialize CUBLAS */ + printf("simpleCUBLAS test running..\n"); + + status = cublasCreate(&handle); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLAS initialization error\n"); + return EXIT_FAILURE; + } + + /* Allocate host memory for the matrices */ + h_A = reinterpret_cast(malloc(n2 * sizeof(h_A[0]))); + + if (h_A == 0) { + fprintf(stderr, "!!!! host memory allocation error (A)\n"); + return EXIT_FAILURE; + } + + h_B = reinterpret_cast(malloc(n2 * sizeof(h_B[0]))); + + if (h_B == 0) { + fprintf(stderr, "!!!! host memory allocation error (B)\n"); + return EXIT_FAILURE; + } + + h_C = reinterpret_cast(malloc(n2 * sizeof(h_C[0]))); + + if (h_C == 0) { + fprintf(stderr, "!!!! host memory allocation error (C)\n"); + return EXIT_FAILURE; + } + + /* Fill the matrices with test data */ + for (i = 0; i < n2; i++) { + h_A[i] = rand() / static_cast(RAND_MAX); + h_B[i] = rand() / static_cast(RAND_MAX); + h_C[i] = rand() / static_cast(RAND_MAX); + } + + /* Allocate device memory for the matrices */ + if (cudaMalloc(reinterpret_cast(&d_A), n2 * sizeof(d_A[0])) != + cudaSuccess) { + fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); + return EXIT_FAILURE; + } + + if (cudaMalloc(reinterpret_cast(&d_B), n2 * sizeof(d_B[0])) != + cudaSuccess) { + fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); + return EXIT_FAILURE; + } + + if (cudaMalloc(reinterpret_cast(&d_C), n2 * sizeof(d_C[0])) != + cudaSuccess) { + fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); + return EXIT_FAILURE; + } + + /* Initialize the device matrices with the host matrices */ + status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (write A)\n"); + return EXIT_FAILURE; + } + + status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (write B)\n"); + return EXIT_FAILURE; + } + + status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (write C)\n"); + return EXIT_FAILURE; + } + + /* Performs operation using plain C code */ + simple_sgemm(N, alpha, h_A, h_B, beta, h_C); + h_C_ref = h_C; + + /* Performs operation using cublas */ + status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, + N, d_B, N, &beta, d_C, N); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! kernel execution error.\n"); + return EXIT_FAILURE; + } + + /* Allocate host memory for reading back the result from device memory */ + h_C = reinterpret_cast(malloc(n2 * sizeof(h_C[0]))); + + if (h_C == 0) { + fprintf(stderr, "!!!! host memory allocation error (C)\n"); + return EXIT_FAILURE; + } + + /* Read the result back */ + status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (read C)\n"); + return EXIT_FAILURE; + } + + /* Check result against reference */ + error_norm = 0; + ref_norm = 0; + + for (i = 0; i < n2; ++i) { + diff = h_C_ref[i] - h_C[i]; + error_norm += diff * diff; + ref_norm += h_C_ref[i] * h_C_ref[i]; + } + + error_norm = static_cast(sqrt(static_cast(error_norm))); + ref_norm = static_cast(sqrt(static_cast(ref_norm))); + + if (fabs(ref_norm) < 1e-7) { + fprintf(stderr, "!!!! reference norm is 0\n"); + return EXIT_FAILURE; + } + + /* Memory clean up */ + free(h_A); + free(h_B); + free(h_C); + free(h_C_ref); + + if (cudaFree(d_A) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (A)\n"); + return EXIT_FAILURE; + } + + if (cudaFree(d_B) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (B)\n"); + return EXIT_FAILURE; + } + + if (cudaFree(d_C) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (C)\n"); + return EXIT_FAILURE; + } + + /* Shutdown */ + status = cublasDestroy(handle); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! shutdown error (A)\n"); + return EXIT_FAILURE; + } + + if (error_norm / ref_norm < 1e-6f) { + printf("simpleCUBLAS test passed.\n"); + exit(EXIT_SUCCESS); + } else { + printf("simpleCUBLAS test failed.\n"); + exit(EXIT_FAILURE); + } +} diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.sln b/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.sln new file mode 100644 index 00000000..fcad3e9e --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS", "simpleCUBLAS_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.vcxproj new file mode 100644 index 00000000..79b747f9 --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.vcxproj @@ -0,0 +1,106 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUBLAS_vs2010 + simpleCUBLAS + + + + + Application + MultiByte + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUBLAS.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.sln b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.sln new file mode 100644 index 00000000..4ea5adb0 --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS", "simpleCUBLAS_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj new file mode 100644 index 00000000..d1c4aacc --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUBLAS_vs2012 + simpleCUBLAS + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUBLAS.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.sln b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.sln new file mode 100644 index 00000000..75b516b3 --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS", "simpleCUBLAS_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj new file mode 100644 index 00000000..60d45caf --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUBLAS_vs2013 + simpleCUBLAS + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUBLAS.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.sln b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.sln new file mode 100644 index 00000000..f98f95b0 --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS", "simpleCUBLAS_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj new file mode 100644 index 00000000..6804f433 --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUBLAS_vs2015 + simpleCUBLAS + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUBLAS.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.sln b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.sln new file mode 100644 index 00000000..80ee9c02 --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS", "simpleCUBLAS_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj new file mode 100644 index 00000000..31a5f194 --- /dev/null +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUBLAS_vs2017 + simpleCUBLAS + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUBLAS.exe + + + compute_30,compute_30; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUFFT/Makefile b/Samples/simpleCUFFT/Makefile new file mode 100644 index 00000000..bf38e0ca --- /dev/null +++ b/Samples/simpleCUFFT/Makefile @@ -0,0 +1,289 @@ +################################################################################ +# +# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-g++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +LIBRARIES += -lcufft + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: simpleCUFFT + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +simpleCUFFT.o:simpleCUFFT.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +simpleCUFFT: simpleCUFFT.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./simpleCUFFT + +clean: + rm -f simpleCUFFT simpleCUFFT.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleCUFFT + +clobber: clean diff --git a/Samples/simpleCUFFT/NsightEclipse.xml b/Samples/simpleCUFFT/NsightEclipse.xml new file mode 100644 index 00000000..cac090a0 --- /dev/null +++ b/Samples/simpleCUFFT/NsightEclipse.xml @@ -0,0 +1,70 @@ + + + + simpleCUFFT + + whole + + ./ + ../ + ../../common/inc + + + Image Processing + CUFFT Library + + + CUDA + CUFFT + + + cufft + + + + true + simpleCUFFT.cu + + CUFFT + + + 1:CUDA Basic Topics + 2:Image Processing + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + aarch64 + + + ppc64le + linux + + + + all + + Simple CUFFT + exe + diff --git a/Samples/simpleCUFFT/README.md b/Samples/simpleCUFFT/README.md new file mode 100644 index 00000000..f3de5071 --- /dev/null +++ b/Samples/simpleCUFFT/README.md @@ -0,0 +1,95 @@ +# simpleCUFFT - Simple CUFFT + +## Description + +Example of using CUFFT. In this example, CUFFT is used to compute the 1D-convolution of some signal with some filter by transforming both into frequency domain, multiplying them together, and transforming the signal back to time domain. cuFFT plans are created using simple and advanced API functions. + +## Key Concepts + +Image Processing, CUFFT Library + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l, aarch64 + +## CUDA APIs involved + +## Dependencies needed to build/run +[CUFFT](../../README.md#cufft) + +## Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/simpleCUFFT/simpleCUFFT.cu b/Samples/simpleCUFFT/simpleCUFFT.cu new file mode 100644 index 00000000..2fcacd47 --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT.cu @@ -0,0 +1,286 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */ + +// includes, system +#include +#include +#include +#include + +// includes, project +#include +#include +#include +#include +#include + +// Complex data type +typedef float2 Complex; +static __device__ __host__ inline Complex ComplexAdd(Complex, Complex); +static __device__ __host__ inline Complex ComplexScale(Complex, float); +static __device__ __host__ inline Complex ComplexMul(Complex, Complex); +static __global__ void ComplexPointwiseMulAndScale(Complex *, const Complex *, + int, float); + +// Filtering functions +void Convolve(const Complex *, int, const Complex *, int, Complex *); + +// Padding functions +int PadData(const Complex *, Complex **, int, const Complex *, Complex **, int); + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest(int argc, char **argv); + +// The filter size is assumed to be a number smaller than the signal size +#define SIGNAL_SIZE 50 +#define FILTER_KERNEL_SIZE 11 + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { runTest(argc, argv); } + +//////////////////////////////////////////////////////////////////////////////// +//! Run a simple test for CUDA +//////////////////////////////////////////////////////////////////////////////// +void runTest(int argc, char **argv) { + printf("[simpleCUFFT] is starting...\n"); + + findCudaDevice(argc, (const char **)argv); + + // Allocate host memory for the signal + Complex *h_signal = + reinterpret_cast(malloc(sizeof(Complex) * SIGNAL_SIZE)); + + // Initialize the memory for the signal + for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) { + h_signal[i].x = rand() / static_cast(RAND_MAX); + h_signal[i].y = 0; + } + + // Allocate host memory for the filter + Complex *h_filter_kernel = + reinterpret_cast(malloc(sizeof(Complex) * FILTER_KERNEL_SIZE)); + + // Initialize the memory for the filter + for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i) { + h_filter_kernel[i].x = rand() / static_cast(RAND_MAX); + h_filter_kernel[i].y = 0; + } + + // Pad signal and filter kernel + Complex *h_padded_signal; + Complex *h_padded_filter_kernel; + int new_size = + PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel, + &h_padded_filter_kernel, FILTER_KERNEL_SIZE); + int mem_size = sizeof(Complex) * new_size; + + // Allocate device memory for signal + Complex *d_signal; + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_signal), mem_size)); + // Copy host memory to device + checkCudaErrors( + cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice)); + + // Allocate device memory for filter kernel + Complex *d_filter_kernel; + checkCudaErrors( + cudaMalloc(reinterpret_cast(&d_filter_kernel), mem_size)); + + // Copy host memory to device + checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size, + cudaMemcpyHostToDevice)); + + // CUFFT plan simple API + cufftHandle plan; + checkCudaErrors(cufftPlan1d(&plan, new_size, CUFFT_C2C, 1)); + + // CUFFT plan advanced API + cufftHandle plan_adv; + size_t workSize; + long long int new_size_long = new_size; + + checkCudaErrors(cufftCreate(&plan_adv)); + checkCudaErrors(cufftXtMakePlanMany(plan_adv, 1, &new_size_long, NULL, 1, 1, + CUDA_C_32F, NULL, 1, 1, CUDA_C_32F, 1, + &workSize, CUDA_C_32F)); + printf("Temporary buffer size %li bytes\n", workSize); + + // Transform signal and kernel + printf("Transforming signal cufftExecC2C\n"); + checkCudaErrors(cufftExecC2C(plan, reinterpret_cast(d_signal), + reinterpret_cast(d_signal), + CUFFT_FORWARD)); + checkCudaErrors(cufftExecC2C( + plan_adv, reinterpret_cast(d_filter_kernel), + reinterpret_cast(d_filter_kernel), CUFFT_FORWARD)); + + // Multiply the coefficients together and normalize the result + printf("Launching ComplexPointwiseMulAndScale<<< >>>\n"); + ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size, + 1.0f / new_size); + + // Check if kernel execution generated and error + getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]"); + + // Transform signal back + printf("Transforming signal back cufftExecC2C\n"); + checkCudaErrors(cufftExecC2C(plan, reinterpret_cast(d_signal), + reinterpret_cast(d_signal), + CUFFT_INVERSE)); + + // Copy device memory to host + Complex *h_convolved_signal = h_padded_signal; + checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size, + cudaMemcpyDeviceToHost)); + + // Allocate host memory for the convolution result + Complex *h_convolved_signal_ref = + reinterpret_cast(malloc(sizeof(Complex) * SIGNAL_SIZE)); + + // Convolve on the host + Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, + h_convolved_signal_ref); + + // check result + bool bTestResult = sdkCompareL2fe( + reinterpret_cast(h_convolved_signal_ref), + reinterpret_cast(h_convolved_signal), 2 * SIGNAL_SIZE, 1e-5f); + + // Destroy CUFFT context + checkCudaErrors(cufftDestroy(plan)); + checkCudaErrors(cufftDestroy(plan_adv)); + + // cleanup memory + free(h_signal); + free(h_filter_kernel); + free(h_padded_signal); + free(h_padded_filter_kernel); + free(h_convolved_signal_ref); + checkCudaErrors(cudaFree(d_signal)); + checkCudaErrors(cudaFree(d_filter_kernel)); + + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); +} + +// Pad data +int PadData(const Complex *signal, Complex **padded_signal, int signal_size, + const Complex *filter_kernel, Complex **padded_filter_kernel, + int filter_kernel_size) { + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; + int new_size = signal_size + maxRadius; + + // Pad signal + Complex *new_data = + reinterpret_cast(malloc(sizeof(Complex) * new_size)); + memcpy(new_data + 0, signal, signal_size * sizeof(Complex)); + memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex)); + *padded_signal = new_data; + + // Pad filter + new_data = reinterpret_cast(malloc(sizeof(Complex) * new_size)); + memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex)); + memset(new_data + maxRadius, 0, + (new_size - filter_kernel_size) * sizeof(Complex)); + memcpy(new_data + new_size - minRadius, filter_kernel, + minRadius * sizeof(Complex)); + *padded_filter_kernel = new_data; + + return new_size; +} + +//////////////////////////////////////////////////////////////////////////////// +// Filtering operations +//////////////////////////////////////////////////////////////////////////////// + +// Computes convolution on the host +void Convolve(const Complex *signal, int signal_size, + const Complex *filter_kernel, int filter_kernel_size, + Complex *filtered_signal) { + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; + + // Loop over output element indices + for (int i = 0; i < signal_size; ++i) { + filtered_signal[i].x = filtered_signal[i].y = 0; + + // Loop over convolution indices + for (int j = -maxRadius + 1; j <= minRadius; ++j) { + int k = i + j; + + if (k >= 0 && k < signal_size) { + filtered_signal[i] = + ComplexAdd(filtered_signal[i], + ComplexMul(signal[k], filter_kernel[minRadius - j])); + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Complex operations +//////////////////////////////////////////////////////////////////////////////// + +// Complex addition +static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) { + Complex c; + c.x = a.x + b.x; + c.y = a.y + b.y; + return c; +} + +// Complex scale +static __device__ __host__ inline Complex ComplexScale(Complex a, float s) { + Complex c; + c.x = s * a.x; + c.y = s * a.y; + return c; +} + +// Complex multiplication +static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) { + Complex c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + return c; +} + +// Complex pointwise multiplication +static __global__ void ComplexPointwiseMulAndScale(Complex *a, const Complex *b, + int size, float scale) { + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = threadID; i < size; i += numThreads) { + a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale); + } +} diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2010.sln b/Samples/simpleCUFFT/simpleCUFFT_vs2010.sln new file mode 100644 index 00000000..1bf9f623 --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2010.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUFFT", "simpleCUFFT_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2010.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2010.vcxproj new file mode 100644 index 00000000..97a37127 --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2010.vcxproj @@ -0,0 +1,106 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUFFT_vs2010 + simpleCUFFT + + + + + Application + MultiByte + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cufft.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUFFT.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2012.sln b/Samples/simpleCUFFT/simpleCUFFT_vs2012.sln new file mode 100644 index 00000000..c2f8cfdc --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUFFT", "simpleCUFFT_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj new file mode 100644 index 00000000..cf19ef62 --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUFFT_vs2012 + simpleCUFFT + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cufft.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUFFT.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2013.sln b/Samples/simpleCUFFT/simpleCUFFT_vs2013.sln new file mode 100644 index 00000000..e9782b2d --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUFFT", "simpleCUFFT_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj new file mode 100644 index 00000000..384b3aa6 --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUFFT_vs2013 + simpleCUFFT + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cufft.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUFFT.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2015.sln b/Samples/simpleCUFFT/simpleCUFFT_vs2015.sln new file mode 100644 index 00000000..921dafdf --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUFFT", "simpleCUFFT_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj new file mode 100644 index 00000000..6f5f651d --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUFFT_vs2015 + simpleCUFFT + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cufft.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUFFT.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.sln b/Samples/simpleCUFFT/simpleCUFFT_vs2017.sln new file mode 100644 index 00000000..5f6b9654 --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUFFT", "simpleCUFFT_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj new file mode 100644 index 00000000..bb012d34 --- /dev/null +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUFFT_vs2017 + simpleCUFFT + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cufft.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUFFT.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/vectorAdd_nvrtc/Makefile b/Samples/vectorAdd_nvrtc/Makefile new file mode 100644 index 00000000..c39cfcd4 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/Makefile @@ -0,0 +1,334 @@ +################################################################################ +# +# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-g++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + CCFLAGS += -g + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - vectorAdd_nvrtc is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# libNVRTC specific libraries +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -L$(CUDA_PATH)/lib -framework CUDA +else ifeq ($(TARGET_ARCH),x86_64) + LDFLAGS += -L$(CUDA_PATH)/lib64/stubs -L$(CUDA_PATH)/lib64 +else ifeq ($(TARGET_ARCH),ppc64le) + LDFLAGS += -L$(CUDA_PATH)/targets/ppc64le-linux/lib/stubs -L$(CUDA_PATH)/targets/ppc64le-linux/lib +endif + +ifeq ($(TARGET_OS),darwin) + ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA +else + ifeq ($(TARGET_ARCH),x86_64) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs + CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs + endif + + ifeq ($(TARGET_ARCH),ppc64le) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs + endif + + CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) + ifeq ("$(CUDALIB)","") + $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<) + SAMPLE_ENABLED := 0 + endif + + LIBRARIES += -lcuda +endif + +INCLUDES += -I$(CUDA_PATH)/include + +LIBRARIES += -lnvrtc + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: vectorAdd_nvrtc + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +vectorAdd.o:vectorAdd.cpp + $(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $< + +vectorAdd_nvrtc: vectorAdd.o + $(EXEC) $(HOST_COMPILER) $(LDFLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./vectorAdd_nvrtc + +clean: + rm -f vectorAdd_nvrtc vectorAdd.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/vectorAdd_nvrtc + +clobber: clean diff --git a/Samples/vectorAdd_nvrtc/README.md b/Samples/vectorAdd_nvrtc/README.md new file mode 100644 index 00000000..93675b45 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/README.md @@ -0,0 +1,98 @@ +# vectorAdd_nvrtc - Vector Addition with libNVRTC + +## Description + +This CUDA Driver API sample uses NVRTC for runtime compilation of vector addition kernel. Vector addition kernel demonstrated is the same as the sample illustrating Chapter 3 of the programming guide. + +## Key Concepts + +CUDA Driver API, Vector Addition, Runtime Compilation + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, aarch64 + +## CUDA APIs involved + +### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html) +cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH + +## Dependencies needed to build/run +[NVRTC](../../README.md#nvrtc) + +## Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd.cpp b/Samples/vectorAdd_nvrtc/vectorAdd.cpp new file mode 100644 index 00000000..14fa6e1f --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd.cpp @@ -0,0 +1,153 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Vector addition: C = A + B. + * + * This sample is a very basic sample that implements element by element + * vector addition. It is the same as the sample illustrating Chapter 2 + * of the programming guide with some additions like error checking. + */ + +#include +#include + +// For the CUDA runtime routines (prefixed with "cuda_") +#include +#include + +// helper functions and utilities to work with CUDA +#include + +#include + +/** + * Host main routine + */ +int main(int argc, char **argv) { + char *ptx, *kernel_file; + size_t ptxSize; + kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]); + compileFileToPTX(kernel_file, argc, argv, &ptx, &ptxSize, 0); + CUmodule module = loadPTX(ptx, argc, argv); + + CUfunction kernel_addr; + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd")); + + // Print the vector length to be used, and compute its size + int numElements = 50000; + size_t size = numElements * sizeof(float); + printf("[Vector addition of %d elements]\n", numElements); + + // Allocate the host input vector A + float *h_A = reinterpret_cast(malloc(size)); + + // Allocate the host input vector B + float *h_B = reinterpret_cast(malloc(size)); + + // Allocate the host output vector C + float *h_C = reinterpret_cast(malloc(size)); + + // Verify that allocations succeeded + if (h_A == NULL || h_B == NULL || h_C == NULL) { + fprintf(stderr, "Failed to allocate host vectors!\n"); + exit(EXIT_FAILURE); + } + + // Initialize the host input vectors + for (int i = 0; i < numElements; ++i) { + h_A[i] = rand() / static_cast(RAND_MAX); + h_B[i] = rand() / static_cast(RAND_MAX); + } + + // Allocate the device input vector A + CUdeviceptr d_A; + checkCudaErrors(cuMemAlloc(&d_A, size)); + + // Allocate the device input vector B + CUdeviceptr d_B; + checkCudaErrors(cuMemAlloc(&d_B, size)); + + // Allocate the device output vector C + CUdeviceptr d_C; + checkCudaErrors(cuMemAlloc(&d_C, size)); + + // Copy the host input vectors A and B in host memory to the device input + // vectors in device memory + printf("Copy input data from the host memory to the CUDA device\n"); + checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size)); + checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size)); + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, + threadsPerBlock); + dim3 cudaBlockSize(threadsPerBlock, 1, 1); + dim3 cudaGridSize(blocksPerGrid, 1, 1); + + void *arr[] = {reinterpret_cast(&d_A), reinterpret_cast(&d_B), + reinterpret_cast(&d_C), + reinterpret_cast(&numElements)}; + checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + cudaBlockSize.x, cudaBlockSize.y, + cudaBlockSize.z, /* block dim */ + 0, 0, /* shared mem, stream */ + &arr[0], /* arguments */ + 0)); + checkCudaErrors(cuCtxSynchronize()); + + // Copy the device result vector in device memory to the host result vector + // in host memory. + printf("Copy output data from the CUDA device to the host memory\n"); + checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size)); + + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) { + if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + + printf("Test PASSED\n"); + + // Free device global memory + checkCudaErrors(cuMemFree(d_A)); + checkCudaErrors(cuMemFree(d_B)); + checkCudaErrors(cuMemFree(d_C)); + + // Free host memory + free(h_A); + free(h_B); + free(h_C); + + printf("Done\n"); + + return 0; +} diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_kernel.cu b/Samples/vectorAdd_nvrtc/vectorAdd_kernel.cu new file mode 100644 index 00000000..2e2c9981 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_kernel.cu @@ -0,0 +1,42 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ + +extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, + int numElements) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) { + C[i] = A[i] + B[i]; + } +} diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.sln b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.sln new file mode 100644 index 00000000..5fe04847 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorAdd_nvrtc", "vectorAdd_nvrtc_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.vcxproj new file mode 100644 index 00000000..4118fa81 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.vcxproj @@ -0,0 +1,106 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + vectorAdd_nvrtc_vs2010 + vectorAdd_nvrtc + + + + + Application + MultiByte + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/vectorAdd_nvrtc.exe + + + + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.sln b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.sln new file mode 100644 index 00000000..b3bf2b60 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorAdd_nvrtc", "vectorAdd_nvrtc_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj new file mode 100644 index 00000000..5386c8ac --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + vectorAdd_nvrtc_vs2012 + vectorAdd_nvrtc + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/vectorAdd_nvrtc.exe + + + + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.sln b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.sln new file mode 100644 index 00000000..3647a494 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorAdd_nvrtc", "vectorAdd_nvrtc_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj new file mode 100644 index 00000000..f4cba895 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + vectorAdd_nvrtc_vs2013 + vectorAdd_nvrtc + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/vectorAdd_nvrtc.exe + + + + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.sln b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.sln new file mode 100644 index 00000000..db348e33 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorAdd_nvrtc", "vectorAdd_nvrtc_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj new file mode 100644 index 00000000..a8d3b4b9 --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + vectorAdd_nvrtc_vs2015 + vectorAdd_nvrtc + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/vectorAdd_nvrtc.exe + + + + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.sln b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.sln new file mode 100644 index 00000000..ca760d4d --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorAdd_nvrtc", "vectorAdd_nvrtc_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj new file mode 100644 index 00000000..e449811e --- /dev/null +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + vectorAdd_nvrtc_vs2017 + vectorAdd_nvrtc + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/vectorAdd_nvrtc.exe + + + + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/warpAggregatedAtomicsCG/Makefile b/Samples/warpAggregatedAtomicsCG/Makefile new file mode 100644 index 00000000..6d51d36f --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/Makefile @@ -0,0 +1,287 @@ +################################################################################ +# +# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-g++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: warpAggregatedAtomicsCG + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +warpAggregatedAtomicsCG.o:warpAggregatedAtomicsCG.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +warpAggregatedAtomicsCG: warpAggregatedAtomicsCG.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./warpAggregatedAtomicsCG + +clean: + rm -f warpAggregatedAtomicsCG warpAggregatedAtomicsCG.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/warpAggregatedAtomicsCG + +clobber: clean diff --git a/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml new file mode 100644 index 00000000..d2c36a1d --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml @@ -0,0 +1,64 @@ + + + + warpAggregatedAtomicsCG + + + ./ + ../ + ../../common/inc + + + Cooperative Groups + Atomic Intrinsics + + + GPGPU + Cooperative Groups + Atomic + + + + + + true + warpAggregatedAtomicsCG.cu + + 1:CUDA Advanced Topics + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + + + x86_64 + linux + + + ppc64le + linux + + + x86_64 + macosx + + + windows7 + + + arm + + + aarch64 + + + + 3.0 + + Warp Aggregated Atomics using Cooperative Groups + diff --git a/Samples/warpAggregatedAtomicsCG/README.md b/Samples/warpAggregatedAtomicsCG/README.md new file mode 100644 index 00000000..3939db26 --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/README.md @@ -0,0 +1,91 @@ +# warpAggregatedAtomicsCG - Warp Aggregated Atomics using Cooperative Groups + +## Description + +This sample demonstrates how using Cooperative Groups (CG) to perform warp aggregated atomics, a useful technique to improve performance when many threads atomically add to a single counter. + +## Key Concepts + +Cooperative Groups, Atomic Intrinsics + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l, aarch64 + +## CUDA APIs involved + +## Prerequisites + +Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu new file mode 100644 index 00000000..6022e27c --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu @@ -0,0 +1,126 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +// includes, project +#include +#include + +#include + +#include + +namespace cg = cooperative_groups; + +#define NUM_ELEMS 10000000 +#define NUM_THREADS_PER_BLOCK 512 + +// warp-aggregated atomic increment +__device__ int atomicAggInc(int *counter) { + cg::coalesced_group active = cg::coalesced_threads(); + + int mask = active.ballot(1); + // select the leader + int leader = __ffs(mask) - 1; + + // leader does the update + int res = 0; + if (active.thread_rank() == leader) { + res = atomicAdd(counter, __popc(mask)); + } + + // broadcast result + res = active.shfl(res, leader); + + // each thread computes its own value + return res + __popc(mask & ((1 << active.thread_rank()) - 1)); +} + +__global__ void filter_arr(int *dst, int *nres, const int *src, int n) { + int id = threadIdx.x + blockIdx.x * blockDim.x; + + for (int i = id; i < n; i += gridDim.x * blockDim.x) { + if (src[i] > 0) dst[atomicAggInc(nres)] = src[i]; + } +} + +int main(int argc, char **argv) { + int *data_to_filter, *filtered_data, nres = 0; + int *d_data_to_filter, *d_filtered_data, *d_nres; + + data_to_filter = reinterpret_cast(malloc(sizeof(int) * NUM_ELEMS)); + + // Generate input data. + for (int i = 0; i < NUM_ELEMS; i++) { + data_to_filter[i] = rand() % 20; + } + + findCudaDevice(argc, (const char **)argv); + + checkCudaErrors(cudaMalloc(&d_data_to_filter, sizeof(int) * NUM_ELEMS)); + checkCudaErrors(cudaMalloc(&d_filtered_data, sizeof(int) * NUM_ELEMS)); + checkCudaErrors(cudaMalloc(&d_nres, sizeof(int))); + + checkCudaErrors(cudaMemcpy(d_data_to_filter, data_to_filter, + sizeof(int) * NUM_ELEMS, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(d_nres, 0, sizeof(int))); + + dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); + dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1); + + filter_arr<<>>(d_filtered_data, d_nres, d_data_to_filter, + NUM_ELEMS); + + checkCudaErrors( + cudaMemcpy(&nres, d_nres, sizeof(int), cudaMemcpyDeviceToHost)); + + filtered_data = reinterpret_cast(malloc(sizeof(int) * nres)); + + checkCudaErrors(cudaMemcpy(filtered_data, d_filtered_data, sizeof(int) * nres, + cudaMemcpyDeviceToHost)); + + int *host_filtered_data = + reinterpret_cast(malloc(sizeof(int) * NUM_ELEMS)); + + // Generate host output with host filtering code. + int host_flt_count = 0; + for (int i = 0; i < NUM_ELEMS; i++) { + if (data_to_filter[i] > 0) { + host_filtered_data[host_flt_count++] = data_to_filter[i]; + } + } + + printf("\nWarp Aggregated Atomics %s \n", + host_flt_count == nres ? "PASSED" : "FAILED"); + + checkCudaErrors(cudaFree(d_data_to_filter)); + checkCudaErrors(cudaFree(d_filtered_data)); + checkCudaErrors(cudaFree(d_nres)); + free(data_to_filter); + free(filtered_data); + free(host_filtered_data); +} diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.sln b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.sln new file mode 100644 index 00000000..8947cbf0 --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warpAggregatedAtomicsCG", "warpAggregatedAtomicsCG_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.vcxproj new file mode 100644 index 00000000..8ed53622 --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.vcxproj @@ -0,0 +1,106 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + warpAggregatedAtomicsCG_vs2010 + warpAggregatedAtomicsCG + + + + + Application + MultiByte + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/warpAggregatedAtomicsCG.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.sln b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.sln new file mode 100644 index 00000000..d02f9b6c --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warpAggregatedAtomicsCG", "warpAggregatedAtomicsCG_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj new file mode 100644 index 00000000..2e57ab7c --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + warpAggregatedAtomicsCG_vs2012 + warpAggregatedAtomicsCG + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/warpAggregatedAtomicsCG.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.sln b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.sln new file mode 100644 index 00000000..d0f712f9 --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warpAggregatedAtomicsCG", "warpAggregatedAtomicsCG_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj new file mode 100644 index 00000000..4ad896cb --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + warpAggregatedAtomicsCG_vs2013 + warpAggregatedAtomicsCG + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/warpAggregatedAtomicsCG.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.sln b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.sln new file mode 100644 index 00000000..9130f5a2 --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warpAggregatedAtomicsCG", "warpAggregatedAtomicsCG_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj new file mode 100644 index 00000000..6cc5401b --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + warpAggregatedAtomicsCG_vs2015 + warpAggregatedAtomicsCG + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/warpAggregatedAtomicsCG.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.sln b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.sln new file mode 100644 index 00000000..edea4fc0 --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warpAggregatedAtomicsCG", "warpAggregatedAtomicsCG_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj new file mode 100644 index 00000000..01ddb390 --- /dev/null +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + warpAggregatedAtomicsCG_vs2017 + warpAggregatedAtomicsCG + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/warpAggregatedAtomicsCG.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + +