mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-12-01 09:19:16 +08:00
1966 lines
81 KiB
C
1966 lines
81 KiB
C
|
/*
|
||
|
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
||
|
*
|
||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||
|
* with this source code for terms and conditions that govern your use of
|
||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||
|
* this software and related documentation outside the terms of the EULA
|
||
|
* is strictly prohibited.
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
|
||
|
#ifndef __cuda_drvapi_dynlink_cuda_h__
|
||
|
#define __cuda_drvapi_dynlink_cuda_h__
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
|
||
|
|
||
|
#define __cuda_cuda_h__ 1
|
||
|
|
||
|
/**
|
||
|
* CUDA API versioning support
|
||
|
*/
|
||
|
#define __CUDA_API_VERSION 5000
|
||
|
|
||
|
/**
|
||
|
* \defgroup CUDA_DRIVER CUDA Driver API
|
||
|
*
|
||
|
* This section describes the low-level CUDA driver application programming
|
||
|
* interface.
|
||
|
*
|
||
|
* @{
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* \defgroup CUDA_TYPES Data types used by CUDA driver
|
||
|
* @{
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* CUDA API version number
|
||
|
*/
|
||
|
#define CUDA_VERSION 3020 /* 3.2 */
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* CUDA device pointer
|
||
|
*/
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
|
||
|
#if defined(_WIN64) || defined(__LP64__)
|
||
|
typedef unsigned long long CUdeviceptr;
|
||
|
#else
|
||
|
typedef unsigned int CUdeviceptr;
|
||
|
#endif
|
||
|
|
||
|
#endif /* __CUDA_API_VERSION >= 3020 */
|
||
|
|
||
|
typedef int CUdevice; /**< CUDA device */
|
||
|
typedef struct CUctx_st *CUcontext; /**< CUDA context */
|
||
|
typedef struct CUmod_st *CUmodule; /**< CUDA module */
|
||
|
typedef struct CUfunc_st *CUfunction; /**< CUDA function */
|
||
|
typedef struct CUarray_st *CUarray; /**< CUDA array */
|
||
|
typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */
|
||
|
typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */
|
||
|
typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */
|
||
|
typedef struct CUevent_st *CUevent; /**< CUDA event */
|
||
|
typedef struct CUstream_st *CUstream; /**< CUDA stream */
|
||
|
typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
|
||
|
typedef unsigned long long CUtexObject; /**< CUDA texture object */
|
||
|
typedef unsigned long long CUsurfObject; /**< CUDA surface object */
|
||
|
|
||
|
typedef struct CUuuid_st /**< CUDA definition of UUID */
|
||
|
{
|
||
|
char bytes[16];
|
||
|
} CUuuid;
|
||
|
|
||
|
/**
|
||
|
* Context creation flags
|
||
|
*/
|
||
|
typedef enum CUctx_flags_enum
|
||
|
{
|
||
|
CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
|
||
|
CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
|
||
|
CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
|
||
|
CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
|
||
|
CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
|
||
|
CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */
|
||
|
CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */
|
||
|
#if __CUDA_API_VERSION < 4000
|
||
|
CU_CTX_SCHED_MASK = 0x03,
|
||
|
CU_CTX_FLAGS_MASK = 0x1f
|
||
|
#else
|
||
|
CU_CTX_SCHED_MASK = 0x07,
|
||
|
CU_CTX_PRIMARY = 0x20, /**< Initialize and return the primary context */
|
||
|
CU_CTX_FLAGS_MASK = 0x3f
|
||
|
#endif
|
||
|
} CUctx_flags;
|
||
|
|
||
|
/**
|
||
|
* Event creation flags
|
||
|
*/
|
||
|
typedef enum CUevent_flags_enum
|
||
|
{
|
||
|
CU_EVENT_DEFAULT = 0, /**< Default event flag */
|
||
|
CU_EVENT_BLOCKING_SYNC = 1, /**< Event uses blocking synchronization */
|
||
|
CU_EVENT_DISABLE_TIMING = 2 /**< Event will not record timing data */
|
||
|
} CUevent_flags;
|
||
|
|
||
|
/**
|
||
|
* Array formats
|
||
|
*/
|
||
|
typedef enum CUarray_format_enum
|
||
|
{
|
||
|
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
|
||
|
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
|
||
|
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
|
||
|
CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
|
||
|
CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
|
||
|
CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
|
||
|
CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
|
||
|
CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
|
||
|
} CUarray_format;
|
||
|
|
||
|
/**
|
||
|
* Texture reference addressing modes
|
||
|
*/
|
||
|
typedef enum CUaddress_mode_enum
|
||
|
{
|
||
|
CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
|
||
|
CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
|
||
|
CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
|
||
|
CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */
|
||
|
} CUaddress_mode;
|
||
|
|
||
|
/**
|
||
|
* Texture reference filtering modes
|
||
|
*/
|
||
|
typedef enum CUfilter_mode_enum
|
||
|
{
|
||
|
CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
|
||
|
CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
|
||
|
} CUfilter_mode;
|
||
|
|
||
|
/**
|
||
|
* Device properties
|
||
|
*/
|
||
|
typedef enum CUdevice_attribute_enum
|
||
|
{
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
|
||
|
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
|
||
|
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
|
||
|
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
|
||
|
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
|
||
|
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */
|
||
|
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
|
||
|
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
|
||
|
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */
|
||
|
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */
|
||
|
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */
|
||
|
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */
|
||
|
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Maximum texture array width */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Maximum texture array height */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
|
||
|
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */
|
||
|
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */
|
||
|
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */
|
||
|
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */
|
||
|
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */
|
||
|
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */
|
||
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */
|
||
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76 /**< Minor compute capability version number */
|
||
|
#if __CUDA_API_VERSION >= 4000
|
||
|
, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
|
||
|
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
|
||
|
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
|
||
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
|
||
|
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */
|
||
|
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device uses shares a unified address space with the host */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */
|
||
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43 /**< Maximum layers in a 1D layered texture */
|
||
|
#endif
|
||
|
} CUdevice_attribute;
|
||
|
|
||
|
/**
|
||
|
* Legacy device properties
|
||
|
*/
|
||
|
typedef struct CUdevprop_st
|
||
|
{
|
||
|
int maxThreadsPerBlock; /**< Maximum number of threads per block */
|
||
|
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
|
||
|
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
|
||
|
int sharedMemPerBlock; /**< Shared memory available per block in bytes */
|
||
|
int totalConstantMemory; /**< Constant memory available on device in bytes */
|
||
|
int SIMDWidth; /**< Warp size in threads */
|
||
|
int memPitch; /**< Maximum pitch in bytes allowed by memory copies */
|
||
|
int regsPerBlock; /**< 32-bit registers available per block */
|
||
|
int clockRate; /**< Clock frequency in kilohertz */
|
||
|
int textureAlign; /**< Alignment requirement for textures */
|
||
|
} CUdevprop;
|
||
|
|
||
|
/**
|
||
|
* Function properties
|
||
|
*/
|
||
|
typedef enum CUfunction_attribute_enum
|
||
|
{
|
||
|
/**
|
||
|
* The maximum number of threads per block, beyond which a launch of the
|
||
|
* function would fail. This number depends on both the function and the
|
||
|
* device on which the function is currently loaded.
|
||
|
*/
|
||
|
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
|
||
|
|
||
|
/**
|
||
|
* The size in bytes of statically-allocated shared memory required by
|
||
|
* this function. This does not include dynamically-allocated shared
|
||
|
* memory requested by the user at runtime.
|
||
|
*/
|
||
|
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
|
||
|
|
||
|
/**
|
||
|
* The size in bytes of user-allocated constant memory required by this
|
||
|
* function.
|
||
|
*/
|
||
|
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
|
||
|
|
||
|
/**
|
||
|
* The size in bytes of local memory used by each thread of this function.
|
||
|
*/
|
||
|
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
|
||
|
|
||
|
/**
|
||
|
* The number of registers used by each thread of this function.
|
||
|
*/
|
||
|
CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
|
||
|
|
||
|
/**
|
||
|
* The PTX virtual architecture version for which the function was
|
||
|
* compiled. This value is the major PTX version * 10 + the minor PTX
|
||
|
* version, so a PTX version 1.3 function would return the value 13.
|
||
|
* Note that this may return the undefined value of 0 for cubins
|
||
|
* compiled prior to CUDA 3.0.
|
||
|
*/
|
||
|
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
|
||
|
|
||
|
/**
|
||
|
* The binary architecture version for which the function was compiled.
|
||
|
* This value is the major binary version * 10 + the minor binary version,
|
||
|
* so a binary version 1.3 function would return the value 13. Note that
|
||
|
* this will return a value of 10 for legacy cubins that do not have a
|
||
|
* properly-encoded binary architecture version.
|
||
|
*/
|
||
|
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
|
||
|
|
||
|
CU_FUNC_ATTRIBUTE_MAX
|
||
|
} CUfunction_attribute;
|
||
|
|
||
|
/**
|
||
|
* Function cache configurations
|
||
|
*/
|
||
|
typedef enum CUfunc_cache_enum
|
||
|
{
|
||
|
CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
|
||
|
CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
|
||
|
CU_FUNC_CACHE_PREFER_L1 = 0x02 /**< prefer larger L1 cache and smaller shared memory */
|
||
|
} CUfunc_cache;
|
||
|
|
||
|
/**
|
||
|
* Shared memory configurations
|
||
|
*/
|
||
|
typedef enum CUsharedconfig_enum
|
||
|
{
|
||
|
CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */
|
||
|
CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */
|
||
|
CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */
|
||
|
} CUsharedconfig;
|
||
|
|
||
|
/**
|
||
|
* Memory types
|
||
|
*/
|
||
|
typedef enum CUmemorytype_enum
|
||
|
{
|
||
|
CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
|
||
|
CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
|
||
|
CU_MEMORYTYPE_ARRAY = 0x03 /**< Array memory */
|
||
|
#if __CUDA_API_VERSION >= 4000
|
||
|
, CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
|
||
|
#endif
|
||
|
} CUmemorytype;
|
||
|
|
||
|
/**
|
||
|
* Compute Modes
|
||
|
*/
|
||
|
typedef enum CUcomputemode_enum
|
||
|
{
|
||
|
CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
|
||
|
CU_COMPUTEMODE_PROHIBITED = 2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
|
||
|
#if __CUDA_API_VERSION >= 4000
|
||
|
, CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
|
||
|
#endif
|
||
|
} CUcomputemode;
|
||
|
|
||
|
/**
|
||
|
* Online compiler options
|
||
|
*/
|
||
|
typedef enum CUjit_option_enum
|
||
|
{
|
||
|
/**
|
||
|
* Max number of registers that a thread may use.\n
|
||
|
* Option type: unsigned int
|
||
|
*/
|
||
|
CU_JIT_MAX_REGISTERS = 0,
|
||
|
|
||
|
/**
|
||
|
* IN: Specifies minimum number of threads per block to target compilation
|
||
|
* for\n
|
||
|
* OUT: Returns the number of threads the compiler actually targeted.
|
||
|
* This restricts the resource utilization fo the compiler (e.g. max
|
||
|
* registers) such that a block with the given number of threads should be
|
||
|
* able to launch based on register limitations. Note, this option does not
|
||
|
* currently take into account any other resource limitations, such as
|
||
|
* shared memory utilization.\n
|
||
|
* Option type: unsigned int
|
||
|
*/
|
||
|
CU_JIT_THREADS_PER_BLOCK,
|
||
|
|
||
|
/**
|
||
|
* Returns a float value in the option of the wall clock time, in
|
||
|
* milliseconds, spent creating the cubin\n
|
||
|
* Option type: float
|
||
|
*/
|
||
|
CU_JIT_WALL_TIME,
|
||
|
|
||
|
/**
|
||
|
* Pointer to a buffer in which to print any log messsages from PTXAS
|
||
|
* that are informational in nature (the buffer size is specified via
|
||
|
* option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
|
||
|
* Option type: char*
|
||
|
*/
|
||
|
CU_JIT_INFO_LOG_BUFFER,
|
||
|
|
||
|
/**
|
||
|
* IN: Log buffer size in bytes. Log messages will be capped at this size
|
||
|
* (including null terminator)\n
|
||
|
* OUT: Amount of log buffer filled with messages\n
|
||
|
* Option type: unsigned int
|
||
|
*/
|
||
|
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
|
||
|
|
||
|
/**
|
||
|
* Pointer to a buffer in which to print any log messages from PTXAS that
|
||
|
* reflect errors (the buffer size is specified via option
|
||
|
* ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
|
||
|
* Option type: char*
|
||
|
*/
|
||
|
CU_JIT_ERROR_LOG_BUFFER,
|
||
|
|
||
|
/**
|
||
|
* IN: Log buffer size in bytes. Log messages will be capped at this size
|
||
|
* (including null terminator)\n
|
||
|
* OUT: Amount of log buffer filled with messages\n
|
||
|
* Option type: unsigned int
|
||
|
*/
|
||
|
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
|
||
|
|
||
|
/**
|
||
|
* Level of optimizations to apply to generated code (0 - 4), with 4
|
||
|
* being the default and highest level of optimizations.\n
|
||
|
* Option type: unsigned int
|
||
|
*/
|
||
|
CU_JIT_OPTIMIZATION_LEVEL,
|
||
|
|
||
|
/**
|
||
|
* No option value required. Determines the target based on the current
|
||
|
* attached context (default)\n
|
||
|
* Option type: No option value needed
|
||
|
*/
|
||
|
CU_JIT_TARGET_FROM_CUCONTEXT,
|
||
|
|
||
|
/**
|
||
|
* Target is chosen based on supplied ::CUjit_target_enum.\n
|
||
|
* Option type: unsigned int for enumerated type ::CUjit_target_enum
|
||
|
*/
|
||
|
CU_JIT_TARGET,
|
||
|
|
||
|
/**
|
||
|
* Specifies choice of fallback strategy if matching cubin is not found.
|
||
|
* Choice is based on supplied ::CUjit_fallback_enum.\n
|
||
|
* Option type: unsigned int for enumerated type ::CUjit_fallback_enum
|
||
|
*/
|
||
|
CU_JIT_FALLBACK_STRATEGY
|
||
|
|
||
|
} CUjit_option;
|
||
|
|
||
|
/**
|
||
|
* Online compilation targets
|
||
|
*/
|
||
|
typedef enum CUjit_target_enum
|
||
|
{
|
||
|
CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */
|
||
|
CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */
|
||
|
CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
|
||
|
CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
|
||
|
CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
|
||
|
CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
|
||
|
CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
|
||
|
CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
|
||
|
CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
|
||
|
CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
|
||
|
CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
|
||
|
CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
|
||
|
CU_TARGET_COMPUTE_70 = 70 /**< Compute device class 7.0.*/
|
||
|
} CUjit_target;
|
||
|
|
||
|
/**
|
||
|
* Cubin matching fallback strategies
|
||
|
*/
|
||
|
typedef enum CUjit_fallback_enum
|
||
|
{
|
||
|
CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
|
||
|
CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */
|
||
|
} CUjit_fallback;
|
||
|
|
||
|
/**
|
||
|
* Flags to register a graphics resource
|
||
|
*/
|
||
|
typedef enum CUgraphicsRegisterFlags_enum
|
||
|
{
|
||
|
CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
|
||
|
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
|
||
|
CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
|
||
|
CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04
|
||
|
} CUgraphicsRegisterFlags;
|
||
|
|
||
|
/**
|
||
|
* Flags for mapping and unmapping interop resources
|
||
|
*/
|
||
|
typedef enum CUgraphicsMapResourceFlags_enum
|
||
|
{
|
||
|
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
|
||
|
CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
|
||
|
CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
|
||
|
} CUgraphicsMapResourceFlags;
|
||
|
|
||
|
/**
|
||
|
* Array indices for cube faces
|
||
|
*/
|
||
|
typedef enum CUarray_cubemap_face_enum
|
||
|
{
|
||
|
CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
|
||
|
CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
|
||
|
CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
|
||
|
CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */
|
||
|
CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */
|
||
|
CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */
|
||
|
} CUarray_cubemap_face;
|
||
|
|
||
|
/**
|
||
|
* Limits
|
||
|
*/
|
||
|
typedef enum CUlimit_enum
|
||
|
{
|
||
|
CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */
|
||
|
CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
|
||
|
CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */
|
||
|
} CUlimit;
|
||
|
|
||
|
/**
|
||
|
* Resource types
|
||
|
*/
|
||
|
typedef enum CUresourcetype_enum
|
||
|
{
|
||
|
CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */
|
||
|
CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
|
||
|
CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */
|
||
|
CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */
|
||
|
} CUresourcetype;
|
||
|
|
||
|
/**
|
||
|
* Error codes
|
||
|
*/
|
||
|
typedef enum cudaError_enum
|
||
|
{
|
||
|
/**
|
||
|
* The API call returned with no errors. In the case of query calls, this
|
||
|
* can also mean that the operation being queried is complete (see
|
||
|
* ::cuEventQuery() and ::cuStreamQuery()).
|
||
|
*/
|
||
|
CUDA_SUCCESS = 0,
|
||
|
|
||
|
/**
|
||
|
* This indicates that one or more of the parameters passed to the API call
|
||
|
* is not within an acceptable range of values.
|
||
|
*/
|
||
|
CUDA_ERROR_INVALID_VALUE = 1,
|
||
|
|
||
|
/**
|
||
|
* The API call failed because it was unable to allocate enough memory to
|
||
|
* perform the requested operation.
|
||
|
*/
|
||
|
CUDA_ERROR_OUT_OF_MEMORY = 2,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the CUDA driver has not been initialized with
|
||
|
* ::cuInit() or that initialization has failed.
|
||
|
*/
|
||
|
CUDA_ERROR_NOT_INITIALIZED = 3,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the CUDA driver is in the process of shutting down.
|
||
|
*/
|
||
|
CUDA_ERROR_DEINITIALIZED = 4,
|
||
|
|
||
|
/**
|
||
|
* This indicates profiling APIs are called while application is running
|
||
|
* in visual profiler mode.
|
||
|
*/
|
||
|
CUDA_ERROR_PROFILER_DISABLED = 5,
|
||
|
/**
|
||
|
* This indicates profiling has not been initialized for this context.
|
||
|
* Call cuProfilerInitialize() to resolve this.
|
||
|
*/
|
||
|
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6,
|
||
|
/**
|
||
|
* This indicates profiler has already been started and probably
|
||
|
* cuProfilerStart() is incorrectly called.
|
||
|
*/
|
||
|
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7,
|
||
|
/**
|
||
|
* This indicates profiler has already been stopped and probably
|
||
|
* cuProfilerStop() is incorrectly called.
|
||
|
*/
|
||
|
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8,
|
||
|
/**
|
||
|
* This indicates that no CUDA-capable devices were detected by the installed
|
||
|
* CUDA driver.
|
||
|
*/
|
||
|
CUDA_ERROR_NO_DEVICE = 100,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the device ordinal supplied by the user does not
|
||
|
* correspond to a valid CUDA device.
|
||
|
*/
|
||
|
CUDA_ERROR_INVALID_DEVICE = 101,
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This indicates that the device kernel image is invalid. This can also
|
||
|
* indicate an invalid CUDA module.
|
||
|
*/
|
||
|
CUDA_ERROR_INVALID_IMAGE = 200,
|
||
|
|
||
|
/**
|
||
|
* This most frequently indicates that there is no context bound to the
|
||
|
* current thread. This can also be returned if the context passed to an
|
||
|
* API call is not a valid handle (such as a context that has had
|
||
|
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
|
||
|
* mixes different API versions (i.e. 3010 context with 3020 API calls).
|
||
|
* See ::cuCtxGetApiVersion() for more details.
|
||
|
*/
|
||
|
CUDA_ERROR_INVALID_CONTEXT = 201,
|
||
|
|
||
|
/**
|
||
|
* This indicated that the context being supplied as a parameter to the
|
||
|
* API call was already the active context.
|
||
|
* \deprecated
|
||
|
* This error return is deprecated as of CUDA 3.2. It is no longer an
|
||
|
* error to attempt to push the active context via ::cuCtxPushCurrent().
|
||
|
*/
|
||
|
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,
|
||
|
|
||
|
/**
|
||
|
* This indicates that a map or register operation has failed.
|
||
|
*/
|
||
|
CUDA_ERROR_MAP_FAILED = 205,
|
||
|
|
||
|
/**
|
||
|
* This indicates that an unmap or unregister operation has failed.
|
||
|
*/
|
||
|
CUDA_ERROR_UNMAP_FAILED = 206,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the specified array is currently mapped and thus
|
||
|
* cannot be destroyed.
|
||
|
*/
|
||
|
CUDA_ERROR_ARRAY_IS_MAPPED = 207,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the resource is already mapped.
|
||
|
*/
|
||
|
CUDA_ERROR_ALREADY_MAPPED = 208,
|
||
|
|
||
|
/**
|
||
|
* This indicates that there is no kernel image available that is suitable
|
||
|
* for the device. This can occur when a user specifies code generation
|
||
|
* options for a particular CUDA source file that do not include the
|
||
|
* corresponding device configuration.
|
||
|
*/
|
||
|
CUDA_ERROR_NO_BINARY_FOR_GPU = 209,
|
||
|
|
||
|
/**
|
||
|
* This indicates that a resource has already been acquired.
|
||
|
*/
|
||
|
CUDA_ERROR_ALREADY_ACQUIRED = 210,
|
||
|
|
||
|
/**
|
||
|
* This indicates that a resource is not mapped.
|
||
|
*/
|
||
|
CUDA_ERROR_NOT_MAPPED = 211,
|
||
|
|
||
|
/**
|
||
|
* This indicates that a mapped resource is not available for access as an
|
||
|
* array.
|
||
|
*/
|
||
|
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,
|
||
|
|
||
|
/**
|
||
|
* This indicates that a mapped resource is not available for access as a
|
||
|
* pointer.
|
||
|
*/
|
||
|
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,
|
||
|
|
||
|
/**
|
||
|
* This indicates that an uncorrectable ECC error was detected during
|
||
|
* execution.
|
||
|
*/
|
||
|
CUDA_ERROR_ECC_UNCORRECTABLE = 214,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the ::CUlimit passed to the API call is not
|
||
|
* supported by the active device.
|
||
|
*/
|
||
|
CUDA_ERROR_UNSUPPORTED_LIMIT = 215,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the ::CUcontext passed to the API call can
|
||
|
* only be bound to a single CPU thread at a time but is already
|
||
|
* bound to a CPU thread.
|
||
|
*/
|
||
|
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the device kernel source is invalid.
|
||
|
*/
|
||
|
CUDA_ERROR_INVALID_SOURCE = 300,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the file specified was not found.
|
||
|
*/
|
||
|
CUDA_ERROR_FILE_NOT_FOUND = 301,
|
||
|
|
||
|
/**
|
||
|
* This indicates that a link to a shared object failed to resolve.
|
||
|
*/
|
||
|
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
|
||
|
|
||
|
/**
|
||
|
* This indicates that initialization of a shared object failed.
|
||
|
*/
|
||
|
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,
|
||
|
|
||
|
/**
|
||
|
* This indicates that an OS call failed.
|
||
|
*/
|
||
|
CUDA_ERROR_OPERATING_SYSTEM = 304,
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This indicates that a resource handle passed to the API call was not
|
||
|
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
|
||
|
*/
|
||
|
CUDA_ERROR_INVALID_HANDLE = 400,
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This indicates that a named symbol was not found. Examples of symbols
|
||
|
* are global/constant variable names, texture names, and surface names.
|
||
|
*/
|
||
|
CUDA_ERROR_NOT_FOUND = 500,
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This indicates that asynchronous operations issued previously have not
|
||
|
* completed yet. This result is not actually an error, but must be indicated
|
||
|
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
|
||
|
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
|
||
|
*/
|
||
|
CUDA_ERROR_NOT_READY = 600,
|
||
|
|
||
|
|
||
|
/**
|
||
|
* An exception occurred on the device while executing a kernel. Common
|
||
|
* causes include dereferencing an invalid device pointer and accessing
|
||
|
* out of bounds shared memory. The context cannot be used, so it must
|
||
|
* be destroyed (and a new one should be created). All existing device
|
||
|
* memory allocations from this context are invalid and must be
|
||
|
* reconstructed if the program is to continue using CUDA.
|
||
|
*/
|
||
|
CUDA_ERROR_LAUNCH_FAILED = 700,
|
||
|
|
||
|
/**
|
||
|
* This indicates that a launch did not occur because it did not have
|
||
|
* appropriate resources. This error usually indicates that the user has
|
||
|
* attempted to pass too many arguments to the device kernel, or the
|
||
|
* kernel launch specifies too many threads for the kernel's register
|
||
|
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
|
||
|
* when a 32-bit int is expected) is equivalent to passing too many
|
||
|
* arguments and can also result in this error.
|
||
|
*/
|
||
|
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
|
||
|
|
||
|
/**
|
||
|
* This indicates that the device kernel took too long to execute. This can
|
||
|
* only occur if timeouts are enabled - see the device attribute
|
||
|
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
|
||
|
* context cannot be used (and must be destroyed similar to
|
||
|
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
|
||
|
* this context are invalid and must be reconstructed if the program is to
|
||
|
* continue using CUDA.
|
||
|
*/
|
||
|
CUDA_ERROR_LAUNCH_TIMEOUT = 702,
|
||
|
|
||
|
/**
|
||
|
* This error indicates a kernel launch that uses an incompatible texturing
|
||
|
* mode.
|
||
|
*/
|
||
|
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
|
||
|
* trying to re-enable peer access to a context which has already
|
||
|
* had peer access to it enabled.
|
||
|
*/
|
||
|
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that a call to ::cuMemPeerRegister is trying to
|
||
|
* register memory from a context which has not had peer access
|
||
|
* enabled yet via ::cuCtxEnablePeerAccess(), or that
|
||
|
* ::cuCtxDisablePeerAccess() is trying to disable peer access
|
||
|
* which has not been enabled yet.
|
||
|
*/
|
||
|
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that a call to ::cuMemPeerRegister is trying to
|
||
|
* register already-registered memory.
|
||
|
*/
|
||
|
CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that a call to ::cuMemPeerUnregister is trying to
|
||
|
* unregister memory that has not been registered.
|
||
|
*/
|
||
|
CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED = 707,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that ::cuCtxCreate was called with the flag
|
||
|
* ::CU_CTX_PRIMARY on a device which already has initialized its
|
||
|
* primary context.
|
||
|
*/
|
||
|
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that the context current to the calling thread
|
||
|
* has been destroyed using ::cuCtxDestroy, or is a primary context which
|
||
|
* has not yet been initialized.
|
||
|
*/
|
||
|
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709,
|
||
|
|
||
|
/**
|
||
|
* A device-side assert triggered during kernel execution. The context
|
||
|
* cannot be used anymore, and must be destroyed. All existing device
|
||
|
* memory allocations from this context are invalid and must be
|
||
|
* reconstructed if the program is to continue using CUDA.
|
||
|
*/
|
||
|
CUDA_ERROR_ASSERT = 710,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that the hardware resources required to enable
|
||
|
* peer access have been exhausted for one or more of the devices
|
||
|
* passed to ::cuCtxEnablePeerAccess().
|
||
|
*/
|
||
|
CUDA_ERROR_TOO_MANY_PEERS = 711,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that the memory range passed to ::cuMemHostRegister()
|
||
|
* has already been registered.
|
||
|
*/
|
||
|
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
|
||
|
|
||
|
/**
|
||
|
* This error indicates that the pointer passed to ::cuMemHostUnregister()
|
||
|
* does not correspond to any currently registered memory region.
|
||
|
*/
|
||
|
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713,
|
||
|
|
||
|
/**
|
||
|
* This indicates that an unknown internal error has occurred.
|
||
|
*/
|
||
|
CUDA_ERROR_UNKNOWN = 999
|
||
|
} CUresult;
|
||
|
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
#define CUDA_CB __stdcall
|
||
|
#else
|
||
|
#define CUDA_CB
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* CUDA stream callback
|
||
|
* \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL.
|
||
|
* \param status ::CUDA_SUCCESS or any persistent error on the stream.
|
||
|
* \param userData User parameter provided at registration.
|
||
|
*/
|
||
|
typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 4000
|
||
|
/**
|
||
|
* If set, host memory is portable between CUDA contexts.
|
||
|
* Flag for ::cuMemHostAlloc()
|
||
|
*/
|
||
|
#define CU_MEMHOSTALLOC_PORTABLE 0x01
|
||
|
|
||
|
/**
|
||
|
* If set, host memory is mapped into CUDA address space and
|
||
|
* ::cuMemHostGetDevicePointer() may be called on the host pointer.
|
||
|
* Flag for ::cuMemHostAlloc()
|
||
|
*/
|
||
|
#define CU_MEMHOSTALLOC_DEVICEMAP 0x02
|
||
|
|
||
|
/**
|
||
|
* If set, host memory is allocated as write-combined - fast to write,
|
||
|
* faster to DMA, slow to read except via SSE4 streaming load instruction
|
||
|
* (MOVNTDQA).
|
||
|
* Flag for ::cuMemHostAlloc()
|
||
|
*/
|
||
|
#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04
|
||
|
|
||
|
/**
|
||
|
* If set, host memory is portable between CUDA contexts.
|
||
|
* Flag for ::cuMemHostRegister()
|
||
|
*/
|
||
|
#define CU_MEMHOSTREGISTER_PORTABLE 0x01
|
||
|
|
||
|
/**
|
||
|
* If set, host memory is mapped into CUDA address space and
|
||
|
* ::cuMemHostGetDevicePointer() may be called on the host pointer.
|
||
|
* Flag for ::cuMemHostRegister()
|
||
|
*/
|
||
|
#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02
|
||
|
|
||
|
/**
|
||
|
* If set, peer memory is mapped into CUDA address space and
|
||
|
* ::cuMemPeerGetDevicePointer() may be called on the host pointer.
|
||
|
* Flag for ::cuMemPeerRegister()
|
||
|
*/
|
||
|
#define CU_MEMPEERREGISTER_DEVICEMAP 0x02
|
||
|
#endif
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
|
||
|
/**
|
||
|
* 2D memory copy parameters
|
||
|
*/
|
||
|
typedef struct CUDA_MEMCPY2D_st
|
||
|
{
|
||
|
size_t srcXInBytes; /**< Source X in bytes */
|
||
|
size_t srcY; /**< Source Y */
|
||
|
|
||
|
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
|
||
|
const void *srcHost; /**< Source host pointer */
|
||
|
CUdeviceptr srcDevice; /**< Source device pointer */
|
||
|
CUarray srcArray; /**< Source array reference */
|
||
|
size_t srcPitch; /**< Source pitch (ignored when src is array) */
|
||
|
|
||
|
size_t dstXInBytes; /**< Destination X in bytes */
|
||
|
size_t dstY; /**< Destination Y */
|
||
|
|
||
|
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
|
||
|
void *dstHost; /**< Destination host pointer */
|
||
|
CUdeviceptr dstDevice; /**< Destination device pointer */
|
||
|
CUarray dstArray; /**< Destination array reference */
|
||
|
size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
|
||
|
|
||
|
size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
|
||
|
size_t Height; /**< Height of 2D memory copy */
|
||
|
} CUDA_MEMCPY2D;
|
||
|
|
||
|
/**
|
||
|
* 3D memory copy parameters
|
||
|
*/
|
||
|
typedef struct CUDA_MEMCPY3D_st
|
||
|
{
|
||
|
size_t srcXInBytes; /**< Source X in bytes */
|
||
|
size_t srcY; /**< Source Y */
|
||
|
size_t srcZ; /**< Source Z */
|
||
|
size_t srcLOD; /**< Source LOD */
|
||
|
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
|
||
|
const void *srcHost; /**< Source host pointer */
|
||
|
CUdeviceptr srcDevice; /**< Source device pointer */
|
||
|
CUarray srcArray; /**< Source array reference */
|
||
|
void *reserved0; /**< Must be NULL */
|
||
|
size_t srcPitch; /**< Source pitch (ignored when src is array) */
|
||
|
size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
|
||
|
|
||
|
size_t dstXInBytes; /**< Destination X in bytes */
|
||
|
size_t dstY; /**< Destination Y */
|
||
|
size_t dstZ; /**< Destination Z */
|
||
|
size_t dstLOD; /**< Destination LOD */
|
||
|
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
|
||
|
void *dstHost; /**< Destination host pointer */
|
||
|
CUdeviceptr dstDevice; /**< Destination device pointer */
|
||
|
CUarray dstArray; /**< Destination array reference */
|
||
|
void *reserved1; /**< Must be NULL */
|
||
|
size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
|
||
|
size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
|
||
|
|
||
|
size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
|
||
|
size_t Height; /**< Height of 3D memory copy */
|
||
|
size_t Depth; /**< Depth of 3D memory copy */
|
||
|
} CUDA_MEMCPY3D;
|
||
|
|
||
|
/**
|
||
|
* 3D memory cross-context copy parameters
|
||
|
*/
|
||
|
typedef struct CUDA_MEMCPY3D_PEER_st
|
||
|
{
|
||
|
size_t srcXInBytes; /**< Source X in bytes */
|
||
|
size_t srcY; /**< Source Y */
|
||
|
size_t srcZ; /**< Source Z */
|
||
|
size_t srcLOD; /**< Source LOD */
|
||
|
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
|
||
|
const void *srcHost; /**< Source host pointer */
|
||
|
CUdeviceptr srcDevice; /**< Source device pointer */
|
||
|
CUarray srcArray; /**< Source array reference */
|
||
|
CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
|
||
|
size_t srcPitch; /**< Source pitch (ignored when src is array) */
|
||
|
size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
|
||
|
|
||
|
size_t dstXInBytes; /**< Destination X in bytes */
|
||
|
size_t dstY; /**< Destination Y */
|
||
|
size_t dstZ; /**< Destination Z */
|
||
|
size_t dstLOD; /**< Destination LOD */
|
||
|
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
|
||
|
void *dstHost; /**< Destination host pointer */
|
||
|
CUdeviceptr dstDevice; /**< Destination device pointer */
|
||
|
CUarray dstArray; /**< Destination array reference */
|
||
|
CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
|
||
|
size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
|
||
|
size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
|
||
|
|
||
|
size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
|
||
|
size_t Height; /**< Height of 3D memory copy */
|
||
|
size_t Depth; /**< Depth of 3D memory copy */
|
||
|
} CUDA_MEMCPY3D_PEER;
|
||
|
|
||
|
/**
|
||
|
* Array descriptor
|
||
|
*/
|
||
|
typedef struct CUDA_ARRAY_DESCRIPTOR_st
|
||
|
{
|
||
|
size_t Width; /**< Width of array */
|
||
|
size_t Height; /**< Height of array */
|
||
|
|
||
|
CUarray_format Format; /**< Array format */
|
||
|
unsigned int NumChannels; /**< Channels per array element */
|
||
|
} CUDA_ARRAY_DESCRIPTOR;
|
||
|
|
||
|
/**
|
||
|
* 3D array descriptor
|
||
|
*/
|
||
|
typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
|
||
|
{
|
||
|
size_t Width; /**< Width of 3D array */
|
||
|
size_t Height; /**< Height of 3D array */
|
||
|
size_t Depth; /**< Depth of 3D array */
|
||
|
|
||
|
CUarray_format Format; /**< Array format */
|
||
|
unsigned int NumChannels; /**< Channels per array element */
|
||
|
unsigned int Flags; /**< Flags */
|
||
|
} CUDA_ARRAY3D_DESCRIPTOR;
|
||
|
|
||
|
#endif /* __CUDA_API_VERSION >= 3020 */
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 5000
|
||
|
/**
|
||
|
* CUDA Resource descriptor
|
||
|
*/
|
||
|
typedef struct CUDA_RESOURCE_DESC_st
|
||
|
{
|
||
|
CUresourcetype resType; /**< Resource type */
|
||
|
|
||
|
union
|
||
|
{
|
||
|
struct
|
||
|
{
|
||
|
CUarray hArray; /**< CUDA array */
|
||
|
} array;
|
||
|
struct
|
||
|
{
|
||
|
CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
|
||
|
} mipmap;
|
||
|
struct
|
||
|
{
|
||
|
CUdeviceptr devPtr; /**< Device pointer */
|
||
|
CUarray_format format; /**< Array format */
|
||
|
unsigned int numChannels; /**< Channels per array element */
|
||
|
size_t sizeInBytes; /**< Size in bytes */
|
||
|
} linear;
|
||
|
struct
|
||
|
{
|
||
|
CUdeviceptr devPtr; /**< Device pointer */
|
||
|
CUarray_format format; /**< Array format */
|
||
|
unsigned int numChannels; /**< Channels per array element */
|
||
|
size_t width; /**< Width of the array in elements */
|
||
|
size_t height; /**< Height of the array in elements */
|
||
|
size_t pitchInBytes; /**< Pitch between two rows in bytes */
|
||
|
} pitch2D;
|
||
|
struct
|
||
|
{
|
||
|
int reserved[32];
|
||
|
} __reserved;
|
||
|
} res;
|
||
|
|
||
|
unsigned int flags; /**< Flags (must be zero) */
|
||
|
} CUDA_RESOURCE_DESC;
|
||
|
|
||
|
/**
|
||
|
* Texture descriptor
|
||
|
*/
|
||
|
typedef struct CUDA_TEXTURE_DESC_st
|
||
|
{
|
||
|
CUaddress_mode addressMode[3]; /**< Address modes */
|
||
|
CUfilter_mode filterMode; /**< Filter mode */
|
||
|
unsigned int flags; /**< Flags */
|
||
|
unsigned int maxAnisotropy; /**< Maximum anistropy ratio */
|
||
|
CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
|
||
|
float mipmapLevelBias; /**< Mipmap level bias */
|
||
|
float minMipmapLevelClamp; /**< Mipmap minimum level clamp */
|
||
|
float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */
|
||
|
int _reserved[16];
|
||
|
} CUDA_TEXTURE_DESC;
|
||
|
|
||
|
/**
|
||
|
* Resource view format
|
||
|
*/
|
||
|
typedef enum CUresourceViewFormat_enum
|
||
|
{
|
||
|
CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */
|
||
|
CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */
|
||
|
CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */
|
||
|
CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */
|
||
|
CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */
|
||
|
CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */
|
||
|
CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */
|
||
|
CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */
|
||
|
CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */
|
||
|
CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */
|
||
|
CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */
|
||
|
CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */
|
||
|
CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */
|
||
|
CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */
|
||
|
CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */
|
||
|
CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
|
||
|
CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */
|
||
|
CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */
|
||
|
} CUresourceViewFormat;
|
||
|
|
||
|
/**
|
||
|
* Resource view descriptor
|
||
|
*/
|
||
|
typedef struct CUDA_RESOURCE_VIEW_DESC_st
|
||
|
{
|
||
|
CUresourceViewFormat format; /**< Resource view format */
|
||
|
size_t width; /**< Width of the resource view */
|
||
|
size_t height; /**< Height of the resource view */
|
||
|
size_t depth; /**< Depth of the resource view */
|
||
|
unsigned int firstMipmapLevel; /**< First defined mipmap level */
|
||
|
unsigned int lastMipmapLevel; /**< Last defined mipmap level */
|
||
|
unsigned int firstLayer; /**< First layer index */
|
||
|
unsigned int lastLayer; /**< Last layer index */
|
||
|
unsigned int _reserved[16];
|
||
|
} CUDA_RESOURCE_VIEW_DESC;
|
||
|
|
||
|
/**
|
||
|
* GPU Direct v3 tokens
|
||
|
*/
|
||
|
typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st
|
||
|
{
|
||
|
unsigned long long p2pToken;
|
||
|
unsigned int vaSpaceToken;
|
||
|
} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
/**
|
||
|
* If set, the CUDA array is a collection of layers, where each layer is either a 1D
|
||
|
* or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
|
||
|
* of layers, not the depth of a 3D array.
|
||
|
*/
|
||
|
#define CUDA_ARRAY3D_LAYERED 0x01
|
||
|
|
||
|
/**
|
||
|
* Deprecated, use CUDA_ARRAY3D_LAYERED
|
||
|
*/
|
||
|
#define CUDA_ARRAY3D_2DARRAY 0x01
|
||
|
|
||
|
/**
|
||
|
* This flag must be set in order to bind a surface reference
|
||
|
* to the CUDA array
|
||
|
*/
|
||
|
#define CUDA_ARRAY3D_SURFACE_LDST 0x02
|
||
|
|
||
|
/**
|
||
|
* Override the texref format with a format inferred from the array.
|
||
|
* Flag for ::cuTexRefSetArray()
|
||
|
*/
|
||
|
#define CU_TRSA_OVERRIDE_FORMAT 0x01
|
||
|
|
||
|
/**
|
||
|
* Read the texture as integers rather than promoting the values to floats
|
||
|
* in the range [0,1].
|
||
|
* Flag for ::cuTexRefSetFlags()
|
||
|
*/
|
||
|
#define CU_TRSF_READ_AS_INTEGER 0x01
|
||
|
|
||
|
/**
|
||
|
* Use normalized texture coordinates in the range [0,1) instead of [0,dim).
|
||
|
* Flag for ::cuTexRefSetFlags()
|
||
|
*/
|
||
|
#define CU_TRSF_NORMALIZED_COORDINATES 0x02
|
||
|
|
||
|
/**
|
||
|
* Perform sRGB->linear conversion during texture read.
|
||
|
* Flag for ::cuTexRefSetFlags()
|
||
|
*/
|
||
|
#define CU_TRSF_SRGB 0x10
|
||
|
|
||
|
/**
|
||
|
* End of array terminator for the \p extra parameter to
|
||
|
* ::cuLaunchKernel
|
||
|
*/
|
||
|
#define CU_LAUNCH_PARAM_END ((void*)0x00)
|
||
|
|
||
|
/**
|
||
|
* Indicator that the next value in the \p extra parameter to
|
||
|
* ::cuLaunchKernel will be a pointer to a buffer containing all kernel
|
||
|
* parameters used for launching kernel \p f. This buffer needs to
|
||
|
* honor all alignment/padding requirements of the individual parameters.
|
||
|
* If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
|
||
|
* \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
|
||
|
* effect.
|
||
|
*/
|
||
|
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
|
||
|
|
||
|
/**
|
||
|
* Indicator that the next value in the \p extra parameter to
|
||
|
* ::cuLaunchKernel will be a pointer to a size_t which contains the
|
||
|
* size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
|
||
|
* It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
|
||
|
* in the \p extra array if the value associated with
|
||
|
* ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
|
||
|
*/
|
||
|
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
|
||
|
|
||
|
/**
|
||
|
* For texture references loaded into the module, use default texunit from
|
||
|
* texture reference.
|
||
|
*/
|
||
|
#define CU_PARAM_TR_DEFAULT -1
|
||
|
|
||
|
/**
|
||
|
* CUDA API made obselete at API version 3020
|
||
|
*/
|
||
|
#if defined(__CUDA_API_VERSION_INTERNAL)
|
||
|
#define CUdeviceptr CUdeviceptr_v1
|
||
|
#define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st
|
||
|
#define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1
|
||
|
#define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st
|
||
|
#define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1
|
||
|
#define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st
|
||
|
#define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1
|
||
|
#define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st
|
||
|
#define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1
|
||
|
#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
|
||
|
|
||
|
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
|
||
|
|
||
|
typedef unsigned int CUdeviceptr;
|
||
|
|
||
|
typedef struct CUDA_MEMCPY2D_st
|
||
|
{
|
||
|
unsigned int srcXInBytes; /**< Source X in bytes */
|
||
|
unsigned int srcY; /**< Source Y */
|
||
|
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
|
||
|
const void *srcHost; /**< Source host pointer */
|
||
|
CUdeviceptr srcDevice; /**< Source device pointer */
|
||
|
CUarray srcArray; /**< Source array reference */
|
||
|
unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
|
||
|
|
||
|
unsigned int dstXInBytes; /**< Destination X in bytes */
|
||
|
unsigned int dstY; /**< Destination Y */
|
||
|
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
|
||
|
void *dstHost; /**< Destination host pointer */
|
||
|
CUdeviceptr dstDevice; /**< Destination device pointer */
|
||
|
CUarray dstArray; /**< Destination array reference */
|
||
|
unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
|
||
|
|
||
|
unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */
|
||
|
unsigned int Height; /**< Height of 2D memory copy */
|
||
|
} CUDA_MEMCPY2D;
|
||
|
|
||
|
typedef struct CUDA_MEMCPY3D_st
|
||
|
{
|
||
|
unsigned int srcXInBytes; /**< Source X in bytes */
|
||
|
unsigned int srcY; /**< Source Y */
|
||
|
unsigned int srcZ; /**< Source Z */
|
||
|
unsigned int srcLOD; /**< Source LOD */
|
||
|
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
|
||
|
const void *srcHost; /**< Source host pointer */
|
||
|
CUdeviceptr srcDevice; /**< Source device pointer */
|
||
|
CUarray srcArray; /**< Source array reference */
|
||
|
void *reserved0; /**< Must be NULL */
|
||
|
unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
|
||
|
unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
|
||
|
|
||
|
unsigned int dstXInBytes; /**< Destination X in bytes */
|
||
|
unsigned int dstY; /**< Destination Y */
|
||
|
unsigned int dstZ; /**< Destination Z */
|
||
|
unsigned int dstLOD; /**< Destination LOD */
|
||
|
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
|
||
|
void *dstHost; /**< Destination host pointer */
|
||
|
CUdeviceptr dstDevice; /**< Destination device pointer */
|
||
|
CUarray dstArray; /**< Destination array reference */
|
||
|
void *reserved1; /**< Must be NULL */
|
||
|
unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
|
||
|
unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
|
||
|
|
||
|
unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */
|
||
|
unsigned int Height; /**< Height of 3D memory copy */
|
||
|
unsigned int Depth; /**< Depth of 3D memory copy */
|
||
|
} CUDA_MEMCPY3D;
|
||
|
|
||
|
typedef struct CUDA_ARRAY_DESCRIPTOR_st
|
||
|
{
|
||
|
unsigned int Width; /**< Width of array */
|
||
|
unsigned int Height; /**< Height of array */
|
||
|
|
||
|
CUarray_format Format; /**< Array format */
|
||
|
unsigned int NumChannels; /**< Channels per array element */
|
||
|
} CUDA_ARRAY_DESCRIPTOR;
|
||
|
|
||
|
typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
|
||
|
{
|
||
|
unsigned int Width; /**< Width of 3D array */
|
||
|
unsigned int Height; /**< Height of 3D array */
|
||
|
unsigned int Depth; /**< Depth of 3D array */
|
||
|
|
||
|
CUarray_format Format; /**< Array format */
|
||
|
unsigned int NumChannels; /**< Channels per array element */
|
||
|
unsigned int Flags; /**< Flags */
|
||
|
} CUDA_ARRAY3D_DESCRIPTOR;
|
||
|
|
||
|
#endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */
|
||
|
|
||
|
/*
|
||
|
* If set, the CUDA array contains an array of 2D slices
|
||
|
* and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
|
||
|
* the number of slices, not the depth of a 3D array.
|
||
|
*/
|
||
|
#define CUDA_ARRAY3D_2DARRAY 0x01
|
||
|
|
||
|
/**
|
||
|
* This flag must be set in order to bind a surface reference
|
||
|
* to the CUDA array
|
||
|
*/
|
||
|
#define CUDA_ARRAY3D_SURFACE_LDST 0x02
|
||
|
|
||
|
/**
|
||
|
* Override the texref format with a format inferred from the array.
|
||
|
* Flag for ::cuTexRefSetArray()
|
||
|
*/
|
||
|
#define CU_TRSA_OVERRIDE_FORMAT 0x01
|
||
|
|
||
|
/**
|
||
|
* Read the texture as integers rather than promoting the values to floats
|
||
|
* in the range [0,1].
|
||
|
* Flag for ::cuTexRefSetFlags()
|
||
|
*/
|
||
|
#define CU_TRSF_READ_AS_INTEGER 0x01
|
||
|
|
||
|
/**
|
||
|
* Use normalized texture coordinates in the range [0,1) instead of [0,dim).
|
||
|
* Flag for ::cuTexRefSetFlags()
|
||
|
*/
|
||
|
#define CU_TRSF_NORMALIZED_COORDINATES 0x02
|
||
|
|
||
|
/**
|
||
|
* Perform sRGB->linear conversion during texture read.
|
||
|
* Flag for ::cuTexRefSetFlags()
|
||
|
*/
|
||
|
#define CU_TRSF_SRGB 0x10
|
||
|
|
||
|
/**
|
||
|
* For texture references loaded into the module, use default texunit from
|
||
|
* texture reference.
|
||
|
*/
|
||
|
#define CU_PARAM_TR_DEFAULT -1
|
||
|
|
||
|
/** @} */ /* END CUDA_TYPES */
|
||
|
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
#define CUDAAPI __stdcall
|
||
|
#else
|
||
|
#define CUDAAPI
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* \defgroup CUDA_INITIALIZE Initialization
|
||
|
*
|
||
|
* This section describes the initialization functions of the low-level CUDA
|
||
|
* driver application programming interface.
|
||
|
*
|
||
|
* @{
|
||
|
*/
|
||
|
|
||
|
/*********************************
|
||
|
** Initialization
|
||
|
*********************************/
|
||
|
typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
|
||
|
|
||
|
/*********************************
|
||
|
** Driver Version Query
|
||
|
*********************************/
|
||
|
typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Device management
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
|
||
|
typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
|
||
|
typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
|
||
|
typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
|
||
|
#else
|
||
|
typedef CUresult CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
|
||
|
#endif
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
|
||
|
typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
|
||
|
typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char **pStr);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Context management
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
|
||
|
typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx);
|
||
|
typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
|
||
|
typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
|
||
|
typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx);
|
||
|
typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
|
||
|
typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device);
|
||
|
typedef CUresult CUDAAPI tcuCtxSynchronize(void);
|
||
|
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Module management
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
|
||
|
typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
|
||
|
typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
||
|
typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
|
||
|
typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
|
||
|
typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
|
||
|
#else
|
||
|
typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
|
||
|
#endif
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
|
||
|
typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Memory management
|
||
|
**
|
||
|
***********************************/
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
|
||
|
typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
|
||
|
typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
|
||
|
typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
|
||
|
size_t *pPitch,
|
||
|
size_t WidthInBytes,
|
||
|
size_t Height,
|
||
|
// size of biggest r/w to be performed by kernels on this memory
|
||
|
// 4, 8 or 16 bytes
|
||
|
unsigned int ElementSizeBytes
|
||
|
);
|
||
|
#else
|
||
|
typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
|
||
|
typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
|
||
|
typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
|
||
|
typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
|
||
|
unsigned int *pPitch,
|
||
|
unsigned int WidthInBytes,
|
||
|
unsigned int Height,
|
||
|
// size of biggest r/w to be performed by kernels on this memory
|
||
|
// 4, 8 or 16 bytes
|
||
|
unsigned int ElementSizeBytes
|
||
|
);
|
||
|
#endif
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
|
||
|
typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
|
||
|
#else
|
||
|
typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
|
||
|
#endif
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
|
||
|
typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 4010
|
||
|
/**
|
||
|
* Interprocess Handles
|
||
|
*/
|
||
|
#define CU_IPC_HANDLE_SIZE 64
|
||
|
|
||
|
typedef struct CUipcEventHandle_st
|
||
|
{
|
||
|
char reserved[CU_IPC_HANDLE_SIZE];
|
||
|
} CUipcEventHandle;
|
||
|
|
||
|
typedef struct CUipcMemHandle_st
|
||
|
{
|
||
|
char reserved[CU_IPC_HANDLE_SIZE];
|
||
|
} CUipcMemHandle;
|
||
|
|
||
|
typedef enum CUipcMem_flags_enum
|
||
|
{
|
||
|
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
|
||
|
} CUipcMem_flags;
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
|
||
|
typedef CUresult CUDAAPI tcuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
|
||
|
typedef CUresult CUDAAPI tcuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
|
||
|
typedef CUresult CUDAAPI tcuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
|
||
|
typedef CUresult CUDAAPI tcuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
|
||
|
typedef CUresult CUDAAPI tcuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
|
||
|
typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr);
|
||
|
#endif
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
|
||
|
typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
|
||
|
typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Synchronous Memcpy
|
||
|
**
|
||
|
** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
|
||
|
** but if host memory is involved, they wait until the copy is done before returning.
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
// 1D functions
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
// system <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
|
||
|
|
||
|
// device <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
|
||
|
|
||
|
// device <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
||
|
|
||
|
// system <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
||
|
|
||
|
// array <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
||
|
#else
|
||
|
// system <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
|
||
|
|
||
|
// device <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
|
||
|
|
||
|
// device <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
||
|
|
||
|
// system <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
||
|
|
||
|
// array <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
||
|
#endif
|
||
|
|
||
|
// 2D memcpy
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
|
||
|
typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
|
||
|
|
||
|
// 3D memcpy
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Asynchronous Memcpy
|
||
|
**
|
||
|
** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
|
||
|
** memcpy's done with these functions execute in parallel with the CPU and, if
|
||
|
** the hardware is available, may execute in parallel with the GPU.
|
||
|
** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
// 1D functions
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
// system <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
|
||
|
const void *srcHost, size_t ByteCount, CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
|
||
|
CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
|
||
|
|
||
|
// device <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
|
||
|
CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
|
||
|
|
||
|
// system <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
|
||
|
const void *srcHost, size_t ByteCount, CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
|
||
|
size_t ByteCount, CUstream hStream);
|
||
|
|
||
|
#else
|
||
|
// system <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
|
||
|
const void *srcHost, unsigned int ByteCount, CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
|
||
|
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
|
||
|
|
||
|
// device <-> device memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
|
||
|
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
|
||
|
|
||
|
// system <-> array memory
|
||
|
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
|
||
|
const void *srcHost, unsigned int ByteCount, CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
|
||
|
unsigned int ByteCount, CUstream hStream);
|
||
|
#endif
|
||
|
|
||
|
// 2D memcpy
|
||
|
typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
|
||
|
|
||
|
// 3D memcpy
|
||
|
typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Memset
|
||
|
**
|
||
|
***********************************/
|
||
|
typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
|
||
|
typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
|
||
|
typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
|
||
|
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
|
||
|
typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
|
||
|
#else
|
||
|
typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
|
||
|
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
|
||
|
typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
|
||
|
#endif
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Function management
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
|
||
|
typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
|
||
|
typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
|
||
|
typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
|
||
|
typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
|
||
|
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
|
||
|
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
|
||
|
unsigned int sharedMemBytes,
|
||
|
CUstream hStream, void **kernelParams, void **extra);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Array management
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
|
||
|
typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
|
||
|
typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
|
||
|
typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 5000
|
||
|
typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
|
||
|
typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
|
||
|
typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Texture reference management
|
||
|
**
|
||
|
***********************************/
|
||
|
typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
|
||
|
typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
|
||
|
#else
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
|
||
|
#endif
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
|
||
|
typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
|
||
|
typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
|
||
|
typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
|
||
|
typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
|
||
|
typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
|
||
|
typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Surface reference management
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
|
||
|
typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Parameter management
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
|
||
|
typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
|
||
|
typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
|
||
|
typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
|
||
|
typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
|
||
|
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Launch functions
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
|
||
|
typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
|
||
|
typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Events
|
||
|
**
|
||
|
***********************************/
|
||
|
typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
|
||
|
typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
|
||
|
typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
|
||
|
typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
|
||
|
typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Streams
|
||
|
**
|
||
|
***********************************/
|
||
|
typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
|
||
|
typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
|
||
|
typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Graphics interop
|
||
|
**
|
||
|
***********************************/
|
||
|
typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
|
||
|
typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
|
||
|
|
||
|
#if __CUDA_API_VERSION >= 3020
|
||
|
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
|
||
|
#else
|
||
|
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
|
||
|
#endif
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
|
||
|
typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
||
|
typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Export tables
|
||
|
**
|
||
|
***********************************/
|
||
|
typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Limits
|
||
|
**
|
||
|
***********************************/
|
||
|
|
||
|
typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
|
||
|
typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
|
||
|
typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache *pconfig);
|
||
|
typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config);
|
||
|
typedef CUresult CUDAAPI tcuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
|
||
|
typedef CUresult CUDAAPI tcuCtxSetSharedMemConfig(CUsharedconfig config);
|
||
|
typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
|
||
|
|
||
|
/************************************
|
||
|
**
|
||
|
** Profiler
|
||
|
**
|
||
|
***********************************/
|
||
|
typedef CUresult CUDAAPI tcuProfilerStop(void);
|
||
|
|
||
|
/************************************
|
||
|
************************************/
|
||
|
|
||
|
extern CUresult CUDAAPI cuInit(unsigned int, int cudaVersion);
|
||
|
|
||
|
extern tcuDriverGetVersion *cuDriverGetVersion;
|
||
|
extern tcuDeviceGet *cuDeviceGet;
|
||
|
extern tcuDeviceGetCount *cuDeviceGetCount;
|
||
|
extern tcuDeviceGetName *cuDeviceGetName;
|
||
|
extern tcuDeviceComputeCapability *cuDeviceComputeCapability;
|
||
|
extern tcuDeviceGetProperties *cuDeviceGetProperties;
|
||
|
extern tcuDeviceGetAttribute *cuDeviceGetAttribute;
|
||
|
extern tcuGetErrorString *cuGetErrorString;
|
||
|
extern tcuCtxDestroy *cuCtxDestroy;
|
||
|
extern tcuCtxAttach *cuCtxAttach;
|
||
|
extern tcuCtxDetach *cuCtxDetach;
|
||
|
extern tcuCtxPushCurrent *cuCtxPushCurrent;
|
||
|
extern tcuCtxPopCurrent *cuCtxPopCurrent;
|
||
|
|
||
|
extern tcuCtxSetCurrent *cuCtxSetCurrent;
|
||
|
extern tcuCtxGetCurrent *cuCtxGetCurrent;
|
||
|
|
||
|
extern tcuCtxGetDevice *cuCtxGetDevice;
|
||
|
extern tcuCtxSynchronize *cuCtxSynchronize;
|
||
|
extern tcuModuleLoad *cuModuleLoad;
|
||
|
extern tcuModuleLoadData *cuModuleLoadData;
|
||
|
extern tcuModuleLoadDataEx *cuModuleLoadDataEx;
|
||
|
extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
|
||
|
extern tcuModuleUnload *cuModuleUnload;
|
||
|
extern tcuModuleGetFunction *cuModuleGetFunction;
|
||
|
extern tcuModuleGetTexRef *cuModuleGetTexRef;
|
||
|
extern tcuModuleGetSurfRef *cuModuleGetSurfRef;
|
||
|
extern tcuMemFreeHost *cuMemFreeHost;
|
||
|
extern tcuMemHostAlloc *cuMemHostAlloc;
|
||
|
extern tcuMemHostGetFlags *cuMemHostGetFlags;
|
||
|
|
||
|
extern tcuMemHostRegister *cuMemHostRegister;
|
||
|
extern tcuMemHostUnregister *cuMemHostUnregister;
|
||
|
extern tcuMemcpy *cuMemcpy;
|
||
|
extern tcuMemcpyPeer *cuMemcpyPeer;
|
||
|
|
||
|
extern tcuDeviceTotalMem *cuDeviceTotalMem;
|
||
|
extern tcuCtxCreate *cuCtxCreate;
|
||
|
extern tcuModuleGetGlobal *cuModuleGetGlobal;
|
||
|
extern tcuMemGetInfo *cuMemGetInfo;
|
||
|
extern tcuMemAlloc *cuMemAlloc;
|
||
|
extern tcuMemAllocPitch *cuMemAllocPitch;
|
||
|
extern tcuMemFree *cuMemFree;
|
||
|
extern tcuMemGetAddressRange *cuMemGetAddressRange;
|
||
|
extern tcuMemAllocHost *cuMemAllocHost;
|
||
|
extern tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
|
||
|
extern tcuFuncSetBlockShape *cuFuncSetBlockShape;
|
||
|
extern tcuFuncSetSharedSize *cuFuncSetSharedSize;
|
||
|
extern tcuFuncGetAttribute *cuFuncGetAttribute;
|
||
|
extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
|
||
|
extern tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
|
||
|
extern tcuLaunchKernel *cuLaunchKernel;
|
||
|
extern tcuArrayDestroy *cuArrayDestroy;
|
||
|
extern tcuTexRefCreate *cuTexRefCreate;
|
||
|
extern tcuTexRefDestroy *cuTexRefDestroy;
|
||
|
extern tcuTexRefSetArray *cuTexRefSetArray;
|
||
|
extern tcuTexRefSetFormat *cuTexRefSetFormat;
|
||
|
extern tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
|
||
|
extern tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
|
||
|
extern tcuTexRefSetFlags *cuTexRefSetFlags;
|
||
|
extern tcuTexRefGetArray *cuTexRefGetArray;
|
||
|
extern tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
|
||
|
extern tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
|
||
|
extern tcuTexRefGetFormat *cuTexRefGetFormat;
|
||
|
extern tcuTexRefGetFlags *cuTexRefGetFlags;
|
||
|
extern tcuSurfRefSetArray *cuSurfRefSetArray;
|
||
|
extern tcuSurfRefGetArray *cuSurfRefGetArray;
|
||
|
extern tcuParamSetSize *cuParamSetSize;
|
||
|
extern tcuParamSeti *cuParamSeti;
|
||
|
extern tcuParamSetf *cuParamSetf;
|
||
|
extern tcuParamSetv *cuParamSetv;
|
||
|
extern tcuParamSetTexRef *cuParamSetTexRef;
|
||
|
extern tcuLaunch *cuLaunch;
|
||
|
extern tcuLaunchGrid *cuLaunchGrid;
|
||
|
extern tcuLaunchGridAsync *cuLaunchGridAsync;
|
||
|
extern tcuEventCreate *cuEventCreate;
|
||
|
extern tcuEventRecord *cuEventRecord;
|
||
|
extern tcuEventQuery *cuEventQuery;
|
||
|
extern tcuEventSynchronize *cuEventSynchronize;
|
||
|
extern tcuEventDestroy *cuEventDestroy;
|
||
|
extern tcuEventElapsedTime *cuEventElapsedTime;
|
||
|
extern tcuStreamCreate *cuStreamCreate;
|
||
|
extern tcuStreamQuery *cuStreamQuery;
|
||
|
extern tcuStreamWaitEvent *cuStreamWaitEvent;
|
||
|
extern tcuStreamAddCallback *cuStreamAddCallback;
|
||
|
extern tcuStreamSynchronize *cuStreamSynchronize;
|
||
|
extern tcuStreamDestroy *cuStreamDestroy;
|
||
|
extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
|
||
|
extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
|
||
|
extern tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
|
||
|
extern tcuGraphicsMapResources *cuGraphicsMapResources;
|
||
|
extern tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
|
||
|
extern tcuGetExportTable *cuGetExportTable;
|
||
|
extern tcuCtxSetLimit *cuCtxSetLimit;
|
||
|
extern tcuCtxGetLimit *cuCtxGetLimit;
|
||
|
|
||
|
// These functions could be using the CUDA 3.2 interface (_v2)
|
||
|
extern tcuMemcpyHtoD *cuMemcpyHtoD;
|
||
|
extern tcuMemcpyDtoH *cuMemcpyDtoH;
|
||
|
extern tcuMemcpyDtoD *cuMemcpyDtoD;
|
||
|
extern tcuMemcpyDtoA *cuMemcpyDtoA;
|
||
|
extern tcuMemcpyAtoD *cuMemcpyAtoD;
|
||
|
extern tcuMemcpyHtoA *cuMemcpyHtoA;
|
||
|
extern tcuMemcpyAtoH *cuMemcpyAtoH;
|
||
|
extern tcuMemcpyAtoA *cuMemcpyAtoA;
|
||
|
extern tcuMemcpy2D *cuMemcpy2D;
|
||
|
extern tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
|
||
|
extern tcuMemcpy3D *cuMemcpy3D;
|
||
|
extern tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
|
||
|
extern tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
|
||
|
extern tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
|
||
|
extern tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
|
||
|
extern tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
|
||
|
extern tcuMemcpy2DAsync *cuMemcpy2DAsync;
|
||
|
extern tcuMemcpy3DAsync *cuMemcpy3DAsync;
|
||
|
extern tcuMemsetD8 *cuMemsetD8;
|
||
|
extern tcuMemsetD16 *cuMemsetD16;
|
||
|
extern tcuMemsetD32 *cuMemsetD32;
|
||
|
extern tcuMemsetD2D8 *cuMemsetD2D8;
|
||
|
extern tcuMemsetD2D16 *cuMemsetD2D16;
|
||
|
extern tcuMemsetD2D32 *cuMemsetD2D32;
|
||
|
extern tcuArrayCreate *cuArrayCreate;
|
||
|
extern tcuArrayGetDescriptor *cuArrayGetDescriptor;
|
||
|
extern tcuArray3DCreate *cuArray3DCreate;
|
||
|
extern tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
|
||
|
extern tcuTexRefSetAddress *cuTexRefSetAddress;
|
||
|
extern tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
|
||
|
extern tcuTexRefGetAddress *cuTexRefGetAddress;
|
||
|
extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
|
||
|
|
||
|
extern tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
|
||
|
extern tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
|
||
|
extern tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
|
||
|
|
||
|
extern tcuProfilerStop *cuProfilerStop;
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
//#undef __CUDA_API_VERSION
|
||
|
|
||
|
#endif //__cuda_drvapi_dynlink_cuda_h__
|