1 /*
2  * This copyright notice applies to this header file only:
3  *
4  * Copyright (c) 2016
5  *
6  * Permission is hereby granted, free of charge, to any person
7  * obtaining a copy of this software and associated documentation
8  * files (the "Software"), to deal in the Software without
9  * restriction, including without limitation the rights to use,
10  * copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the software, and to permit persons to whom the
12  * software is furnished to do so, subject to the following
13  * conditions:
14  *
15  * The above copyright notice and this permission notice shall be
16  * included in all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25  * OTHER DEALINGS IN THE SOFTWARE.
26  */
27 
28 #ifndef __cuda_cuda_h__
29 #define __cuda_cuda_h__
30 
31 #include <stddef.h>
32 
33 #define CUDA_VERSION 7050
34 
35 #if defined(_WIN32) || defined(__CYGWIN__)
36 #define CUDAAPI __stdcall
37 #else
38 #define CUDAAPI
39 #endif
40 
41 typedef int CUdevice;
42 typedef struct CUarray_st *CUarray;                       /**< CUDA array */
43 typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
44 #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
45 typedef unsigned long long CUdeviceptr;
46 #else
47 typedef unsigned int CUdeviceptr;
48 #endif
49 typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
50 typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
51 
52 /**
53  * Context creation flags
54  */
55 typedef enum CUctx_flags_enum
56 {
57     CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
58     CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
59     CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
60     CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
61     CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
62     CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
63     CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
64 #if __CUDA_API_VERSION < 4000
65     CU_CTX_SCHED_MASK          = 0x03,
66     CU_CTX_FLAGS_MASK          = 0x1f
67 #else
68     CU_CTX_SCHED_MASK          = 0x07,
69     CU_CTX_PRIMARY             = 0x20, /**< Initialize and return the primary context */
70     CU_CTX_FLAGS_MASK          = 0x3f
71 #endif
72 } CUctx_flags;
73 
74 /**
75  * Stream creation flags
76  */
77 typedef enum CUstream_flags_enum {
78     CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
79     CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
80 } CUstream_flags;
81 
82 /**
83  * Device properties
84  */
85 typedef enum CUdevice_attribute_enum
86 {
87     CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
88     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
89     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
90     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
91     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
92     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
93     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
94     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
95     CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
96     CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
97     CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
98     CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
99     CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
100     CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
101     CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Peak clock frequency in kilohertz */
102     CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
103     CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently */
104     CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
105     CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
106     CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
107     CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
108     CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
109     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
110     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
111     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
112     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
113     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
114     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
115     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Maximum texture array width */
116     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Maximum texture array height */
117     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
118     CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
119     CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
120     CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
121     CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
122     CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
123     CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35                         /**< Device is using TCC driver model */
124 #if __CUDA_API_VERSION >= 4000
125   , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
126     CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
127     CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
128     CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
129     CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
130     CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device uses shares a unified address space with the host */
131     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
132     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43   /**< Maximum layers in a 1D layered texture */
133 #endif
134 } CUdevice_attribute;
135 
136 
137 /**
138  * Error codes
139  */
140 typedef enum cudaError_enum
141 {
142     /**
143      * The API call returned with no errors. In the case of query calls, this
144      * can also mean that the operation being queried is complete (see
145      * ::cuEventQuery() and ::cuStreamQuery()).
146      */
147     CUDA_SUCCESS                              = 0,
148 
149     /**
150      * This indicates that one or more of the parameters passed to the API call
151      * is not within an acceptable range of values.
152      */
153     CUDA_ERROR_INVALID_VALUE                  = 1,
154 
155     /**
156      * The API call failed because it was unable to allocate enough memory to
157      * perform the requested operation.
158      */
159     CUDA_ERROR_OUT_OF_MEMORY                  = 2,
160 
161     /**
162      * This indicates that the CUDA driver has not been initialized with
163      * ::cuInit() or that initialization has failed.
164      */
165     CUDA_ERROR_NOT_INITIALIZED                = 3,
166 
167     /**
168      * This indicates that the CUDA driver is in the process of shutting down.
169      */
170     CUDA_ERROR_DEINITIALIZED                  = 4,
171 
172     /**
173      * This indicates profiling APIs are called while application is running
174      * in visual profiler mode.
175     */
176     CUDA_ERROR_PROFILER_DISABLED           = 5,
177     /**
178      * This indicates profiling has not been initialized for this context.
179      * Call cuProfilerInitialize() to resolve this.
180     */
181     CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
182     /**
183      * This indicates profiler has already been started and probably
184      * cuProfilerStart() is incorrectly called.
185     */
186     CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
187     /**
188      * This indicates profiler has already been stopped and probably
189      * cuProfilerStop() is incorrectly called.
190     */
191     CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
192     /**
193      * This indicates that no CUDA-capable devices were detected by the installed
194      * CUDA driver.
195      */
196     CUDA_ERROR_NO_DEVICE                      = 100,
197 
198     /**
199      * This indicates that the device ordinal supplied by the user does not
200      * correspond to a valid CUDA device.
201      */
202     CUDA_ERROR_INVALID_DEVICE                 = 101,
203 
204 
205     /**
206      * This indicates that the device kernel image is invalid. This can also
207      * indicate an invalid CUDA module.
208      */
209     CUDA_ERROR_INVALID_IMAGE                  = 200,
210 
211     /**
212      * This most frequently indicates that there is no context bound to the
213      * current thread. This can also be returned if the context passed to an
214      * API call is not a valid handle (such as a context that has had
215      * ::cuCtxDestroy() invoked on it). This can also be returned if a user
216      * mixes different API versions (i.e. 3010 context with 3020 API calls).
217      * See ::cuCtxGetApiVersion() for more details.
218      */
219     CUDA_ERROR_INVALID_CONTEXT                = 201,
220 
221     /**
222      * This indicated that the context being supplied as a parameter to the
223      * API call was already the active context.
224      * \deprecated
225      * This error return is deprecated as of CUDA 3.2. It is no longer an
226      * error to attempt to push the active context via ::cuCtxPushCurrent().
227      */
228     CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
229 
230     /**
231      * This indicates that a map or register operation has failed.
232      */
233     CUDA_ERROR_MAP_FAILED                     = 205,
234 
235     /**
236      * This indicates that an unmap or unregister operation has failed.
237      */
238     CUDA_ERROR_UNMAP_FAILED                   = 206,
239 
240     /**
241      * This indicates that the specified array is currently mapped and thus
242      * cannot be destroyed.
243      */
244     CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
245 
246     /**
247      * This indicates that the resource is already mapped.
248      */
249     CUDA_ERROR_ALREADY_MAPPED                 = 208,
250 
251     /**
252      * This indicates that there is no kernel image available that is suitable
253      * for the device. This can occur when a user specifies code generation
254      * options for a particular CUDA source file that do not include the
255      * corresponding device configuration.
256      */
257     CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
258 
259     /**
260      * This indicates that a resource has already been acquired.
261      */
262     CUDA_ERROR_ALREADY_ACQUIRED               = 210,
263 
264     /**
265      * This indicates that a resource is not mapped.
266      */
267     CUDA_ERROR_NOT_MAPPED                     = 211,
268 
269     /**
270      * This indicates that a mapped resource is not available for access as an
271      * array.
272      */
273     CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
274 
275     /**
276      * This indicates that a mapped resource is not available for access as a
277      * pointer.
278      */
279     CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
280 
281     /**
282      * This indicates that an uncorrectable ECC error was detected during
283      * execution.
284      */
285     CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
286 
287     /**
288      * This indicates that the ::CUlimit passed to the API call is not
289      * supported by the active device.
290      */
291     CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
292 
293     /**
294      * This indicates that the ::CUcontext passed to the API call can
295      * only be bound to a single CPU thread at a time but is already
296      * bound to a CPU thread.
297      */
298     CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
299 
300     /**
301      * This indicates that the device kernel source is invalid.
302      */
303     CUDA_ERROR_INVALID_SOURCE                 = 300,
304 
305     /**
306      * This indicates that the file specified was not found.
307      */
308     CUDA_ERROR_FILE_NOT_FOUND                 = 301,
309 
310     /**
311      * This indicates that a link to a shared object failed to resolve.
312      */
313     CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
314 
315     /**
316      * This indicates that initialization of a shared object failed.
317      */
318     CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
319 
320     /**
321      * This indicates that an OS call failed.
322      */
323     CUDA_ERROR_OPERATING_SYSTEM               = 304,
324 
325 
326     /**
327      * This indicates that a resource handle passed to the API call was not
328      * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
329      */
330     CUDA_ERROR_INVALID_HANDLE                 = 400,
331 
332 
333     /**
334      * This indicates that a named symbol was not found. Examples of symbols
335      * are global/constant variable names, texture names, and surface names.
336      */
337     CUDA_ERROR_NOT_FOUND                      = 500,
338 
339 
340     /**
341      * This indicates that asynchronous operations issued previously have not
342      * completed yet. This result is not actually an error, but must be indicated
343      * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
344      * may return this value include ::cuEventQuery() and ::cuStreamQuery().
345      */
346     CUDA_ERROR_NOT_READY                      = 600,
347 
348 
349     /**
350      * An exception occurred on the device while executing a kernel. Common
351      * causes include dereferencing an invalid device pointer and accessing
352      * out of bounds shared memory. The context cannot be used, so it must
353      * be destroyed (and a new one should be created). All existing device
354      * memory allocations from this context are invalid and must be
355      * reconstructed if the program is to continue using CUDA.
356      */
357     CUDA_ERROR_LAUNCH_FAILED                  = 700,
358 
359     /**
360      * This indicates that a launch did not occur because it did not have
361      * appropriate resources. This error usually indicates that the user has
362      * attempted to pass too many arguments to the device kernel, or the
363      * kernel launch specifies too many threads for the kernel's register
364      * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
365      * when a 32-bit int is expected) is equivalent to passing too many
366      * arguments and can also result in this error.
367      */
368     CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
369 
370     /**
371      * This indicates that the device kernel took too long to execute. This can
372      * only occur if timeouts are enabled - see the device attribute
373      * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
374      * context cannot be used (and must be destroyed similar to
375      * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
376      * this context are invalid and must be reconstructed if the program is to
377      * continue using CUDA.
378      */
379     CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
380 
381     /**
382      * This error indicates a kernel launch that uses an incompatible texturing
383      * mode.
384      */
385     CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
386 
387     /**
388      * This error indicates that a call to ::cuCtxEnablePeerAccess() is
389      * trying to re-enable peer access to a context which has already
390      * had peer access to it enabled.
391      */
392     CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
393 
394     /**
395      * This error indicates that a call to ::cuMemPeerRegister is trying to
396      * register memory from a context which has not had peer access
397      * enabled yet via ::cuCtxEnablePeerAccess(), or that
398      * ::cuCtxDisablePeerAccess() is trying to disable peer access
399      * which has not been enabled yet.
400      */
401     CUDA_ERROR_PEER_ACCESS_NOT_ENABLED    = 705,
402 
403     /**
404      * This error indicates that a call to ::cuMemPeerRegister is trying to
405      * register already-registered memory.
406      */
407     CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,
408 
409     /**
410      * This error indicates that a call to ::cuMemPeerUnregister is trying to
411      * unregister memory that has not been registered.
412      */
413     CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED     = 707,
414 
415     /**
416      * This error indicates that ::cuCtxCreate was called with the flag
417      * ::CU_CTX_PRIMARY on a device which already has initialized its
418      * primary context.
419      */
420     CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
421 
422     /**
423      * This error indicates that the context current to the calling thread
424      * has been destroyed using ::cuCtxDestroy, or is a primary context which
425      * has not yet been initialized.
426      */
427     CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
428 
429     /**
430      * This indicates that an unknown internal error has occurred.
431      */
432     CUDA_ERROR_UNKNOWN                        = 999
433 } CUresult;
434 
435 /**
436  * Memory types
437  */
438 typedef enum CUmemorytype_enum
439 {
440     CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
441     CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
442     CU_MEMORYTYPE_ARRAY   = 0x03     /**< Array memory */
443 #if __CUDA_API_VERSION >= 4000
444   , CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
445 #endif
446 } CUmemorytype;
447 
448 /**
449  * Compute Modes
450  */
451 typedef enum CUcomputemode_enum
452 {
453     CU_COMPUTEMODE_DEFAULT           = 0,  /**< Default compute mode (Multiple contexts allowed per device) */
454     CU_COMPUTEMODE_EXCLUSIVE         = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
455     CU_COMPUTEMODE_PROHIBITED        = 2  /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
456 #if __CUDA_API_VERSION >= 4000
457   , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
458 #endif
459 } CUcomputemode;
460 
461 /**
462  * Flags to register a graphics resource
463  */
464 typedef enum CUgraphicsRegisterFlags_enum
465 {
466     CU_GRAPHICS_REGISTER_FLAGS_NONE          = 0x00,
467     CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY     = 0x01,
468     CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
469     CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST  = 0x04
470 } CUgraphicsRegisterFlags;
471 
472 /**
473  * Flags for mapping and unmapping interop resources
474  */
475 typedef enum CUgraphicsMapResourceFlags_enum
476 {
477     CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
478     CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
479     CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
480 } CUgraphicsMapResourceFlags;
481 
482 typedef struct CUDA_MEMCPY2D_st {
483     size_t srcXInBytes;
484     size_t srcY;
485     CUmemorytype srcMemoryType;
486     const void *srcHost;
487     CUdeviceptr srcDevice;
488     CUarray srcArray;
489     size_t srcPitch;
490 
491     size_t dstXInBytes;
492     size_t dstY;
493     CUmemorytype dstMemoryType;
494     void *dstHost;
495     CUdeviceptr dstDevice;
496     CUarray dstArray;
497     size_t dstPitch;
498 
499     size_t WidthInBytes;
500     size_t Height;
501 } CUDA_MEMCPY2D;
502 #endif
503