1 /*
2  * Copyright 2011-2014 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 
17 #ifdef _MSC_VER
18 #  if _MSC_VER < 1900
19 #    define snprintf _snprintf
20 #  endif
21 #  define popen _popen
22 #  define pclose _pclose
23 #  define _CRT_SECURE_NO_WARNINGS
24 #endif
25 
26 #include <cuew.h>
27 #include <assert.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <sys/stat.h>
31 
32 #ifdef _WIN32
33 #  define WIN32_LEAN_AND_MEAN
34 #  define VC_EXTRALEAN
35 #  include <windows.h>
36 
37 /* Utility macros. */
38 
39 typedef HMODULE DynamicLibrary;
40 
41 #  define dynamic_library_open(path)         LoadLibraryA(path)
42 #  define dynamic_library_close(lib)         FreeLibrary(lib)
43 #  define dynamic_library_find(lib, symbol)  GetProcAddress(lib, symbol)
44 #else
45 #  include <dlfcn.h>
46 
47 typedef void* DynamicLibrary;
48 
49 #  define dynamic_library_open(path)         dlopen(path, RTLD_NOW)
50 #  define dynamic_library_close(lib)         dlclose(lib)
51 #  define dynamic_library_find(lib, symbol)  dlsym(lib, symbol)
52 #endif
53 
54 #define _LIBRARY_FIND_CHECKED(lib, name) \
55         name = (t##name *)dynamic_library_find(lib, #name); \
56         assert(name);
57 
58 #define _LIBRARY_FIND(lib, name) \
59         name = (t##name *)dynamic_library_find(lib, #name);
60 
61 #define CUDA_LIBRARY_FIND_CHECKED(name) \
62         _LIBRARY_FIND_CHECKED(cuda_lib, name)
63 #define CUDA_LIBRARY_FIND(name) _LIBRARY_FIND(cuda_lib, name)
64 
65 #define NVRTC_LIBRARY_FIND_CHECKED(name) \
66         _LIBRARY_FIND_CHECKED(nvrtc_lib, name)
67 #define NVRTC_LIBRARY_FIND(name) _LIBRARY_FIND(nvrtc_lib, name)
68 
69 static DynamicLibrary cuda_lib;
70 static DynamicLibrary nvrtc_lib;
71 
72 /* Function definitions. */
73 tcuGetErrorString *cuGetErrorString;
74 tcuGetErrorName *cuGetErrorName;
75 tcuInit *cuInit;
76 tcuDriverGetVersion *cuDriverGetVersion;
77 tcuDeviceGet *cuDeviceGet;
78 tcuDeviceGetCount *cuDeviceGetCount;
79 tcuDeviceGetName *cuDeviceGetName;
80 tcuDeviceGetUuid *cuDeviceGetUuid;
81 tcuDeviceTotalMem_v2 *cuDeviceTotalMem_v2;
82 tcuDeviceGetAttribute *cuDeviceGetAttribute;
83 tcuDeviceGetProperties *cuDeviceGetProperties;
84 tcuDeviceComputeCapability *cuDeviceComputeCapability;
85 tcuDevicePrimaryCtxRetain *cuDevicePrimaryCtxRetain;
86 tcuDevicePrimaryCtxRelease *cuDevicePrimaryCtxRelease;
87 tcuDevicePrimaryCtxSetFlags *cuDevicePrimaryCtxSetFlags;
88 tcuDevicePrimaryCtxGetState *cuDevicePrimaryCtxGetState;
89 tcuDevicePrimaryCtxReset *cuDevicePrimaryCtxReset;
90 tcuCtxCreate_v2 *cuCtxCreate_v2;
91 tcuCtxDestroy_v2 *cuCtxDestroy_v2;
92 tcuCtxPushCurrent_v2 *cuCtxPushCurrent_v2;
93 tcuCtxPopCurrent_v2 *cuCtxPopCurrent_v2;
94 tcuCtxSetCurrent *cuCtxSetCurrent;
95 tcuCtxGetCurrent *cuCtxGetCurrent;
96 tcuCtxGetDevice *cuCtxGetDevice;
97 tcuCtxGetFlags *cuCtxGetFlags;
98 tcuCtxSynchronize *cuCtxSynchronize;
99 tcuCtxSetLimit *cuCtxSetLimit;
100 tcuCtxGetLimit *cuCtxGetLimit;
101 tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
102 tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
103 tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
104 tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
105 tcuCtxGetApiVersion *cuCtxGetApiVersion;
106 tcuCtxGetStreamPriorityRange *cuCtxGetStreamPriorityRange;
107 tcuCtxAttach *cuCtxAttach;
108 tcuCtxDetach *cuCtxDetach;
109 tcuModuleLoad *cuModuleLoad;
110 tcuModuleLoadData *cuModuleLoadData;
111 tcuModuleLoadDataEx *cuModuleLoadDataEx;
112 tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
113 tcuModuleUnload *cuModuleUnload;
114 tcuModuleGetFunction *cuModuleGetFunction;
115 tcuModuleGetGlobal_v2 *cuModuleGetGlobal_v2;
116 tcuModuleGetTexRef *cuModuleGetTexRef;
117 tcuModuleGetSurfRef *cuModuleGetSurfRef;
118 tcuLinkCreate_v2 *cuLinkCreate_v2;
119 tcuLinkAddData_v2 *cuLinkAddData_v2;
120 tcuLinkAddFile_v2 *cuLinkAddFile_v2;
121 tcuLinkComplete *cuLinkComplete;
122 tcuLinkDestroy *cuLinkDestroy;
123 tcuMemGetInfo_v2 *cuMemGetInfo_v2;
124 tcuMemAlloc_v2 *cuMemAlloc_v2;
125 tcuMemAllocPitch_v2 *cuMemAllocPitch_v2;
126 tcuMemFree_v2 *cuMemFree_v2;
127 tcuMemGetAddressRange_v2 *cuMemGetAddressRange_v2;
128 tcuMemAllocHost_v2 *cuMemAllocHost_v2;
129 tcuMemFreeHost *cuMemFreeHost;
130 tcuMemHostAlloc *cuMemHostAlloc;
131 tcuMemHostGetDevicePointer_v2 *cuMemHostGetDevicePointer_v2;
132 tcuMemHostGetFlags *cuMemHostGetFlags;
133 tcuMemAllocManaged *cuMemAllocManaged;
134 tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
135 tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
136 tcuIpcGetEventHandle *cuIpcGetEventHandle;
137 tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
138 tcuIpcGetMemHandle *cuIpcGetMemHandle;
139 tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
140 tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
141 tcuMemHostRegister_v2 *cuMemHostRegister_v2;
142 tcuMemHostUnregister *cuMemHostUnregister;
143 tcuMemcpy *cuMemcpy;
144 tcuMemcpyPeer *cuMemcpyPeer;
145 tcuMemcpyHtoD_v2 *cuMemcpyHtoD_v2;
146 tcuMemcpyDtoH_v2 *cuMemcpyDtoH_v2;
147 tcuMemcpyDtoD_v2 *cuMemcpyDtoD_v2;
148 tcuMemcpyDtoA_v2 *cuMemcpyDtoA_v2;
149 tcuMemcpyAtoD_v2 *cuMemcpyAtoD_v2;
150 tcuMemcpyHtoA_v2 *cuMemcpyHtoA_v2;
151 tcuMemcpyAtoH_v2 *cuMemcpyAtoH_v2;
152 tcuMemcpyAtoA_v2 *cuMemcpyAtoA_v2;
153 tcuMemcpy2D_v2 *cuMemcpy2D_v2;
154 tcuMemcpy2DUnaligned_v2 *cuMemcpy2DUnaligned_v2;
155 tcuMemcpy3D_v2 *cuMemcpy3D_v2;
156 tcuMemcpy3DPeer *cuMemcpy3DPeer;
157 tcuMemcpyAsync *cuMemcpyAsync;
158 tcuMemcpyPeerAsync *cuMemcpyPeerAsync;
159 tcuMemcpyHtoDAsync_v2 *cuMemcpyHtoDAsync_v2;
160 tcuMemcpyDtoHAsync_v2 *cuMemcpyDtoHAsync_v2;
161 tcuMemcpyDtoDAsync_v2 *cuMemcpyDtoDAsync_v2;
162 tcuMemcpyHtoAAsync_v2 *cuMemcpyHtoAAsync_v2;
163 tcuMemcpyAtoHAsync_v2 *cuMemcpyAtoHAsync_v2;
164 tcuMemcpy2DAsync_v2 *cuMemcpy2DAsync_v2;
165 tcuMemcpy3DAsync_v2 *cuMemcpy3DAsync_v2;
166 tcuMemcpy3DPeerAsync *cuMemcpy3DPeerAsync;
167 tcuMemsetD8_v2 *cuMemsetD8_v2;
168 tcuMemsetD16_v2 *cuMemsetD16_v2;
169 tcuMemsetD32_v2 *cuMemsetD32_v2;
170 tcuMemsetD2D8_v2 *cuMemsetD2D8_v2;
171 tcuMemsetD2D16_v2 *cuMemsetD2D16_v2;
172 tcuMemsetD2D32_v2 *cuMemsetD2D32_v2;
173 tcuMemsetD8Async *cuMemsetD8Async;
174 tcuMemsetD16Async *cuMemsetD16Async;
175 tcuMemsetD32Async *cuMemsetD32Async;
176 tcuMemsetD2D8Async *cuMemsetD2D8Async;
177 tcuMemsetD2D16Async *cuMemsetD2D16Async;
178 tcuMemsetD2D32Async *cuMemsetD2D32Async;
179 tcuArrayCreate_v2 *cuArrayCreate_v2;
180 tcuArrayGetDescriptor_v2 *cuArrayGetDescriptor_v2;
181 tcuArrayDestroy *cuArrayDestroy;
182 tcuArray3DCreate_v2 *cuArray3DCreate_v2;
183 tcuArray3DGetDescriptor_v2 *cuArray3DGetDescriptor_v2;
184 tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
185 tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
186 tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
187 tcuPointerGetAttribute *cuPointerGetAttribute;
188 tcuMemPrefetchAsync *cuMemPrefetchAsync;
189 tcuMemAdvise *cuMemAdvise;
190 tcuMemRangeGetAttribute *cuMemRangeGetAttribute;
191 tcuMemRangeGetAttributes *cuMemRangeGetAttributes;
192 tcuPointerSetAttribute *cuPointerSetAttribute;
193 tcuPointerGetAttributes *cuPointerGetAttributes;
194 tcuStreamCreate *cuStreamCreate;
195 tcuStreamCreateWithPriority *cuStreamCreateWithPriority;
196 tcuStreamGetPriority *cuStreamGetPriority;
197 tcuStreamGetFlags *cuStreamGetFlags;
198 tcuStreamGetCtx *cuStreamGetCtx;
199 tcuStreamWaitEvent *cuStreamWaitEvent;
200 tcuStreamAddCallback *cuStreamAddCallback;
201 tcuStreamAttachMemAsync *cuStreamAttachMemAsync;
202 tcuStreamQuery *cuStreamQuery;
203 tcuStreamSynchronize *cuStreamSynchronize;
204 tcuStreamDestroy_v2 *cuStreamDestroy_v2;
205 tcuEventCreate *cuEventCreate;
206 tcuEventRecord *cuEventRecord;
207 tcuEventQuery *cuEventQuery;
208 tcuEventSynchronize *cuEventSynchronize;
209 tcuEventDestroy_v2 *cuEventDestroy_v2;
210 tcuEventElapsedTime *cuEventElapsedTime;
211 tcuStreamWaitValue32 *cuStreamWaitValue32;
212 tcuStreamWaitValue64 *cuStreamWaitValue64;
213 tcuStreamWriteValue32 *cuStreamWriteValue32;
214 tcuStreamWriteValue64 *cuStreamWriteValue64;
215 tcuStreamBatchMemOp *cuStreamBatchMemOp;
216 tcuFuncGetAttribute *cuFuncGetAttribute;
217 tcuFuncSetAttribute *cuFuncSetAttribute;
218 tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
219 tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
220 tcuLaunchKernel *cuLaunchKernel;
221 tcuLaunchCooperativeKernel *cuLaunchCooperativeKernel;
222 tcuLaunchCooperativeKernelMultiDevice *cuLaunchCooperativeKernelMultiDevice;
223 tcuFuncSetBlockShape *cuFuncSetBlockShape;
224 tcuFuncSetSharedSize *cuFuncSetSharedSize;
225 tcuParamSetSize *cuParamSetSize;
226 tcuParamSeti *cuParamSeti;
227 tcuParamSetf *cuParamSetf;
228 tcuParamSetv *cuParamSetv;
229 tcuLaunch *cuLaunch;
230 tcuLaunchGrid *cuLaunchGrid;
231 tcuLaunchGridAsync *cuLaunchGridAsync;
232 tcuParamSetTexRef *cuParamSetTexRef;
233 tcuOccupancyMaxActiveBlocksPerMultiprocessor *cuOccupancyMaxActiveBlocksPerMultiprocessor;
234 tcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
235 tcuOccupancyMaxPotentialBlockSize *cuOccupancyMaxPotentialBlockSize;
236 tcuOccupancyMaxPotentialBlockSizeWithFlags *cuOccupancyMaxPotentialBlockSizeWithFlags;
237 tcuTexRefSetArray *cuTexRefSetArray;
238 tcuTexRefSetMipmappedArray *cuTexRefSetMipmappedArray;
239 tcuTexRefSetAddress_v2 *cuTexRefSetAddress_v2;
240 tcuTexRefSetAddress2D_v3 *cuTexRefSetAddress2D_v3;
241 tcuTexRefSetFormat *cuTexRefSetFormat;
242 tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
243 tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
244 tcuTexRefSetMipmapFilterMode *cuTexRefSetMipmapFilterMode;
245 tcuTexRefSetMipmapLevelBias *cuTexRefSetMipmapLevelBias;
246 tcuTexRefSetMipmapLevelClamp *cuTexRefSetMipmapLevelClamp;
247 tcuTexRefSetMaxAnisotropy *cuTexRefSetMaxAnisotropy;
248 tcuTexRefSetBorderColor *cuTexRefSetBorderColor;
249 tcuTexRefSetFlags *cuTexRefSetFlags;
250 tcuTexRefGetAddress_v2 *cuTexRefGetAddress_v2;
251 tcuTexRefGetArray *cuTexRefGetArray;
252 tcuTexRefGetMipmappedArray *cuTexRefGetMipmappedArray;
253 tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
254 tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
255 tcuTexRefGetFormat *cuTexRefGetFormat;
256 tcuTexRefGetMipmapFilterMode *cuTexRefGetMipmapFilterMode;
257 tcuTexRefGetMipmapLevelBias *cuTexRefGetMipmapLevelBias;
258 tcuTexRefGetMipmapLevelClamp *cuTexRefGetMipmapLevelClamp;
259 tcuTexRefGetMaxAnisotropy *cuTexRefGetMaxAnisotropy;
260 tcuTexRefGetBorderColor *cuTexRefGetBorderColor;
261 tcuTexRefGetFlags *cuTexRefGetFlags;
262 tcuTexRefCreate *cuTexRefCreate;
263 tcuTexRefDestroy *cuTexRefDestroy;
264 tcuSurfRefSetArray *cuSurfRefSetArray;
265 tcuSurfRefGetArray *cuSurfRefGetArray;
266 tcuTexObjectCreate *cuTexObjectCreate;
267 tcuTexObjectDestroy *cuTexObjectDestroy;
268 tcuTexObjectGetResourceDesc *cuTexObjectGetResourceDesc;
269 tcuTexObjectGetTextureDesc *cuTexObjectGetTextureDesc;
270 tcuTexObjectGetResourceViewDesc *cuTexObjectGetResourceViewDesc;
271 tcuSurfObjectCreate *cuSurfObjectCreate;
272 tcuSurfObjectDestroy *cuSurfObjectDestroy;
273 tcuSurfObjectGetResourceDesc *cuSurfObjectGetResourceDesc;
274 tcuDeviceCanAccessPeer *cuDeviceCanAccessPeer;
275 tcuCtxEnablePeerAccess *cuCtxEnablePeerAccess;
276 tcuCtxDisablePeerAccess *cuCtxDisablePeerAccess;
277 tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute;
278 tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
279 tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
280 tcuGraphicsResourceGetMappedMipmappedArray *cuGraphicsResourceGetMappedMipmappedArray;
281 tcuGraphicsResourceGetMappedPointer_v2 *cuGraphicsResourceGetMappedPointer_v2;
282 tcuGraphicsResourceSetMapFlags_v2 *cuGraphicsResourceSetMapFlags_v2;
283 tcuGraphicsMapResources *cuGraphicsMapResources;
284 tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
285 tcuGetExportTable *cuGetExportTable;
286 
287 tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
288 tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
289 tcuGLGetDevices_v2 *cuGLGetDevices_v2;
290 tcuGLCtxCreate_v2 *cuGLCtxCreate_v2;
291 tcuGLInit *cuGLInit;
292 tcuGLRegisterBufferObject *cuGLRegisterBufferObject;
293 tcuGLMapBufferObject_v2 *cuGLMapBufferObject_v2;
294 tcuGLUnmapBufferObject *cuGLUnmapBufferObject;
295 tcuGLUnregisterBufferObject *cuGLUnregisterBufferObject;
296 tcuGLSetBufferObjectMapFlags *cuGLSetBufferObjectMapFlags;
297 tcuGLMapBufferObjectAsync_v2 *cuGLMapBufferObjectAsync_v2;
298 tcuGLUnmapBufferObjectAsync *cuGLUnmapBufferObjectAsync;
299 
300 tnvrtcGetErrorString *nvrtcGetErrorString;
301 tnvrtcVersion *nvrtcVersion;
302 tnvrtcCreateProgram *nvrtcCreateProgram;
303 tnvrtcDestroyProgram *nvrtcDestroyProgram;
304 tnvrtcCompileProgram *nvrtcCompileProgram;
305 tnvrtcGetPTXSize *nvrtcGetPTXSize;
306 tnvrtcGetPTX *nvrtcGetPTX;
307 tnvrtcGetProgramLogSize *nvrtcGetProgramLogSize;
308 tnvrtcGetProgramLog *nvrtcGetProgramLog;
309 tnvrtcAddNameExpression *nvrtcAddNameExpression;
310 tnvrtcGetLoweredName *nvrtcGetLoweredName;
311 
312 
dynamic_library_open_find(const char ** paths)313 static DynamicLibrary dynamic_library_open_find(const char **paths) {
314   int i = 0;
315   while (paths[i] != NULL) {
316       DynamicLibrary lib = dynamic_library_open(paths[i]);
317       if (lib != NULL) {
318         return lib;
319       }
320       ++i;
321   }
322   return NULL;
323 }
324 
325 /* Implementation function. */
cuewCudaExit(void)326 static void cuewCudaExit(void) {
327   if (cuda_lib != NULL) {
328     /*  Ignore errors. */
329     dynamic_library_close(cuda_lib);
330     cuda_lib = NULL;
331   }
332 }
333 
cuewCudaInit(void)334 static int cuewCudaInit(void) {
335   /* Library paths. */
336 #ifdef _WIN32
337   /* Expected in c:/windows/system or similar, no path needed. */
338   const char *cuda_paths[] = {"nvcuda.dll", NULL};
339 #elif defined(__APPLE__)
340   /* Default installation path. */
341   const char *cuda_paths[] = {"/usr/local/cuda/lib/libcuda.dylib", NULL};
342 #else
343   const char *cuda_paths[] = {"libcuda.so", "libcuda.so.1", NULL};
344 #endif
345   static int initialized = 0;
346   static int result = 0;
347   int error, driver_version;
348 
349   if (initialized) {
350     return result;
351   }
352 
353   initialized = 1;
354 
355   error = atexit(cuewCudaExit);
356   if (error) {
357     result = CUEW_ERROR_ATEXIT_FAILED;
358     return result;
359   }
360 
361   /* Load library. */
362   cuda_lib = dynamic_library_open_find(cuda_paths);
363 
364   if (cuda_lib == NULL) {
365     result = CUEW_ERROR_OPEN_FAILED;
366     return result;
367   }
368 
369   /* Detect driver version. */
370   driver_version = 1000;
371 
372   CUDA_LIBRARY_FIND_CHECKED(cuDriverGetVersion);
373   if (cuDriverGetVersion) {
374     cuDriverGetVersion(&driver_version);
375   }
376 
377   /* We require version 4.0. */
378   if (driver_version < 4000) {
379     result = CUEW_ERROR_OPEN_FAILED;
380     return result;
381   }
382   /* Fetch all function pointers. */
383   CUDA_LIBRARY_FIND(cuGetErrorString);
384   CUDA_LIBRARY_FIND(cuGetErrorName);
385   CUDA_LIBRARY_FIND(cuInit);
386   CUDA_LIBRARY_FIND(cuDriverGetVersion);
387   CUDA_LIBRARY_FIND(cuDeviceGet);
388   CUDA_LIBRARY_FIND(cuDeviceGetCount);
389   CUDA_LIBRARY_FIND(cuDeviceGetName);
390   CUDA_LIBRARY_FIND(cuDeviceGetUuid);
391   CUDA_LIBRARY_FIND(cuDeviceTotalMem_v2);
392   CUDA_LIBRARY_FIND(cuDeviceGetAttribute);
393   CUDA_LIBRARY_FIND(cuDeviceGetProperties);
394   CUDA_LIBRARY_FIND(cuDeviceComputeCapability);
395   CUDA_LIBRARY_FIND(cuDevicePrimaryCtxRetain);
396   CUDA_LIBRARY_FIND(cuDevicePrimaryCtxRelease);
397   CUDA_LIBRARY_FIND(cuDevicePrimaryCtxSetFlags);
398   CUDA_LIBRARY_FIND(cuDevicePrimaryCtxGetState);
399   CUDA_LIBRARY_FIND(cuDevicePrimaryCtxReset);
400   CUDA_LIBRARY_FIND(cuCtxCreate_v2);
401   CUDA_LIBRARY_FIND(cuCtxDestroy_v2);
402   CUDA_LIBRARY_FIND(cuCtxPushCurrent_v2);
403   CUDA_LIBRARY_FIND(cuCtxPopCurrent_v2);
404   CUDA_LIBRARY_FIND(cuCtxSetCurrent);
405   CUDA_LIBRARY_FIND(cuCtxGetCurrent);
406   CUDA_LIBRARY_FIND(cuCtxGetDevice);
407   CUDA_LIBRARY_FIND(cuCtxGetFlags);
408   CUDA_LIBRARY_FIND(cuCtxSynchronize);
409   CUDA_LIBRARY_FIND(cuCtxSetLimit);
410   CUDA_LIBRARY_FIND(cuCtxGetLimit);
411   CUDA_LIBRARY_FIND(cuCtxGetCacheConfig);
412   CUDA_LIBRARY_FIND(cuCtxSetCacheConfig);
413   CUDA_LIBRARY_FIND(cuCtxGetSharedMemConfig);
414   CUDA_LIBRARY_FIND(cuCtxSetSharedMemConfig);
415   CUDA_LIBRARY_FIND(cuCtxGetApiVersion);
416   CUDA_LIBRARY_FIND(cuCtxGetStreamPriorityRange);
417   CUDA_LIBRARY_FIND(cuCtxAttach);
418   CUDA_LIBRARY_FIND(cuCtxDetach);
419   CUDA_LIBRARY_FIND(cuModuleLoad);
420   CUDA_LIBRARY_FIND(cuModuleLoadData);
421   CUDA_LIBRARY_FIND(cuModuleLoadDataEx);
422   CUDA_LIBRARY_FIND(cuModuleLoadFatBinary);
423   CUDA_LIBRARY_FIND(cuModuleUnload);
424   CUDA_LIBRARY_FIND(cuModuleGetFunction);
425   CUDA_LIBRARY_FIND(cuModuleGetGlobal_v2);
426   CUDA_LIBRARY_FIND(cuModuleGetTexRef);
427   CUDA_LIBRARY_FIND(cuModuleGetSurfRef);
428   CUDA_LIBRARY_FIND(cuLinkCreate_v2);
429   CUDA_LIBRARY_FIND(cuLinkAddData_v2);
430   CUDA_LIBRARY_FIND(cuLinkAddFile_v2);
431   CUDA_LIBRARY_FIND(cuLinkComplete);
432   CUDA_LIBRARY_FIND(cuLinkDestroy);
433   CUDA_LIBRARY_FIND(cuMemGetInfo_v2);
434   CUDA_LIBRARY_FIND(cuMemAlloc_v2);
435   CUDA_LIBRARY_FIND(cuMemAllocPitch_v2);
436   CUDA_LIBRARY_FIND(cuMemFree_v2);
437   CUDA_LIBRARY_FIND(cuMemGetAddressRange_v2);
438   CUDA_LIBRARY_FIND(cuMemAllocHost_v2);
439   CUDA_LIBRARY_FIND(cuMemFreeHost);
440   CUDA_LIBRARY_FIND(cuMemHostAlloc);
441   CUDA_LIBRARY_FIND(cuMemHostGetDevicePointer_v2);
442   CUDA_LIBRARY_FIND(cuMemHostGetFlags);
443   CUDA_LIBRARY_FIND(cuMemAllocManaged);
444   CUDA_LIBRARY_FIND(cuDeviceGetByPCIBusId);
445   CUDA_LIBRARY_FIND(cuDeviceGetPCIBusId);
446   CUDA_LIBRARY_FIND(cuIpcGetEventHandle);
447   CUDA_LIBRARY_FIND(cuIpcOpenEventHandle);
448   CUDA_LIBRARY_FIND(cuIpcGetMemHandle);
449   CUDA_LIBRARY_FIND(cuIpcOpenMemHandle);
450   CUDA_LIBRARY_FIND(cuIpcCloseMemHandle);
451   CUDA_LIBRARY_FIND(cuMemHostRegister_v2);
452   CUDA_LIBRARY_FIND(cuMemHostUnregister);
453   CUDA_LIBRARY_FIND(cuMemcpy);
454   CUDA_LIBRARY_FIND(cuMemcpyPeer);
455   CUDA_LIBRARY_FIND(cuMemcpyHtoD_v2);
456   CUDA_LIBRARY_FIND(cuMemcpyDtoH_v2);
457   CUDA_LIBRARY_FIND(cuMemcpyDtoD_v2);
458   CUDA_LIBRARY_FIND(cuMemcpyDtoA_v2);
459   CUDA_LIBRARY_FIND(cuMemcpyAtoD_v2);
460   CUDA_LIBRARY_FIND(cuMemcpyHtoA_v2);
461   CUDA_LIBRARY_FIND(cuMemcpyAtoH_v2);
462   CUDA_LIBRARY_FIND(cuMemcpyAtoA_v2);
463   CUDA_LIBRARY_FIND(cuMemcpy2D_v2);
464   CUDA_LIBRARY_FIND(cuMemcpy2DUnaligned_v2);
465   CUDA_LIBRARY_FIND(cuMemcpy3D_v2);
466   CUDA_LIBRARY_FIND(cuMemcpy3DPeer);
467   CUDA_LIBRARY_FIND(cuMemcpyAsync);
468   CUDA_LIBRARY_FIND(cuMemcpyPeerAsync);
469   CUDA_LIBRARY_FIND(cuMemcpyHtoDAsync_v2);
470   CUDA_LIBRARY_FIND(cuMemcpyDtoHAsync_v2);
471   CUDA_LIBRARY_FIND(cuMemcpyDtoDAsync_v2);
472   CUDA_LIBRARY_FIND(cuMemcpyHtoAAsync_v2);
473   CUDA_LIBRARY_FIND(cuMemcpyAtoHAsync_v2);
474   CUDA_LIBRARY_FIND(cuMemcpy2DAsync_v2);
475   CUDA_LIBRARY_FIND(cuMemcpy3DAsync_v2);
476   CUDA_LIBRARY_FIND(cuMemcpy3DPeerAsync);
477   CUDA_LIBRARY_FIND(cuMemsetD8_v2);
478   CUDA_LIBRARY_FIND(cuMemsetD16_v2);
479   CUDA_LIBRARY_FIND(cuMemsetD32_v2);
480   CUDA_LIBRARY_FIND(cuMemsetD2D8_v2);
481   CUDA_LIBRARY_FIND(cuMemsetD2D16_v2);
482   CUDA_LIBRARY_FIND(cuMemsetD2D32_v2);
483   CUDA_LIBRARY_FIND(cuMemsetD8Async);
484   CUDA_LIBRARY_FIND(cuMemsetD16Async);
485   CUDA_LIBRARY_FIND(cuMemsetD32Async);
486   CUDA_LIBRARY_FIND(cuMemsetD2D8Async);
487   CUDA_LIBRARY_FIND(cuMemsetD2D16Async);
488   CUDA_LIBRARY_FIND(cuMemsetD2D32Async);
489   CUDA_LIBRARY_FIND(cuArrayCreate_v2);
490   CUDA_LIBRARY_FIND(cuArrayGetDescriptor_v2);
491   CUDA_LIBRARY_FIND(cuArrayDestroy);
492   CUDA_LIBRARY_FIND(cuArray3DCreate_v2);
493   CUDA_LIBRARY_FIND(cuArray3DGetDescriptor_v2);
494   CUDA_LIBRARY_FIND(cuMipmappedArrayCreate);
495   CUDA_LIBRARY_FIND(cuMipmappedArrayGetLevel);
496   CUDA_LIBRARY_FIND(cuMipmappedArrayDestroy);
497   CUDA_LIBRARY_FIND(cuPointerGetAttribute);
498   CUDA_LIBRARY_FIND(cuMemPrefetchAsync);
499   CUDA_LIBRARY_FIND(cuMemAdvise);
500   CUDA_LIBRARY_FIND(cuMemRangeGetAttribute);
501   CUDA_LIBRARY_FIND(cuMemRangeGetAttributes);
502   CUDA_LIBRARY_FIND(cuPointerSetAttribute);
503   CUDA_LIBRARY_FIND(cuPointerGetAttributes);
504   CUDA_LIBRARY_FIND(cuStreamCreate);
505   CUDA_LIBRARY_FIND(cuStreamCreateWithPriority);
506   CUDA_LIBRARY_FIND(cuStreamGetPriority);
507   CUDA_LIBRARY_FIND(cuStreamGetFlags);
508   CUDA_LIBRARY_FIND(cuStreamGetCtx);
509   CUDA_LIBRARY_FIND(cuStreamWaitEvent);
510   CUDA_LIBRARY_FIND(cuStreamAddCallback);
511   CUDA_LIBRARY_FIND(cuStreamAttachMemAsync);
512   CUDA_LIBRARY_FIND(cuStreamQuery);
513   CUDA_LIBRARY_FIND(cuStreamSynchronize);
514   CUDA_LIBRARY_FIND(cuStreamDestroy_v2);
515   CUDA_LIBRARY_FIND(cuEventCreate);
516   CUDA_LIBRARY_FIND(cuEventRecord);
517   CUDA_LIBRARY_FIND(cuEventQuery);
518   CUDA_LIBRARY_FIND(cuEventSynchronize);
519   CUDA_LIBRARY_FIND(cuEventDestroy_v2);
520   CUDA_LIBRARY_FIND(cuEventElapsedTime);
521   CUDA_LIBRARY_FIND(cuStreamWaitValue32);
522   CUDA_LIBRARY_FIND(cuStreamWaitValue64);
523   CUDA_LIBRARY_FIND(cuStreamWriteValue32);
524   CUDA_LIBRARY_FIND(cuStreamWriteValue64);
525   CUDA_LIBRARY_FIND(cuStreamBatchMemOp);
526   CUDA_LIBRARY_FIND(cuFuncGetAttribute);
527   CUDA_LIBRARY_FIND(cuFuncSetAttribute);
528   CUDA_LIBRARY_FIND(cuFuncSetCacheConfig);
529   CUDA_LIBRARY_FIND(cuFuncSetSharedMemConfig);
530   CUDA_LIBRARY_FIND(cuLaunchKernel);
531   CUDA_LIBRARY_FIND(cuLaunchCooperativeKernel);
532   CUDA_LIBRARY_FIND(cuLaunchCooperativeKernelMultiDevice);
533   CUDA_LIBRARY_FIND(cuFuncSetBlockShape);
534   CUDA_LIBRARY_FIND(cuFuncSetSharedSize);
535   CUDA_LIBRARY_FIND(cuParamSetSize);
536   CUDA_LIBRARY_FIND(cuParamSeti);
537   CUDA_LIBRARY_FIND(cuParamSetf);
538   CUDA_LIBRARY_FIND(cuParamSetv);
539   CUDA_LIBRARY_FIND(cuLaunch);
540   CUDA_LIBRARY_FIND(cuLaunchGrid);
541   CUDA_LIBRARY_FIND(cuLaunchGridAsync);
542   CUDA_LIBRARY_FIND(cuParamSetTexRef);
543   CUDA_LIBRARY_FIND(cuOccupancyMaxActiveBlocksPerMultiprocessor);
544   CUDA_LIBRARY_FIND(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags);
545   CUDA_LIBRARY_FIND(cuOccupancyMaxPotentialBlockSize);
546   CUDA_LIBRARY_FIND(cuOccupancyMaxPotentialBlockSizeWithFlags);
547   CUDA_LIBRARY_FIND(cuTexRefSetArray);
548   CUDA_LIBRARY_FIND(cuTexRefSetMipmappedArray);
549   CUDA_LIBRARY_FIND(cuTexRefSetAddress_v2);
550   CUDA_LIBRARY_FIND(cuTexRefSetAddress2D_v3);
551   CUDA_LIBRARY_FIND(cuTexRefSetFormat);
552   CUDA_LIBRARY_FIND(cuTexRefSetAddressMode);
553   CUDA_LIBRARY_FIND(cuTexRefSetFilterMode);
554   CUDA_LIBRARY_FIND(cuTexRefSetMipmapFilterMode);
555   CUDA_LIBRARY_FIND(cuTexRefSetMipmapLevelBias);
556   CUDA_LIBRARY_FIND(cuTexRefSetMipmapLevelClamp);
557   CUDA_LIBRARY_FIND(cuTexRefSetMaxAnisotropy);
558   CUDA_LIBRARY_FIND(cuTexRefSetBorderColor);
559   CUDA_LIBRARY_FIND(cuTexRefSetFlags);
560   CUDA_LIBRARY_FIND(cuTexRefGetAddress_v2);
561   CUDA_LIBRARY_FIND(cuTexRefGetArray);
562   CUDA_LIBRARY_FIND(cuTexRefGetMipmappedArray);
563   CUDA_LIBRARY_FIND(cuTexRefGetAddressMode);
564   CUDA_LIBRARY_FIND(cuTexRefGetFilterMode);
565   CUDA_LIBRARY_FIND(cuTexRefGetFormat);
566   CUDA_LIBRARY_FIND(cuTexRefGetMipmapFilterMode);
567   CUDA_LIBRARY_FIND(cuTexRefGetMipmapLevelBias);
568   CUDA_LIBRARY_FIND(cuTexRefGetMipmapLevelClamp);
569   CUDA_LIBRARY_FIND(cuTexRefGetMaxAnisotropy);
570   CUDA_LIBRARY_FIND(cuTexRefGetBorderColor);
571   CUDA_LIBRARY_FIND(cuTexRefGetFlags);
572   CUDA_LIBRARY_FIND(cuTexRefCreate);
573   CUDA_LIBRARY_FIND(cuTexRefDestroy);
574   CUDA_LIBRARY_FIND(cuSurfRefSetArray);
575   CUDA_LIBRARY_FIND(cuSurfRefGetArray);
576   CUDA_LIBRARY_FIND(cuTexObjectCreate);
577   CUDA_LIBRARY_FIND(cuTexObjectDestroy);
578   CUDA_LIBRARY_FIND(cuTexObjectGetResourceDesc);
579   CUDA_LIBRARY_FIND(cuTexObjectGetTextureDesc);
580   CUDA_LIBRARY_FIND(cuTexObjectGetResourceViewDesc);
581   CUDA_LIBRARY_FIND(cuSurfObjectCreate);
582   CUDA_LIBRARY_FIND(cuSurfObjectDestroy);
583   CUDA_LIBRARY_FIND(cuSurfObjectGetResourceDesc);
584   CUDA_LIBRARY_FIND(cuDeviceCanAccessPeer);
585   CUDA_LIBRARY_FIND(cuCtxEnablePeerAccess);
586   CUDA_LIBRARY_FIND(cuCtxDisablePeerAccess);
587   CUDA_LIBRARY_FIND(cuDeviceGetP2PAttribute);
588   CUDA_LIBRARY_FIND(cuGraphicsUnregisterResource);
589   CUDA_LIBRARY_FIND(cuGraphicsSubResourceGetMappedArray);
590   CUDA_LIBRARY_FIND(cuGraphicsResourceGetMappedMipmappedArray);
591   CUDA_LIBRARY_FIND(cuGraphicsResourceGetMappedPointer_v2);
592   CUDA_LIBRARY_FIND(cuGraphicsResourceSetMapFlags_v2);
593   CUDA_LIBRARY_FIND(cuGraphicsMapResources);
594   CUDA_LIBRARY_FIND(cuGraphicsUnmapResources);
595   CUDA_LIBRARY_FIND(cuGetExportTable);
596 
597   CUDA_LIBRARY_FIND(cuGraphicsGLRegisterBuffer);
598   CUDA_LIBRARY_FIND(cuGraphicsGLRegisterImage);
599   CUDA_LIBRARY_FIND(cuGLGetDevices_v2);
600   CUDA_LIBRARY_FIND(cuGLCtxCreate_v2);
601   CUDA_LIBRARY_FIND(cuGLInit);
602   CUDA_LIBRARY_FIND(cuGLRegisterBufferObject);
603   CUDA_LIBRARY_FIND(cuGLMapBufferObject_v2);
604   CUDA_LIBRARY_FIND(cuGLUnmapBufferObject);
605   CUDA_LIBRARY_FIND(cuGLUnregisterBufferObject);
606   CUDA_LIBRARY_FIND(cuGLSetBufferObjectMapFlags);
607   CUDA_LIBRARY_FIND(cuGLMapBufferObjectAsync_v2);
608   CUDA_LIBRARY_FIND(cuGLUnmapBufferObjectAsync);
609 
610   result = CUEW_SUCCESS;
611   return result;
612 }
613 
cuewExitNvrtc(void)614 static void cuewExitNvrtc(void) {
615   if (nvrtc_lib != NULL) {
616     /*  Ignore errors. */
617     dynamic_library_close(nvrtc_lib);
618     nvrtc_lib = NULL;
619   }
620 }
621 
cuewNvrtcInit(void)622 static int cuewNvrtcInit(void) {
623   /* Library paths. */
624 #ifdef _WIN32
625   /* Expected in c:/windows/system or similar, no path needed. */
626   const char *nvrtc_paths[] = {"nvrtc64_101_0.dll",
627                                "nvrtc64_100_0.dll",
628                                "nvrtc64_91.dll",
629                                "nvrtc64_90.dll",
630                                "nvrtc64_80.dll",
631                                NULL};
632 #elif defined(__APPLE__)
633   /* Default installation path. */
634   const char *nvrtc_paths[] = {"/usr/local/cuda/lib/libnvrtc.dylib", NULL};
635 #else
636   const char *nvrtc_paths[] = {"libnvrtc.so",
637 #  if defined(__x86_64__) || defined(_M_X64)
638                                "/usr/local/cuda/lib64/libnvrtc.so",
639 #else
640                                "/usr/local/cuda/lib/libnvrtc.so",
641 #endif
642                                NULL};
643 #endif
644   static int initialized = 0;
645   static int result = 0;
646   int error;
647 
648   if (initialized) {
649     return result;
650   }
651 
652   initialized = 1;
653 
654   error = atexit(cuewExitNvrtc);
655   if (error) {
656     result = CUEW_ERROR_ATEXIT_FAILED;
657     return result;
658   }
659 
660   /* Load library. */
661   nvrtc_lib = dynamic_library_open_find(nvrtc_paths);
662 
663   if (nvrtc_lib == NULL) {
664     result = CUEW_ERROR_OPEN_FAILED;
665     return result;
666   }
667 
668   NVRTC_LIBRARY_FIND(nvrtcGetErrorString);
669   NVRTC_LIBRARY_FIND(nvrtcVersion);
670   NVRTC_LIBRARY_FIND(nvrtcCreateProgram);
671   NVRTC_LIBRARY_FIND(nvrtcDestroyProgram);
672   NVRTC_LIBRARY_FIND(nvrtcCompileProgram);
673   NVRTC_LIBRARY_FIND(nvrtcGetPTXSize);
674   NVRTC_LIBRARY_FIND(nvrtcGetPTX);
675   NVRTC_LIBRARY_FIND(nvrtcGetProgramLogSize);
676   NVRTC_LIBRARY_FIND(nvrtcGetProgramLog);
677   NVRTC_LIBRARY_FIND(nvrtcAddNameExpression);
678   NVRTC_LIBRARY_FIND(nvrtcGetLoweredName);
679 
680   result = CUEW_SUCCESS;
681   return result;
682 }
683 
684 
cuewInit(cuuint32_t flags)685 int cuewInit(cuuint32_t flags) {
686   int result = CUEW_SUCCESS;
687 
688   if (flags & CUEW_INIT_CUDA) {
689     result = cuewCudaInit();
690     if (result != CUEW_SUCCESS) {
691       return result;
692     }
693   }
694 
695   if (flags & CUEW_INIT_NVRTC) {
696     result = cuewNvrtcInit();
697     if (result != CUEW_SUCCESS) {
698       return result;
699     }
700   }
701 
702   return result;
703 }
704 
705 
cuewErrorString(CUresult result)706 const char *cuewErrorString(CUresult result) {
707   switch (result) {
708     case CUDA_SUCCESS: return "No errors";
709     case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
710     case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
711     case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
712     case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
713     case CUDA_ERROR_PROFILER_DISABLED: return "Profiler disabled";
714     case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "Profiler not initialized";
715     case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "Profiler already started";
716     case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "Profiler already stopped";
717     case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
718     case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
719     case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
720     case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
721     case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
722     case CUDA_ERROR_MAP_FAILED: return "Map failed";
723     case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
724     case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
725     case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
726     case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
727     case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
728     case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
729     case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
730     case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
731     case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
732     case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
733     case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "Context already in use";
734     case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "Peer access unsupported";
735     case CUDA_ERROR_INVALID_PTX: return "Invalid ptx";
736     case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "Invalid graphics context";
737     case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "Nvlink uncorrectable";
738     case CUDA_ERROR_JIT_COMPILER_NOT_FOUND: return "Jit compiler not found";
739     case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
740     case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
741     case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
742     case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
743     case CUDA_ERROR_OPERATING_SYSTEM: return "Operating system";
744     case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
745     case CUDA_ERROR_NOT_FOUND: return "Not found";
746     case CUDA_ERROR_NOT_READY: return "CUDA not ready";
747     case CUDA_ERROR_ILLEGAL_ADDRESS: return "Illegal address";
748     case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
749     case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
750     case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
751     case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "Peer access already enabled";
752     case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "Peer access not enabled";
753     case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "Primary context active";
754     case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "Context is destroyed";
755     case CUDA_ERROR_ASSERT: return "Assert";
756     case CUDA_ERROR_TOO_MANY_PEERS: return "Too many peers";
757     case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "Host memory already registered";
758     case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "Host memory not registered";
759     case CUDA_ERROR_HARDWARE_STACK_ERROR: return "Hardware stack error";
760     case CUDA_ERROR_ILLEGAL_INSTRUCTION: return "Illegal instruction";
761     case CUDA_ERROR_MISALIGNED_ADDRESS: return "Misaligned address";
762     case CUDA_ERROR_INVALID_ADDRESS_SPACE: return "Invalid address space";
763     case CUDA_ERROR_INVALID_PC: return "Invalid pc";
764     case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
765     case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: return "Cooperative launch too large";
766     case CUDA_ERROR_NOT_PERMITTED: return "Not permitted";
767     case CUDA_ERROR_NOT_SUPPORTED: return "Not supported";
768     case CUDA_ERROR_UNKNOWN: return "Unknown error";
769     default: return "Unknown CUDA error value";
770   }
771 }
772 
path_join(const char * path1,const char * path2,int maxlen,char * result)773 static void path_join(const char *path1,
774                       const char *path2,
775                       int maxlen,
776                       char *result) {
777 #if defined(WIN32) || defined(_WIN32)
778   const char separator = '\\';
779 #else
780   const char separator = '/';
781 #endif
782   int n = snprintf(result, maxlen, "%s%c%s", path1, separator, path2);
783   if (n != -1 && n < maxlen) {
784     result[n] = '\0';
785   }
786   else {
787     result[maxlen - 1] = '\0';
788   }
789 }
790 
path_exists(const char * path)791 static int path_exists(const char *path) {
792   struct stat st;
793   if (stat(path, &st)) {
794     return 0;
795   }
796   return 1;
797 }
798 
cuewCompilerPath(void)799 const char *cuewCompilerPath(void) {
800 #ifdef _WIN32
801   const char *defaultpaths[] = {
802     "C:/CUDA/bin",
803     "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin",
804     NULL};
805   const char *executable = "nvcc.exe";
806 #else
807   const char *defaultpaths[] = {
808     "/Developer/NVIDIA/CUDA-5.0/bin",
809     "/usr/local/cuda-5.0/bin",
810     "/usr/local/cuda/bin",
811     "/Developer/NVIDIA/CUDA-6.0/bin",
812     "/usr/local/cuda-6.0/bin",
813     "/Developer/NVIDIA/CUDA-5.5/bin",
814     "/usr/local/cuda-5.5/bin",
815     NULL};
816   const char *executable = "nvcc";
817 #endif
818   int i;
819 
820   const char *binpath = getenv("CUDA_BIN_PATH");
821 
822   static char nvcc[65536];
823 
824   if (binpath) {
825     path_join(binpath, executable, sizeof(nvcc), nvcc);
826     if (path_exists(nvcc)) {
827       return nvcc;
828     }
829   }
830 
831   for (i = 0; defaultpaths[i]; ++i) {
832     path_join(defaultpaths[i], executable, sizeof(nvcc), nvcc);
833     if (path_exists(nvcc)) {
834       return nvcc;
835     }
836   }
837 
838   {
839 #ifdef _WIN32
840     FILE *handle = popen("where nvcc", "r");
841 #else
842     FILE *handle = popen("which nvcc", "r");
843 #endif
844     if (handle) {
845       char buffer[4096] = {0};
846       int len = fread(buffer, 1, sizeof(buffer) - 1, handle);
847       buffer[len] = '\0';
848       pclose(handle);
849       if (buffer[0]) {
850         return "nvcc";
851       }
852     }
853   }
854 
855   return NULL;
856 }
857 
cuewNvrtcVersion(void)858 int cuewNvrtcVersion(void) {
859   int major, minor;
860   if (nvrtcVersion) {
861     nvrtcVersion(&major, &minor);
862     return 10 * major + minor;
863   }
864   return 0;
865 }
866 
cuewCompilerVersion(void)867 int cuewCompilerVersion(void) {
868   const char *path = cuewCompilerPath();
869   const char *marker = "Cuda compilation tools, release ";
870   FILE *pipe;
871   int major, minor;
872   char *versionstr;
873   char buf[128];
874   char output[65536] = "\0";
875   char command[65536] = "\0";
876 
877   if (path == NULL) {
878     return 0;
879   }
880 
881   /* get --version output */
882   strcat(command, "\"");
883   strncat(command, path, sizeof(command) - 1);
884   strncat(command, "\" --version", sizeof(command) - strlen(path) - 1);
885   pipe = popen(command, "r");
886   if (!pipe) {
887     fprintf(stderr, "CUDA: failed to run compiler to retrieve version");
888     return 0;
889   }
890 
891   while (!feof(pipe)) {
892     if (fgets(buf, sizeof(buf), pipe) != NULL) {
893       strncat(output, buf, sizeof(output) - strlen(output) - 1);
894     }
895   }
896 
897   pclose(pipe);
898 
899   /* parse version number */
900   versionstr = strstr(output, marker);
901   if (versionstr == NULL) {
902     fprintf(stderr, "CUDA: failed to find version number in:\n\n%s\n", output);
903     return 0;
904   }
905   versionstr += strlen(marker);
906 
907   if (sscanf(versionstr, "%d.%d", &major, &minor) < 2) {
908     fprintf(stderr, "CUDA: failed to parse version number from:\n\n%s\n", output);
909     return 0;
910   }
911 
912   return 10 * major + minor;
913 }
914