1 /*
2  *  Copyright (c) 2004-2017, Bruno Levy
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *
8  *  * Redistributions of source code must retain the above copyright notice,
9  *  this list of conditions and the following disclaimer.
10  *  * Redistributions in binary form must reproduce the above copyright notice,
11  *  this list of conditions and the following disclaimer in the documentation
12  *  and/or other materials provided with the distribution.
13  *  * Neither the name of the ALICE Project-Team nor the names of its
14  *  contributors may be used to endorse or promote products derived from this
15  *  software without specific prior written permission.
16  *
17  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  *  POSSIBILITY OF SUCH DAMAGE.
28  *
29  *  If you modify this software, you should include a notice giving the
30  *  name of the person performing the modification, the date of modification,
31  *  and the reason for such modification.
32  *
33  *  Contact: Bruno Levy
34  *
35  *     levy@loria.fr
36  *
37  *     ALICE Project
38  *     LORIA, INRIA Lorraine,
39  *     Campus Scientifique, BP 239
40  *     54506 VANDOEUVRE LES NANCY CEDEX
41  *     FRANCE
42  *
43  */
44 
45 #include "nl_cuda.h"
46 #include "nl_context.h"
47 #include <stdint.h>
48 
49 /**
50  * \file nl_cuda.c
51  * \brief Weak-coupling adapter to call CUDA from OpenNL.
52  */
53 
54 /**********************************************************/
55 /*      CUDA structures and functions                     */
56 /* Repeated here so that one can compile OpenNL without   */
57 /* requiring CUDA to be installed in the system.          */
58 /**********************************************************/
59 
60 enum cudaComputeMode {
61     cudaComputeModeDefault          = 0,
62     cudaComputeModeExclusive        = 1,
63     cudaComputeModeProhibited       = 2,
64     cudaComputeModeExclusiveProcess = 3
65 };
66 
67 enum cudaMemcpyKind {
68     cudaMemcpyHostToHost          =   0,
69     cudaMemcpyHostToDevice        =   1,
70     cudaMemcpyDeviceToHost        =   2,
71     cudaMemcpyDeviceToDevice      =   3,
72     cudaMemcpyDefault             =   4
73 };
74 
75 enum cudaDeviceAttribute {
76     CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
77     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
78     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
79     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
80     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
81     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
82     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
83     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
84     CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
85     CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
86     CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
87     CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
88     CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
89     CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
90     CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
91     CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
92     CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
93     CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
94     CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
95     CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
96     CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
97     CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
98     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
99     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
100     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
101     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
102     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
103     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
104     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,
105     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,
106     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,
107     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
108     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
109     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
110     CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
111     CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
112     CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
113     CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
114     CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
115     CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,
116     CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,
117     CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,
118     CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,
119     CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
120     CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
121     CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
122     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,
123     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,
124     CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,
125     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,
126     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,
127     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,
128     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,
129     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,
130     CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,
131     CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,
132     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,
133     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,
134     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,
135     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,
136     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,
137     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,
138     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,
139     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,
140     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,
141     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,
142     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,
143     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,
144     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,
145     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,
146     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,
147     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,
148     CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,
149     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,
150     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,
151     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,
152     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,
153     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,
154     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,
155     CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
156     CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
157     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,
158     CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,
159     CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,
160     CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,
161     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,
162     CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
163     CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
164     CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,
165     CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
166     CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
167     CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
168     CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
169     CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
170     CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
171     CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
172     CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,
173     CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,
174     CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,
175     CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,
176     CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,
177     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,
178     CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,
179     CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,
180     CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100,
181     CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,
182     CU_DEVICE_ATTRIBUTE_MAX
183 };
184 
185 /*
186  * We use cudaGetDeviceProperties() to get the name of the device.
187  * There should be cudaDeviceGetName() but it is not always there.
188  * We do not use the other device properties because the order of
189  * the fields change in the different CUDA versions.
190  */
191 struct cudaDeviceProp {
192     char name[256];
193     char buff[4096];
194 };
195 
196 typedef int cudaError_t;
197 
198 typedef cudaError_t (*FUNPTR_cudaDriverGetVersion)(int* version);
199 typedef cudaError_t (*FUNPTR_cudaRuntimeGetVersion)(int* version);
200 typedef cudaError_t (*FUNPTR_cudaGetDeviceCount)(int* device_count);
201 typedef cudaError_t (*FUNPTR_cudaGetDeviceProperties)(
202     struct cudaDeviceProp *props, int device
203 );
204 typedef cudaError_t (*FUNPTR_cudaDeviceGetAttribute)(
205     int* attrib_value, enum cudaDeviceAttribute attrib, int device
206 );
207 typedef cudaError_t (*FUNPTR_cudaDeviceReset)(void);
208 typedef cudaError_t (*FUNPTR_cudaMalloc)(void **devPtr, size_t size);
209 typedef cudaError_t (*FUNPTR_cudaFree)(void* devPtr);
210 typedef cudaError_t (*FUNPTR_cudaMemcpy)(
211     void *dst, const void *src, size_t count, enum cudaMemcpyKind kind
212 );
213 
214 /**
215  * \brief Finds and initializes a function pointer to
216  *  one of the functions in CUDA.
217  * \details Function pointers are stored into the
218  *  CUDAContext returned by the function CUDA().
219  *  If a symbol is not found, returns NL_FALSE from the
220  *  calling function.
221  */
222 #define find_cuda_func(name)                                  \
223     if(                                                       \
224         (                                                     \
225             CUDA()->name =                                    \
226             (FUNPTR_##name)nlFindFunction(                    \
227 		   CUDA()->DLL_cudart,#name                   \
228 	    )					              \
229         ) == NULL                                             \
230     ) {                                                       \
231         nlError("nlInitExtension_CUDA: function not found", #name); \
232         return NL_FALSE;                                      \
233     }
234 
235 /**********************************************************/
236 /*      CUBLAS structures and functions                   */
237 /**********************************************************/
238 
239 struct cublasContext;
240 typedef struct cublasContext *cublasHandle_t;
241 typedef int cublasStatus_t;
242 
243 typedef enum {
244     CUBLAS_SIDE_LEFT =0,
245     CUBLAS_SIDE_RIGHT=1
246 } cublasSideMode_t;
247 
248 typedef enum {
249     CUBLAS_FILL_MODE_LOWER=0,
250     CUBLAS_FILL_MODE_UPPER=1
251 } cublasFillMode_t;
252 
253 typedef enum {
254     CUBLAS_OP_N=0,
255     CUBLAS_OP_T=1,
256     CUBLAS_OP_C=2
257 } cublasOperation_t;
258 
259 typedef enum {
260     CUBLAS_DIAG_NON_UNIT=0,
261     CUBLAS_DIAG_UNIT=1
262 } cublasDiagType_t;
263 
264 typedef cublasStatus_t (*FUNPTR_cublasCreate)(cublasHandle_t* handle);
265 typedef cublasStatus_t (*FUNPTR_cublasDestroy)(cublasHandle_t handle);
266 
267 typedef cublasStatus_t (*FUNPTR_cublasGetVersion)(
268     cublasHandle_t handle, int* version
269 );
270 
271 typedef cublasStatus_t (*FUNPTR_cublasDdot)(
272     cublasHandle_t handle, int n,
273     const double *x, int incx,
274     const double *y, int incy,
275     double *result
276 );
277 
278 typedef cublasStatus_t (*FUNPTR_cublasDcopy)(
279     cublasHandle_t handle, int n,
280     const double *x, int incx,
281     const double *y, int incy
282 );
283 
284 typedef cublasStatus_t (*FUNPTR_cublasDaxpy)(
285     cublasHandle_t handle, int n,
286     const double* alpha,
287     const double *x, int incx,
288     const double *y, int incy
289 );
290 
291 typedef cublasStatus_t (*FUNPTR_cublasDscal)(
292     cublasHandle_t handle, int n,
293     const double* alpha,
294     const double *x, int incx
295 );
296 
297 typedef cublasStatus_t (*FUNPTR_cublasDnrm2)(
298     cublasHandle_t handle, int n,
299     const double *x, int incx,
300     double* result
301 );
302 
303 typedef cublasStatus_t (*FUNPTR_cublasDdgmm)(
304     cublasHandle_t handle, cublasSideMode_t mode,
305     int m, int n,
306     const double* A, int lda,
307     const double* x, int incx,
308     double* C, int ldc
309 );
310 
311 typedef cublasStatus_t (*FUNPTR_cublasDgemv)(
312     cublasHandle_t handle,
313     cublasOperation_t trans,
314     int m,
315     int n,
316     const double *alpha,
317     const double *A,
318     int lda,
319     const double *x,
320     int incx,
321     const double *beta,
322     double *y,
323     int incy
324 );
325 
326 typedef cublasStatus_t (*FUNPTR_cublasDtpsv)(
327     cublasHandle_t handle, cublasFillMode_t uplo,
328     cublasOperation_t trans, cublasDiagType_t diag,
329     int n, const double *AP,
330     double* x, int incx
331 );
332 
333 
334 /**
335  * \brief Finds and initializes a function pointer to
336  *  one of the functions in CUBLAS.
337  * \details Function pointers are stored into the
338  *  CUDAContext returned by the function CUDA().
339  *  If a symbol is not found, returns NL_FALSE from the
340  *  calling function. Here we use the functions suffixed
341  *  by "_v2".
342  */
343 #define find_cublas_func(name)	    		              \
344     if(                                                       \
345         (                                                     \
346             CUDA()->name =                                    \
347             (FUNPTR_##name)nlFindFunction(                    \
348 		   CUDA()->DLL_cublas,#name "_v2"             \
349 	    )					              \
350         ) == NULL                                             \
351     ) {                                                       \
352         nlError("nlInitExtension_CUDA: function not found", #name); \
353         return NL_FALSE;                                      \
354     }
355 
356 /**
357  * \brief Finds and initializes a function pointer to
358  *  one of the functions in CUBLAS.
359  * \details Function pointers are stored into the
360  *  CUDAContext returned by the function CUDA().
361  *  If a symbol is not found, returns NL_FALSE from the
362  *  calling function.
363  */
364 #define find_cublas_func_v1(name)                             \
365     if(                                                       \
366         (                                                     \
367             CUDA()->name =                                    \
368             (FUNPTR_##name)nlFindFunction(                    \
369 		   CUDA()->DLL_cublas,#name                   \
370 	    )					              \
371         ) == NULL                                             \
372     ) {                                                       \
373         nlError("nlInitExtension_CUDA: function not found", #name); \
374         return NL_FALSE;                                      \
375     }
376 
377 
378 
379 /**********************************************************/
380 /*      CUSPARSE structures and functions                 */
381 /**********************************************************/
382 
383 struct cusparseContext;
384 typedef struct cusparseContext *cusparseHandle_t;
385 typedef int cusparseStatus_t;
386 struct cusparseMatDescr;
387 typedef struct cusparseMatDescr *cusparseMatDescr_t;
388 
389 typedef enum {
390     CUSPARSE_MATRIX_TYPE_GENERAL = 0,
391     CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1,
392     CUSPARSE_MATRIX_TYPE_HERMITIAN = 2,
393     CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
394 } cusparseMatrixType_t;
395 
396 typedef enum {
397     CUSPARSE_INDEX_BASE_ZERO = 0,
398     CUSPARSE_INDEX_BASE_ONE = 1
399 } cusparseIndexBase_t;
400 
401 typedef enum {
402     CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
403     CUSPARSE_OPERATION_TRANSPOSE = 1,
404     CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
405 } cusparseOperation_t;
406 
407 typedef cusparseStatus_t (*FUNPTR_cusparseCreate)(cusparseHandle_t* handle);
408 typedef cusparseStatus_t (*FUNPTR_cusparseDestroy)(cusparseHandle_t handle);
409 typedef cusparseStatus_t (*FUNPTR_cusparseGetVersion)(
410     cusparseHandle_t handle, int* version
411 );
412 typedef cusparseStatus_t (*FUNPTR_cusparseCreateMatDescr)(
413     cusparseMatDescr_t* descr
414 );
415 typedef cusparseStatus_t (*FUNPTR_cusparseDestroyMatDescr)(
416     cusparseMatDescr_t descr
417 );
418 typedef cusparseStatus_t (*FUNPTR_cusparseSetMatType)(
419     cusparseMatDescr_t descr, cusparseMatrixType_t mtype
420 );
421 typedef cusparseStatus_t (*FUNPTR_cusparseSetMatIndexBase)(
422     cusparseMatDescr_t descr, cusparseIndexBase_t ibase
423 );
424 typedef cusparseStatus_t (*FUNPTR_cusparseDcsrmv)(
425      cusparseHandle_t handle, cusparseOperation_t transA,
426      int m, int n, int nnz,
427      const double *alpha, const cusparseMatDescr_t descrA,
428      const double *csrSortedValA, const int *csrSortedRowPtrA,
429      const int *csrSortedColIndA, const double *x,
430      const double *beta, double *y
431 );
432 
433 /**********************************************************/
434 /* CUSPARSE v10 structures and functions (generic matrix) */
435 /**********************************************************/
436 
437 typedef enum cudaDataType_t {
438     CUDA_R_16F= 2,  CUDA_C_16F= 6,
439     CUDA_R_32F= 0,  CUDA_C_32F= 4,
440     CUDA_R_64F= 1,  CUDA_C_64F= 5,
441     CUDA_R_8I = 3,  CUDA_C_8I = 7,
442     CUDA_R_8U = 8,  CUDA_C_8U = 9,
443     CUDA_R_32I= 10, CUDA_C_32I= 11,
444     CUDA_R_32U= 12, CUDA_C_32U= 13
445 } cudaDataType;
446 
447 struct cusparseDnVecDescr;
448 struct cusparseSpMatDescr;
449 typedef struct cusparseDnVecDescr* cusparseDnVecDescr_t;
450 typedef struct cusparseSpMatDescr* cusparseSpMatDescr_t;
451 
452 typedef enum {
453     CUSPARSE_INDEX_16U = 1,
454     CUSPARSE_INDEX_32I = 2,
455     CUSPARSE_INDEX_64I = 3
456 } cusparseIndexType_t;
457 
458 typedef cusparseStatus_t (*FUNPTR_cusparseCreateDnVec)(
459     cusparseDnVecDescr_t* dnVecDescr,
460     int64_t size, void* values, cudaDataType valueType
461 );
462 
463 typedef cusparseStatus_t (*FUNPTR_cusparseDestroyDnVec)(
464     cusparseDnVecDescr_t dnVecDescr
465 );
466 
467 typedef cusparseStatus_t (*FUNPTR_cusparseDnVecSetValues)(
468     cusparseDnVecDescr_t dnVecDescr, void* values
469 );
470 
471 typedef cusparseStatus_t (*FUNPTR_cusparseCreateCsr)(
472      cusparseSpMatDescr_t* spMatDescr,
473      int64_t rows, int64_t cols, int64_t nnz,
474      void* csrRowOffsets, void* csrColInd, void* csrValues,
475      cusparseIndexType_t csrRowOffsetsType,
476      cusparseIndexType_t csrColIndType,
477      cusparseIndexBase_t idxBase,
478      cudaDataType valueType
479 );
480 
481 typedef cusparseStatus_t (*FUNPTR_cusparseDestroySpMat)(
482      cusparseSpMatDescr_t spMatDescr
483 );
484 
485 typedef enum {
486     CUSPARSE_MV_ALG_DEFAULT = 0,
487     CUSPARSE_COOMV_ALG      = 1,
488     CUSPARSE_CSRMV_ALG1     = 2,
489     CUSPARSE_CSRMV_ALG2     = 3
490 } cusparseSpMVAlg_t;
491 
492 typedef cusparseStatus_t  (*FUNPTR_cusparseSpMV)(
493    cusparseHandle_t handle, cusparseOperation_t opA,
494    const void* alpha, const cusparseSpMatDescr_t matA,
495    const cusparseDnVecDescr_t vecX, const void* beta,
496    const cusparseDnVecDescr_t vecY, cudaDataType computeType,
497    cusparseSpMVAlg_t alg, void* externalBuffer
498 );
499 
500 typedef cusparseStatus_t (*FUNPTR_cusparseSpMV_bufferSize)(
501     cusparseHandle_t handle, cusparseOperation_t opA,
502     const void* alpha, const cusparseSpMatDescr_t matA,
503     const cusparseDnVecDescr_t vecX, const void* beta,
504     const cusparseDnVecDescr_t vecY, cudaDataType computeType,
505     cusparseSpMVAlg_t alg, size_t* bufferSize
506 );
507 
508 /**
509  * \brief Finds and initializes a function pointer to
510  *  one of the functions in CUSPARSE.
511  * \details Function pointers are stored into the
512  *  CUDAContext returned by the function CUDA().
513  *  If a symbol is not found, returns NL_FALSE from the
514  *  calling function.
515  */
516 #define find_cusparse_func(name)                              \
517     if(                                                       \
518         (                                                     \
519             CUDA()->name =                                    \
520             (FUNPTR_##name)nlFindFunction(                    \
521 		   CUDA()->DLL_cusparse,#name                 \
522 	    )					              \
523         ) == NULL                                             \
524     ) {                                                       \
525         nlError("nlInitExtension_CUDA : function not found", #name);	\
526         return NL_FALSE;                                      \
527     }
528 
529 
530 /**
531  * \brief Finds and initializes a function pointer to
532  *  one of the functions in CUSPARSE.
533  * \details Function pointers are stored into the
534  *  CUDAContext returned by the function CUDA().
535  *  If a symbol is not found, returns NL_FALSE from the
536  *  calling function.
537  */
538 #define find_cusparse_func_V10(name)                          \
539     if(                                                       \
540         (                                                     \
541             CUDA()->name =                                    \
542             (FUNPTR_##name)nlFindFunction(                    \
543 		   CUDA()->DLL_cusparse,#name                 \
544 	    )					              \
545         ) == NULL                                             \
546     ) {                                                       \
547         nlWarning("nlInitExtension_CUDA : optional function not found", #name);	\
548 	CUDA()->has_cusparse_V10 = NL_FALSE;                  \
549     }
550 
551 
552 /**********************************************************/
553 
554 /**
555  * \brief The structure that stores the handle to
556  *  the CUDA shared object, the function pointers
557  *  and the detected version.
558  */
559 typedef struct {
560     NLdll DLL_cudart;
561 
562     FUNPTR_cudaDriverGetVersion cudaDriverGetVersion;
563     FUNPTR_cudaRuntimeGetVersion cudaRuntimeGetVersion;
564     FUNPTR_cudaGetDeviceCount cudaGetDeviceCount;
565     FUNPTR_cudaGetDeviceProperties cudaGetDeviceProperties;
566     FUNPTR_cudaDeviceGetAttribute cudaDeviceGetAttribute;
567     FUNPTR_cudaDeviceReset cudaDeviceReset;
568     FUNPTR_cudaMalloc cudaMalloc;
569     FUNPTR_cudaFree cudaFree;
570     FUNPTR_cudaMemcpy cudaMemcpy;
571 
572     NLdll DLL_cublas;
573     cublasHandle_t HNDL_cublas;
574     FUNPTR_cublasCreate cublasCreate;
575     FUNPTR_cublasDestroy cublasDestroy;
576     FUNPTR_cublasGetVersion cublasGetVersion;
577     FUNPTR_cublasDdot cublasDdot;
578     FUNPTR_cublasDcopy cublasDcopy;
579     FUNPTR_cublasDaxpy cublasDaxpy;
580     FUNPTR_cublasDscal cublasDscal;
581     FUNPTR_cublasDnrm2 cublasDnrm2;
582     FUNPTR_cublasDdgmm cublasDdgmm;
583     FUNPTR_cublasDgemv cublasDgemv;
584     FUNPTR_cublasDtpsv cublasDtpsv;
585 
586     NLdll DLL_cusparse;
587     cusparseHandle_t HNDL_cusparse;
588     FUNPTR_cusparseCreate cusparseCreate;
589     FUNPTR_cusparseDestroy cusparseDestroy;
590     FUNPTR_cusparseGetVersion cusparseGetVersion;
591     FUNPTR_cusparseCreateMatDescr cusparseCreateMatDescr;
592     FUNPTR_cusparseDestroyMatDescr cusparseDestroyMatDescr;
593     FUNPTR_cusparseSetMatType cusparseSetMatType;
594     FUNPTR_cusparseSetMatIndexBase cusparseSetMatIndexBase;
595     FUNPTR_cusparseDcsrmv cusparseDcsrmv;
596 
597     /* cusparse V10 API */
598     NLboolean has_cusparse_V10;
599     FUNPTR_cusparseCreateDnVec cusparseCreateDnVec;
600     FUNPTR_cusparseDestroyDnVec cusparseDestroyDnVec;
601     FUNPTR_cusparseDnVecSetValues cusparseDnVecSetValues;
602     FUNPTR_cusparseCreateCsr cusparseCreateCsr;
603     FUNPTR_cusparseDestroySpMat cusparseDestroySpMat;
604     FUNPTR_cusparseSpMV cusparseSpMV;
605     FUNPTR_cusparseSpMV_bufferSize cusparseSpMV_bufferSize;
606 
607     int devID;
608 } CUDAContext;
609 
610 /**
611  * \brief Gets the CUDA context.
612  * \return a pointer to the CUDA context
613  */
CUDA()614 static CUDAContext* CUDA() {
615     static CUDAContext context;
616     static NLboolean init = NL_FALSE;
617     if(!init) {
618         init = NL_TRUE;
619         memset(&context, 0, sizeof(context));
620     }
621     return &context;
622 }
623 
nlExtensionIsInitialized_CUDA()624 NLboolean nlExtensionIsInitialized_CUDA() {
625     if(
626 	CUDA()->DLL_cudart == NULL ||
627 	CUDA()->cudaDriverGetVersion == NULL ||
628 	CUDA()->cudaRuntimeGetVersion == NULL ||
629 	CUDA()->cudaGetDeviceCount == NULL ||
630 	CUDA()->cudaGetDeviceProperties == NULL ||
631 	CUDA()->cudaDeviceGetAttribute == NULL ||
632 	CUDA()->cudaDeviceReset == NULL ||
633 	CUDA()->cudaMalloc == NULL ||
634 	CUDA()->cudaFree == NULL ||
635 	CUDA()->cudaMemcpy == NULL ||
636 
637 	CUDA()->DLL_cublas == NULL ||
638 	CUDA()->HNDL_cublas == NULL ||
639 	CUDA()->cublasCreate == NULL ||
640 	CUDA()->cublasDestroy == NULL ||
641 	CUDA()->cublasGetVersion == NULL ||
642 	CUDA()->cublasDdot == NULL ||
643 	CUDA()->cublasDcopy == NULL ||
644 	CUDA()->cublasDaxpy == NULL ||
645 	CUDA()->cublasDscal == NULL ||
646 	CUDA()->cublasDnrm2 == NULL ||
647 	CUDA()->cublasDdgmm == NULL ||
648 
649 	CUDA()->DLL_cusparse == NULL ||
650 	CUDA()->HNDL_cusparse == NULL ||
651 	CUDA()->cusparseCreate == NULL ||
652 	CUDA()->cusparseDestroy == NULL ||
653 	CUDA()->cusparseGetVersion == NULL ||
654 	CUDA()->cusparseCreateMatDescr == NULL ||
655 	CUDA()->cusparseDestroyMatDescr == NULL ||
656 	CUDA()->cusparseSetMatType == NULL ||
657 	CUDA()->cusparseSetMatIndexBase == NULL ||
658 	CUDA()->cusparseDcsrmv == NULL
659     ) {
660         return NL_FALSE;
661     }
662     return NL_TRUE;
663 }
664 
nlTerminateExtension_CUDA(void)665 static void nlTerminateExtension_CUDA(void) {
666     if(!nlExtensionIsInitialized_CUDA()) {
667 	return;
668     }
669 
670     CUDA()->cusparseDestroy(CUDA()->HNDL_cusparse);
671     nlCloseDLL(CUDA()->DLL_cusparse);
672 
673     CUDA()->cublasDestroy(CUDA()->HNDL_cublas);
674     nlCloseDLL(CUDA()->DLL_cublas);
675 
676     CUDA()->cudaDeviceReset();
677     nlCloseDLL(CUDA()->DLL_cudart);
678 
679     memset(CUDA(), 0, sizeof(CUDAContext));
680 }
681 
682 /**************************************************************************/
683 
684 /**
685  * \brief Finds the number of cores from the major and minor versions of the
686  *  shader model.
687  * \details Highly inspired by the helpers library in CUDA examples,
688  *  see https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h
689  */
ConvertSMVer2Cores(int major,int minor)690 static int ConvertSMVer2Cores(int major, int minor) {
691     /* Defines for GPU Architecture types (using the SM version
692        to determine the # of cores per SM */
693     typedef struct {
694         int SM; /* 0xMm (hexadecimal notation),
695                     M = SM Major version,
696                     and m = SM minor version */
697         int Cores;
698     } sSMtoCores;
699 
700     sSMtoCores nGpuArchCoresPerSM[] = {
701         { 0x10,  8 }, /* Tesla Generation   (SM 1.0) G80 class    */
702         { 0x11,  8 }, /* Tesla Generation   (SM 1.1) G8x class    */
703         { 0x12,  8 }, /* Tesla Generation   (SM 1.2) G9x class    */
704         { 0x13,  8 }, /* Tesla Generation   (SM 1.3) GT200 class  */
705         { 0x20, 32 }, /* Fermi Generation   (SM 2.0) GF100 class  */
706         { 0x21, 48 }, /* Fermi Generation   (SM 2.1) GF10x class  */
707         { 0x30, 192}, /* Kepler Generation  (SM 3.0) GK10x class  */
708         { 0x35, 192}, /* Kepler Generation  (SM 3.5) GK11x class  */
709 	{ 0x50, 128}, /* Maxwell Generation (SM 5.0) GM10x class
710                              (yes, #cores smaller than with 3.x)  */
711 	{ 0x52, 128}, /* Maxwell Generation (SM 5.2) GM20x class  */
712 	{ 0x60, 64 }, /* Pascal Generation  (SM 6.0) GP100,GP102
713               (yes, 64, but GP100 has superfast double precision) */
714 	{ 0x61, 128}, /* Pascal Generation  (SM 6.1) GP104 class
715                                (but FP64 runs as 1/32 FP32 speed) */
716 	{ 0x70, 64 }, /* yes, nb cores decreased in SM 7.x        */
717 	{ 0x72, 64 },
718 	{ 0x75, 64 },
719         {   -1, -1 }
720     };
721     int index = 0;
722     if(major == 0 && minor == 0) {
723 	return 0;
724     }
725     while (nGpuArchCoresPerSM[index].SM != -1) {
726         if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
727             return nGpuArchCoresPerSM[index].Cores;
728         }
729         index++;
730     }
731     /* If we don't find the values, we use a default value (64) */
732     nl_printf(
733       "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
734        major, minor, 64
735     );
736     return 64;
737 }
738 
739 /**
740  * \brief Gets the double-precision performance for a device.
741  * \param[in] device the device
742  * \return the peak GFlops double-precision performance of the device,
743  *  or 0.0 if device's compute mode is prohibited.
744  */
getDeviceDoublePrecisionGFlops(int device)745 static double getDeviceDoublePrecisionGFlops(int device) {
746     int compute_mode;
747     int compute_capability_major;
748     int compute_capability_minor;
749     int cores_per_multiprocessor;
750     int multiprocessor_count;
751     int double_precision_perf_ratio;
752     int clock_rate;
753     double result = 0.0;
754 
755     CUDA()->cudaDeviceGetAttribute(
756 	&compute_mode,
757 	CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
758 	device
759     );
760 
761     if(compute_mode == cudaComputeModeProhibited) {
762 	return 0.0;
763     }
764 
765     CUDA()->cudaDeviceGetAttribute(
766 	&compute_capability_major,
767 	CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
768 	device
769     );
770 
771     CUDA()->cudaDeviceGetAttribute(
772 	&compute_capability_minor,
773 	CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
774 	device
775     );
776 
777     CUDA()->cudaDeviceGetAttribute(
778 	&multiprocessor_count,
779 	CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
780 	device
781     );
782 
783     CUDA()->cudaDeviceGetAttribute(
784 	&double_precision_perf_ratio,
785 	CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO,
786 	device
787     );
788 
789     CUDA()->cudaDeviceGetAttribute(
790 	&clock_rate,
791 	CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
792 	device
793     );
794 
795     cores_per_multiprocessor = ConvertSMVer2Cores(
796 	compute_capability_major, compute_capability_minor
797     );
798 
799     /*
800      * I need this 2.0 factor to match the specs,
801      * does it mean a CUDA core does two FPs per cycle ?
802      * They probably count FMAs as 2 ops for the "peak perf"
803      * stat...
804      */
805     result = 2.0 *
806 	((double)(clock_rate) / (1024.0 * 1024.0)) *
807 	(double)(multiprocessor_count) *
808 	(double)(cores_per_multiprocessor) /
809 	(double)(double_precision_perf_ratio) ;
810 
811     return result;
812 }
813 
814 /**
815  * \brief Gets the device ID with the maximum double precision
816  *  performance.
817  * \return the ID of the fastest device or -1 is no device is
818  *  available.
819  */
getBestDeviceID()820 static int getBestDeviceID() {
821     int result = -1;
822     double fastest_GFlops = 0.0;
823     int device_count;
824     int retval = CUDA()->cudaGetDeviceCount(&device_count);
825     int device;
826     double device_GFlops;
827     int driver_ver;
828     int runtime_ver;
829     if(retval == 35) {
830 	nl_printf("Error: Driver/CUDA versions mismatch\n");
831 	retval = CUDA()->cudaDriverGetVersion(&driver_ver);
832 	nl_printf("cudaDriverGetVersion()   retval=%d\n",retval);
833 	retval = CUDA()->cudaRuntimeGetVersion(&runtime_ver);
834 	nl_printf("cudaRuntimeGetVersion()  retval=%d\n",retval);
835 
836 	nl_printf("  Driver  version=%d\n",driver_ver);
837 	nl_printf("  Runtime version=%d\n",driver_ver);
838 	return result;
839     }
840     for(device=0; device<device_count; ++device) {
841 	device_GFlops = getDeviceDoublePrecisionGFlops(device);
842 	if(device_GFlops > fastest_GFlops) {
843 	    fastest_GFlops = device_GFlops;
844 	    result = device;
845 	}
846     }
847     return result;
848 }
849 
850 /**************************************************************************/
851 
852 #ifdef NL_OS_UNIX
853 #  define LIBPREFIX "lib"
854 #  ifdef NL_OS_APPLE
855 #      define LIBEXTENSION ".dylib"
856 #  else
857 #      define LIBEXTENSION ".so"
858 #  endif
859 #else
860 #  define LIBPREFIX
861 #  define LIBEXTENSION ".dll"
862 #endif
863 
864 
nlInitExtension_CUDA(void)865 NLboolean nlInitExtension_CUDA(void) {
866     int cublas_version;
867     int cusparse_version;
868     static struct cudaDeviceProp deviceProp;
869     int compute_capability_major;
870     int compute_capability_minor;
871     int multiprocessor_count;
872     int max_shared_mem_per_block;
873     int max_shared_mem_per_multiprocessor;
874     int max_regs_per_block;
875     int max_regs_per_multiprocessor;
876     int warp_size;
877     int double_precision_perf_ratio;
878 
879     NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL;
880     if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
881 	flags |= NL_LINK_QUIET;
882     }
883 
884     if(nlExtensionIsInitialized_CUDA()) {
885 	return NL_TRUE;
886     }
887 
888     CUDA()->DLL_cudart = nlOpenDLL(
889 	LIBPREFIX "cudart" LIBEXTENSION, flags
890     );
891 
892     find_cuda_func(cudaDriverGetVersion);
893     find_cuda_func(cudaRuntimeGetVersion);
894     find_cuda_func(cudaGetDeviceCount);
895     find_cuda_func(cudaGetDeviceProperties);
896     find_cuda_func(cudaDeviceGetAttribute);
897     find_cuda_func(cudaDeviceReset);
898     find_cuda_func(cudaMalloc);
899     find_cuda_func(cudaFree);
900     find_cuda_func(cudaMemcpy);
901 
902     CUDA()->devID = getBestDeviceID();
903 
904     if(
905        CUDA()->devID == -1 ||
906        CUDA()->cudaGetDeviceProperties(&deviceProp, CUDA()->devID)
907     ) {
908 	nl_fprintf(stderr,"OpenNL CUDA: could not find a CUDA device\n");
909 	return NL_FALSE;
910     }
911 
912     nl_printf("OpenNL CUDA: Device ID = %d\n", CUDA()->devID);
913     nl_printf("OpenNL CUDA: Device name=%s\n", deviceProp.name);
914 
915     {
916 	CUDA()->cudaDeviceGetAttribute(
917 	    &compute_capability_major,
918 	    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
919 	    CUDA()->devID
920 	);
921 
922 	CUDA()->cudaDeviceGetAttribute(
923 	    &compute_capability_minor,
924 	    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
925 	    CUDA()->devID
926 	);
927 
928 	CUDA()->cudaDeviceGetAttribute(
929 	    &multiprocessor_count,
930 	    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
931 	    CUDA()->devID
932 	);
933 
934 	CUDA()->cudaDeviceGetAttribute(
935 	    &max_shared_mem_per_block,
936 	    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
937 	    CUDA()->devID
938 	);
939 
940 	CUDA()->cudaDeviceGetAttribute(
941 	    &max_shared_mem_per_multiprocessor,
942 	    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
943 	    CUDA()->devID
944 	);
945 
946 	CUDA()->cudaDeviceGetAttribute(
947 	    &max_regs_per_block,
948 	    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
949 	    CUDA()->devID
950 	);
951 
952 	CUDA()->cudaDeviceGetAttribute(
953 	    &max_regs_per_multiprocessor,
954 	    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
955 	    CUDA()->devID
956 	);
957 
958 	CUDA()->cudaDeviceGetAttribute(
959 	    &warp_size,
960 	    CU_DEVICE_ATTRIBUTE_WARP_SIZE,
961 	    CUDA()->devID
962 	);
963 
964 	CUDA()->cudaDeviceGetAttribute(
965 	    &double_precision_perf_ratio,
966 	    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO,
967 	    CUDA()->devID
968 	);
969 
970 	nl_printf(
971 	    "OpenNL CUDA: Device has %d Multi-Processors, "
972 	    "%d cores per Multi-Processor, SM %d.%d compute capabilities\n",
973 	    multiprocessor_count,
974 	    ConvertSMVer2Cores(
975 		compute_capability_major, compute_capability_minor
976 	    ),
977 	    compute_capability_major, compute_capability_minor
978 	);
979 
980 	nl_printf(
981 	    "OpenNL CUDA: %d kB shared mem. per block, %d per MP\n",
982 	    (int)(max_shared_mem_per_block / 1024),
983 	    (int)(max_shared_mem_per_multiprocessor / 1024)
984 	);
985 
986 	nl_printf(
987 	    "OpenNL CUDA: %d regs. per block, %d per MP\n",
988 	    max_regs_per_block, max_regs_per_multiprocessor
989 	);
990 
991 	nl_printf("OpenNL CUDA: warpsize = %d\n", warp_size);
992 
993 	nl_printf(
994 	    "OpenNL CUDA: double precision perf ratio = %d\n",
995 	    double_precision_perf_ratio
996 	);
997 	nl_printf(
998 	    "OpenNL CUDA: theoretical peak double precision GFlops = %f\n",
999 	    getDeviceDoublePrecisionGFlops(CUDA()->devID)
1000 	);
1001 	if (
1002 	    (compute_capability_major * 0x10 + compute_capability_minor) < 0x11
1003 	) {
1004 	    nl_fprintf(
1005 		stderr,
1006 		"OpenNL CUDA requires a minimum CUDA compute 1.1 capability\n"
1007 	    );
1008 	    CUDA()->cudaDeviceReset();
1009 	    return NL_FALSE;
1010 	}
1011     }
1012 
1013     CUDA()->DLL_cublas = nlOpenDLL(
1014 	LIBPREFIX "cublas" LIBEXTENSION, flags
1015     );
1016 
1017     find_cublas_func(cublasCreate);
1018     find_cublas_func(cublasDestroy);
1019     find_cublas_func(cublasGetVersion);
1020     find_cublas_func(cublasDdot);
1021     find_cublas_func(cublasDaxpy);
1022     find_cublas_func(cublasDcopy);
1023     find_cublas_func(cublasDscal);
1024     find_cublas_func(cublasDnrm2);
1025     find_cublas_func(cublasDgemv);
1026     find_cublas_func(cublasDtpsv);
1027     find_cublas_func_v1(cublasDdgmm);
1028 
1029 
1030     if(CUDA()->cublasCreate(&CUDA()->HNDL_cublas)) {
1031 	return NL_FALSE;
1032     }
1033 
1034     if(CUDA()->cublasGetVersion(CUDA()->HNDL_cublas, &cublas_version)) {
1035 	return NL_FALSE;
1036     }
1037     nl_printf("OpenNL CUDA: cublas version = %d\n", cublas_version);
1038 
1039     CUDA()->DLL_cusparse = nlOpenDLL(
1040 	LIBPREFIX "cusparse" LIBEXTENSION, flags
1041     );
1042     find_cusparse_func(cusparseCreate);
1043     find_cusparse_func(cusparseDestroy);
1044     find_cusparse_func(cusparseGetVersion);
1045     find_cusparse_func(cusparseCreateMatDescr);
1046     find_cusparse_func(cusparseDestroyMatDescr);
1047     find_cusparse_func(cusparseSetMatType);
1048     find_cusparse_func(cusparseSetMatIndexBase);
1049     find_cusparse_func(cusparseDcsrmv);
1050 
1051     /* V10 API */
1052     if(getenv("NL_NO_CUSPARSE_V10") == NULL) {
1053         nl_printf("OpenNL CUDA: Trying to initialize cusparse V10\n");
1054 	CUDA()->has_cusparse_V10 = NL_TRUE;
1055 	find_cusparse_func_V10(cusparseCreateDnVec);
1056 	find_cusparse_func_V10(cusparseDestroyDnVec);
1057 	find_cusparse_func_V10(cusparseDnVecSetValues);
1058 	find_cusparse_func_V10(cusparseCreateCsr);
1059 	find_cusparse_func_V10(cusparseDestroySpMat);
1060 	find_cusparse_func_V10(cusparseSpMV);
1061 	find_cusparse_func_V10(cusparseSpMV_bufferSize);
1062     } else {
1063 	CUDA()->has_cusparse_V10 = NL_FALSE;
1064     }
1065 
1066     if(CUDA()->has_cusparse_V10) {
1067 	nl_printf("OpenNL CUDA: using cusparse V10 API\n");
1068     } else {
1069 	nl_printf("OpenNL CUDA: using cusparse V9 API\n");
1070     }
1071 
1072     if(CUDA()->cusparseCreate(&CUDA()->HNDL_cusparse)) {
1073 	return NL_FALSE;
1074     }
1075     if(CUDA()->cusparseGetVersion(CUDA()->HNDL_cusparse, &cusparse_version)) {
1076 	return NL_FALSE;
1077     }
1078     nl_printf("OpenNL CUDA: cusparse version = %d\n", cusparse_version);
1079 
1080     if(!nlExtensionIsInitialized_CUDA()) {
1081 	return NL_FALSE;
1082     }
1083 
1084     atexit(nlTerminateExtension_CUDA);
1085     return NL_TRUE;
1086 
1087 }
1088 
nlCUDACheckImpl(int status,int line)1089 static void nlCUDACheckImpl(int status, int line) {
1090     if(status != 0) {
1091 	nl_fprintf(stderr,"nl_cuda.c:%d fatal error %d\n",line, status);
1092 	CUDA()->cudaDeviceReset();
1093 	exit(-1);
1094     }
1095 }
1096 
1097 #define nlCUDACheck(status) nlCUDACheckImpl(status, __LINE__)
1098 
1099 /**************************************************************************/
1100 
1101 /**
1102  * Abstract matrix interface for a CRS matrix stored on the GPU.
1103  */
1104 typedef struct {
1105     NLuint m;
1106     NLuint n;
1107     NLenum type;
1108     NLDestroyMatrixFunc destroy_func;
1109     NLMultMatrixVectorFunc mult_func;
1110 
1111     /* cusparse V9 implementation */
1112     cusparseMatDescr_t descr;
1113 
1114     /* cusparse V10 implementation */
1115     cusparseSpMatDescr_t descr_V10;
1116     cusparseDnVecDescr_t X_V10;
1117     cusparseDnVecDescr_t Y_V10;
1118     NLboolean work_init_V10;
1119     void* work_V10;
1120     size_t work_size_V10;
1121 
1122     NLuint_big nnz;
1123     int* colind;
1124     int* rowptr;
1125     double* val;
1126 } NLCUDASparseMatrix;
1127 
1128 
1129 /**
1130  * \brief Deallocates just the CRS part of a CUDA matrix.
1131  */
nlCRSMatrixCUDADestroyCRS(NLCUDASparseMatrix * Mcuda)1132 static void nlCRSMatrixCUDADestroyCRS(NLCUDASparseMatrix* Mcuda) {
1133     if(!nlExtensionIsInitialized_CUDA()) {
1134 	return;
1135     }
1136     if(Mcuda->colind != NULL) {
1137 	nlCUDACheck(CUDA()->cudaFree(Mcuda->colind));
1138 	Mcuda->colind = NULL;
1139     }
1140     if(Mcuda->rowptr != NULL) {
1141 	nlCUDACheck(CUDA()->cudaFree(Mcuda->rowptr));
1142 	Mcuda->rowptr = NULL;
1143     }
1144     if(Mcuda->val != NULL) {
1145 	nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
1146 	Mcuda->val = NULL;
1147     }
1148 }
1149 
nlCRSMatrixCUDADestroy(NLCUDASparseMatrix * Mcuda)1150 static void nlCRSMatrixCUDADestroy(NLCUDASparseMatrix* Mcuda) {
1151     if(!nlExtensionIsInitialized_CUDA()) {
1152 	return;
1153     }
1154     nlCRSMatrixCUDADestroyCRS(Mcuda);
1155     if(CUDA()->has_cusparse_V10) {
1156 	nlCUDACheck(CUDA()->cusparseDestroySpMat(Mcuda->descr_V10));
1157 	if(Mcuda->X_V10 != NULL) {
1158 	    nlCUDACheck(CUDA()->cusparseDestroyDnVec(Mcuda->X_V10));
1159 	}
1160 	if(Mcuda->Y_V10 != NULL) {
1161 	    nlCUDACheck(CUDA()->cusparseDestroyDnVec(Mcuda->Y_V10));
1162 	}
1163 	if(Mcuda->work_V10 != NULL) {
1164 	    nlCUDACheck(CUDA()->cudaFree(Mcuda->work_V10));
1165 	}
1166     } else {
1167 	nlCUDACheck(CUDA()->cusparseDestroyMatDescr(Mcuda->descr));
1168     }
1169     memset(Mcuda, 0, sizeof(*Mcuda));
1170 }
1171 
nlCRSMatrixCUDAMult(NLCUDASparseMatrix * Mcuda,const double * x,double * y)1172 static void nlCRSMatrixCUDAMult(
1173     NLCUDASparseMatrix* Mcuda, const double* x, double* y
1174 ) {
1175     const double one = 1.0;
1176     const double zero = 0.0;
1177     if(CUDA()->has_cusparse_V10) {
1178 	if(Mcuda->X_V10 == NULL) {
1179 	    nlCUDACheck(
1180 		CUDA()->cusparseCreateDnVec(
1181 		    &Mcuda->X_V10, Mcuda->n, (void*)x, CUDA_R_64F
1182 		)
1183 	    );
1184 	} else {
1185 	    nlCUDACheck(
1186 		CUDA()->cusparseDnVecSetValues(Mcuda->X_V10, (void*)x)
1187 	    );
1188 	}
1189 	if(Mcuda->Y_V10 == NULL) {
1190 	    nlCUDACheck(
1191 		CUDA()->cusparseCreateDnVec(
1192 		    &Mcuda->Y_V10, Mcuda->m, y, CUDA_R_64F
1193 		)
1194 	    );
1195 	} else {
1196 	    nlCUDACheck(
1197 		CUDA()->cusparseDnVecSetValues(Mcuda->Y_V10, y)
1198 	    );
1199 	}
1200 	if(!Mcuda->work_init_V10) {
1201 	    nlCUDACheck(
1202 		CUDA()->cusparseSpMV_bufferSize(
1203 		    CUDA()->HNDL_cusparse,
1204 		    CUSPARSE_OPERATION_NON_TRANSPOSE,
1205 		    &one,
1206 		    Mcuda->descr_V10,
1207 		    Mcuda->X_V10,
1208 		    &zero,
1209 		    Mcuda->Y_V10,
1210 		    CUDA_R_64F,
1211 		    CUSPARSE_CSRMV_ALG2, /* CUSPARSE_MV_ALG_DEFAULT, HERE */
1212 		    &Mcuda->work_size_V10
1213 		)
1214 	    );
1215 	    nl_printf("Buffer size = %d\n", Mcuda->work_size_V10);
1216 	    if(Mcuda->work_size_V10 != 0) {
1217 		nlCUDACheck(
1218 		   CUDA()->cudaMalloc(&Mcuda->work_V10,Mcuda->work_size_V10)
1219 	        );
1220 		nl_printf("work = 0x%lx\n", Mcuda->work_V10);
1221 	    }
1222 	    Mcuda->work_init_V10 = NL_TRUE;
1223 	}
1224 	nlCUDACheck(
1225 	    CUDA()->cusparseSpMV(
1226 		CUDA()->HNDL_cusparse,
1227 		CUSPARSE_OPERATION_NON_TRANSPOSE,
1228 		&one,
1229 		Mcuda->descr_V10,
1230 		Mcuda->X_V10,
1231 		&zero,
1232 		Mcuda->Y_V10,
1233 		CUDA_R_64F,
1234 		CUSPARSE_CSRMV_ALG2, /* CUSPARSE_MV_ALG_DEFAULT, HERE */
1235 		Mcuda->work_V10
1236 	    )
1237 	);
1238     } else {
1239 	nlCUDACheck(
1240 	    CUDA()->cusparseDcsrmv(
1241 		CUDA()->HNDL_cusparse,
1242 		CUSPARSE_OPERATION_NON_TRANSPOSE,
1243 		(int)Mcuda->m,
1244 		(int)Mcuda->n,
1245 		(int)Mcuda->nnz,
1246 		&one,
1247 		Mcuda->descr,
1248 		Mcuda->val,
1249 		Mcuda->rowptr,
1250 		Mcuda->colind,
1251 		x,
1252 		&zero,
1253 		y
1254 	    )
1255 	);
1256     }
1257     nlCUDABlas()->flops += (NLulong)(2*Mcuda->nnz);
1258 }
1259 
nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in)1260 NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) {
1261     NLCUDASparseMatrix* Mcuda = NL_NEW(NLCUDASparseMatrix);
1262     NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
1263     size_t colind_sz, rowptr_sz, val_sz;
1264     nl_assert(M_in->type == NL_MATRIX_CRS);
1265     Mcuda->m = M->m;
1266     Mcuda->n = M->n;
1267     Mcuda->nnz = (NLuint)(nlCRSMatrixNNZ(M));
1268 
1269     colind_sz = (size_t)Mcuda->nnz*sizeof(NLuint);
1270     rowptr_sz = (size_t)(Mcuda->m+1)*sizeof(NLuint_big);
1271     val_sz    = (size_t)Mcuda->nnz*sizeof(double);
1272 
1273     nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->colind,colind_sz));
1274     nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->rowptr,rowptr_sz));
1275     nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->val,val_sz));
1276     nlCUDACheck(CUDA()->cudaMemcpy(
1277        Mcuda->colind, M->colind, colind_sz, cudaMemcpyHostToDevice)
1278     );
1279     nlCUDACheck(CUDA()->cudaMemcpy(
1280        Mcuda->rowptr, M->rowptr, rowptr_sz, cudaMemcpyHostToDevice)
1281     );
1282     nlCUDACheck(CUDA()->cudaMemcpy(
1283        Mcuda->val, M->val, val_sz, cudaMemcpyHostToDevice)
1284     );
1285     Mcuda->type=NL_MATRIX_OTHER;
1286     Mcuda->destroy_func=(NLDestroyMatrixFunc)nlCRSMatrixCUDADestroy;
1287     Mcuda->mult_func=(NLMultMatrixVectorFunc)nlCRSMatrixCUDAMult;
1288 
1289 #ifdef GARGANTUA
1290     if(!CUDA()->has_cusparse_V10) {
1291 	nlError(
1292 	    "OpenNL CUDA",
1293 	    "FATAL ERROR, GARGANTUA mode needs cusparse V10\n"
1294 	);
1295 	exit(-1);
1296     }
1297 #  define ROWPTR_TYPE CUSPARSE_INDEX_64I
1298 #else
1299 #  define ROWPTR_TYPE CUSPARSE_INDEX_32I
1300 #endif
1301 
1302     if(CUDA()->has_cusparse_V10) {
1303 	nlCUDACheck(
1304 	    CUDA()->cusparseCreateCsr(
1305 		&Mcuda->descr_V10,
1306 		Mcuda->m,
1307 		Mcuda->n,
1308 		(int64_t)Mcuda->nnz,
1309 		Mcuda->rowptr,
1310 		Mcuda->colind,
1311 		Mcuda->val,
1312 		ROWPTR_TYPE,
1313 		CUSPARSE_INDEX_32I,
1314 		CUSPARSE_INDEX_BASE_ZERO,
1315 		CUDA_R_64F
1316 	    )
1317 	);
1318     } else {
1319 	nlCUDACheck(CUDA()->cusparseCreateMatDescr(&Mcuda->descr));
1320 	if(M->symmetric_storage) {
1321 	    nlCUDACheck(CUDA()->cusparseSetMatType(
1322 	       Mcuda->descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)
1323 	    );
1324 	} else {
1325 	    nlCUDACheck(CUDA()->cusparseSetMatType(
1326 	       Mcuda->descr, CUSPARSE_MATRIX_TYPE_GENERAL)
1327 	    );
1328 	}
1329 	nlCUDACheck(CUDA()->cusparseSetMatIndexBase(
1330 	   Mcuda->descr, CUSPARSE_INDEX_BASE_ZERO)
1331 	);
1332     }
1333 
1334     return (NLMatrix)Mcuda;
1335 }
1336 
1337 /**************************************************************************/
1338 
1339 /**
1340  * Abstract matrix interface for a diagonal matrix stored on the GPU.
1341  */
1342 typedef struct {
1343     NLuint m;
1344     NLuint n;
1345     NLenum type;
1346     NLDestroyMatrixFunc destroy_func;
1347     NLMultMatrixVectorFunc mult_func;
1348     double* val;
1349 } NLDiagonalMatrixCUDA;
1350 
nlDiagonalMatrixCUDADestroy(NLDiagonalMatrixCUDA * Mcuda)1351 static void nlDiagonalMatrixCUDADestroy(NLDiagonalMatrixCUDA* Mcuda) {
1352     if(!nlExtensionIsInitialized_CUDA()) {
1353 	return;
1354     }
1355     nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
1356     memset(Mcuda, 0, sizeof(*Mcuda));
1357 }
1358 
nlDiagonalMatrixCUDAMult(NLDiagonalMatrixCUDA * Mcuda,const double * x,double * y)1359 static void nlDiagonalMatrixCUDAMult(
1360     NLDiagonalMatrixCUDA* Mcuda, const double* x, double* y
1361 ) {
1362     int N = (int)Mcuda->n;
1363     /*
1364      * vector x vector component-wise product implemented
1365      * using diagonal matrix x matrix function.
1366      */
1367     nlCUDACheck(CUDA()->cublasDdgmm(
1368 	CUDA()->HNDL_cublas, CUBLAS_SIDE_LEFT,
1369 	N, 1,
1370 	x, N,
1371 	Mcuda->val, 1,
1372 	y, N
1373     ));
1374     nlCUDABlas()->flops += (NLulong)N;
1375 }
1376 
nlDiagonalMatrixCUDANew(const double * diag,NLuint n)1377 static NLMatrix nlDiagonalMatrixCUDANew(const double* diag, NLuint n) {
1378     NLDiagonalMatrixCUDA* Mcuda = NL_NEW(NLDiagonalMatrixCUDA);
1379     Mcuda->m = n;
1380     Mcuda->n = n;
1381     Mcuda->type = NL_MATRIX_OTHER;
1382     nlCUDACheck(CUDA()->cudaMalloc(
1383        (void**)&Mcuda->val, n*sizeof(double))
1384     );
1385     nlCUDACheck(CUDA()->cudaMemcpy(
1386        Mcuda->val, diag, n*sizeof(double), cudaMemcpyHostToDevice)
1387     );
1388     Mcuda->destroy_func=(NLDestroyMatrixFunc)nlDiagonalMatrixCUDADestroy;
1389     Mcuda->mult_func=(NLMultMatrixVectorFunc)nlDiagonalMatrixCUDAMult;
1390     return (NLMatrix)Mcuda;
1391 }
1392 
nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M_in)1393 NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M_in) {
1394     NLuint N = M_in->n;
1395     NLuint i;
1396     NLuint_big jj;
1397     double* diag = NULL;
1398     NLMatrix result = NULL;
1399     NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
1400     nl_assert(M_in->type == NL_MATRIX_CRS);
1401     diag = NL_NEW_ARRAY(double,N);
1402     for(i=0; i<N; ++i) {
1403 	for(jj=M->rowptr[i]; jj<M->rowptr[i+1]; ++jj) {
1404 	    if(M->colind[jj] == i) {
1405 		diag[i] = M->val[jj];
1406 	    }
1407 	}
1408     }
1409     for(i=0; i<N; ++i) {
1410 	diag[i] = ((diag[i] == 0.0) ? 1.0 : 1.0 / diag[i]);
1411     }
1412     result = nlDiagonalMatrixCUDANew(diag, N);
1413     NL_DELETE_ARRAY(diag);
1414     return result;
1415 }
1416 
1417 /**************************************************************************/
1418 
cuda_blas_malloc(NLBlas_t blas,NLmemoryType type,size_t size)1419 static void* cuda_blas_malloc(
1420     NLBlas_t blas, NLmemoryType type, size_t size
1421 ) {
1422     void* result = NULL;
1423     blas->used_ram[type] += (NLulong)size;
1424     blas->max_used_ram[type] = MAX(
1425 	blas->max_used_ram[type],blas->used_ram[type]
1426     );
1427     if(type == NL_HOST_MEMORY) {
1428 	result = malloc(size);
1429     } else {
1430 	nlCUDACheck(CUDA()->cudaMalloc(&result,size));
1431     }
1432     return result;
1433 }
1434 
cuda_blas_free(NLBlas_t blas,NLmemoryType type,size_t size,void * ptr)1435 static void cuda_blas_free(
1436     NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
1437 ) {
1438     blas->used_ram[type] -= (NLulong)size;
1439     if(type == NL_HOST_MEMORY) {
1440 	free(ptr);
1441     } else {
1442 	nlCUDACheck(CUDA()->cudaFree(ptr));
1443     }
1444 }
1445 
cuda_blas_memcpy(NLBlas_t blas,void * to,NLmemoryType to_type,void * from,NLmemoryType from_type,size_t size)1446 static void cuda_blas_memcpy(
1447     NLBlas_t blas,
1448     void* to, NLmemoryType to_type,
1449     void* from, NLmemoryType from_type,
1450     size_t size
1451 ) {
1452     enum cudaMemcpyKind kind = cudaMemcpyDefault;
1453     nl_arg_used(blas);
1454     if(from_type == NL_HOST_MEMORY) {
1455 	if(to_type == NL_HOST_MEMORY) {
1456 	    kind = cudaMemcpyHostToHost;
1457 	} else {
1458 	    kind = cudaMemcpyHostToDevice;
1459 	}
1460     } else {
1461 	if(to_type == NL_HOST_MEMORY) {
1462 	    kind = cudaMemcpyDeviceToHost;
1463 	} else {
1464 	    kind = cudaMemcpyDeviceToDevice;
1465 	}
1466     }
1467     nlCUDACheck(CUDA()->cudaMemcpy(to, from, size, kind));
1468 }
1469 
cuda_blas_dcopy(NLBlas_t blas,int n,const double * x,int incx,double * y,int incy)1470 static void cuda_blas_dcopy(
1471     NLBlas_t blas, int n, const double *x, int incx, double *y, int incy
1472 ) {
1473     nl_arg_used(blas);
1474     CUDA()->cublasDcopy(CUDA()->HNDL_cublas,n,x,incx,y,incy);
1475 }
1476 
cuda_blas_ddot(NLBlas_t blas,int n,const double * x,int incx,const double * y,int incy)1477 static double cuda_blas_ddot(
1478     NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
1479 ) {
1480     double result = 0.0;
1481     blas->flops += (NLulong)(2*n);
1482     CUDA()->cublasDdot(CUDA()->HNDL_cublas,n,x,incx,y,incy,&result);
1483     return result;
1484 }
1485 
cuda_blas_dnrm2(NLBlas_t blas,int n,const double * x,int incx)1486 static double cuda_blas_dnrm2(
1487     NLBlas_t blas, int n, const double *x, int incx
1488 ) {
1489     double result = 0.0;
1490     blas->flops += (NLulong)(2*n);
1491     CUDA()->cublasDnrm2(CUDA()->HNDL_cublas,n,x,incx,&result);
1492     return result;
1493 }
1494 
cuda_blas_daxpy(NLBlas_t blas,int n,double a,const double * x,int incx,double * y,int incy)1495 static void cuda_blas_daxpy(
1496     NLBlas_t blas, int n,
1497     double a, const double *x, int incx, double *y, int incy
1498 ) {
1499     blas->flops += (NLulong)(2*n);
1500     CUDA()->cublasDaxpy(CUDA()->HNDL_cublas,n,&a,x,incx,y,incy);
1501 }
1502 
cuda_blas_dscal(NLBlas_t blas,int n,double a,double * x,int incx)1503 static void cuda_blas_dscal(
1504     NLBlas_t blas, int n, double a, double *x, int incx
1505 ) {
1506     blas->flops += (NLulong)n;
1507     CUDA()->cublasDscal(CUDA()->HNDL_cublas,n,&a,x,incx);
1508 }
1509 
1510 
cuda_blas_dgemv(NLBlas_t blas,MatrixTranspose trans,int m,int n,double alpha,const double * A,int ldA,const double * x,int incx,double beta,double * y,int incy)1511 static void cuda_blas_dgemv(
1512     NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
1513     const double *A, int ldA, const double *x, int incx,
1514     double beta, double *y, int incy
1515 ) {
1516     nl_arg_used(blas);
1517     /* TODO: update FLOPS */
1518     CUDA()->cublasDgemv(
1519 	CUDA()->HNDL_cublas, (cublasOperation_t)trans,
1520 	m, n, &alpha, A, ldA, x, incx, &beta, y, incy
1521     );
1522 }
1523 
cuda_blas_dtpsv(NLBlas_t blas,MatrixTriangle uplo,MatrixTranspose trans,MatrixUnitTriangular diag,int n,const double * AP,double * x,int incx)1524 static void cuda_blas_dtpsv(
1525     NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
1526     MatrixUnitTriangular diag, int n, const double *AP,
1527     double *x, int incx
1528 ) {
1529     nl_arg_used(blas);
1530     /* TODO: update FLOPS */
1531     CUDA()->cublasDtpsv(
1532 	CUDA()->HNDL_cublas,
1533 	(cublasFillMode_t)uplo,
1534 	(cublasOperation_t)trans,
1535 	(cublasDiagType_t)diag, n,
1536 	AP, x, incx
1537     );
1538 }
1539 
1540 
nlCUDABlas()1541 NLBlas_t nlCUDABlas() {
1542     static NLboolean initialized = NL_FALSE;
1543     static struct NLBlas blas;
1544     if(!initialized) {
1545 	memset(&blas, 0, sizeof(blas));
1546 	blas.has_unified_memory = NL_FALSE;
1547 	blas.Malloc = cuda_blas_malloc;
1548 	blas.Free = cuda_blas_free;
1549 	blas.Memcpy = cuda_blas_memcpy;
1550 	blas.Dcopy = cuda_blas_dcopy;
1551 	blas.Ddot = cuda_blas_ddot;
1552 	blas.Dnrm2 = cuda_blas_dnrm2;
1553 	blas.Daxpy = cuda_blas_daxpy;
1554 	blas.Dscal = cuda_blas_dscal;
1555 	blas.Dgemv = cuda_blas_dgemv;
1556 	blas.Dtpsv = cuda_blas_dtpsv;
1557 	nlBlasResetStats(&blas);
1558 	initialized = NL_TRUE;
1559     }
1560     return &blas;
1561 }
1562 
1563 
1564 /**************************************************************************/
1565