1 /*
2 * Copyright (c) 2004-2017, Bruno Levy
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 * * Neither the name of the ALICE Project-Team nor the names of its
14 * contributors may be used to endorse or promote products derived from this
15 * software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 *
29 * If you modify this software, you should include a notice giving the
30 * name of the person performing the modification, the date of modification,
31 * and the reason for such modification.
32 *
33 * Contact: Bruno Levy
34 *
35 * levy@loria.fr
36 *
37 * ALICE Project
38 * LORIA, INRIA Lorraine,
39 * Campus Scientifique, BP 239
40 * 54506 VANDOEUVRE LES NANCY CEDEX
41 * FRANCE
42 *
43 */
44
45 #include "nl_cuda.h"
46 #include "nl_context.h"
47 #include <stdint.h>
48
49 /**
50 * \file nl_cuda.c
51 * \brief Weak-coupling adapter to call CUDA from OpenNL.
52 */
53
54 /**********************************************************/
55 /* CUDA structures and functions */
56 /* Repeated here so that one can compile OpenNL without */
57 /* requiring CUDA to be installed in the system. */
58 /**********************************************************/
59
60 enum cudaComputeMode {
61 cudaComputeModeDefault = 0,
62 cudaComputeModeExclusive = 1,
63 cudaComputeModeProhibited = 2,
64 cudaComputeModeExclusiveProcess = 3
65 };
66
67 enum cudaMemcpyKind {
68 cudaMemcpyHostToHost = 0,
69 cudaMemcpyHostToDevice = 1,
70 cudaMemcpyDeviceToHost = 2,
71 cudaMemcpyDeviceToDevice = 3,
72 cudaMemcpyDefault = 4
73 };
74
75 enum cudaDeviceAttribute {
76 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
77 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
78 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
79 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
80 CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
81 CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
82 CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
83 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
84 CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
85 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
86 CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
87 CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
88 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
89 CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
90 CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
91 CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
92 CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
93 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
94 CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
95 CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
96 CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
97 CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
98 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
99 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
100 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
101 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
102 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
103 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
104 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,
105 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,
106 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,
107 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
108 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
109 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
110 CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
111 CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
112 CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
113 CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
114 CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
115 CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,
116 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,
117 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,
118 CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,
119 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
120 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
121 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
122 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,
123 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,
124 CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,
125 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,
126 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,
127 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,
128 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,
129 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,
130 CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,
131 CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,
132 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,
133 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,
134 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,
135 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,
136 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,
137 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,
138 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,
139 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,
140 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,
141 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,
142 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,
143 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,
144 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,
145 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,
146 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,
147 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,
148 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,
149 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,
150 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,
151 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,
152 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,
153 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,
154 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,
155 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
156 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
157 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,
158 CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,
159 CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,
160 CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,
161 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,
162 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
163 CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
164 CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,
165 CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
166 CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
167 CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
168 CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
169 CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
170 CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
171 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
172 CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,
173 CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,
174 CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,
175 CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,
176 CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,
177 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,
178 CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,
179 CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,
180 CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100,
181 CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,
182 CU_DEVICE_ATTRIBUTE_MAX
183 };
184
185 /*
186 * We use cudaGetDeviceProperties() to get the name of the device.
187 * There should be cudaDeviceGetName() but it is not always there.
188 * We do not use the other device properties because the order of
189 * the fields change in the different CUDA versions.
190 */
191 struct cudaDeviceProp {
192 char name[256];
193 char buff[4096];
194 };
195
196 typedef int cudaError_t;
197
198 typedef cudaError_t (*FUNPTR_cudaDriverGetVersion)(int* version);
199 typedef cudaError_t (*FUNPTR_cudaRuntimeGetVersion)(int* version);
200 typedef cudaError_t (*FUNPTR_cudaGetDeviceCount)(int* device_count);
201 typedef cudaError_t (*FUNPTR_cudaGetDeviceProperties)(
202 struct cudaDeviceProp *props, int device
203 );
204 typedef cudaError_t (*FUNPTR_cudaDeviceGetAttribute)(
205 int* attrib_value, enum cudaDeviceAttribute attrib, int device
206 );
207 typedef cudaError_t (*FUNPTR_cudaDeviceReset)(void);
208 typedef cudaError_t (*FUNPTR_cudaMalloc)(void **devPtr, size_t size);
209 typedef cudaError_t (*FUNPTR_cudaFree)(void* devPtr);
210 typedef cudaError_t (*FUNPTR_cudaMemcpy)(
211 void *dst, const void *src, size_t count, enum cudaMemcpyKind kind
212 );
213
214 /**
215 * \brief Finds and initializes a function pointer to
216 * one of the functions in CUDA.
217 * \details Function pointers are stored into the
218 * CUDAContext returned by the function CUDA().
219 * If a symbol is not found, returns NL_FALSE from the
220 * calling function.
221 */
222 #define find_cuda_func(name) \
223 if( \
224 ( \
225 CUDA()->name = \
226 (FUNPTR_##name)nlFindFunction( \
227 CUDA()->DLL_cudart,#name \
228 ) \
229 ) == NULL \
230 ) { \
231 nlError("nlInitExtension_CUDA: function not found", #name); \
232 return NL_FALSE; \
233 }
234
235 /**********************************************************/
236 /* CUBLAS structures and functions */
237 /**********************************************************/
238
239 struct cublasContext;
240 typedef struct cublasContext *cublasHandle_t;
241 typedef int cublasStatus_t;
242
243 typedef enum {
244 CUBLAS_SIDE_LEFT =0,
245 CUBLAS_SIDE_RIGHT=1
246 } cublasSideMode_t;
247
248 typedef enum {
249 CUBLAS_FILL_MODE_LOWER=0,
250 CUBLAS_FILL_MODE_UPPER=1
251 } cublasFillMode_t;
252
253 typedef enum {
254 CUBLAS_OP_N=0,
255 CUBLAS_OP_T=1,
256 CUBLAS_OP_C=2
257 } cublasOperation_t;
258
259 typedef enum {
260 CUBLAS_DIAG_NON_UNIT=0,
261 CUBLAS_DIAG_UNIT=1
262 } cublasDiagType_t;
263
264 typedef cublasStatus_t (*FUNPTR_cublasCreate)(cublasHandle_t* handle);
265 typedef cublasStatus_t (*FUNPTR_cublasDestroy)(cublasHandle_t handle);
266
267 typedef cublasStatus_t (*FUNPTR_cublasGetVersion)(
268 cublasHandle_t handle, int* version
269 );
270
271 typedef cublasStatus_t (*FUNPTR_cublasDdot)(
272 cublasHandle_t handle, int n,
273 const double *x, int incx,
274 const double *y, int incy,
275 double *result
276 );
277
278 typedef cublasStatus_t (*FUNPTR_cublasDcopy)(
279 cublasHandle_t handle, int n,
280 const double *x, int incx,
281 const double *y, int incy
282 );
283
284 typedef cublasStatus_t (*FUNPTR_cublasDaxpy)(
285 cublasHandle_t handle, int n,
286 const double* alpha,
287 const double *x, int incx,
288 const double *y, int incy
289 );
290
291 typedef cublasStatus_t (*FUNPTR_cublasDscal)(
292 cublasHandle_t handle, int n,
293 const double* alpha,
294 const double *x, int incx
295 );
296
297 typedef cublasStatus_t (*FUNPTR_cublasDnrm2)(
298 cublasHandle_t handle, int n,
299 const double *x, int incx,
300 double* result
301 );
302
303 typedef cublasStatus_t (*FUNPTR_cublasDdgmm)(
304 cublasHandle_t handle, cublasSideMode_t mode,
305 int m, int n,
306 const double* A, int lda,
307 const double* x, int incx,
308 double* C, int ldc
309 );
310
311 typedef cublasStatus_t (*FUNPTR_cublasDgemv)(
312 cublasHandle_t handle,
313 cublasOperation_t trans,
314 int m,
315 int n,
316 const double *alpha,
317 const double *A,
318 int lda,
319 const double *x,
320 int incx,
321 const double *beta,
322 double *y,
323 int incy
324 );
325
326 typedef cublasStatus_t (*FUNPTR_cublasDtpsv)(
327 cublasHandle_t handle, cublasFillMode_t uplo,
328 cublasOperation_t trans, cublasDiagType_t diag,
329 int n, const double *AP,
330 double* x, int incx
331 );
332
333
334 /**
335 * \brief Finds and initializes a function pointer to
336 * one of the functions in CUBLAS.
337 * \details Function pointers are stored into the
338 * CUDAContext returned by the function CUDA().
339 * If a symbol is not found, returns NL_FALSE from the
340 * calling function. Here we use the functions suffixed
341 * by "_v2".
342 */
343 #define find_cublas_func(name) \
344 if( \
345 ( \
346 CUDA()->name = \
347 (FUNPTR_##name)nlFindFunction( \
348 CUDA()->DLL_cublas,#name "_v2" \
349 ) \
350 ) == NULL \
351 ) { \
352 nlError("nlInitExtension_CUDA: function not found", #name); \
353 return NL_FALSE; \
354 }
355
356 /**
357 * \brief Finds and initializes a function pointer to
358 * one of the functions in CUBLAS.
359 * \details Function pointers are stored into the
360 * CUDAContext returned by the function CUDA().
361 * If a symbol is not found, returns NL_FALSE from the
362 * calling function.
363 */
364 #define find_cublas_func_v1(name) \
365 if( \
366 ( \
367 CUDA()->name = \
368 (FUNPTR_##name)nlFindFunction( \
369 CUDA()->DLL_cublas,#name \
370 ) \
371 ) == NULL \
372 ) { \
373 nlError("nlInitExtension_CUDA: function not found", #name); \
374 return NL_FALSE; \
375 }
376
377
378
379 /**********************************************************/
380 /* CUSPARSE structures and functions */
381 /**********************************************************/
382
383 struct cusparseContext;
384 typedef struct cusparseContext *cusparseHandle_t;
385 typedef int cusparseStatus_t;
386 struct cusparseMatDescr;
387 typedef struct cusparseMatDescr *cusparseMatDescr_t;
388
389 typedef enum {
390 CUSPARSE_MATRIX_TYPE_GENERAL = 0,
391 CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1,
392 CUSPARSE_MATRIX_TYPE_HERMITIAN = 2,
393 CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
394 } cusparseMatrixType_t;
395
396 typedef enum {
397 CUSPARSE_INDEX_BASE_ZERO = 0,
398 CUSPARSE_INDEX_BASE_ONE = 1
399 } cusparseIndexBase_t;
400
401 typedef enum {
402 CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
403 CUSPARSE_OPERATION_TRANSPOSE = 1,
404 CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
405 } cusparseOperation_t;
406
407 typedef cusparseStatus_t (*FUNPTR_cusparseCreate)(cusparseHandle_t* handle);
408 typedef cusparseStatus_t (*FUNPTR_cusparseDestroy)(cusparseHandle_t handle);
409 typedef cusparseStatus_t (*FUNPTR_cusparseGetVersion)(
410 cusparseHandle_t handle, int* version
411 );
412 typedef cusparseStatus_t (*FUNPTR_cusparseCreateMatDescr)(
413 cusparseMatDescr_t* descr
414 );
415 typedef cusparseStatus_t (*FUNPTR_cusparseDestroyMatDescr)(
416 cusparseMatDescr_t descr
417 );
418 typedef cusparseStatus_t (*FUNPTR_cusparseSetMatType)(
419 cusparseMatDescr_t descr, cusparseMatrixType_t mtype
420 );
421 typedef cusparseStatus_t (*FUNPTR_cusparseSetMatIndexBase)(
422 cusparseMatDescr_t descr, cusparseIndexBase_t ibase
423 );
424 typedef cusparseStatus_t (*FUNPTR_cusparseDcsrmv)(
425 cusparseHandle_t handle, cusparseOperation_t transA,
426 int m, int n, int nnz,
427 const double *alpha, const cusparseMatDescr_t descrA,
428 const double *csrSortedValA, const int *csrSortedRowPtrA,
429 const int *csrSortedColIndA, const double *x,
430 const double *beta, double *y
431 );
432
433 /**********************************************************/
434 /* CUSPARSE v10 structures and functions (generic matrix) */
435 /**********************************************************/
436
437 typedef enum cudaDataType_t {
438 CUDA_R_16F= 2, CUDA_C_16F= 6,
439 CUDA_R_32F= 0, CUDA_C_32F= 4,
440 CUDA_R_64F= 1, CUDA_C_64F= 5,
441 CUDA_R_8I = 3, CUDA_C_8I = 7,
442 CUDA_R_8U = 8, CUDA_C_8U = 9,
443 CUDA_R_32I= 10, CUDA_C_32I= 11,
444 CUDA_R_32U= 12, CUDA_C_32U= 13
445 } cudaDataType;
446
447 struct cusparseDnVecDescr;
448 struct cusparseSpMatDescr;
449 typedef struct cusparseDnVecDescr* cusparseDnVecDescr_t;
450 typedef struct cusparseSpMatDescr* cusparseSpMatDescr_t;
451
452 typedef enum {
453 CUSPARSE_INDEX_16U = 1,
454 CUSPARSE_INDEX_32I = 2,
455 CUSPARSE_INDEX_64I = 3
456 } cusparseIndexType_t;
457
458 typedef cusparseStatus_t (*FUNPTR_cusparseCreateDnVec)(
459 cusparseDnVecDescr_t* dnVecDescr,
460 int64_t size, void* values, cudaDataType valueType
461 );
462
463 typedef cusparseStatus_t (*FUNPTR_cusparseDestroyDnVec)(
464 cusparseDnVecDescr_t dnVecDescr
465 );
466
467 typedef cusparseStatus_t (*FUNPTR_cusparseDnVecSetValues)(
468 cusparseDnVecDescr_t dnVecDescr, void* values
469 );
470
471 typedef cusparseStatus_t (*FUNPTR_cusparseCreateCsr)(
472 cusparseSpMatDescr_t* spMatDescr,
473 int64_t rows, int64_t cols, int64_t nnz,
474 void* csrRowOffsets, void* csrColInd, void* csrValues,
475 cusparseIndexType_t csrRowOffsetsType,
476 cusparseIndexType_t csrColIndType,
477 cusparseIndexBase_t idxBase,
478 cudaDataType valueType
479 );
480
481 typedef cusparseStatus_t (*FUNPTR_cusparseDestroySpMat)(
482 cusparseSpMatDescr_t spMatDescr
483 );
484
485 typedef enum {
486 CUSPARSE_MV_ALG_DEFAULT = 0,
487 CUSPARSE_COOMV_ALG = 1,
488 CUSPARSE_CSRMV_ALG1 = 2,
489 CUSPARSE_CSRMV_ALG2 = 3
490 } cusparseSpMVAlg_t;
491
492 typedef cusparseStatus_t (*FUNPTR_cusparseSpMV)(
493 cusparseHandle_t handle, cusparseOperation_t opA,
494 const void* alpha, const cusparseSpMatDescr_t matA,
495 const cusparseDnVecDescr_t vecX, const void* beta,
496 const cusparseDnVecDescr_t vecY, cudaDataType computeType,
497 cusparseSpMVAlg_t alg, void* externalBuffer
498 );
499
500 typedef cusparseStatus_t (*FUNPTR_cusparseSpMV_bufferSize)(
501 cusparseHandle_t handle, cusparseOperation_t opA,
502 const void* alpha, const cusparseSpMatDescr_t matA,
503 const cusparseDnVecDescr_t vecX, const void* beta,
504 const cusparseDnVecDescr_t vecY, cudaDataType computeType,
505 cusparseSpMVAlg_t alg, size_t* bufferSize
506 );
507
508 /**
509 * \brief Finds and initializes a function pointer to
510 * one of the functions in CUSPARSE.
511 * \details Function pointers are stored into the
512 * CUDAContext returned by the function CUDA().
513 * If a symbol is not found, returns NL_FALSE from the
514 * calling function.
515 */
516 #define find_cusparse_func(name) \
517 if( \
518 ( \
519 CUDA()->name = \
520 (FUNPTR_##name)nlFindFunction( \
521 CUDA()->DLL_cusparse,#name \
522 ) \
523 ) == NULL \
524 ) { \
525 nlError("nlInitExtension_CUDA : function not found", #name); \
526 return NL_FALSE; \
527 }
528
529
530 /**
531 * \brief Finds and initializes a function pointer to
532 * one of the functions in CUSPARSE.
533 * \details Function pointers are stored into the
534 * CUDAContext returned by the function CUDA().
535 * If a symbol is not found, returns NL_FALSE from the
536 * calling function.
537 */
538 #define find_cusparse_func_V10(name) \
539 if( \
540 ( \
541 CUDA()->name = \
542 (FUNPTR_##name)nlFindFunction( \
543 CUDA()->DLL_cusparse,#name \
544 ) \
545 ) == NULL \
546 ) { \
547 nlWarning("nlInitExtension_CUDA : optional function not found", #name); \
548 CUDA()->has_cusparse_V10 = NL_FALSE; \
549 }
550
551
552 /**********************************************************/
553
554 /**
555 * \brief The structure that stores the handle to
556 * the CUDA shared object, the function pointers
557 * and the detected version.
558 */
559 typedef struct {
560 NLdll DLL_cudart;
561
562 FUNPTR_cudaDriverGetVersion cudaDriverGetVersion;
563 FUNPTR_cudaRuntimeGetVersion cudaRuntimeGetVersion;
564 FUNPTR_cudaGetDeviceCount cudaGetDeviceCount;
565 FUNPTR_cudaGetDeviceProperties cudaGetDeviceProperties;
566 FUNPTR_cudaDeviceGetAttribute cudaDeviceGetAttribute;
567 FUNPTR_cudaDeviceReset cudaDeviceReset;
568 FUNPTR_cudaMalloc cudaMalloc;
569 FUNPTR_cudaFree cudaFree;
570 FUNPTR_cudaMemcpy cudaMemcpy;
571
572 NLdll DLL_cublas;
573 cublasHandle_t HNDL_cublas;
574 FUNPTR_cublasCreate cublasCreate;
575 FUNPTR_cublasDestroy cublasDestroy;
576 FUNPTR_cublasGetVersion cublasGetVersion;
577 FUNPTR_cublasDdot cublasDdot;
578 FUNPTR_cublasDcopy cublasDcopy;
579 FUNPTR_cublasDaxpy cublasDaxpy;
580 FUNPTR_cublasDscal cublasDscal;
581 FUNPTR_cublasDnrm2 cublasDnrm2;
582 FUNPTR_cublasDdgmm cublasDdgmm;
583 FUNPTR_cublasDgemv cublasDgemv;
584 FUNPTR_cublasDtpsv cublasDtpsv;
585
586 NLdll DLL_cusparse;
587 cusparseHandle_t HNDL_cusparse;
588 FUNPTR_cusparseCreate cusparseCreate;
589 FUNPTR_cusparseDestroy cusparseDestroy;
590 FUNPTR_cusparseGetVersion cusparseGetVersion;
591 FUNPTR_cusparseCreateMatDescr cusparseCreateMatDescr;
592 FUNPTR_cusparseDestroyMatDescr cusparseDestroyMatDescr;
593 FUNPTR_cusparseSetMatType cusparseSetMatType;
594 FUNPTR_cusparseSetMatIndexBase cusparseSetMatIndexBase;
595 FUNPTR_cusparseDcsrmv cusparseDcsrmv;
596
597 /* cusparse V10 API */
598 NLboolean has_cusparse_V10;
599 FUNPTR_cusparseCreateDnVec cusparseCreateDnVec;
600 FUNPTR_cusparseDestroyDnVec cusparseDestroyDnVec;
601 FUNPTR_cusparseDnVecSetValues cusparseDnVecSetValues;
602 FUNPTR_cusparseCreateCsr cusparseCreateCsr;
603 FUNPTR_cusparseDestroySpMat cusparseDestroySpMat;
604 FUNPTR_cusparseSpMV cusparseSpMV;
605 FUNPTR_cusparseSpMV_bufferSize cusparseSpMV_bufferSize;
606
607 int devID;
608 } CUDAContext;
609
610 /**
611 * \brief Gets the CUDA context.
612 * \return a pointer to the CUDA context
613 */
CUDA()614 static CUDAContext* CUDA() {
615 static CUDAContext context;
616 static NLboolean init = NL_FALSE;
617 if(!init) {
618 init = NL_TRUE;
619 memset(&context, 0, sizeof(context));
620 }
621 return &context;
622 }
623
nlExtensionIsInitialized_CUDA()624 NLboolean nlExtensionIsInitialized_CUDA() {
625 if(
626 CUDA()->DLL_cudart == NULL ||
627 CUDA()->cudaDriverGetVersion == NULL ||
628 CUDA()->cudaRuntimeGetVersion == NULL ||
629 CUDA()->cudaGetDeviceCount == NULL ||
630 CUDA()->cudaGetDeviceProperties == NULL ||
631 CUDA()->cudaDeviceGetAttribute == NULL ||
632 CUDA()->cudaDeviceReset == NULL ||
633 CUDA()->cudaMalloc == NULL ||
634 CUDA()->cudaFree == NULL ||
635 CUDA()->cudaMemcpy == NULL ||
636
637 CUDA()->DLL_cublas == NULL ||
638 CUDA()->HNDL_cublas == NULL ||
639 CUDA()->cublasCreate == NULL ||
640 CUDA()->cublasDestroy == NULL ||
641 CUDA()->cublasGetVersion == NULL ||
642 CUDA()->cublasDdot == NULL ||
643 CUDA()->cublasDcopy == NULL ||
644 CUDA()->cublasDaxpy == NULL ||
645 CUDA()->cublasDscal == NULL ||
646 CUDA()->cublasDnrm2 == NULL ||
647 CUDA()->cublasDdgmm == NULL ||
648
649 CUDA()->DLL_cusparse == NULL ||
650 CUDA()->HNDL_cusparse == NULL ||
651 CUDA()->cusparseCreate == NULL ||
652 CUDA()->cusparseDestroy == NULL ||
653 CUDA()->cusparseGetVersion == NULL ||
654 CUDA()->cusparseCreateMatDescr == NULL ||
655 CUDA()->cusparseDestroyMatDescr == NULL ||
656 CUDA()->cusparseSetMatType == NULL ||
657 CUDA()->cusparseSetMatIndexBase == NULL ||
658 CUDA()->cusparseDcsrmv == NULL
659 ) {
660 return NL_FALSE;
661 }
662 return NL_TRUE;
663 }
664
nlTerminateExtension_CUDA(void)665 static void nlTerminateExtension_CUDA(void) {
666 if(!nlExtensionIsInitialized_CUDA()) {
667 return;
668 }
669
670 CUDA()->cusparseDestroy(CUDA()->HNDL_cusparse);
671 nlCloseDLL(CUDA()->DLL_cusparse);
672
673 CUDA()->cublasDestroy(CUDA()->HNDL_cublas);
674 nlCloseDLL(CUDA()->DLL_cublas);
675
676 CUDA()->cudaDeviceReset();
677 nlCloseDLL(CUDA()->DLL_cudart);
678
679 memset(CUDA(), 0, sizeof(CUDAContext));
680 }
681
682 /**************************************************************************/
683
684 /**
685 * \brief Finds the number of cores from the major and minor versions of the
686 * shader model.
687 * \details Highly inspired by the helpers library in CUDA examples,
688 * see https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h
689 */
ConvertSMVer2Cores(int major,int minor)690 static int ConvertSMVer2Cores(int major, int minor) {
691 /* Defines for GPU Architecture types (using the SM version
692 to determine the # of cores per SM */
693 typedef struct {
694 int SM; /* 0xMm (hexadecimal notation),
695 M = SM Major version,
696 and m = SM minor version */
697 int Cores;
698 } sSMtoCores;
699
700 sSMtoCores nGpuArchCoresPerSM[] = {
701 { 0x10, 8 }, /* Tesla Generation (SM 1.0) G80 class */
702 { 0x11, 8 }, /* Tesla Generation (SM 1.1) G8x class */
703 { 0x12, 8 }, /* Tesla Generation (SM 1.2) G9x class */
704 { 0x13, 8 }, /* Tesla Generation (SM 1.3) GT200 class */
705 { 0x20, 32 }, /* Fermi Generation (SM 2.0) GF100 class */
706 { 0x21, 48 }, /* Fermi Generation (SM 2.1) GF10x class */
707 { 0x30, 192}, /* Kepler Generation (SM 3.0) GK10x class */
708 { 0x35, 192}, /* Kepler Generation (SM 3.5) GK11x class */
709 { 0x50, 128}, /* Maxwell Generation (SM 5.0) GM10x class
710 (yes, #cores smaller than with 3.x) */
711 { 0x52, 128}, /* Maxwell Generation (SM 5.2) GM20x class */
712 { 0x60, 64 }, /* Pascal Generation (SM 6.0) GP100,GP102
713 (yes, 64, but GP100 has superfast double precision) */
714 { 0x61, 128}, /* Pascal Generation (SM 6.1) GP104 class
715 (but FP64 runs as 1/32 FP32 speed) */
716 { 0x70, 64 }, /* yes, nb cores decreased in SM 7.x */
717 { 0x72, 64 },
718 { 0x75, 64 },
719 { -1, -1 }
720 };
721 int index = 0;
722 if(major == 0 && minor == 0) {
723 return 0;
724 }
725 while (nGpuArchCoresPerSM[index].SM != -1) {
726 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
727 return nGpuArchCoresPerSM[index].Cores;
728 }
729 index++;
730 }
731 /* If we don't find the values, we use a default value (64) */
732 nl_printf(
733 "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
734 major, minor, 64
735 );
736 return 64;
737 }
738
739 /**
740 * \brief Gets the double-precision performance for a device.
741 * \param[in] device the device
742 * \return the peak GFlops double-precision performance of the device,
743 * or 0.0 if device's compute mode is prohibited.
744 */
getDeviceDoublePrecisionGFlops(int device)745 static double getDeviceDoublePrecisionGFlops(int device) {
746 int compute_mode;
747 int compute_capability_major;
748 int compute_capability_minor;
749 int cores_per_multiprocessor;
750 int multiprocessor_count;
751 int double_precision_perf_ratio;
752 int clock_rate;
753 double result = 0.0;
754
755 CUDA()->cudaDeviceGetAttribute(
756 &compute_mode,
757 CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
758 device
759 );
760
761 if(compute_mode == cudaComputeModeProhibited) {
762 return 0.0;
763 }
764
765 CUDA()->cudaDeviceGetAttribute(
766 &compute_capability_major,
767 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
768 device
769 );
770
771 CUDA()->cudaDeviceGetAttribute(
772 &compute_capability_minor,
773 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
774 device
775 );
776
777 CUDA()->cudaDeviceGetAttribute(
778 &multiprocessor_count,
779 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
780 device
781 );
782
783 CUDA()->cudaDeviceGetAttribute(
784 &double_precision_perf_ratio,
785 CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO,
786 device
787 );
788
789 CUDA()->cudaDeviceGetAttribute(
790 &clock_rate,
791 CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
792 device
793 );
794
795 cores_per_multiprocessor = ConvertSMVer2Cores(
796 compute_capability_major, compute_capability_minor
797 );
798
799 /*
800 * I need this 2.0 factor to match the specs,
801 * does it mean a CUDA core does two FPs per cycle ?
802 * They probably count FMAs as 2 ops for the "peak perf"
803 * stat...
804 */
805 result = 2.0 *
806 ((double)(clock_rate) / (1024.0 * 1024.0)) *
807 (double)(multiprocessor_count) *
808 (double)(cores_per_multiprocessor) /
809 (double)(double_precision_perf_ratio) ;
810
811 return result;
812 }
813
814 /**
815 * \brief Gets the device ID with the maximum double precision
816 * performance.
817 * \return the ID of the fastest device or -1 is no device is
818 * available.
819 */
getBestDeviceID()820 static int getBestDeviceID() {
821 int result = -1;
822 double fastest_GFlops = 0.0;
823 int device_count;
824 int retval = CUDA()->cudaGetDeviceCount(&device_count);
825 int device;
826 double device_GFlops;
827 int driver_ver;
828 int runtime_ver;
829 if(retval == 35) {
830 nl_printf("Error: Driver/CUDA versions mismatch\n");
831 retval = CUDA()->cudaDriverGetVersion(&driver_ver);
832 nl_printf("cudaDriverGetVersion() retval=%d\n",retval);
833 retval = CUDA()->cudaRuntimeGetVersion(&runtime_ver);
834 nl_printf("cudaRuntimeGetVersion() retval=%d\n",retval);
835
836 nl_printf(" Driver version=%d\n",driver_ver);
837 nl_printf(" Runtime version=%d\n",driver_ver);
838 return result;
839 }
840 for(device=0; device<device_count; ++device) {
841 device_GFlops = getDeviceDoublePrecisionGFlops(device);
842 if(device_GFlops > fastest_GFlops) {
843 fastest_GFlops = device_GFlops;
844 result = device;
845 }
846 }
847 return result;
848 }
849
850 /**************************************************************************/
851
852 #ifdef NL_OS_UNIX
853 # define LIBPREFIX "lib"
854 # ifdef NL_OS_APPLE
855 # define LIBEXTENSION ".dylib"
856 # else
857 # define LIBEXTENSION ".so"
858 # endif
859 #else
860 # define LIBPREFIX
861 # define LIBEXTENSION ".dll"
862 #endif
863
864
nlInitExtension_CUDA(void)865 NLboolean nlInitExtension_CUDA(void) {
866 int cublas_version;
867 int cusparse_version;
868 static struct cudaDeviceProp deviceProp;
869 int compute_capability_major;
870 int compute_capability_minor;
871 int multiprocessor_count;
872 int max_shared_mem_per_block;
873 int max_shared_mem_per_multiprocessor;
874 int max_regs_per_block;
875 int max_regs_per_multiprocessor;
876 int warp_size;
877 int double_precision_perf_ratio;
878
879 NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL;
880 if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
881 flags |= NL_LINK_QUIET;
882 }
883
884 if(nlExtensionIsInitialized_CUDA()) {
885 return NL_TRUE;
886 }
887
888 CUDA()->DLL_cudart = nlOpenDLL(
889 LIBPREFIX "cudart" LIBEXTENSION, flags
890 );
891
892 find_cuda_func(cudaDriverGetVersion);
893 find_cuda_func(cudaRuntimeGetVersion);
894 find_cuda_func(cudaGetDeviceCount);
895 find_cuda_func(cudaGetDeviceProperties);
896 find_cuda_func(cudaDeviceGetAttribute);
897 find_cuda_func(cudaDeviceReset);
898 find_cuda_func(cudaMalloc);
899 find_cuda_func(cudaFree);
900 find_cuda_func(cudaMemcpy);
901
902 CUDA()->devID = getBestDeviceID();
903
904 if(
905 CUDA()->devID == -1 ||
906 CUDA()->cudaGetDeviceProperties(&deviceProp, CUDA()->devID)
907 ) {
908 nl_fprintf(stderr,"OpenNL CUDA: could not find a CUDA device\n");
909 return NL_FALSE;
910 }
911
912 nl_printf("OpenNL CUDA: Device ID = %d\n", CUDA()->devID);
913 nl_printf("OpenNL CUDA: Device name=%s\n", deviceProp.name);
914
915 {
916 CUDA()->cudaDeviceGetAttribute(
917 &compute_capability_major,
918 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
919 CUDA()->devID
920 );
921
922 CUDA()->cudaDeviceGetAttribute(
923 &compute_capability_minor,
924 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
925 CUDA()->devID
926 );
927
928 CUDA()->cudaDeviceGetAttribute(
929 &multiprocessor_count,
930 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
931 CUDA()->devID
932 );
933
934 CUDA()->cudaDeviceGetAttribute(
935 &max_shared_mem_per_block,
936 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
937 CUDA()->devID
938 );
939
940 CUDA()->cudaDeviceGetAttribute(
941 &max_shared_mem_per_multiprocessor,
942 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
943 CUDA()->devID
944 );
945
946 CUDA()->cudaDeviceGetAttribute(
947 &max_regs_per_block,
948 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
949 CUDA()->devID
950 );
951
952 CUDA()->cudaDeviceGetAttribute(
953 &max_regs_per_multiprocessor,
954 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
955 CUDA()->devID
956 );
957
958 CUDA()->cudaDeviceGetAttribute(
959 &warp_size,
960 CU_DEVICE_ATTRIBUTE_WARP_SIZE,
961 CUDA()->devID
962 );
963
964 CUDA()->cudaDeviceGetAttribute(
965 &double_precision_perf_ratio,
966 CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO,
967 CUDA()->devID
968 );
969
970 nl_printf(
971 "OpenNL CUDA: Device has %d Multi-Processors, "
972 "%d cores per Multi-Processor, SM %d.%d compute capabilities\n",
973 multiprocessor_count,
974 ConvertSMVer2Cores(
975 compute_capability_major, compute_capability_minor
976 ),
977 compute_capability_major, compute_capability_minor
978 );
979
980 nl_printf(
981 "OpenNL CUDA: %d kB shared mem. per block, %d per MP\n",
982 (int)(max_shared_mem_per_block / 1024),
983 (int)(max_shared_mem_per_multiprocessor / 1024)
984 );
985
986 nl_printf(
987 "OpenNL CUDA: %d regs. per block, %d per MP\n",
988 max_regs_per_block, max_regs_per_multiprocessor
989 );
990
991 nl_printf("OpenNL CUDA: warpsize = %d\n", warp_size);
992
993 nl_printf(
994 "OpenNL CUDA: double precision perf ratio = %d\n",
995 double_precision_perf_ratio
996 );
997 nl_printf(
998 "OpenNL CUDA: theoretical peak double precision GFlops = %f\n",
999 getDeviceDoublePrecisionGFlops(CUDA()->devID)
1000 );
1001 if (
1002 (compute_capability_major * 0x10 + compute_capability_minor) < 0x11
1003 ) {
1004 nl_fprintf(
1005 stderr,
1006 "OpenNL CUDA requires a minimum CUDA compute 1.1 capability\n"
1007 );
1008 CUDA()->cudaDeviceReset();
1009 return NL_FALSE;
1010 }
1011 }
1012
1013 CUDA()->DLL_cublas = nlOpenDLL(
1014 LIBPREFIX "cublas" LIBEXTENSION, flags
1015 );
1016
1017 find_cublas_func(cublasCreate);
1018 find_cublas_func(cublasDestroy);
1019 find_cublas_func(cublasGetVersion);
1020 find_cublas_func(cublasDdot);
1021 find_cublas_func(cublasDaxpy);
1022 find_cublas_func(cublasDcopy);
1023 find_cublas_func(cublasDscal);
1024 find_cublas_func(cublasDnrm2);
1025 find_cublas_func(cublasDgemv);
1026 find_cublas_func(cublasDtpsv);
1027 find_cublas_func_v1(cublasDdgmm);
1028
1029
1030 if(CUDA()->cublasCreate(&CUDA()->HNDL_cublas)) {
1031 return NL_FALSE;
1032 }
1033
1034 if(CUDA()->cublasGetVersion(CUDA()->HNDL_cublas, &cublas_version)) {
1035 return NL_FALSE;
1036 }
1037 nl_printf("OpenNL CUDA: cublas version = %d\n", cublas_version);
1038
1039 CUDA()->DLL_cusparse = nlOpenDLL(
1040 LIBPREFIX "cusparse" LIBEXTENSION, flags
1041 );
1042 find_cusparse_func(cusparseCreate);
1043 find_cusparse_func(cusparseDestroy);
1044 find_cusparse_func(cusparseGetVersion);
1045 find_cusparse_func(cusparseCreateMatDescr);
1046 find_cusparse_func(cusparseDestroyMatDescr);
1047 find_cusparse_func(cusparseSetMatType);
1048 find_cusparse_func(cusparseSetMatIndexBase);
1049 find_cusparse_func(cusparseDcsrmv);
1050
1051 /* V10 API */
1052 if(getenv("NL_NO_CUSPARSE_V10") == NULL) {
1053 nl_printf("OpenNL CUDA: Trying to initialize cusparse V10\n");
1054 CUDA()->has_cusparse_V10 = NL_TRUE;
1055 find_cusparse_func_V10(cusparseCreateDnVec);
1056 find_cusparse_func_V10(cusparseDestroyDnVec);
1057 find_cusparse_func_V10(cusparseDnVecSetValues);
1058 find_cusparse_func_V10(cusparseCreateCsr);
1059 find_cusparse_func_V10(cusparseDestroySpMat);
1060 find_cusparse_func_V10(cusparseSpMV);
1061 find_cusparse_func_V10(cusparseSpMV_bufferSize);
1062 } else {
1063 CUDA()->has_cusparse_V10 = NL_FALSE;
1064 }
1065
1066 if(CUDA()->has_cusparse_V10) {
1067 nl_printf("OpenNL CUDA: using cusparse V10 API\n");
1068 } else {
1069 nl_printf("OpenNL CUDA: using cusparse V9 API\n");
1070 }
1071
1072 if(CUDA()->cusparseCreate(&CUDA()->HNDL_cusparse)) {
1073 return NL_FALSE;
1074 }
1075 if(CUDA()->cusparseGetVersion(CUDA()->HNDL_cusparse, &cusparse_version)) {
1076 return NL_FALSE;
1077 }
1078 nl_printf("OpenNL CUDA: cusparse version = %d\n", cusparse_version);
1079
1080 if(!nlExtensionIsInitialized_CUDA()) {
1081 return NL_FALSE;
1082 }
1083
1084 atexit(nlTerminateExtension_CUDA);
1085 return NL_TRUE;
1086
1087 }
1088
nlCUDACheckImpl(int status,int line)1089 static void nlCUDACheckImpl(int status, int line) {
1090 if(status != 0) {
1091 nl_fprintf(stderr,"nl_cuda.c:%d fatal error %d\n",line, status);
1092 CUDA()->cudaDeviceReset();
1093 exit(-1);
1094 }
1095 }
1096
1097 #define nlCUDACheck(status) nlCUDACheckImpl(status, __LINE__)
1098
1099 /**************************************************************************/
1100
1101 /**
1102 * Abstract matrix interface for a CRS matrix stored on the GPU.
1103 */
1104 typedef struct {
1105 NLuint m;
1106 NLuint n;
1107 NLenum type;
1108 NLDestroyMatrixFunc destroy_func;
1109 NLMultMatrixVectorFunc mult_func;
1110
1111 /* cusparse V9 implementation */
1112 cusparseMatDescr_t descr;
1113
1114 /* cusparse V10 implementation */
1115 cusparseSpMatDescr_t descr_V10;
1116 cusparseDnVecDescr_t X_V10;
1117 cusparseDnVecDescr_t Y_V10;
1118 NLboolean work_init_V10;
1119 void* work_V10;
1120 size_t work_size_V10;
1121
1122 NLuint_big nnz;
1123 int* colind;
1124 int* rowptr;
1125 double* val;
1126 } NLCUDASparseMatrix;
1127
1128
1129 /**
1130 * \brief Deallocates just the CRS part of a CUDA matrix.
1131 */
nlCRSMatrixCUDADestroyCRS(NLCUDASparseMatrix * Mcuda)1132 static void nlCRSMatrixCUDADestroyCRS(NLCUDASparseMatrix* Mcuda) {
1133 if(!nlExtensionIsInitialized_CUDA()) {
1134 return;
1135 }
1136 if(Mcuda->colind != NULL) {
1137 nlCUDACheck(CUDA()->cudaFree(Mcuda->colind));
1138 Mcuda->colind = NULL;
1139 }
1140 if(Mcuda->rowptr != NULL) {
1141 nlCUDACheck(CUDA()->cudaFree(Mcuda->rowptr));
1142 Mcuda->rowptr = NULL;
1143 }
1144 if(Mcuda->val != NULL) {
1145 nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
1146 Mcuda->val = NULL;
1147 }
1148 }
1149
nlCRSMatrixCUDADestroy(NLCUDASparseMatrix * Mcuda)1150 static void nlCRSMatrixCUDADestroy(NLCUDASparseMatrix* Mcuda) {
1151 if(!nlExtensionIsInitialized_CUDA()) {
1152 return;
1153 }
1154 nlCRSMatrixCUDADestroyCRS(Mcuda);
1155 if(CUDA()->has_cusparse_V10) {
1156 nlCUDACheck(CUDA()->cusparseDestroySpMat(Mcuda->descr_V10));
1157 if(Mcuda->X_V10 != NULL) {
1158 nlCUDACheck(CUDA()->cusparseDestroyDnVec(Mcuda->X_V10));
1159 }
1160 if(Mcuda->Y_V10 != NULL) {
1161 nlCUDACheck(CUDA()->cusparseDestroyDnVec(Mcuda->Y_V10));
1162 }
1163 if(Mcuda->work_V10 != NULL) {
1164 nlCUDACheck(CUDA()->cudaFree(Mcuda->work_V10));
1165 }
1166 } else {
1167 nlCUDACheck(CUDA()->cusparseDestroyMatDescr(Mcuda->descr));
1168 }
1169 memset(Mcuda, 0, sizeof(*Mcuda));
1170 }
1171
nlCRSMatrixCUDAMult(NLCUDASparseMatrix * Mcuda,const double * x,double * y)1172 static void nlCRSMatrixCUDAMult(
1173 NLCUDASparseMatrix* Mcuda, const double* x, double* y
1174 ) {
1175 const double one = 1.0;
1176 const double zero = 0.0;
1177 if(CUDA()->has_cusparse_V10) {
1178 if(Mcuda->X_V10 == NULL) {
1179 nlCUDACheck(
1180 CUDA()->cusparseCreateDnVec(
1181 &Mcuda->X_V10, Mcuda->n, (void*)x, CUDA_R_64F
1182 )
1183 );
1184 } else {
1185 nlCUDACheck(
1186 CUDA()->cusparseDnVecSetValues(Mcuda->X_V10, (void*)x)
1187 );
1188 }
1189 if(Mcuda->Y_V10 == NULL) {
1190 nlCUDACheck(
1191 CUDA()->cusparseCreateDnVec(
1192 &Mcuda->Y_V10, Mcuda->m, y, CUDA_R_64F
1193 )
1194 );
1195 } else {
1196 nlCUDACheck(
1197 CUDA()->cusparseDnVecSetValues(Mcuda->Y_V10, y)
1198 );
1199 }
1200 if(!Mcuda->work_init_V10) {
1201 nlCUDACheck(
1202 CUDA()->cusparseSpMV_bufferSize(
1203 CUDA()->HNDL_cusparse,
1204 CUSPARSE_OPERATION_NON_TRANSPOSE,
1205 &one,
1206 Mcuda->descr_V10,
1207 Mcuda->X_V10,
1208 &zero,
1209 Mcuda->Y_V10,
1210 CUDA_R_64F,
1211 CUSPARSE_CSRMV_ALG2, /* CUSPARSE_MV_ALG_DEFAULT, HERE */
1212 &Mcuda->work_size_V10
1213 )
1214 );
1215 nl_printf("Buffer size = %d\n", Mcuda->work_size_V10);
1216 if(Mcuda->work_size_V10 != 0) {
1217 nlCUDACheck(
1218 CUDA()->cudaMalloc(&Mcuda->work_V10,Mcuda->work_size_V10)
1219 );
1220 nl_printf("work = 0x%lx\n", Mcuda->work_V10);
1221 }
1222 Mcuda->work_init_V10 = NL_TRUE;
1223 }
1224 nlCUDACheck(
1225 CUDA()->cusparseSpMV(
1226 CUDA()->HNDL_cusparse,
1227 CUSPARSE_OPERATION_NON_TRANSPOSE,
1228 &one,
1229 Mcuda->descr_V10,
1230 Mcuda->X_V10,
1231 &zero,
1232 Mcuda->Y_V10,
1233 CUDA_R_64F,
1234 CUSPARSE_CSRMV_ALG2, /* CUSPARSE_MV_ALG_DEFAULT, HERE */
1235 Mcuda->work_V10
1236 )
1237 );
1238 } else {
1239 nlCUDACheck(
1240 CUDA()->cusparseDcsrmv(
1241 CUDA()->HNDL_cusparse,
1242 CUSPARSE_OPERATION_NON_TRANSPOSE,
1243 (int)Mcuda->m,
1244 (int)Mcuda->n,
1245 (int)Mcuda->nnz,
1246 &one,
1247 Mcuda->descr,
1248 Mcuda->val,
1249 Mcuda->rowptr,
1250 Mcuda->colind,
1251 x,
1252 &zero,
1253 y
1254 )
1255 );
1256 }
1257 nlCUDABlas()->flops += (NLulong)(2*Mcuda->nnz);
1258 }
1259
nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in)1260 NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) {
1261 NLCUDASparseMatrix* Mcuda = NL_NEW(NLCUDASparseMatrix);
1262 NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
1263 size_t colind_sz, rowptr_sz, val_sz;
1264 nl_assert(M_in->type == NL_MATRIX_CRS);
1265 Mcuda->m = M->m;
1266 Mcuda->n = M->n;
1267 Mcuda->nnz = (NLuint)(nlCRSMatrixNNZ(M));
1268
1269 colind_sz = (size_t)Mcuda->nnz*sizeof(NLuint);
1270 rowptr_sz = (size_t)(Mcuda->m+1)*sizeof(NLuint_big);
1271 val_sz = (size_t)Mcuda->nnz*sizeof(double);
1272
1273 nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->colind,colind_sz));
1274 nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->rowptr,rowptr_sz));
1275 nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->val,val_sz));
1276 nlCUDACheck(CUDA()->cudaMemcpy(
1277 Mcuda->colind, M->colind, colind_sz, cudaMemcpyHostToDevice)
1278 );
1279 nlCUDACheck(CUDA()->cudaMemcpy(
1280 Mcuda->rowptr, M->rowptr, rowptr_sz, cudaMemcpyHostToDevice)
1281 );
1282 nlCUDACheck(CUDA()->cudaMemcpy(
1283 Mcuda->val, M->val, val_sz, cudaMemcpyHostToDevice)
1284 );
1285 Mcuda->type=NL_MATRIX_OTHER;
1286 Mcuda->destroy_func=(NLDestroyMatrixFunc)nlCRSMatrixCUDADestroy;
1287 Mcuda->mult_func=(NLMultMatrixVectorFunc)nlCRSMatrixCUDAMult;
1288
1289 #ifdef GARGANTUA
1290 if(!CUDA()->has_cusparse_V10) {
1291 nlError(
1292 "OpenNL CUDA",
1293 "FATAL ERROR, GARGANTUA mode needs cusparse V10\n"
1294 );
1295 exit(-1);
1296 }
1297 # define ROWPTR_TYPE CUSPARSE_INDEX_64I
1298 #else
1299 # define ROWPTR_TYPE CUSPARSE_INDEX_32I
1300 #endif
1301
1302 if(CUDA()->has_cusparse_V10) {
1303 nlCUDACheck(
1304 CUDA()->cusparseCreateCsr(
1305 &Mcuda->descr_V10,
1306 Mcuda->m,
1307 Mcuda->n,
1308 (int64_t)Mcuda->nnz,
1309 Mcuda->rowptr,
1310 Mcuda->colind,
1311 Mcuda->val,
1312 ROWPTR_TYPE,
1313 CUSPARSE_INDEX_32I,
1314 CUSPARSE_INDEX_BASE_ZERO,
1315 CUDA_R_64F
1316 )
1317 );
1318 } else {
1319 nlCUDACheck(CUDA()->cusparseCreateMatDescr(&Mcuda->descr));
1320 if(M->symmetric_storage) {
1321 nlCUDACheck(CUDA()->cusparseSetMatType(
1322 Mcuda->descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)
1323 );
1324 } else {
1325 nlCUDACheck(CUDA()->cusparseSetMatType(
1326 Mcuda->descr, CUSPARSE_MATRIX_TYPE_GENERAL)
1327 );
1328 }
1329 nlCUDACheck(CUDA()->cusparseSetMatIndexBase(
1330 Mcuda->descr, CUSPARSE_INDEX_BASE_ZERO)
1331 );
1332 }
1333
1334 return (NLMatrix)Mcuda;
1335 }
1336
1337 /**************************************************************************/
1338
1339 /**
1340 * Abstract matrix interface for a diagonal matrix stored on the GPU.
1341 */
1342 typedef struct {
1343 NLuint m;
1344 NLuint n;
1345 NLenum type;
1346 NLDestroyMatrixFunc destroy_func;
1347 NLMultMatrixVectorFunc mult_func;
1348 double* val;
1349 } NLDiagonalMatrixCUDA;
1350
nlDiagonalMatrixCUDADestroy(NLDiagonalMatrixCUDA * Mcuda)1351 static void nlDiagonalMatrixCUDADestroy(NLDiagonalMatrixCUDA* Mcuda) {
1352 if(!nlExtensionIsInitialized_CUDA()) {
1353 return;
1354 }
1355 nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
1356 memset(Mcuda, 0, sizeof(*Mcuda));
1357 }
1358
nlDiagonalMatrixCUDAMult(NLDiagonalMatrixCUDA * Mcuda,const double * x,double * y)1359 static void nlDiagonalMatrixCUDAMult(
1360 NLDiagonalMatrixCUDA* Mcuda, const double* x, double* y
1361 ) {
1362 int N = (int)Mcuda->n;
1363 /*
1364 * vector x vector component-wise product implemented
1365 * using diagonal matrix x matrix function.
1366 */
1367 nlCUDACheck(CUDA()->cublasDdgmm(
1368 CUDA()->HNDL_cublas, CUBLAS_SIDE_LEFT,
1369 N, 1,
1370 x, N,
1371 Mcuda->val, 1,
1372 y, N
1373 ));
1374 nlCUDABlas()->flops += (NLulong)N;
1375 }
1376
nlDiagonalMatrixCUDANew(const double * diag,NLuint n)1377 static NLMatrix nlDiagonalMatrixCUDANew(const double* diag, NLuint n) {
1378 NLDiagonalMatrixCUDA* Mcuda = NL_NEW(NLDiagonalMatrixCUDA);
1379 Mcuda->m = n;
1380 Mcuda->n = n;
1381 Mcuda->type = NL_MATRIX_OTHER;
1382 nlCUDACheck(CUDA()->cudaMalloc(
1383 (void**)&Mcuda->val, n*sizeof(double))
1384 );
1385 nlCUDACheck(CUDA()->cudaMemcpy(
1386 Mcuda->val, diag, n*sizeof(double), cudaMemcpyHostToDevice)
1387 );
1388 Mcuda->destroy_func=(NLDestroyMatrixFunc)nlDiagonalMatrixCUDADestroy;
1389 Mcuda->mult_func=(NLMultMatrixVectorFunc)nlDiagonalMatrixCUDAMult;
1390 return (NLMatrix)Mcuda;
1391 }
1392
nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M_in)1393 NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M_in) {
1394 NLuint N = M_in->n;
1395 NLuint i;
1396 NLuint_big jj;
1397 double* diag = NULL;
1398 NLMatrix result = NULL;
1399 NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
1400 nl_assert(M_in->type == NL_MATRIX_CRS);
1401 diag = NL_NEW_ARRAY(double,N);
1402 for(i=0; i<N; ++i) {
1403 for(jj=M->rowptr[i]; jj<M->rowptr[i+1]; ++jj) {
1404 if(M->colind[jj] == i) {
1405 diag[i] = M->val[jj];
1406 }
1407 }
1408 }
1409 for(i=0; i<N; ++i) {
1410 diag[i] = ((diag[i] == 0.0) ? 1.0 : 1.0 / diag[i]);
1411 }
1412 result = nlDiagonalMatrixCUDANew(diag, N);
1413 NL_DELETE_ARRAY(diag);
1414 return result;
1415 }
1416
1417 /**************************************************************************/
1418
cuda_blas_malloc(NLBlas_t blas,NLmemoryType type,size_t size)1419 static void* cuda_blas_malloc(
1420 NLBlas_t blas, NLmemoryType type, size_t size
1421 ) {
1422 void* result = NULL;
1423 blas->used_ram[type] += (NLulong)size;
1424 blas->max_used_ram[type] = MAX(
1425 blas->max_used_ram[type],blas->used_ram[type]
1426 );
1427 if(type == NL_HOST_MEMORY) {
1428 result = malloc(size);
1429 } else {
1430 nlCUDACheck(CUDA()->cudaMalloc(&result,size));
1431 }
1432 return result;
1433 }
1434
cuda_blas_free(NLBlas_t blas,NLmemoryType type,size_t size,void * ptr)1435 static void cuda_blas_free(
1436 NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
1437 ) {
1438 blas->used_ram[type] -= (NLulong)size;
1439 if(type == NL_HOST_MEMORY) {
1440 free(ptr);
1441 } else {
1442 nlCUDACheck(CUDA()->cudaFree(ptr));
1443 }
1444 }
1445
cuda_blas_memcpy(NLBlas_t blas,void * to,NLmemoryType to_type,void * from,NLmemoryType from_type,size_t size)1446 static void cuda_blas_memcpy(
1447 NLBlas_t blas,
1448 void* to, NLmemoryType to_type,
1449 void* from, NLmemoryType from_type,
1450 size_t size
1451 ) {
1452 enum cudaMemcpyKind kind = cudaMemcpyDefault;
1453 nl_arg_used(blas);
1454 if(from_type == NL_HOST_MEMORY) {
1455 if(to_type == NL_HOST_MEMORY) {
1456 kind = cudaMemcpyHostToHost;
1457 } else {
1458 kind = cudaMemcpyHostToDevice;
1459 }
1460 } else {
1461 if(to_type == NL_HOST_MEMORY) {
1462 kind = cudaMemcpyDeviceToHost;
1463 } else {
1464 kind = cudaMemcpyDeviceToDevice;
1465 }
1466 }
1467 nlCUDACheck(CUDA()->cudaMemcpy(to, from, size, kind));
1468 }
1469
cuda_blas_dcopy(NLBlas_t blas,int n,const double * x,int incx,double * y,int incy)1470 static void cuda_blas_dcopy(
1471 NLBlas_t blas, int n, const double *x, int incx, double *y, int incy
1472 ) {
1473 nl_arg_used(blas);
1474 CUDA()->cublasDcopy(CUDA()->HNDL_cublas,n,x,incx,y,incy);
1475 }
1476
cuda_blas_ddot(NLBlas_t blas,int n,const double * x,int incx,const double * y,int incy)1477 static double cuda_blas_ddot(
1478 NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
1479 ) {
1480 double result = 0.0;
1481 blas->flops += (NLulong)(2*n);
1482 CUDA()->cublasDdot(CUDA()->HNDL_cublas,n,x,incx,y,incy,&result);
1483 return result;
1484 }
1485
cuda_blas_dnrm2(NLBlas_t blas,int n,const double * x,int incx)1486 static double cuda_blas_dnrm2(
1487 NLBlas_t blas, int n, const double *x, int incx
1488 ) {
1489 double result = 0.0;
1490 blas->flops += (NLulong)(2*n);
1491 CUDA()->cublasDnrm2(CUDA()->HNDL_cublas,n,x,incx,&result);
1492 return result;
1493 }
1494
cuda_blas_daxpy(NLBlas_t blas,int n,double a,const double * x,int incx,double * y,int incy)1495 static void cuda_blas_daxpy(
1496 NLBlas_t blas, int n,
1497 double a, const double *x, int incx, double *y, int incy
1498 ) {
1499 blas->flops += (NLulong)(2*n);
1500 CUDA()->cublasDaxpy(CUDA()->HNDL_cublas,n,&a,x,incx,y,incy);
1501 }
1502
cuda_blas_dscal(NLBlas_t blas,int n,double a,double * x,int incx)1503 static void cuda_blas_dscal(
1504 NLBlas_t blas, int n, double a, double *x, int incx
1505 ) {
1506 blas->flops += (NLulong)n;
1507 CUDA()->cublasDscal(CUDA()->HNDL_cublas,n,&a,x,incx);
1508 }
1509
1510
cuda_blas_dgemv(NLBlas_t blas,MatrixTranspose trans,int m,int n,double alpha,const double * A,int ldA,const double * x,int incx,double beta,double * y,int incy)1511 static void cuda_blas_dgemv(
1512 NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
1513 const double *A, int ldA, const double *x, int incx,
1514 double beta, double *y, int incy
1515 ) {
1516 nl_arg_used(blas);
1517 /* TODO: update FLOPS */
1518 CUDA()->cublasDgemv(
1519 CUDA()->HNDL_cublas, (cublasOperation_t)trans,
1520 m, n, &alpha, A, ldA, x, incx, &beta, y, incy
1521 );
1522 }
1523
cuda_blas_dtpsv(NLBlas_t blas,MatrixTriangle uplo,MatrixTranspose trans,MatrixUnitTriangular diag,int n,const double * AP,double * x,int incx)1524 static void cuda_blas_dtpsv(
1525 NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
1526 MatrixUnitTriangular diag, int n, const double *AP,
1527 double *x, int incx
1528 ) {
1529 nl_arg_used(blas);
1530 /* TODO: update FLOPS */
1531 CUDA()->cublasDtpsv(
1532 CUDA()->HNDL_cublas,
1533 (cublasFillMode_t)uplo,
1534 (cublasOperation_t)trans,
1535 (cublasDiagType_t)diag, n,
1536 AP, x, incx
1537 );
1538 }
1539
1540
nlCUDABlas()1541 NLBlas_t nlCUDABlas() {
1542 static NLboolean initialized = NL_FALSE;
1543 static struct NLBlas blas;
1544 if(!initialized) {
1545 memset(&blas, 0, sizeof(blas));
1546 blas.has_unified_memory = NL_FALSE;
1547 blas.Malloc = cuda_blas_malloc;
1548 blas.Free = cuda_blas_free;
1549 blas.Memcpy = cuda_blas_memcpy;
1550 blas.Dcopy = cuda_blas_dcopy;
1551 blas.Ddot = cuda_blas_ddot;
1552 blas.Dnrm2 = cuda_blas_dnrm2;
1553 blas.Daxpy = cuda_blas_daxpy;
1554 blas.Dscal = cuda_blas_dscal;
1555 blas.Dgemv = cuda_blas_dgemv;
1556 blas.Dtpsv = cuda_blas_dtpsv;
1557 nlBlasResetStats(&blas);
1558 initialized = NL_TRUE;
1559 }
1560 return &blas;
1561 }
1562
1563
1564 /**************************************************************************/
1565