1 /* -----------------------------------------------------------------
2 * Programmer(s): Slaven Peles, and Cody J. Balos @ LLNL
3 * -----------------------------------------------------------------
4 * SUNDIALS Copyright Start
5 * Copyright (c) 2002-2021, Lawrence Livermore National Security
6 * and Southern Methodist University.
7 * All rights reserved.
8 *
9 * See the top-level LICENSE and NOTICE files for details.
10 *
11 * SPDX-License-Identifier: BSD-3-Clause
12 * SUNDIALS Copyright End
13 * -----------------------------------------------------------------
14 * This is the implementation file for a CUDA implementation
15 * of the NVECTOR package.
16 * -----------------------------------------------------------------*/
17
18 #include <cstdio>
19 #include <cstdlib>
20 #include <cmath>
21 #include <limits>
22
23 #include <nvector/nvector_cuda.h>
24 #include "VectorArrayKernels.cuh"
25 #include "VectorKernels.cuh"
26
27 #include "sundials_cuda.h"
28 #include "sundials_debug.h"
29
30 #define ZERO RCONST(0.0)
31 #define HALF RCONST(0.5)
32
33 extern "C" {
34
35 using namespace sundials;
36 using namespace sundials::nvector_cuda;
37
38 /*
39 * Macro definitions
40 */
41
42 #define NVEC_CUDA_CONTENT(x) ((N_VectorContent_Cuda)(x->content))
43 #define NVEC_CUDA_PRIVATE(x) ((N_PrivateVectorContent_Cuda)(NVEC_CUDA_CONTENT(x)->priv))
44 #define NVEC_CUDA_MEMSIZE(x) (NVEC_CUDA_CONTENT(x)->length * sizeof(realtype))
45 #define NVEC_CUDA_MEMHELP(x) (NVEC_CUDA_CONTENT(x)->mem_helper)
46 #define NVEC_CUDA_HDATAp(x) ((realtype*) NVEC_CUDA_CONTENT(x)->host_data->ptr)
47 #define NVEC_CUDA_DDATAp(x) ((realtype*) NVEC_CUDA_CONTENT(x)->device_data->ptr)
48 #define NVEC_CUDA_HBUFFERp(x) ((realtype*) NVEC_CUDA_PRIVATE(x)->reduce_buffer_host->ptr)
49 #define NVEC_CUDA_DBUFFERp(x) ((realtype*) NVEC_CUDA_PRIVATE(x)->reduce_buffer_dev->ptr)
50 #define NVEC_CUDA_STREAM(x) (NVEC_CUDA_CONTENT(x)->stream_exec_policy->stream())
51
52
53 /*
54 * Private structure definition
55 */
56
57 struct _N_PrivateVectorContent_Cuda
58 {
59 booleantype use_managed_mem; /* indicates if the data pointers and buffer pointers are managed memory */
60 size_t reduce_buffer_allocated_bytes; /* current size of the reduction buffer */
61 SUNMemory reduce_buffer_dev; /* device buffer used for reductions */
62 SUNMemory reduce_buffer_host; /* host buffer used for reductions */
63 };
64
65 typedef struct _N_PrivateVectorContent_Cuda *N_PrivateVectorContent_Cuda;
66
67 /*
68 * Private function definitions
69 */
70
71 static int AllocateData(N_Vector v);
72 static int InitializeReductionBuffer(N_Vector v, const realtype value);
73 static void FreeReductionBuffer(N_Vector v);
74 static int CopyReductionBufferFromDevice(N_Vector v, size_t n = 1);
75 static int GetKernelParameters(N_Vector v, booleantype reduction, size_t& grid, size_t& block,
76 size_t& shMemSize, cudaStream_t& stream, size_t n = 0);
77 static void PostKernelLaunch();
78
79 /*
80 * Private functions needed for N_VMakeWithManagedAllocator_Cuda
81 * backwards compatibility.
82 */
83
84 /* DEPRECATION NOTICE: The 4 functions below can be removed once
85 N_VMakeWithManagedAllocator_Cuda (deprecated) is removed in the
86 next major release. The UserAllocHelper struct can also be removed. */
87
88 /* Struct that we use to pack up the user
89 provided alloc and free functions. */
90 typedef struct _UserAllocHelper
91 {
92 void* (*userallocfn)(size_t);
93 void (*userfreefn)(void*);
94 } UserAllocHelper;
95
UserAlloc(SUNMemoryHelper helper,SUNMemory * memptr,size_t memsize,SUNMemoryType mem_type)96 static int UserAlloc(SUNMemoryHelper helper, SUNMemory* memptr,
97 size_t memsize, SUNMemoryType mem_type)
98 {
99 UserAllocHelper* ua = (UserAllocHelper*) helper->content;
100 SUNMemory mem = SUNMemoryNewEmpty();
101
102 mem->type = SUNMEMTYPE_UVM;
103 mem->ptr = ua->userallocfn(memsize);
104 mem->own = SUNTRUE;
105 if (mem->ptr == NULL)
106 {
107 SUNDIALS_DEBUG_PRINT("ERROR in UserAlloc: user provided alloc failed\n");
108 free(mem);
109 return(-1);
110 }
111
112 *memptr = mem;
113 return(0);
114 }
115
UserDealloc(SUNMemoryHelper helper,SUNMemory mem)116 static int UserDealloc(SUNMemoryHelper helper, SUNMemory mem)
117 {
118 UserAllocHelper* ua = (UserAllocHelper*) helper->content;
119 if (mem->own)
120 {
121 ua->userfreefn(mem->ptr);
122 mem->ptr = NULL;
123 }
124 free(mem);
125 return(0);
126 }
127
HelperClone(SUNMemoryHelper helper)128 static SUNMemoryHelper HelperClone(SUNMemoryHelper helper)
129 {
130 UserAllocHelper* uaclone;
131 UserAllocHelper* ua = (UserAllocHelper*) helper->content;
132 SUNMemoryHelper hclone = SUNMemoryHelper_NewEmpty();
133
134 SUNMemoryHelper_CopyOps(helper, hclone);
135
136 uaclone = (UserAllocHelper*) malloc(sizeof(UserAllocHelper));
137 uaclone->userallocfn = ua->userallocfn;
138 uaclone->userfreefn = ua->userfreefn;
139
140 hclone->content = uaclone;
141
142 return(hclone);
143 }
144
HelperDestroy(SUNMemoryHelper helper)145 static int HelperDestroy(SUNMemoryHelper helper)
146 {
147 free(helper->content);
148 helper->content = NULL;
149 free(helper->ops);
150 free(helper);
151 return(0);
152 }
153
N_VNewEmpty_Cuda()154 N_Vector N_VNewEmpty_Cuda()
155 {
156 N_Vector v;
157
158 /* Create vector */
159 v = NULL;
160 v = N_VNewEmpty();
161 if (v == NULL) return(NULL);
162
163 /* Attach operations */
164
165 /* constructors, destructors, and utility operations */
166 v->ops->nvgetvectorid = N_VGetVectorID_Cuda;
167 v->ops->nvclone = N_VClone_Cuda;
168 v->ops->nvcloneempty = N_VCloneEmpty_Cuda;
169 v->ops->nvdestroy = N_VDestroy_Cuda;
170 v->ops->nvspace = N_VSpace_Cuda;
171 v->ops->nvgetlength = N_VGetLength_Cuda;
172 v->ops->nvgetarraypointer = N_VGetHostArrayPointer_Cuda;
173 v->ops->nvgetdevicearraypointer = N_VGetDeviceArrayPointer_Cuda;
174 v->ops->nvsetarraypointer = N_VSetHostArrayPointer_Cuda;
175
176 /* standard vector operations */
177 v->ops->nvlinearsum = N_VLinearSum_Cuda;
178 v->ops->nvconst = N_VConst_Cuda;
179 v->ops->nvprod = N_VProd_Cuda;
180 v->ops->nvdiv = N_VDiv_Cuda;
181 v->ops->nvscale = N_VScale_Cuda;
182 v->ops->nvabs = N_VAbs_Cuda;
183 v->ops->nvinv = N_VInv_Cuda;
184 v->ops->nvaddconst = N_VAddConst_Cuda;
185 v->ops->nvdotprod = N_VDotProd_Cuda;
186 v->ops->nvmaxnorm = N_VMaxNorm_Cuda;
187 v->ops->nvmin = N_VMin_Cuda;
188 v->ops->nvl1norm = N_VL1Norm_Cuda;
189 v->ops->nvinvtest = N_VInvTest_Cuda;
190 v->ops->nvconstrmask = N_VConstrMask_Cuda;
191 v->ops->nvminquotient = N_VMinQuotient_Cuda;
192 v->ops->nvwrmsnormmask = N_VWrmsNormMask_Cuda;
193 v->ops->nvwrmsnorm = N_VWrmsNorm_Cuda;
194 v->ops->nvwl2norm = N_VWL2Norm_Cuda;
195 v->ops->nvcompare = N_VCompare_Cuda;
196
197 /* fused and vector array operations are disabled (NULL) by default */
198
199 /* local reduction operations */
200 v->ops->nvdotprodlocal = N_VDotProd_Cuda;
201 v->ops->nvmaxnormlocal = N_VMaxNorm_Cuda;
202 v->ops->nvminlocal = N_VMin_Cuda;
203 v->ops->nvl1normlocal = N_VL1Norm_Cuda;
204 v->ops->nvinvtestlocal = N_VInvTest_Cuda;
205 v->ops->nvconstrmasklocal = N_VConstrMask_Cuda;
206 v->ops->nvminquotientlocal = N_VMinQuotient_Cuda;
207 v->ops->nvwsqrsumlocal = N_VWSqrSumLocal_Cuda;
208 v->ops->nvwsqrsummasklocal = N_VWSqrSumMaskLocal_Cuda;
209
210 /* XBraid interface operations */
211 v->ops->nvbufsize = N_VBufSize_Cuda;
212 v->ops->nvbufpack = N_VBufPack_Cuda;
213 v->ops->nvbufunpack = N_VBufUnpack_Cuda;
214
215 /* print operation for debugging */
216 v->ops->nvprint = N_VPrint_Cuda;
217 v->ops->nvprintfile = N_VPrintFile_Cuda;
218
219 /* Create content */
220
221 v->content = (N_VectorContent_Cuda) malloc(sizeof(_N_VectorContent_Cuda));
222 if (v->content == NULL)
223 {
224 N_VDestroy(v);
225 return(NULL);
226 }
227
228 NVEC_CUDA_CONTENT(v)->priv = malloc(sizeof(_N_PrivateVectorContent_Cuda));
229 if (NVEC_CUDA_CONTENT(v)->priv == NULL)
230 {
231 N_VDestroy(v);
232 return(NULL);
233 }
234
235 NVEC_CUDA_CONTENT(v)->length = 0;
236 NVEC_CUDA_CONTENT(v)->host_data = NULL;
237 NVEC_CUDA_CONTENT(v)->device_data = NULL;
238 NVEC_CUDA_CONTENT(v)->stream_exec_policy = NULL;
239 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = NULL;
240 NVEC_CUDA_CONTENT(v)->mem_helper = NULL;
241 NVEC_CUDA_CONTENT(v)->own_helper = SUNFALSE;
242 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
243 NVEC_CUDA_PRIVATE(v)->use_managed_mem = SUNFALSE;
244 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
245 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
246 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
247
248 return(v);
249 }
250
N_VNew_Cuda(sunindextype length)251 N_Vector N_VNew_Cuda(sunindextype length)
252 {
253 N_Vector v;
254
255 v = NULL;
256 v = N_VNewEmpty_Cuda();
257 if (v == NULL) return(NULL);
258
259 NVEC_CUDA_CONTENT(v)->length = length;
260 NVEC_CUDA_CONTENT(v)->host_data = NULL;
261 NVEC_CUDA_CONTENT(v)->device_data = NULL;
262 NVEC_CUDA_CONTENT(v)->mem_helper = SUNMemoryHelper_Cuda();
263 NVEC_CUDA_CONTENT(v)->stream_exec_policy = new CudaThreadDirectExecPolicy(256);
264 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = new CudaBlockReduceExecPolicy(256);
265 NVEC_CUDA_CONTENT(v)->own_helper = SUNTRUE;
266 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
267 NVEC_CUDA_PRIVATE(v)->use_managed_mem = SUNFALSE;
268 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
269 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
270 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
271
272 if (NVEC_CUDA_MEMHELP(v) == NULL)
273 {
274 SUNDIALS_DEBUG_PRINT("ERROR in N_VNew_Cuda: memory helper is NULL\n");
275 N_VDestroy(v);
276 return(NULL);
277 }
278
279 if (AllocateData(v))
280 {
281 SUNDIALS_DEBUG_PRINT("ERROR in N_VNew_Cuda: AllocateData returned nonzero\n");
282 N_VDestroy(v);
283 return(NULL);
284 }
285
286 return(v);
287 }
288
N_VNewWithMemHelp_Cuda(sunindextype length,booleantype use_managed_mem,SUNMemoryHelper helper)289 N_Vector N_VNewWithMemHelp_Cuda(sunindextype length, booleantype use_managed_mem, SUNMemoryHelper helper)
290 {
291 N_Vector v;
292
293 if (helper == NULL)
294 {
295 SUNDIALS_DEBUG_PRINT("ERROR in N_VNewWithMemHelp_Cuda: helper is NULL\n");
296 return(NULL);
297 }
298
299 if (!SUNMemoryHelper_ImplementsRequiredOps(helper))
300 {
301 SUNDIALS_DEBUG_PRINT("ERROR in N_VNewWithMemHelp_Cuda: helper doesn't implement all required ops\n");
302 return(NULL);
303 }
304
305 v = NULL;
306 v = N_VNewEmpty_Cuda();
307 if (v == NULL) return(NULL);
308
309 NVEC_CUDA_CONTENT(v)->length = length;
310 NVEC_CUDA_CONTENT(v)->host_data = NULL;
311 NVEC_CUDA_CONTENT(v)->device_data = NULL;
312 NVEC_CUDA_CONTENT(v)->mem_helper = helper;
313 NVEC_CUDA_CONTENT(v)->stream_exec_policy = new CudaThreadDirectExecPolicy(256);
314 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = new CudaBlockReduceExecPolicy(256);
315 NVEC_CUDA_CONTENT(v)->own_helper = SUNFALSE;
316 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
317 NVEC_CUDA_PRIVATE(v)->use_managed_mem = use_managed_mem;
318 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
319 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
320 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
321
322 if (AllocateData(v))
323 {
324 SUNDIALS_DEBUG_PRINT("ERROR in N_VNewWithMemHelp_Cuda: AllocateData returned nonzero\n");
325 N_VDestroy(v);
326 return(NULL);
327 }
328
329 return(v);
330 }
331
N_VNewManaged_Cuda(sunindextype length)332 N_Vector N_VNewManaged_Cuda(sunindextype length)
333 {
334 N_Vector v;
335
336 v = NULL;
337 v = N_VNewEmpty_Cuda();
338 if (v == NULL) return(NULL);
339
340 NVEC_CUDA_CONTENT(v)->length = length;
341 NVEC_CUDA_CONTENT(v)->host_data = NULL;
342 NVEC_CUDA_CONTENT(v)->device_data = NULL;
343 NVEC_CUDA_CONTENT(v)->stream_exec_policy = new CudaThreadDirectExecPolicy(256);
344 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = new CudaBlockReduceExecPolicy(256);
345 NVEC_CUDA_CONTENT(v)->mem_helper = SUNMemoryHelper_Cuda();
346 NVEC_CUDA_CONTENT(v)->own_helper = SUNTRUE;
347 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
348 NVEC_CUDA_PRIVATE(v)->use_managed_mem = SUNTRUE;
349 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
350 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
351 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
352
353 if (NVEC_CUDA_MEMHELP(v) == NULL)
354 {
355 SUNDIALS_DEBUG_PRINT("ERROR in N_VNewManaged_Cuda: memory helper is NULL\n");
356 N_VDestroy(v);
357 return(NULL);
358 }
359
360 if (AllocateData(v))
361 {
362 SUNDIALS_DEBUG_PRINT("ERROR in N_VNewManaged_Cuda: AllocateData returned nonzero\n");
363 N_VDestroy(v);
364 return(NULL);
365 }
366
367 return(v);
368 }
369
N_VMake_Cuda(sunindextype length,realtype * h_vdata,realtype * d_vdata)370 N_Vector N_VMake_Cuda(sunindextype length, realtype *h_vdata, realtype *d_vdata)
371 {
372 N_Vector v;
373
374 if (h_vdata == NULL || d_vdata == NULL) return(NULL);
375
376 v = NULL;
377 v = N_VNewEmpty_Cuda();
378 if (v == NULL) return(NULL);
379
380 NVEC_CUDA_CONTENT(v)->length = length;
381 NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Wrap(h_vdata, SUNMEMTYPE_HOST);
382 NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Wrap(d_vdata, SUNMEMTYPE_DEVICE);
383 NVEC_CUDA_CONTENT(v)->stream_exec_policy = new CudaThreadDirectExecPolicy(256);
384 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = new CudaBlockReduceExecPolicy(256);
385 NVEC_CUDA_CONTENT(v)->mem_helper = SUNMemoryHelper_Cuda();
386 NVEC_CUDA_CONTENT(v)->own_helper = SUNTRUE;
387 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
388 NVEC_CUDA_PRIVATE(v)->use_managed_mem = SUNFALSE;
389 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
390 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
391 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
392
393 if (NVEC_CUDA_MEMHELP(v) == NULL)
394 {
395 SUNDIALS_DEBUG_PRINT("ERROR in N_VMake_Cuda: memory helper is NULL\n");
396 N_VDestroy(v);
397 return(NULL);
398 }
399
400 if (NVEC_CUDA_CONTENT(v)->device_data == NULL ||
401 NVEC_CUDA_CONTENT(v)->host_data == NULL)
402 {
403 SUNDIALS_DEBUG_PRINT("ERROR in N_VMake_Cuda: SUNMemoryHelper_Wrap returned NULL\n");
404 N_VDestroy(v);
405 return(NULL);
406 }
407
408 return(v);
409 }
410
N_VMakeManaged_Cuda(sunindextype length,realtype * vdata)411 N_Vector N_VMakeManaged_Cuda(sunindextype length, realtype *vdata)
412 {
413 N_Vector v;
414
415 if (vdata == NULL) return(NULL);
416
417 v = NULL;
418 v = N_VNewEmpty_Cuda();
419 if (v == NULL) return(NULL);
420
421 NVEC_CUDA_CONTENT(v)->length = length;
422 NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Wrap(vdata, SUNMEMTYPE_UVM);
423 NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Alias(NVEC_CUDA_CONTENT(v)->host_data);
424 NVEC_CUDA_CONTENT(v)->stream_exec_policy = new CudaThreadDirectExecPolicy(256);
425 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = new CudaBlockReduceExecPolicy(256);
426 NVEC_CUDA_CONTENT(v)->mem_helper = SUNMemoryHelper_Cuda();
427 NVEC_CUDA_CONTENT(v)->own_helper = SUNTRUE;
428 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
429 NVEC_CUDA_PRIVATE(v)->use_managed_mem = SUNTRUE;
430 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
431 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
432 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
433
434 if (NVEC_CUDA_MEMHELP(v) == NULL)
435 {
436 SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeManaged_Cuda: memory helper is NULL\n");
437 N_VDestroy(v);
438 return(NULL);
439 }
440
441 if (NVEC_CUDA_CONTENT(v)->device_data == NULL ||
442 NVEC_CUDA_CONTENT(v)->host_data == NULL)
443 {
444 SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeManaged_Cuda: SUNMemoryHelper_Wrap returned NULL\n");
445 N_VDestroy(v);
446 return(NULL);
447 }
448
449 return(v);
450 }
451
N_VMakeWithManagedAllocator_Cuda(sunindextype length,void * (* allocfn)(size_t),void (* freefn)(void *))452 N_Vector N_VMakeWithManagedAllocator_Cuda(sunindextype length,
453 void* (*allocfn)(size_t),
454 void (*freefn)(void*))
455 {
456 UserAllocHelper* ua;
457 N_Vector v;
458
459 v = NULL;
460 v = N_VNewEmpty_Cuda();
461 if (v == NULL) return(NULL);
462
463 NVEC_CUDA_CONTENT(v)->length = length;
464 NVEC_CUDA_CONTENT(v)->host_data = NULL;
465 NVEC_CUDA_CONTENT(v)->device_data = NULL;
466 NVEC_CUDA_CONTENT(v)->stream_exec_policy = new CudaThreadDirectExecPolicy(256);
467 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = new CudaBlockReduceExecPolicy(256);
468 NVEC_CUDA_CONTENT(v)->mem_helper = SUNMemoryHelper_Cuda();
469 NVEC_CUDA_CONTENT(v)->own_helper = SUNTRUE;
470 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
471 NVEC_CUDA_PRIVATE(v)->use_managed_mem = SUNTRUE;
472 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
473 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
474 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
475
476 if (NVEC_CUDA_MEMHELP(v) == NULL)
477 {
478 SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeWithManagedAllocator_Cuda: memory helper is NULL\n");
479 N_VDestroy(v);
480 return(NULL);
481 }
482
483 ua = (UserAllocHelper*) malloc(sizeof(UserAllocHelper));
484 ua->userallocfn = allocfn;
485 ua->userfreefn = freefn;
486 NVEC_CUDA_MEMHELP(v)->content = (void*) ua;
487 NVEC_CUDA_MEMHELP(v)->ops->alloc = UserAlloc;
488 NVEC_CUDA_MEMHELP(v)->ops->dealloc = UserDealloc;
489 NVEC_CUDA_MEMHELP(v)->ops->clone = HelperClone;
490 NVEC_CUDA_MEMHELP(v)->ops->destroy = HelperDestroy;
491
492 if (AllocateData(v))
493 {
494 SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeWithManagedAllocator_Cuda: AllocateData returned nonzero\n");
495 N_VDestroy(v);
496 return(NULL);
497 }
498
499 return(v);
500 }
501
502 /* ----------------------------------------------------------------------------
503 * Set pointer to the raw host data. Does not free the existing pointer.
504 */
505
N_VSetHostArrayPointer_Cuda(realtype * h_vdata,N_Vector v)506 void N_VSetHostArrayPointer_Cuda(realtype* h_vdata, N_Vector v)
507 {
508 if (N_VIsManagedMemory_Cuda(v))
509 {
510 if (NVEC_CUDA_CONTENT(v)->host_data)
511 {
512 NVEC_CUDA_CONTENT(v)->host_data->ptr = (void*) h_vdata;
513 NVEC_CUDA_CONTENT(v)->device_data->ptr = (void*) h_vdata;
514 }
515 else
516 {
517 NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Wrap((void*) h_vdata, SUNMEMTYPE_UVM);
518 NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Alias(NVEC_CUDA_CONTENT(v)->host_data);
519 }
520 }
521 else
522 {
523 if (NVEC_CUDA_CONTENT(v)->host_data)
524 {
525 NVEC_CUDA_CONTENT(v)->host_data->ptr = (void*) h_vdata;
526 }
527 else
528 {
529 NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Wrap((void*) h_vdata, SUNMEMTYPE_HOST);
530 }
531 }
532 }
533
534 /* ----------------------------------------------------------------------------
535 * Set pointer to the raw device data
536 */
537
N_VSetDeviceArrayPointer_Cuda(realtype * d_vdata,N_Vector v)538 void N_VSetDeviceArrayPointer_Cuda(realtype* d_vdata, N_Vector v)
539 {
540 if (N_VIsManagedMemory_Cuda(v))
541 {
542 if (NVEC_CUDA_CONTENT(v)->device_data)
543 {
544 NVEC_CUDA_CONTENT(v)->device_data->ptr = (void*) d_vdata;
545 NVEC_CUDA_CONTENT(v)->host_data->ptr = (void*) d_vdata;
546 }
547 else
548 {
549 NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Wrap((void*) d_vdata, SUNMEMTYPE_UVM);
550 NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Alias(NVEC_CUDA_CONTENT(v)->device_data);
551 }
552 }
553 else
554 {
555 if (NVEC_CUDA_CONTENT(v)->device_data)
556 {
557 NVEC_CUDA_CONTENT(v)->device_data->ptr = (void*) d_vdata;
558 }
559 else
560 {
561 NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Wrap((void*) d_vdata, SUNMEMTYPE_DEVICE);
562 }
563 }
564 }
565
566 /* ----------------------------------------------------------------------------
567 * Return a flag indicating if the memory for the vector data is managed
568 */
N_VIsManagedMemory_Cuda(N_Vector x)569 booleantype N_VIsManagedMemory_Cuda(N_Vector x)
570 {
571 return NVEC_CUDA_PRIVATE(x)->use_managed_mem;
572 }
573
N_VSetKernelExecPolicy_Cuda(N_Vector x,SUNCudaExecPolicy * stream_exec_policy,SUNCudaExecPolicy * reduce_exec_policy)574 int N_VSetKernelExecPolicy_Cuda(N_Vector x,
575 SUNCudaExecPolicy* stream_exec_policy,
576 SUNCudaExecPolicy* reduce_exec_policy)
577 {
578 if (x == NULL || stream_exec_policy == NULL || reduce_exec_policy == NULL)
579 return(-1);
580
581 if (NVEC_CUDA_CONTENT(x)->own_exec)
582 {
583 delete NVEC_CUDA_CONTENT(x)->stream_exec_policy;
584 delete NVEC_CUDA_CONTENT(x)->reduce_exec_policy;
585 }
586
587 NVEC_CUDA_CONTENT(x)->stream_exec_policy = stream_exec_policy;
588 NVEC_CUDA_CONTENT(x)->reduce_exec_policy = reduce_exec_policy;
589 NVEC_CUDA_CONTENT(x)->own_exec = SUNFALSE;
590
591 return(0);
592 }
593
594 /*
595 * ----------------------------------------------------------------------------
596 * DEPRECATED: will be removed in SUNDIALS v6.
597 * Sets the cudaStream_t to use for execution of the CUDA kernels.
598 */
N_VSetCudaStream_Cuda(N_Vector x,cudaStream_t * stream)599 void N_VSetCudaStream_Cuda(N_Vector x, cudaStream_t *stream)
600 {
601 const CudaExecPolicy* xs = NVEC_CUDA_CONTENT(x)->stream_exec_policy;
602 const CudaExecPolicy* xr = NVEC_CUDA_CONTENT(x)->reduce_exec_policy;
603 CudaThreadDirectExecPolicy* s =
604 new CudaThreadDirectExecPolicy(xs->blockSize(), *stream);
605 CudaBlockReduceExecPolicy* r =
606 new CudaBlockReduceExecPolicy(xr->blockSize(), xr->gridSize(), *stream);
607 N_VSetKernelExecPolicy_Cuda(x, s, r);
608 NVEC_CUDA_CONTENT(x)->own_exec = SUNTRUE;
609 }
610
611 /* ----------------------------------------------------------------------------
612 * Copy vector data to the device
613 */
614
N_VCopyToDevice_Cuda(N_Vector x)615 void N_VCopyToDevice_Cuda(N_Vector x)
616 {
617 int copy_fail;
618
619 copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
620 NVEC_CUDA_CONTENT(x)->device_data,
621 NVEC_CUDA_CONTENT(x)->host_data,
622 NVEC_CUDA_MEMSIZE(x),
623 (void*) NVEC_CUDA_STREAM(x));
624
625 if (copy_fail)
626 {
627 SUNDIALS_DEBUG_PRINT("ERROR in N_VCopyToDevice_Cuda: SUNMemoryHelper_CopyAsync returned nonzero\n");
628 }
629
630 /* we synchronize with respect to the host, but only in this stream */
631 SUNDIALS_CUDA_VERIFY(cudaStreamSynchronize(*NVEC_CUDA_STREAM(x)));
632 }
633
634 /* ----------------------------------------------------------------------------
635 * Copy vector data from the device to the host
636 */
637
N_VCopyFromDevice_Cuda(N_Vector x)638 void N_VCopyFromDevice_Cuda(N_Vector x)
639 {
640 int copy_fail;
641
642 copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
643 NVEC_CUDA_CONTENT(x)->host_data,
644 NVEC_CUDA_CONTENT(x)->device_data,
645 NVEC_CUDA_MEMSIZE(x),
646 (void*) NVEC_CUDA_STREAM(x));
647
648 if (copy_fail)
649 {
650 SUNDIALS_DEBUG_PRINT("ERROR in N_VCopyFromDevice_Cuda: SUNMemoryHelper_CopyAsync returned nonzero\n");
651 }
652
653 /* we synchronize with respect to the host, but only in this stream */
654 SUNDIALS_CUDA_VERIFY(cudaStreamSynchronize(*NVEC_CUDA_STREAM(x)));
655 }
656
657 /* ----------------------------------------------------------------------------
658 * Function to print the a CUDA-based vector to stdout
659 */
660
N_VPrint_Cuda(N_Vector x)661 void N_VPrint_Cuda(N_Vector x)
662 {
663 N_VPrintFile_Cuda(x, stdout);
664 }
665
666 /* ----------------------------------------------------------------------------
667 * Function to print the a CUDA-based vector to outfile
668 */
669
N_VPrintFile_Cuda(N_Vector x,FILE * outfile)670 void N_VPrintFile_Cuda(N_Vector x, FILE *outfile)
671 {
672 sunindextype i;
673
674 for (i = 0; i < NVEC_CUDA_CONTENT(x)->length; i++) {
675 #if defined(SUNDIALS_EXTENDED_PRECISION)
676 fprintf(outfile, "%35.32Lg\n", NVEC_CUDA_HDATAp(x)[i]);
677 #elif defined(SUNDIALS_DOUBLE_PRECISION)
678 fprintf(outfile, "%19.16g\n", NVEC_CUDA_HDATAp(x)[i]);
679 #else
680 fprintf(outfile, "%11.8g\n", NVEC_CUDA_HDATAp(x)[i]);
681 #endif
682 }
683 fprintf(outfile, "\n");
684
685 return;
686 }
687
688
689 /*
690 * -----------------------------------------------------------------
691 * implementation of vector operations
692 * -----------------------------------------------------------------
693 */
694
N_VCloneEmpty_Cuda(N_Vector w)695 N_Vector N_VCloneEmpty_Cuda(N_Vector w)
696 {
697 N_Vector v;
698
699 if (w == NULL) return(NULL);
700
701 /* Create vector */
702 v = NULL;
703 v = N_VNewEmpty_Cuda();
704 if (v == NULL) return(NULL);
705
706 /* Attach operations */
707 if (N_VCopyOps(w, v)) { N_VDestroy(v); return(NULL); }
708
709 /* Set content */
710 NVEC_CUDA_CONTENT(v)->length = NVEC_CUDA_CONTENT(w)->length;
711 NVEC_CUDA_CONTENT(v)->host_data = NULL;
712 NVEC_CUDA_CONTENT(v)->device_data = NULL;
713 NVEC_CUDA_CONTENT(v)->mem_helper = NULL;
714 NVEC_CUDA_CONTENT(v)->own_exec = SUNTRUE;
715 NVEC_CUDA_PRIVATE(v)->use_managed_mem = NVEC_CUDA_PRIVATE(w)->use_managed_mem;
716 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev = NULL;
717 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host = NULL;
718 NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
719
720 return(v);
721 }
722
N_VClone_Cuda(N_Vector w)723 N_Vector N_VClone_Cuda(N_Vector w)
724 {
725 N_Vector v;
726
727 v = NULL;
728 v = N_VCloneEmpty_Cuda(w);
729 if (v == NULL) return(NULL);
730
731 NVEC_CUDA_MEMHELP(v) = SUNMemoryHelper_Clone(NVEC_CUDA_MEMHELP(w));
732 NVEC_CUDA_CONTENT(v)->own_helper = SUNTRUE;
733 NVEC_CUDA_CONTENT(v)->stream_exec_policy = NVEC_CUDA_CONTENT(w)->stream_exec_policy->clone();
734 NVEC_CUDA_CONTENT(v)->reduce_exec_policy = NVEC_CUDA_CONTENT(w)->reduce_exec_policy->clone();
735
736 if (NVEC_CUDA_MEMHELP(v) == NULL)
737 {
738 SUNDIALS_DEBUG_PRINT("ERROR in N_VClone_Cuda: SUNMemoryHelper_Clone returned NULL\n");
739 N_VDestroy(v);
740 return(NULL);
741 }
742
743 if (AllocateData(v))
744 {
745 SUNDIALS_DEBUG_PRINT("ERROR in N_VClone_Cuda: AllocateData returned nonzero\n");
746 N_VDestroy(v);
747 return(NULL);
748 }
749
750 return(v);
751 }
752
N_VDestroy_Cuda(N_Vector v)753 void N_VDestroy_Cuda(N_Vector v)
754 {
755 N_VectorContent_Cuda vc;
756 N_PrivateVectorContent_Cuda vcp;
757
758 if (v == NULL) return;
759
760 /* free ops structure */
761 if (v->ops != NULL)
762 {
763 free(v->ops);
764 v->ops = NULL;
765 }
766
767 /* extract content */
768 vc = NVEC_CUDA_CONTENT(v);
769 if (vc == NULL)
770 {
771 free(v);
772 v = NULL;
773 return;
774 }
775
776 /* free private content */
777 vcp = (N_PrivateVectorContent_Cuda) vc->priv;
778 if (vcp != NULL)
779 {
780 /* free items in private content */
781 FreeReductionBuffer(v);
782 free(vcp);
783 vc->priv = NULL;
784 }
785
786 /* free items in content */
787 if (vc->own_exec)
788 {
789 delete vc->stream_exec_policy;
790 vc->stream_exec_policy = NULL;
791 delete vc->reduce_exec_policy;
792 vc->reduce_exec_policy = NULL;
793 }
794
795 if (NVEC_CUDA_MEMHELP(v))
796 {
797 SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vc->host_data);
798 vc->host_data = NULL;
799 SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vc->device_data);
800 vc->device_data = NULL;
801 if (vc->own_helper) SUNMemoryHelper_Destroy(vc->mem_helper);
802 vc->mem_helper = NULL;
803 }
804
805 /* free content struct */
806 free(vc);
807
808 /* free vector */
809 free(v);
810
811 return;
812 }
813
N_VSpace_Cuda(N_Vector X,sunindextype * lrw,sunindextype * liw)814 void N_VSpace_Cuda(N_Vector X, sunindextype *lrw, sunindextype *liw)
815 {
816 *lrw = NVEC_CUDA_CONTENT(X)->length;
817 *liw = 2;
818 }
819
N_VConst_Cuda(realtype a,N_Vector X)820 void N_VConst_Cuda(realtype a, N_Vector X)
821 {
822 size_t grid, block, shMemSize;
823 cudaStream_t stream;
824
825 GetKernelParameters(X, false, grid, block, shMemSize, stream);
826 setConstKernel<<<grid, block, shMemSize, stream>>>
827 (
828 a,
829 NVEC_CUDA_DDATAp(X),
830 NVEC_CUDA_CONTENT(X)->length
831 );
832 PostKernelLaunch();
833 }
834
N_VLinearSum_Cuda(realtype a,N_Vector X,realtype b,N_Vector Y,N_Vector Z)835 void N_VLinearSum_Cuda(realtype a, N_Vector X, realtype b, N_Vector Y, N_Vector Z)
836 {
837 size_t grid, block, shMemSize;
838 cudaStream_t stream;
839
840 GetKernelParameters(X, false, grid, block, shMemSize, stream);
841 linearSumKernel<<<grid, block, shMemSize, stream>>>
842 (
843 a,
844 NVEC_CUDA_DDATAp(X),
845 b,
846 NVEC_CUDA_DDATAp(Y),
847 NVEC_CUDA_DDATAp(Z),
848 NVEC_CUDA_CONTENT(X)->length
849 );
850 PostKernelLaunch();
851 }
852
N_VProd_Cuda(N_Vector X,N_Vector Y,N_Vector Z)853 void N_VProd_Cuda(N_Vector X, N_Vector Y, N_Vector Z)
854 {
855 size_t grid, block, shMemSize;
856 cudaStream_t stream;
857
858 GetKernelParameters(X, false, grid, block, shMemSize, stream);
859 prodKernel<<<grid, block, shMemSize, stream>>>
860 (
861 NVEC_CUDA_DDATAp(X),
862 NVEC_CUDA_DDATAp(Y),
863 NVEC_CUDA_DDATAp(Z),
864 NVEC_CUDA_CONTENT(X)->length
865 );
866 PostKernelLaunch();
867 }
868
N_VDiv_Cuda(N_Vector X,N_Vector Y,N_Vector Z)869 void N_VDiv_Cuda(N_Vector X, N_Vector Y, N_Vector Z)
870 {
871 size_t grid, block, shMemSize;
872 cudaStream_t stream;
873
874 GetKernelParameters(X, false, grid, block, shMemSize, stream);
875 divKernel<<<grid, block, shMemSize, stream>>>
876 (
877 NVEC_CUDA_DDATAp(X),
878 NVEC_CUDA_DDATAp(Y),
879 NVEC_CUDA_DDATAp(Z),
880 NVEC_CUDA_CONTENT(X)->length
881 );
882 PostKernelLaunch();
883 }
884
N_VScale_Cuda(realtype a,N_Vector X,N_Vector Z)885 void N_VScale_Cuda(realtype a, N_Vector X, N_Vector Z)
886 {
887 size_t grid, block, shMemSize;
888 cudaStream_t stream;
889
890 GetKernelParameters(X, false, grid, block, shMemSize, stream);
891 scaleKernel<<<grid, block, shMemSize, stream>>>
892 (
893 a,
894 NVEC_CUDA_DDATAp(X),
895 NVEC_CUDA_DDATAp(Z),
896 NVEC_CUDA_CONTENT(X)->length
897 );
898 PostKernelLaunch();
899 }
900
N_VAbs_Cuda(N_Vector X,N_Vector Z)901 void N_VAbs_Cuda(N_Vector X, N_Vector Z)
902 {
903 size_t grid, block, shMemSize;
904 cudaStream_t stream;
905
906 GetKernelParameters(X, false, grid, block, shMemSize, stream);
907 absKernel<<<grid, block, shMemSize, stream>>>
908 (
909 NVEC_CUDA_DDATAp(X),
910 NVEC_CUDA_DDATAp(Z),
911 NVEC_CUDA_CONTENT(X)->length
912 );
913 PostKernelLaunch();
914 }
915
N_VInv_Cuda(N_Vector X,N_Vector Z)916 void N_VInv_Cuda(N_Vector X, N_Vector Z)
917 {
918 size_t grid, block, shMemSize;
919 cudaStream_t stream;
920
921 GetKernelParameters(X, false, grid, block, shMemSize, stream);
922 invKernel<<<grid, block, shMemSize, stream>>>
923 (
924 NVEC_CUDA_DDATAp(X),
925 NVEC_CUDA_DDATAp(Z),
926 NVEC_CUDA_CONTENT(X)->length
927 );
928 PostKernelLaunch();
929 }
930
N_VAddConst_Cuda(N_Vector X,realtype b,N_Vector Z)931 void N_VAddConst_Cuda(N_Vector X, realtype b, N_Vector Z)
932 {
933 size_t grid, block, shMemSize;
934 cudaStream_t stream;
935
936 GetKernelParameters(X, false, grid, block, shMemSize, stream);
937 addConstKernel<<<grid, block, shMemSize, stream>>>
938 (
939 b,
940 NVEC_CUDA_DDATAp(X),
941 NVEC_CUDA_DDATAp(Z),
942 NVEC_CUDA_CONTENT(X)->length
943 );
944 PostKernelLaunch();
945 }
946
N_VDotProd_Cuda(N_Vector X,N_Vector Y)947 realtype N_VDotProd_Cuda(N_Vector X, N_Vector Y)
948 {
949 size_t grid, block, shMemSize;
950 cudaStream_t stream;
951
952 if (InitializeReductionBuffer(X, ZERO))
953 {
954 SUNDIALS_DEBUG_PRINT("ERROR in N_VDotProd_Cuda: InitializeReductionBuffer returned nonzero\n");
955 }
956
957 GetKernelParameters(X, true, grid, block, shMemSize, stream);
958 dotProdKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
959 (
960 NVEC_CUDA_DDATAp(X),
961 NVEC_CUDA_DDATAp(Y),
962 NVEC_CUDA_DBUFFERp(X),
963 NVEC_CUDA_CONTENT(X)->length
964 );
965 PostKernelLaunch();
966
967 // Get result from the GPU
968 CopyReductionBufferFromDevice(X);
969 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
970
971 return gpu_result;
972 }
973
N_VMaxNorm_Cuda(N_Vector X)974 realtype N_VMaxNorm_Cuda(N_Vector X)
975 {
976 size_t grid, block, shMemSize;
977 cudaStream_t stream;
978
979 if (InitializeReductionBuffer(X, ZERO))
980 {
981 SUNDIALS_DEBUG_PRINT("ERROR in N_VMaxNorm_Cuda: InitializeReductionBuffer returned nonzero\n");
982 }
983
984 GetKernelParameters(X, true, grid, block, shMemSize, stream);
985 maxNormKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
986 (
987 NVEC_CUDA_DDATAp(X),
988 NVEC_CUDA_DBUFFERp(X),
989 NVEC_CUDA_CONTENT(X)->length
990 );
991 PostKernelLaunch();
992
993 // Finish reduction on CPU if there are less than two blocks of data left.
994 CopyReductionBufferFromDevice(X);
995 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
996
997 return gpu_result;
998 }
999
N_VWSqrSumLocal_Cuda(N_Vector X,N_Vector W)1000 realtype N_VWSqrSumLocal_Cuda(N_Vector X, N_Vector W)
1001 {
1002 size_t grid, block, shMemSize;
1003 cudaStream_t stream;
1004
1005 if (InitializeReductionBuffer(X, ZERO))
1006 {
1007 SUNDIALS_DEBUG_PRINT("ERROR in N_VWSqrSumLocal_Cuda: InitializeReductionBuffer returned nonzero\n");
1008 }
1009
1010 GetKernelParameters(X, true, grid, block, shMemSize, stream);
1011 wL2NormSquareKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1012 (
1013 NVEC_CUDA_DDATAp(X),
1014 NVEC_CUDA_DDATAp(W),
1015 NVEC_CUDA_DBUFFERp(X),
1016 NVEC_CUDA_CONTENT(X)->length
1017 );
1018 PostKernelLaunch();
1019
1020 // Get result from the GPU
1021 CopyReductionBufferFromDevice(X);
1022 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1023
1024 return gpu_result;
1025 }
1026
N_VWrmsNorm_Cuda(N_Vector X,N_Vector W)1027 realtype N_VWrmsNorm_Cuda(N_Vector X, N_Vector W)
1028 {
1029 const realtype sum = N_VWSqrSumLocal_Cuda(X, W);
1030 return std::sqrt(sum/NVEC_CUDA_CONTENT(X)->length);
1031 }
1032
N_VWSqrSumMaskLocal_Cuda(N_Vector X,N_Vector W,N_Vector Id)1033 realtype N_VWSqrSumMaskLocal_Cuda(N_Vector X, N_Vector W, N_Vector Id)
1034 {
1035 size_t grid, block, shMemSize;
1036 cudaStream_t stream;
1037
1038 if (InitializeReductionBuffer(X, ZERO))
1039 {
1040 SUNDIALS_DEBUG_PRINT("ERROR in N_VWSqrSumMaskLocal_Cuda: InitializeReductionBuffer returned nonzero\n");
1041 }
1042
1043 GetKernelParameters(X, true, grid, block, shMemSize, stream);
1044 wL2NormSquareMaskKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1045 (
1046 NVEC_CUDA_DDATAp(X),
1047 NVEC_CUDA_DDATAp(W),
1048 NVEC_CUDA_DDATAp(Id),
1049 NVEC_CUDA_DBUFFERp(X),
1050 NVEC_CUDA_CONTENT(X)->length
1051 );
1052 PostKernelLaunch();
1053
1054 // Get result from the GPU
1055 CopyReductionBufferFromDevice(X);
1056 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1057
1058 return gpu_result;
1059 }
1060
N_VWrmsNormMask_Cuda(N_Vector X,N_Vector W,N_Vector Id)1061 realtype N_VWrmsNormMask_Cuda(N_Vector X, N_Vector W, N_Vector Id)
1062 {
1063 const realtype sum = N_VWSqrSumMaskLocal_Cuda(X, W, Id);
1064 return std::sqrt(sum/NVEC_CUDA_CONTENT(X)->length);
1065 }
1066
N_VMin_Cuda(N_Vector X)1067 realtype N_VMin_Cuda(N_Vector X)
1068 {
1069 const realtype maxVal = std::numeric_limits<realtype>::max();
1070
1071 size_t grid, block, shMemSize;
1072 cudaStream_t stream;
1073
1074 if (InitializeReductionBuffer(X, maxVal))
1075 {
1076 SUNDIALS_DEBUG_PRINT("ERROR in N_VMin_Cuda: InitializeReductionBuffer returned nonzero\n");
1077 }
1078
1079 GetKernelParameters(X, true, grid, block, shMemSize, stream);
1080 findMinKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1081 (
1082 maxVal,
1083 NVEC_CUDA_DDATAp(X),
1084 NVEC_CUDA_DBUFFERp(X),
1085 NVEC_CUDA_CONTENT(X)->length
1086 );
1087 PostKernelLaunch();
1088
1089 // Get result from the GPU
1090 CopyReductionBufferFromDevice(X);
1091 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1092
1093 return gpu_result;
1094 }
1095
N_VWL2Norm_Cuda(N_Vector X,N_Vector W)1096 realtype N_VWL2Norm_Cuda(N_Vector X, N_Vector W)
1097 {
1098 const realtype sum = N_VWSqrSumLocal_Cuda(X, W);
1099 return std::sqrt(sum);
1100 }
1101
N_VL1Norm_Cuda(N_Vector X)1102 realtype N_VL1Norm_Cuda(N_Vector X)
1103 {
1104 size_t grid, block, shMemSize;
1105 cudaStream_t stream;
1106
1107 if (InitializeReductionBuffer(X, ZERO))
1108 {
1109 SUNDIALS_DEBUG_PRINT("ERROR in N_VL1Norm_Cuda: InitializeReductionBuffer returned nonzero\n");
1110 }
1111
1112 GetKernelParameters(X, true, grid, block, shMemSize, stream);
1113 L1NormKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1114 (
1115 NVEC_CUDA_DDATAp(X),
1116 NVEC_CUDA_DBUFFERp(X),
1117 NVEC_CUDA_CONTENT(X)->length
1118 );
1119 PostKernelLaunch();
1120
1121 // Get result from the GPU
1122 CopyReductionBufferFromDevice(X);
1123 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1124
1125 return gpu_result;
1126 }
1127
N_VCompare_Cuda(realtype c,N_Vector X,N_Vector Z)1128 void N_VCompare_Cuda(realtype c, N_Vector X, N_Vector Z)
1129 {
1130 size_t grid, block, shMemSize;
1131 cudaStream_t stream;
1132
1133 GetKernelParameters(X, false, grid, block, shMemSize, stream);
1134 compareKernel<<<grid, block, shMemSize, stream>>>
1135 (
1136 c,
1137 NVEC_CUDA_DDATAp(X),
1138 NVEC_CUDA_DDATAp(Z),
1139 NVEC_CUDA_CONTENT(X)->length
1140 );
1141 PostKernelLaunch();
1142 }
1143
N_VInvTest_Cuda(N_Vector X,N_Vector Z)1144 booleantype N_VInvTest_Cuda(N_Vector X, N_Vector Z)
1145 {
1146 size_t grid, block, shMemSize;
1147 cudaStream_t stream;
1148
1149 if (InitializeReductionBuffer(X, ZERO))
1150 {
1151 SUNDIALS_DEBUG_PRINT("ERROR in N_VInvTest_Cuda: InitializeReductionBuffer returned nonzero\n");
1152 }
1153
1154 GetKernelParameters(X, true, grid, block, shMemSize, stream);
1155 invTestKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1156 (
1157 NVEC_CUDA_DDATAp(X),
1158 NVEC_CUDA_DDATAp(Z),
1159 NVEC_CUDA_DBUFFERp(X),
1160 NVEC_CUDA_CONTENT(X)->length
1161 );
1162 PostKernelLaunch();
1163
1164 // Get result from the GPU
1165 CopyReductionBufferFromDevice(X);
1166 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1167
1168 return (gpu_result < HALF);
1169 }
1170
N_VConstrMask_Cuda(N_Vector C,N_Vector X,N_Vector M)1171 booleantype N_VConstrMask_Cuda(N_Vector C, N_Vector X, N_Vector M)
1172 {
1173 size_t grid, block, shMemSize;
1174 cudaStream_t stream;
1175
1176 if (InitializeReductionBuffer(X, ZERO))
1177 {
1178 SUNDIALS_DEBUG_PRINT("ERROR in N_VConstrMask_Cuda: InitializeReductionBuffer returned nonzero\n");
1179 }
1180
1181 GetKernelParameters(X, true, grid, block, shMemSize, stream);
1182 constrMaskKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1183 (
1184 NVEC_CUDA_DDATAp(C),
1185 NVEC_CUDA_DDATAp(X),
1186 NVEC_CUDA_DDATAp(M),
1187 NVEC_CUDA_DBUFFERp(X),
1188 NVEC_CUDA_CONTENT(X)->length
1189 );
1190 PostKernelLaunch();
1191
1192 // Get result from the GPU
1193 CopyReductionBufferFromDevice(X);
1194 realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1195
1196 return (gpu_result < HALF);
1197 }
1198
N_VMinQuotient_Cuda(N_Vector num,N_Vector denom)1199 realtype N_VMinQuotient_Cuda(N_Vector num, N_Vector denom)
1200 {
1201 // Starting value for min reduction
1202 const realtype maxVal = std::numeric_limits<realtype>::max();
1203 size_t grid, block, shMemSize;
1204 cudaStream_t stream;
1205
1206 if (InitializeReductionBuffer(num, maxVal))
1207 {
1208 SUNDIALS_DEBUG_PRINT("ERROR in N_VMinQuotient_Cuda: InitializeReductionBuffer returned nonzero\n");
1209 }
1210
1211 GetKernelParameters(num, true, grid, block, shMemSize, stream);
1212 minQuotientKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1213 (
1214 maxVal,
1215 NVEC_CUDA_DDATAp(num),
1216 NVEC_CUDA_DDATAp(denom),
1217 NVEC_CUDA_DBUFFERp(num),
1218 NVEC_CUDA_CONTENT(num)->length
1219 );
1220 PostKernelLaunch();
1221
1222 // Get result from the GPU
1223 CopyReductionBufferFromDevice(num);
1224 realtype gpu_result = NVEC_CUDA_HBUFFERp(num)[0];
1225
1226 return gpu_result;
1227 }
1228
1229
1230 /*
1231 * -----------------------------------------------------------------
1232 * fused vector operations
1233 * -----------------------------------------------------------------
1234 */
1235
N_VLinearCombination_Cuda(int nvec,realtype * c,N_Vector * X,N_Vector Z)1236 int N_VLinearCombination_Cuda(int nvec, realtype* c, N_Vector* X, N_Vector Z)
1237 {
1238 cudaError_t err;
1239
1240 // Copy c array to device
1241 realtype* d_c;
1242 err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
1243 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1244 err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
1245 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1246
1247 // Create array of device pointers on host
1248 realtype** h_Xd = new realtype*[nvec];
1249 for (int i=0; i<nvec; i++)
1250 h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1251
1252 // Copy array of device pointers to device from host
1253 realtype** d_Xd;
1254 err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1255 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1256 err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1257 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1258
1259 // Set kernel parameters and launch
1260 size_t grid, block, shMemSize;
1261 cudaStream_t stream;
1262
1263 if (GetKernelParameters(X[0], false, grid, block, shMemSize, stream)) return(-1);
1264 linearCombinationKernel<<<grid, block, shMemSize, stream>>>
1265 (
1266 nvec,
1267 d_c,
1268 d_Xd,
1269 NVEC_CUDA_DDATAp(Z),
1270 NVEC_CUDA_CONTENT(Z)->length
1271 );
1272 PostKernelLaunch();
1273
1274 // Free host array
1275 delete[] h_Xd;
1276
1277 // Free device arrays
1278 err = cudaFree(d_c);
1279 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1280 err = cudaFree(d_Xd);
1281 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1282
1283 return(0);
1284 }
1285
N_VScaleAddMulti_Cuda(int nvec,realtype * c,N_Vector X,N_Vector * Y,N_Vector * Z)1286 int N_VScaleAddMulti_Cuda(int nvec, realtype* c, N_Vector X, N_Vector* Y,
1287 N_Vector* Z)
1288 {
1289 cudaError_t err;
1290
1291 // Copy c array to device
1292 realtype* d_c;
1293 err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
1294 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1295 err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
1296 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1297
1298 // Create array of device pointers on host
1299 realtype** h_Yd = new realtype*[nvec];
1300 for (int i=0; i<nvec; i++)
1301 h_Yd[i] = NVEC_CUDA_DDATAp(Y[i]);
1302
1303 realtype** h_Zd = new realtype*[nvec];
1304 for (int i=0; i<nvec; i++)
1305 h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1306
1307 // Copy array of device pointers to device from host
1308 realtype** d_Yd;
1309 err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
1310 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1311 err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1312 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1313
1314 realtype** d_Zd;
1315 err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1316 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1317 err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1318 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1319
1320 // Set kernel parameters
1321 size_t grid, block, shMemSize;
1322 cudaStream_t stream;
1323
1324 if (GetKernelParameters(X, false, grid, block, shMemSize, stream)) return(-1);
1325 scaleAddMultiKernel<<<grid, block, shMemSize, stream>>>
1326 (
1327 nvec,
1328 d_c,
1329 NVEC_CUDA_DDATAp(X),
1330 d_Yd,
1331 d_Zd,
1332 NVEC_CUDA_CONTENT(X)->length
1333 );
1334 PostKernelLaunch();
1335
1336 // Free host array
1337 delete[] h_Yd;
1338 delete[] h_Zd;
1339
1340 // Free device arrays
1341 err = cudaFree(d_c);
1342 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1343 err = cudaFree(d_Yd);
1344 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1345 err = cudaFree(d_Zd);
1346 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1347
1348 return(0);
1349 }
1350
N_VDotProdMulti_Cuda(int nvec,N_Vector X,N_Vector * Y,realtype * dots)1351 int N_VDotProdMulti_Cuda(int nvec, N_Vector X, N_Vector* Y, realtype* dots)
1352 {
1353 cudaError_t err;
1354
1355 // Create array of device pointers on host
1356 realtype** h_Yd = new realtype*[nvec];
1357 for (int i=0; i<nvec; i++)
1358 h_Yd[i] = NVEC_CUDA_DDATAp(Y[i]);
1359
1360 // Copy array of device pointers to device from host
1361 realtype** d_Yd;
1362 err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
1363 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1364 err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1365 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1366
1367 // Set kernel parameters
1368 size_t grid, block, shMemSize;
1369 cudaStream_t stream;
1370
1371 if (GetKernelParameters(X, false, grid, block, shMemSize, stream)) return(-1);
1372 grid = nvec;
1373
1374 // Allocate reduction buffer on device
1375 realtype* d_buff;
1376 err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
1377 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1378 err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
1379 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1380
1381 dotProdMultiKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1382 (
1383 nvec,
1384 NVEC_CUDA_DDATAp(X),
1385 d_Yd,
1386 d_buff,
1387 NVEC_CUDA_CONTENT(X)->length
1388 );
1389 PostKernelLaunch();
1390
1391 // Copy GPU result to the cpu.
1392 err = cudaMemcpy(dots, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
1393 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1394
1395 // Free host array
1396 delete[] h_Yd;
1397
1398 // Free device arrays
1399 err = cudaFree(d_Yd);
1400 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1401 err = cudaFree(d_buff);
1402 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1403
1404 return(0);
1405 }
1406
1407
1408 /*
1409 * -----------------------------------------------------------------------------
1410 * vector array operations
1411 * -----------------------------------------------------------------------------
1412 */
1413
N_VLinearSumVectorArray_Cuda(int nvec,realtype a,N_Vector * X,realtype b,N_Vector * Y,N_Vector * Z)1414 int N_VLinearSumVectorArray_Cuda(int nvec, realtype a, N_Vector* X, realtype b,
1415 N_Vector* Y, N_Vector* Z)
1416 {
1417 cudaError_t err;
1418
1419 // Create array of device pointers on host
1420 realtype** h_Xd = new realtype*[nvec];
1421 for (int i=0; i<nvec; i++)
1422 h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1423
1424 realtype** h_Yd = new realtype*[nvec];
1425 for (int i=0; i<nvec; i++)
1426 h_Yd[i] = NVEC_CUDA_DDATAp(Y[i]);
1427
1428 realtype** h_Zd = new realtype*[nvec];
1429 for (int i=0; i<nvec; i++)
1430 h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1431
1432 // Copy array of device pointers to device from host
1433 realtype** d_Xd;
1434 err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1435 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1436 err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1437 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1438
1439 realtype** d_Yd;
1440 err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
1441 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1442 err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1443 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1444
1445 realtype** d_Zd;
1446 err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1447 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1448 err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1449 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1450
1451 // Set kernel parameters
1452 size_t grid, block, shMemSize;
1453 cudaStream_t stream;
1454
1455 if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1456 linearSumVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1457 (
1458 nvec,
1459 a,
1460 d_Xd,
1461 b,
1462 d_Yd,
1463 d_Zd,
1464 NVEC_CUDA_CONTENT(Z[0])->length
1465 );
1466 PostKernelLaunch();
1467
1468 // Free host array
1469 delete[] h_Xd;
1470 delete[] h_Yd;
1471 delete[] h_Zd;
1472
1473 // Free device arrays
1474 err = cudaFree(d_Xd);
1475 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1476 err = cudaFree(d_Yd);
1477 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1478 err = cudaFree(d_Zd);
1479 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1480
1481 return(0);
1482 }
1483
N_VScaleVectorArray_Cuda(int nvec,realtype * c,N_Vector * X,N_Vector * Z)1484 int N_VScaleVectorArray_Cuda(int nvec, realtype* c, N_Vector* X, N_Vector* Z)
1485 {
1486 cudaError_t err;
1487
1488 // Copy c array to device
1489 realtype* d_c;
1490 err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
1491 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1492 err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
1493 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1494
1495 // Create array of device pointers on host
1496 realtype** h_Xd = new realtype*[nvec];
1497 for (int i=0; i<nvec; i++)
1498 h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1499
1500 realtype** h_Zd = new realtype*[nvec];
1501 for (int i=0; i<nvec; i++)
1502 h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1503
1504 // Copy array of device pointers to device from host
1505 realtype** d_Xd;
1506 err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1507 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1508 err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1509 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1510
1511 realtype** d_Zd;
1512 err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1513 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1514 err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1515 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1516
1517 // Set kernel parameters
1518 size_t grid, block, shMemSize;
1519 cudaStream_t stream;
1520
1521 if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1522 scaleVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1523 (
1524 nvec,
1525 d_c,
1526 d_Xd,
1527 d_Zd,
1528 NVEC_CUDA_CONTENT(Z[0])->length
1529 );
1530 PostKernelLaunch();
1531
1532 // Free host array
1533 delete[] h_Xd;
1534 delete[] h_Zd;
1535
1536 // Free device arrays
1537 err = cudaFree(d_c);
1538 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1539 err = cudaFree(d_Xd);
1540 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1541 err = cudaFree(d_Zd);
1542 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1543
1544 return(0);
1545 }
1546
N_VConstVectorArray_Cuda(int nvec,realtype c,N_Vector * Z)1547 int N_VConstVectorArray_Cuda(int nvec, realtype c, N_Vector* Z)
1548 {
1549 cudaError_t err;
1550
1551 // Create array of device pointers on host
1552 realtype** h_Zd = new realtype*[nvec];
1553 for (int i=0; i<nvec; i++)
1554 h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1555
1556 // Copy array of device pointers to device from host
1557 realtype** d_Zd;
1558 err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1559 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1560 err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1561 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1562
1563 // Set kernel parameters
1564 size_t grid, block, shMemSize;
1565 cudaStream_t stream;
1566
1567 if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1568 constVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1569 (
1570 nvec,
1571 c,
1572 d_Zd,
1573 NVEC_CUDA_CONTENT(Z[0])->length
1574 );
1575 PostKernelLaunch();
1576
1577 // Free host array
1578 delete[] h_Zd;
1579
1580 // Free device arrays
1581 err = cudaFree(d_Zd);
1582 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1583
1584 return(0);
1585 }
1586
N_VWrmsNormVectorArray_Cuda(int nvec,N_Vector * X,N_Vector * W,realtype * norms)1587 int N_VWrmsNormVectorArray_Cuda(int nvec, N_Vector* X, N_Vector* W,
1588 realtype* norms)
1589 {
1590 cudaError_t err;
1591
1592 // Create array of device pointers on host
1593 realtype** h_Xd = new realtype*[nvec];
1594 for (int i=0; i<nvec; i++)
1595 h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1596 realtype** h_Wd = new realtype*[nvec];
1597 for (int i=0; i<nvec; i++)
1598 h_Wd[i] = NVEC_CUDA_DDATAp(W[i]);
1599
1600 // Copy array of device pointers to device from host
1601 realtype** d_Xd;
1602 err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1603 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1604 err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1605 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1606
1607 realtype** d_Wd;
1608 err = cudaMalloc((void**) &d_Wd, nvec*sizeof(realtype*));
1609 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1610 err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1611 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1612
1613 // Set kernel parameters
1614 size_t grid, block, shMemSize;
1615 cudaStream_t stream;
1616
1617 if (GetKernelParameters(X[0], true, grid, block, shMemSize, stream)) return(-1);
1618 grid = nvec;
1619
1620 // Allocate reduction buffer on device
1621 realtype* d_buff;
1622 err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
1623 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1624 err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
1625 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1626
1627 wL2NormSquareVectorArrayKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1628 (
1629 nvec,
1630 d_Xd,
1631 d_Wd,
1632 d_buff,
1633 NVEC_CUDA_CONTENT(X[0])->length
1634 );
1635 PostKernelLaunch();
1636
1637 // Copy GPU result to the cpu.
1638 err = cudaMemcpy(norms, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
1639 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1640
1641 // Finish computation
1642 for (int k=0; k<nvec; ++k)
1643 norms[k] = std::sqrt(norms[k]/NVEC_CUDA_CONTENT(X[0])->length);
1644
1645 // Free host array
1646 delete[] h_Xd;
1647 delete[] h_Wd;
1648
1649 // Free device arrays
1650 err = cudaFree(d_Xd);
1651 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1652 err = cudaFree(d_Wd);
1653 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1654 err = cudaFree(d_buff);
1655 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1656
1657 return(0);
1658 }
1659
N_VWrmsNormMaskVectorArray_Cuda(int nvec,N_Vector * X,N_Vector * W,N_Vector id,realtype * norms)1660 int N_VWrmsNormMaskVectorArray_Cuda(int nvec, N_Vector* X, N_Vector* W,
1661 N_Vector id, realtype* norms)
1662 {
1663 cudaError_t err;
1664
1665 // Create array of device pointers on host
1666 realtype** h_Xd = new realtype*[nvec];
1667 for (int i=0; i<nvec; i++)
1668 h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1669
1670 realtype** h_Wd = new realtype*[nvec];
1671 for (int i=0; i<nvec; i++)
1672 h_Wd[i] = NVEC_CUDA_DDATAp(W[i]);
1673
1674 // Copy array of device pointers to device from host
1675 realtype** d_Xd;
1676 err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1677 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1678 err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1679 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1680
1681 realtype** d_Wd;
1682 err = cudaMalloc((void**) &d_Wd, nvec*sizeof(realtype*));
1683 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1684 err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1685 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1686
1687 // Set kernel parameters
1688 size_t grid, block, shMemSize;
1689 cudaStream_t stream;
1690
1691 if (GetKernelParameters(X[0], true, grid, block, shMemSize, stream)) return(-1);
1692 grid = nvec;
1693
1694 // Allocate reduction buffer on device
1695 realtype* d_buff;
1696 err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
1697 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1698 err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
1699 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1700
1701 wL2NormSquareMaskVectorArrayKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1702 (
1703 nvec,
1704 d_Xd,
1705 d_Wd,
1706 NVEC_CUDA_DDATAp(id),
1707 d_buff,
1708 NVEC_CUDA_CONTENT(X[0])->length
1709 );
1710 PostKernelLaunch();
1711
1712 // Copy GPU result to the cpu.
1713 err = cudaMemcpy(norms, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
1714 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1715
1716 // Finish computation
1717 for (int k=0; k<nvec; ++k)
1718 norms[k] = std::sqrt(norms[k]/NVEC_CUDA_CONTENT(X[0])->length);
1719
1720 // Free host array
1721 delete[] h_Xd;
1722 delete[] h_Wd;
1723
1724 // Free device arrays
1725 err = cudaFree(d_Xd);
1726 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1727 err = cudaFree(d_Wd);
1728 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1729 err = cudaFree(d_buff);
1730 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1731
1732 return(0);
1733 }
1734
N_VScaleAddMultiVectorArray_Cuda(int nvec,int nsum,realtype * c,N_Vector * X,N_Vector ** Y,N_Vector ** Z)1735 int N_VScaleAddMultiVectorArray_Cuda(int nvec, int nsum, realtype* c,
1736 N_Vector* X, N_Vector** Y, N_Vector** Z)
1737 {
1738 cudaError_t err;
1739
1740 // Copy c array to device
1741 realtype* d_c;
1742 err = cudaMalloc((void**) &d_c, nsum*sizeof(realtype));
1743 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1744 err = cudaMemcpy(d_c, c, nsum*sizeof(realtype), cudaMemcpyHostToDevice);
1745 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1746
1747 // Create array of device pointers on host
1748 realtype** h_Xd = new realtype*[nvec];
1749 for (int i=0; i<nvec; i++)
1750 h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1751
1752 realtype** h_Yd = new realtype*[nsum*nvec];
1753 for (int j=0; j<nvec; j++)
1754 for (int i=0; i<nsum; i++)
1755 h_Yd[j*nsum+i] = NVEC_CUDA_DDATAp(Y[i][j]);
1756
1757 realtype** h_Zd = new realtype*[nsum*nvec];
1758 for (int j=0; j<nvec; j++)
1759 for (int i=0; i<nsum; i++)
1760 h_Zd[j*nsum+i] = NVEC_CUDA_DDATAp(Z[i][j]);
1761
1762 // Copy array of device pointers to device from host
1763 realtype** d_Xd;
1764 err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1765 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1766 err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1767 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1768
1769 realtype** d_Yd;
1770 err = cudaMalloc((void**) &d_Yd, nsum*nvec*sizeof(realtype*));
1771 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1772 err = cudaMemcpy(d_Yd, h_Yd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1773 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1774
1775 realtype** d_Zd;
1776 err = cudaMalloc((void**) &d_Zd, nsum*nvec*sizeof(realtype*));
1777 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1778 err = cudaMemcpy(d_Zd, h_Zd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1779 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1780
1781 // Set kernel parameters
1782 size_t grid, block, shMemSize;
1783 cudaStream_t stream;
1784
1785 if (GetKernelParameters(Z[0][0], false, grid, block, shMemSize, stream)) return(-1);
1786 scaleAddMultiVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1787 (
1788 nvec,
1789 nsum,
1790 d_c,
1791 d_Xd,
1792 d_Yd,
1793 d_Zd,
1794 NVEC_CUDA_CONTENT(Z[0][0])->length
1795 );
1796 PostKernelLaunch();
1797
1798 // Free host array
1799 delete[] h_Xd;
1800 delete[] h_Yd;
1801 delete[] h_Zd;
1802
1803 // Free device arrays
1804 err = cudaFree(d_c);
1805 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1806 err = cudaFree(d_Xd);
1807 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1808 err = cudaFree(d_Yd);
1809 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1810 err = cudaFree(d_Zd);
1811 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1812
1813 return(0);
1814 }
1815
N_VLinearCombinationVectorArray_Cuda(int nvec,int nsum,realtype * c,N_Vector ** X,N_Vector * Z)1816 int N_VLinearCombinationVectorArray_Cuda(int nvec, int nsum, realtype* c,
1817 N_Vector** X, N_Vector* Z)
1818 {
1819 cudaError_t err;
1820
1821 // Copy c array to device
1822 realtype* d_c;
1823 err = cudaMalloc((void**) &d_c, nsum*sizeof(realtype));
1824 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1825 err = cudaMemcpy(d_c, c, nsum*sizeof(realtype), cudaMemcpyHostToDevice);
1826 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1827
1828 // Create array of device pointers on host
1829 realtype** h_Xd = new realtype*[nsum*nvec];
1830 for (int j=0; j<nvec; j++)
1831 for (int i=0; i<nsum; i++)
1832 h_Xd[j*nsum+i] = NVEC_CUDA_DDATAp(X[i][j]);
1833
1834 realtype** h_Zd = new realtype*[nvec];
1835 for (int i=0; i<nvec; i++)
1836 h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1837
1838 // Copy array of device pointers to device from host
1839 realtype** d_Xd;
1840 err = cudaMalloc((void**) &d_Xd, nsum*nvec*sizeof(realtype*));
1841 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1842 err = cudaMemcpy(d_Xd, h_Xd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1843 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1844
1845 realtype** d_Zd;
1846 err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1847 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1848 err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1849 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1850
1851 // Set kernel parameters
1852 size_t grid, block, shMemSize;
1853 cudaStream_t stream;
1854
1855 if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1856 linearCombinationVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1857 (
1858 nvec,
1859 nsum,
1860 d_c,
1861 d_Xd,
1862 d_Zd,
1863 NVEC_CUDA_CONTENT(Z[0])->length
1864 );
1865 PostKernelLaunch();
1866
1867 // Free host array
1868 delete[] h_Xd;
1869 delete[] h_Zd;
1870
1871 // Free device arrays
1872 err = cudaFree(d_c);
1873 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1874 err = cudaFree(d_Xd);
1875 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1876 err = cudaFree(d_Zd);
1877 if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1878
1879 return cudaGetLastError();
1880 }
1881
1882
1883 /*
1884 * -----------------------------------------------------------------
1885 * OPTIONAL XBraid interface operations
1886 * -----------------------------------------------------------------
1887 */
1888
1889
N_VBufSize_Cuda(N_Vector x,sunindextype * size)1890 int N_VBufSize_Cuda(N_Vector x, sunindextype *size)
1891 {
1892 if (x == NULL) return(-1);
1893 *size = (sunindextype)NVEC_CUDA_MEMSIZE(x);
1894 return(0);
1895 }
1896
1897
N_VBufPack_Cuda(N_Vector x,void * buf)1898 int N_VBufPack_Cuda(N_Vector x, void *buf)
1899 {
1900 int copy_fail = 0;
1901 cudaError_t cuerr;
1902
1903 if (x == NULL || buf == NULL) return(-1);
1904
1905 SUNMemory buf_mem = SUNMemoryHelper_Wrap(buf, SUNMEMTYPE_HOST);
1906 if (buf_mem == NULL) return(-1);
1907
1908 copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
1909 buf_mem,
1910 NVEC_CUDA_CONTENT(x)->device_data,
1911 NVEC_CUDA_MEMSIZE(x),
1912 (void*) NVEC_CUDA_STREAM(x));
1913
1914 /* we synchronize with respect to the host, but only in this stream */
1915 cuerr = cudaStreamSynchronize(*NVEC_CUDA_STREAM(x));
1916
1917 SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(x), buf_mem);
1918
1919 return (!SUNDIALS_CUDA_VERIFY(cuerr) || copy_fail ? -1 : 0);
1920 }
1921
1922
N_VBufUnpack_Cuda(N_Vector x,void * buf)1923 int N_VBufUnpack_Cuda(N_Vector x, void *buf)
1924 {
1925 int copy_fail = 0;
1926 cudaError_t cuerr;
1927
1928 if (x == NULL || buf == NULL) return(-1);
1929
1930 SUNMemory buf_mem = SUNMemoryHelper_Wrap(buf, SUNMEMTYPE_HOST);
1931 if (buf_mem == NULL) return(-1);
1932
1933 copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
1934 NVEC_CUDA_CONTENT(x)->device_data,
1935 buf_mem,
1936 NVEC_CUDA_MEMSIZE(x),
1937 (void*) NVEC_CUDA_STREAM(x));
1938
1939 /* we synchronize with respect to the host, but only in this stream */
1940 cuerr = cudaStreamSynchronize(*NVEC_CUDA_STREAM(x));
1941
1942 SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(x), buf_mem);
1943
1944 return (!SUNDIALS_CUDA_VERIFY(cuerr) || copy_fail ? -1 : 0);
1945 }
1946
1947
1948 /*
1949 * -----------------------------------------------------------------
1950 * Enable / Disable fused and vector array operations
1951 * -----------------------------------------------------------------
1952 */
1953
N_VEnableFusedOps_Cuda(N_Vector v,booleantype tf)1954 int N_VEnableFusedOps_Cuda(N_Vector v, booleantype tf)
1955 {
1956 /* check that vector is non-NULL */
1957 if (v == NULL) return(-1);
1958
1959 /* check that ops structure is non-NULL */
1960 if (v->ops == NULL) return(-1);
1961
1962 if (tf)
1963 {
1964 /* enable all fused vector operations */
1965 v->ops->nvlinearcombination = N_VLinearCombination_Cuda;
1966 v->ops->nvscaleaddmulti = N_VScaleAddMulti_Cuda;
1967 v->ops->nvdotprodmulti = N_VDotProdMulti_Cuda;
1968 /* enable all vector array operations */
1969 v->ops->nvlinearsumvectorarray = N_VLinearSumVectorArray_Cuda;
1970 v->ops->nvscalevectorarray = N_VScaleVectorArray_Cuda;
1971 v->ops->nvconstvectorarray = N_VConstVectorArray_Cuda;
1972 v->ops->nvwrmsnormvectorarray = N_VWrmsNormVectorArray_Cuda;
1973 v->ops->nvwrmsnormmaskvectorarray = N_VWrmsNormMaskVectorArray_Cuda;
1974 v->ops->nvscaleaddmultivectorarray = N_VScaleAddMultiVectorArray_Cuda;
1975 v->ops->nvlinearcombinationvectorarray = N_VLinearCombinationVectorArray_Cuda;
1976 }
1977 else
1978 {
1979 /* disable all fused vector operations */
1980 v->ops->nvlinearcombination = NULL;
1981 v->ops->nvscaleaddmulti = NULL;
1982 v->ops->nvdotprodmulti = NULL;
1983 /* disable all vector array operations */
1984 v->ops->nvlinearsumvectorarray = NULL;
1985 v->ops->nvscalevectorarray = NULL;
1986 v->ops->nvconstvectorarray = NULL;
1987 v->ops->nvwrmsnormvectorarray = NULL;
1988 v->ops->nvwrmsnormmaskvectorarray = NULL;
1989 v->ops->nvscaleaddmultivectorarray = NULL;
1990 v->ops->nvlinearcombinationvectorarray = NULL;
1991 }
1992
1993 /* return success */
1994 return(0);
1995 }
1996
N_VEnableLinearCombination_Cuda(N_Vector v,booleantype tf)1997 int N_VEnableLinearCombination_Cuda(N_Vector v, booleantype tf)
1998 {
1999 /* check that vector is non-NULL */
2000 if (v == NULL) return(-1);
2001
2002 /* check that ops structure is non-NULL */
2003 if (v->ops == NULL) return(-1);
2004
2005 /* enable/disable operation */
2006 if (tf)
2007 v->ops->nvlinearcombination = N_VLinearCombination_Cuda;
2008 else
2009 v->ops->nvlinearcombination = NULL;
2010
2011 /* return success */
2012 return(0);
2013 }
2014
N_VEnableScaleAddMulti_Cuda(N_Vector v,booleantype tf)2015 int N_VEnableScaleAddMulti_Cuda(N_Vector v, booleantype tf)
2016 {
2017 /* check that vector is non-NULL */
2018 if (v == NULL) return(-1);
2019
2020 /* check that ops structure is non-NULL */
2021 if (v->ops == NULL) return(-1);
2022
2023 /* enable/disable operation */
2024 if (tf)
2025 v->ops->nvscaleaddmulti = N_VScaleAddMulti_Cuda;
2026 else
2027 v->ops->nvscaleaddmulti = NULL;
2028
2029 /* return success */
2030 return(0);
2031 }
2032
N_VEnableDotProdMulti_Cuda(N_Vector v,booleantype tf)2033 int N_VEnableDotProdMulti_Cuda(N_Vector v, booleantype tf)
2034 {
2035 /* check that vector is non-NULL */
2036 if (v == NULL) return(-1);
2037
2038 /* check that ops structure is non-NULL */
2039 if (v->ops == NULL) return(-1);
2040
2041 /* enable/disable operation */
2042 if (tf)
2043 v->ops->nvdotprodmulti = N_VDotProdMulti_Cuda;
2044 else
2045 v->ops->nvdotprodmulti = NULL;
2046
2047 /* return success */
2048 return(0);
2049 }
2050
N_VEnableLinearSumVectorArray_Cuda(N_Vector v,booleantype tf)2051 int N_VEnableLinearSumVectorArray_Cuda(N_Vector v, booleantype tf)
2052 {
2053 /* check that vector is non-NULL */
2054 if (v == NULL) return(-1);
2055
2056 /* check that ops structure is non-NULL */
2057 if (v->ops == NULL) return(-1);
2058
2059 /* enable/disable operation */
2060 if (tf)
2061 v->ops->nvlinearsumvectorarray = N_VLinearSumVectorArray_Cuda;
2062 else
2063 v->ops->nvlinearsumvectorarray = NULL;
2064
2065 /* return success */
2066 return(0);
2067 }
2068
N_VEnableScaleVectorArray_Cuda(N_Vector v,booleantype tf)2069 int N_VEnableScaleVectorArray_Cuda(N_Vector v, booleantype tf)
2070 {
2071 /* check that vector is non-NULL */
2072 if (v == NULL) return(-1);
2073
2074 /* check that ops structure is non-NULL */
2075 if (v->ops == NULL) return(-1);
2076
2077 /* enable/disable operation */
2078 if (tf)
2079 v->ops->nvscalevectorarray = N_VScaleVectorArray_Cuda;
2080 else
2081 v->ops->nvscalevectorarray = NULL;
2082
2083 /* return success */
2084 return(0);
2085 }
2086
N_VEnableConstVectorArray_Cuda(N_Vector v,booleantype tf)2087 int N_VEnableConstVectorArray_Cuda(N_Vector v, booleantype tf)
2088 {
2089 /* check that vector is non-NULL */
2090 if (v == NULL) return(-1);
2091
2092 /* check that ops structure is non-NULL */
2093 if (v->ops == NULL) return(-1);
2094
2095 /* enable/disable operation */
2096 if (tf)
2097 v->ops->nvconstvectorarray = N_VConstVectorArray_Cuda;
2098 else
2099 v->ops->nvconstvectorarray = NULL;
2100
2101 /* return success */
2102 return(0);
2103 }
2104
N_VEnableWrmsNormVectorArray_Cuda(N_Vector v,booleantype tf)2105 int N_VEnableWrmsNormVectorArray_Cuda(N_Vector v, booleantype tf)
2106 {
2107 /* check that vector is non-NULL */
2108 if (v == NULL) return(-1);
2109
2110 /* check that ops structure is non-NULL */
2111 if (v->ops == NULL) return(-1);
2112
2113 /* enable/disable operation */
2114 if (tf)
2115 v->ops->nvwrmsnormvectorarray = N_VWrmsNormVectorArray_Cuda;
2116 else
2117 v->ops->nvwrmsnormvectorarray = NULL;
2118
2119 /* return success */
2120 return(0);
2121 }
2122
N_VEnableWrmsNormMaskVectorArray_Cuda(N_Vector v,booleantype tf)2123 int N_VEnableWrmsNormMaskVectorArray_Cuda(N_Vector v, booleantype tf)
2124 {
2125 /* check that vector is non-NULL */
2126 if (v == NULL) return(-1);
2127
2128 /* check that ops structure is non-NULL */
2129 if (v->ops == NULL) return(-1);
2130
2131 /* enable/disable operation */
2132 if (tf)
2133 v->ops->nvwrmsnormmaskvectorarray = N_VWrmsNormMaskVectorArray_Cuda;
2134 else
2135 v->ops->nvwrmsnormmaskvectorarray = NULL;
2136
2137 /* return success */
2138 return(0);
2139 }
2140
N_VEnableScaleAddMultiVectorArray_Cuda(N_Vector v,booleantype tf)2141 int N_VEnableScaleAddMultiVectorArray_Cuda(N_Vector v, booleantype tf)
2142 {
2143 /* check that vector is non-NULL */
2144 if (v == NULL) return(-1);
2145
2146 /* check that ops structure is non-NULL */
2147 if (v->ops == NULL) return(-1);
2148
2149 /* enable/disable operation */
2150 if (tf)
2151 v->ops->nvscaleaddmultivectorarray = N_VScaleAddMultiVectorArray_Cuda;
2152 else
2153 v->ops->nvscaleaddmultivectorarray = NULL;
2154
2155 /* return success */
2156 return(0);
2157 }
2158
N_VEnableLinearCombinationVectorArray_Cuda(N_Vector v,booleantype tf)2159 int N_VEnableLinearCombinationVectorArray_Cuda(N_Vector v, booleantype tf)
2160 {
2161 /* check that vector is non-NULL */
2162 if (v == NULL) return(-1);
2163
2164 /* check that ops structure is non-NULL */
2165 if (v->ops == NULL) return(-1);
2166
2167 /* enable/disable operation */
2168 if (tf)
2169 v->ops->nvlinearcombinationvectorarray = N_VLinearCombinationVectorArray_Cuda;
2170 else
2171 v->ops->nvlinearcombinationvectorarray = NULL;
2172
2173 /* return success */
2174 return(0);
2175 }
2176
2177 /*
2178 * Private helper functions.
2179 */
2180
AllocateData(N_Vector v)2181 int AllocateData(N_Vector v)
2182 {
2183 int alloc_fail = 0;
2184 N_VectorContent_Cuda vc = NVEC_CUDA_CONTENT(v);
2185 N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
2186
2187 if (N_VGetLength_Cuda(v) == 0) return(0);
2188
2189 if (vcp->use_managed_mem)
2190 {
2191 alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v), &(vc->device_data),
2192 NVEC_CUDA_MEMSIZE(v), SUNMEMTYPE_UVM);
2193 if (alloc_fail)
2194 {
2195 SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: SUNMemoryHelper_Alloc failed for SUNMEMTYPE_UVM\n");
2196 }
2197 vc->host_data = SUNMemoryHelper_Alias(vc->device_data);
2198 }
2199 else
2200 {
2201 alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v), &(vc->host_data),
2202 NVEC_CUDA_MEMSIZE(v), SUNMEMTYPE_HOST);
2203 if (alloc_fail)
2204 {
2205 SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_HOST\n");
2206 }
2207
2208 alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v), &(vc->device_data),
2209 NVEC_CUDA_MEMSIZE(v), SUNMEMTYPE_DEVICE);
2210 if (alloc_fail)
2211 {
2212 SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_DEVICE\n");
2213 }
2214 }
2215
2216 return(alloc_fail ? -1 : 0);
2217 }
2218
2219 /*
2220 * Initializes the internal buffer used for reductions.
2221 * If the buffer is already allocated, it will only be reallocated
2222 * if it is no longer large enough. This may occur if the length
2223 * of the vector is increased. The buffer is initialized to the
2224 * value given.
2225 */
InitializeReductionBuffer(N_Vector v,const realtype value)2226 int InitializeReductionBuffer(N_Vector v, const realtype value)
2227 {
2228 int alloc_fail = 0, copy_fail = 0;
2229 size_t bytes = sizeof(realtype);
2230 booleantype need_to_allocate = SUNFALSE;
2231 N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
2232 SUNMemory value_mem = SUNMemoryHelper_Wrap((void*) &value, SUNMEMTYPE_HOST);
2233
2234 /* we allocate if the existing reduction buffer is not large enough */
2235 if (vcp->reduce_buffer_allocated_bytes < bytes)
2236 {
2237 FreeReductionBuffer(v);
2238 need_to_allocate = SUNTRUE;
2239 }
2240
2241 if (need_to_allocate)
2242 {
2243 alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v),
2244 &(vcp->reduce_buffer_host), bytes,
2245 SUNMEMTYPE_PINNED);
2246 if (alloc_fail)
2247 {
2248 SUNDIALS_DEBUG_PRINT("WARNING in InitializeReductionBuffer: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_PINNED, using SUNMEMTYPE_HOST instead\n");
2249
2250 /* try to allocate just plain host memory instead */
2251 alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v),
2252 &(vcp->reduce_buffer_host), bytes,
2253 SUNMEMTYPE_HOST);
2254 if (alloc_fail)
2255 {
2256 SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_HOST\n");
2257 }
2258 }
2259 alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v),
2260 &(vcp->reduce_buffer_dev), bytes,
2261 SUNMEMTYPE_DEVICE);
2262 if (alloc_fail)
2263 {
2264 SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_DEVICE\n");
2265 }
2266 }
2267
2268 if (!alloc_fail)
2269 {
2270 /* store the size of the buffer */
2271 vcp->reduce_buffer_allocated_bytes = bytes;
2272
2273 /* initialize the memory with the value */
2274 copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(v),
2275 vcp->reduce_buffer_dev, value_mem,
2276 bytes, (void*) NVEC_CUDA_STREAM(v));
2277
2278 if (copy_fail)
2279 {
2280 SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: SUNMemoryHelper_CopyAsync failed\n");
2281 }
2282 }
2283
2284 SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), value_mem);
2285 return((alloc_fail || copy_fail) ? -1 : 0);
2286 }
2287
2288 /* Free the reduction buffer
2289 */
FreeReductionBuffer(N_Vector v)2290 void FreeReductionBuffer(N_Vector v)
2291 {
2292 N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
2293
2294 if (vcp == NULL) return;
2295
2296 if (vcp->reduce_buffer_dev != NULL)
2297 SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vcp->reduce_buffer_dev);
2298 vcp->reduce_buffer_dev = NULL;
2299 if (vcp->reduce_buffer_host != NULL)
2300 SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vcp->reduce_buffer_host);
2301 vcp->reduce_buffer_host = NULL;
2302 }
2303
2304 /* Copy the reduction buffer from the device to the host.
2305 */
CopyReductionBufferFromDevice(N_Vector v,size_t n)2306 int CopyReductionBufferFromDevice(N_Vector v, size_t n)
2307 {
2308 int copy_fail;
2309 cudaError_t cuerr;
2310
2311 copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(v),
2312 NVEC_CUDA_PRIVATE(v)->reduce_buffer_host,
2313 NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev,
2314 n*sizeof(realtype),
2315 (void*) NVEC_CUDA_STREAM(v));
2316
2317 if (copy_fail)
2318 {
2319 SUNDIALS_DEBUG_PRINT("ERROR in CopyReductionBufferFromDevice: SUNMemoryHelper_CopyAsync returned nonzero\n");
2320 }
2321
2322 /* we synchronize with respect to the host, but only in this stream */
2323 cuerr = cudaStreamSynchronize(*NVEC_CUDA_STREAM(v));
2324 return (!SUNDIALS_CUDA_VERIFY(cuerr) || copy_fail ? -1 : 0);
2325 }
2326
2327 /* Get the kernel launch parameters based on the kernel type (reduction or not),
2328 * using the appropriate kernel execution policy.
2329 */
GetKernelParameters(N_Vector v,booleantype reduction,size_t & grid,size_t & block,size_t & shMemSize,cudaStream_t & stream,size_t n)2330 static int GetKernelParameters(N_Vector v, booleantype reduction, size_t& grid,
2331 size_t& block, size_t& shMemSize,
2332 cudaStream_t& stream, size_t n)
2333 {
2334 n = (n == 0) ? NVEC_CUDA_CONTENT(v)->length : n;
2335 if (reduction)
2336 {
2337 SUNCudaExecPolicy* reduce_exec_policy = NVEC_CUDA_CONTENT(v)->reduce_exec_policy;
2338 grid = reduce_exec_policy->gridSize(n);
2339 block = reduce_exec_policy->blockSize();
2340 shMemSize = 0;
2341 stream = *(reduce_exec_policy->stream());
2342 if (block % CUDA_WARP_SIZE)
2343 {
2344 #ifdef SUNDIALS_DEBUG
2345 throw std::runtime_error("the block size must be a multiple must be of CUDA warp size");
2346 #endif
2347 return(-1);
2348 }
2349 }
2350 else
2351 {
2352 SUNCudaExecPolicy* stream_exec_policy = NVEC_CUDA_CONTENT(v)->stream_exec_policy;
2353 grid = stream_exec_policy->gridSize(n);
2354 block = stream_exec_policy->blockSize();
2355 shMemSize = 0;
2356 stream = *(stream_exec_policy->stream());
2357 }
2358
2359 if (grid == 0)
2360 {
2361 #ifdef SUNDIALS_DEBUG
2362 throw std::runtime_error("the grid size must be > 0");
2363 #endif
2364 return(-1);
2365 }
2366 if (block == 0)
2367 {
2368 #ifdef SUNDIALS_DEBUG
2369 throw std::runtime_error("the block size must be > 0");
2370 #endif
2371 return(-1);
2372 }
2373
2374 return(0);
2375 }
2376
2377 /* Should be called after a kernel launch.
2378 * If SUNDIALS_DEBUG_CUDA_LASTERROR is not defined, then the function does nothing.
2379 * If it is defined, the function will synchronize and check the last CUDA error.
2380 */
PostKernelLaunch()2381 void PostKernelLaunch()
2382 {
2383 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
2384 cudaDeviceSynchronize();
2385 SUNDIALS_CUDA_VERIFY(cudaGetLastError());
2386 #endif
2387 }
2388
2389
2390 } // extern "C"
2391