1 /* -----------------------------------------------------------------
2  * Programmer(s): Slaven Peles, and Cody J. Balos @ LLNL
3  * -----------------------------------------------------------------
4  * SUNDIALS Copyright Start
5  * Copyright (c) 2002-2021, Lawrence Livermore National Security
6  * and Southern Methodist University.
7  * All rights reserved.
8  *
9  * See the top-level LICENSE and NOTICE files for details.
10  *
11  * SPDX-License-Identifier: BSD-3-Clause
12  * SUNDIALS Copyright End
13  * -----------------------------------------------------------------
14  * This is the implementation file for a CUDA implementation
15  * of the NVECTOR package.
16  * -----------------------------------------------------------------*/
17 
18 #include <cstdio>
19 #include <cstdlib>
20 #include <cmath>
21 #include <limits>
22 
23 #include <nvector/nvector_cuda.h>
24 #include "VectorArrayKernels.cuh"
25 #include "VectorKernels.cuh"
26 
27 #include "sundials_cuda.h"
28 #include "sundials_debug.h"
29 
30 #define ZERO RCONST(0.0)
31 #define HALF RCONST(0.5)
32 
33 extern "C" {
34 
35 using namespace sundials;
36 using namespace sundials::nvector_cuda;
37 
38 /*
39  * Macro definitions
40  */
41 
42 #define NVEC_CUDA_CONTENT(x)  ((N_VectorContent_Cuda)(x->content))
43 #define NVEC_CUDA_PRIVATE(x)  ((N_PrivateVectorContent_Cuda)(NVEC_CUDA_CONTENT(x)->priv))
44 #define NVEC_CUDA_MEMSIZE(x)  (NVEC_CUDA_CONTENT(x)->length * sizeof(realtype))
45 #define NVEC_CUDA_MEMHELP(x)  (NVEC_CUDA_CONTENT(x)->mem_helper)
46 #define NVEC_CUDA_HDATAp(x)   ((realtype*) NVEC_CUDA_CONTENT(x)->host_data->ptr)
47 #define NVEC_CUDA_DDATAp(x)   ((realtype*) NVEC_CUDA_CONTENT(x)->device_data->ptr)
48 #define NVEC_CUDA_HBUFFERp(x) ((realtype*) NVEC_CUDA_PRIVATE(x)->reduce_buffer_host->ptr)
49 #define NVEC_CUDA_DBUFFERp(x) ((realtype*) NVEC_CUDA_PRIVATE(x)->reduce_buffer_dev->ptr)
50 #define NVEC_CUDA_STREAM(x)   (NVEC_CUDA_CONTENT(x)->stream_exec_policy->stream())
51 
52 
53 /*
54  * Private structure definition
55  */
56 
57 struct _N_PrivateVectorContent_Cuda
58 {
59   booleantype     use_managed_mem;               /* indicates if the data pointers and buffer pointers are managed memory */
60   size_t          reduce_buffer_allocated_bytes; /* current size of the reduction buffer */
61   SUNMemory       reduce_buffer_dev;             /* device buffer used for reductions */
62   SUNMemory       reduce_buffer_host;            /* host buffer used for reductions */
63 };
64 
65 typedef struct _N_PrivateVectorContent_Cuda *N_PrivateVectorContent_Cuda;
66 
67 /*
68  * Private function definitions
69  */
70 
71 static int AllocateData(N_Vector v);
72 static int InitializeReductionBuffer(N_Vector v, const realtype value);
73 static void FreeReductionBuffer(N_Vector v);
74 static int CopyReductionBufferFromDevice(N_Vector v, size_t n = 1);
75 static int GetKernelParameters(N_Vector v, booleantype reduction, size_t& grid, size_t& block,
76                                size_t& shMemSize, cudaStream_t& stream, size_t n = 0);
77 static void PostKernelLaunch();
78 
79 /*
80  * Private functions needed for N_VMakeWithManagedAllocator_Cuda
81  * backwards compatibility.
82  */
83 
84 /* DEPRECATION NOTICE: The 4 functions below can be removed once
85    N_VMakeWithManagedAllocator_Cuda (deprecated) is removed in the
86    next major release. The UserAllocHelper struct can also be removed. */
87 
88 /* Struct that we use to pack up the user
89    provided alloc and free functions. */
90 typedef struct _UserAllocHelper
91 {
92   void*  (*userallocfn)(size_t);
93   void   (*userfreefn)(void*);
94 } UserAllocHelper;
95 
UserAlloc(SUNMemoryHelper helper,SUNMemory * memptr,size_t memsize,SUNMemoryType mem_type)96 static int UserAlloc(SUNMemoryHelper helper, SUNMemory* memptr,
97                      size_t memsize, SUNMemoryType mem_type)
98 {
99   UserAllocHelper* ua = (UserAllocHelper*) helper->content;
100   SUNMemory mem = SUNMemoryNewEmpty();
101 
102   mem->type = SUNMEMTYPE_UVM;
103   mem->ptr  = ua->userallocfn(memsize);
104   mem->own  = SUNTRUE;
105   if (mem->ptr == NULL)
106   {
107     SUNDIALS_DEBUG_PRINT("ERROR in UserAlloc: user provided alloc failed\n");
108     free(mem);
109     return(-1);
110   }
111 
112   *memptr = mem;
113   return(0);
114 }
115 
UserDealloc(SUNMemoryHelper helper,SUNMemory mem)116 static int UserDealloc(SUNMemoryHelper helper, SUNMemory mem)
117 {
118   UserAllocHelper* ua = (UserAllocHelper*) helper->content;
119   if (mem->own)
120   {
121     ua->userfreefn(mem->ptr);
122     mem->ptr = NULL;
123   }
124   free(mem);
125   return(0);
126 }
127 
HelperClone(SUNMemoryHelper helper)128 static SUNMemoryHelper HelperClone(SUNMemoryHelper helper)
129 {
130   UserAllocHelper* uaclone;
131   UserAllocHelper* ua = (UserAllocHelper*) helper->content;
132   SUNMemoryHelper hclone = SUNMemoryHelper_NewEmpty();
133 
134   SUNMemoryHelper_CopyOps(helper, hclone);
135 
136   uaclone = (UserAllocHelper*) malloc(sizeof(UserAllocHelper));
137   uaclone->userallocfn = ua->userallocfn;
138   uaclone->userfreefn  = ua->userfreefn;
139 
140   hclone->content = uaclone;
141 
142   return(hclone);
143 }
144 
HelperDestroy(SUNMemoryHelper helper)145 static int HelperDestroy(SUNMemoryHelper helper)
146 {
147   free(helper->content);
148   helper->content = NULL;
149   free(helper->ops);
150   free(helper);
151   return(0);
152 }
153 
N_VNewEmpty_Cuda()154 N_Vector N_VNewEmpty_Cuda()
155 {
156   N_Vector v;
157 
158   /* Create vector */
159   v = NULL;
160   v = N_VNewEmpty();
161   if (v == NULL) return(NULL);
162 
163   /* Attach operations */
164 
165   /* constructors, destructors, and utility operations */
166   v->ops->nvgetvectorid           = N_VGetVectorID_Cuda;
167   v->ops->nvclone                 = N_VClone_Cuda;
168   v->ops->nvcloneempty            = N_VCloneEmpty_Cuda;
169   v->ops->nvdestroy               = N_VDestroy_Cuda;
170   v->ops->nvspace                 = N_VSpace_Cuda;
171   v->ops->nvgetlength             = N_VGetLength_Cuda;
172   v->ops->nvgetarraypointer       = N_VGetHostArrayPointer_Cuda;
173   v->ops->nvgetdevicearraypointer = N_VGetDeviceArrayPointer_Cuda;
174   v->ops->nvsetarraypointer       = N_VSetHostArrayPointer_Cuda;
175 
176   /* standard vector operations */
177   v->ops->nvlinearsum    = N_VLinearSum_Cuda;
178   v->ops->nvconst        = N_VConst_Cuda;
179   v->ops->nvprod         = N_VProd_Cuda;
180   v->ops->nvdiv          = N_VDiv_Cuda;
181   v->ops->nvscale        = N_VScale_Cuda;
182   v->ops->nvabs          = N_VAbs_Cuda;
183   v->ops->nvinv          = N_VInv_Cuda;
184   v->ops->nvaddconst     = N_VAddConst_Cuda;
185   v->ops->nvdotprod      = N_VDotProd_Cuda;
186   v->ops->nvmaxnorm      = N_VMaxNorm_Cuda;
187   v->ops->nvmin          = N_VMin_Cuda;
188   v->ops->nvl1norm       = N_VL1Norm_Cuda;
189   v->ops->nvinvtest      = N_VInvTest_Cuda;
190   v->ops->nvconstrmask   = N_VConstrMask_Cuda;
191   v->ops->nvminquotient  = N_VMinQuotient_Cuda;
192   v->ops->nvwrmsnormmask = N_VWrmsNormMask_Cuda;
193   v->ops->nvwrmsnorm     = N_VWrmsNorm_Cuda;
194   v->ops->nvwl2norm      = N_VWL2Norm_Cuda;
195   v->ops->nvcompare      = N_VCompare_Cuda;
196 
197   /* fused and vector array operations are disabled (NULL) by default */
198 
199   /* local reduction operations */
200   v->ops->nvdotprodlocal     = N_VDotProd_Cuda;
201   v->ops->nvmaxnormlocal     = N_VMaxNorm_Cuda;
202   v->ops->nvminlocal         = N_VMin_Cuda;
203   v->ops->nvl1normlocal      = N_VL1Norm_Cuda;
204   v->ops->nvinvtestlocal     = N_VInvTest_Cuda;
205   v->ops->nvconstrmasklocal  = N_VConstrMask_Cuda;
206   v->ops->nvminquotientlocal = N_VMinQuotient_Cuda;
207   v->ops->nvwsqrsumlocal     = N_VWSqrSumLocal_Cuda;
208   v->ops->nvwsqrsummasklocal = N_VWSqrSumMaskLocal_Cuda;
209 
210   /* XBraid interface operations */
211   v->ops->nvbufsize   = N_VBufSize_Cuda;
212   v->ops->nvbufpack   = N_VBufPack_Cuda;
213   v->ops->nvbufunpack = N_VBufUnpack_Cuda;
214 
215   /* print operation for debugging */
216   v->ops->nvprint     = N_VPrint_Cuda;
217   v->ops->nvprintfile = N_VPrintFile_Cuda;
218 
219   /* Create content */
220 
221   v->content = (N_VectorContent_Cuda) malloc(sizeof(_N_VectorContent_Cuda));
222   if (v->content == NULL)
223   {
224     N_VDestroy(v);
225     return(NULL);
226   }
227 
228   NVEC_CUDA_CONTENT(v)->priv = malloc(sizeof(_N_PrivateVectorContent_Cuda));
229   if (NVEC_CUDA_CONTENT(v)->priv == NULL)
230   {
231     N_VDestroy(v);
232     return(NULL);
233   }
234 
235   NVEC_CUDA_CONTENT(v)->length                        = 0;
236   NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
237   NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
238   NVEC_CUDA_CONTENT(v)->stream_exec_policy            = NULL;
239   NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = NULL;
240   NVEC_CUDA_CONTENT(v)->mem_helper                    = NULL;
241   NVEC_CUDA_CONTENT(v)->own_helper                    = SUNFALSE;
242   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
243   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNFALSE;
244   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
245   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
246   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
247 
248   return(v);
249 }
250 
N_VNew_Cuda(sunindextype length)251 N_Vector N_VNew_Cuda(sunindextype length)
252 {
253   N_Vector v;
254 
255   v = NULL;
256   v = N_VNewEmpty_Cuda();
257   if (v == NULL) return(NULL);
258 
259   NVEC_CUDA_CONTENT(v)->length                        = length;
260   NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
261   NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
262   NVEC_CUDA_CONTENT(v)->mem_helper                    = SUNMemoryHelper_Cuda();
263   NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
264   NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
265   NVEC_CUDA_CONTENT(v)->own_helper                    = SUNTRUE;
266   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
267   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNFALSE;
268   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
269   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
270   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
271 
272   if (NVEC_CUDA_MEMHELP(v) == NULL)
273   {
274     SUNDIALS_DEBUG_PRINT("ERROR in N_VNew_Cuda: memory helper is NULL\n");
275     N_VDestroy(v);
276     return(NULL);
277   }
278 
279   if (AllocateData(v))
280   {
281     SUNDIALS_DEBUG_PRINT("ERROR in N_VNew_Cuda: AllocateData returned nonzero\n");
282     N_VDestroy(v);
283     return(NULL);
284   }
285 
286   return(v);
287 }
288 
N_VNewWithMemHelp_Cuda(sunindextype length,booleantype use_managed_mem,SUNMemoryHelper helper)289 N_Vector N_VNewWithMemHelp_Cuda(sunindextype length, booleantype use_managed_mem, SUNMemoryHelper helper)
290 {
291   N_Vector v;
292 
293   if (helper == NULL)
294   {
295     SUNDIALS_DEBUG_PRINT("ERROR in N_VNewWithMemHelp_Cuda: helper is NULL\n");
296     return(NULL);
297   }
298 
299   if (!SUNMemoryHelper_ImplementsRequiredOps(helper))
300   {
301     SUNDIALS_DEBUG_PRINT("ERROR in N_VNewWithMemHelp_Cuda: helper doesn't implement all required ops\n");
302     return(NULL);
303   }
304 
305   v = NULL;
306   v = N_VNewEmpty_Cuda();
307   if (v == NULL) return(NULL);
308 
309   NVEC_CUDA_CONTENT(v)->length                        = length;
310   NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
311   NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
312   NVEC_CUDA_CONTENT(v)->mem_helper                    = helper;
313   NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
314   NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
315   NVEC_CUDA_CONTENT(v)->own_helper                    = SUNFALSE;
316   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
317   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = use_managed_mem;
318   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
319   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
320   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
321 
322   if (AllocateData(v))
323   {
324     SUNDIALS_DEBUG_PRINT("ERROR in N_VNewWithMemHelp_Cuda: AllocateData returned nonzero\n");
325     N_VDestroy(v);
326     return(NULL);
327   }
328 
329   return(v);
330 }
331 
N_VNewManaged_Cuda(sunindextype length)332 N_Vector N_VNewManaged_Cuda(sunindextype length)
333 {
334   N_Vector v;
335 
336   v = NULL;
337   v = N_VNewEmpty_Cuda();
338   if (v == NULL) return(NULL);
339 
340   NVEC_CUDA_CONTENT(v)->length                        = length;
341   NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
342   NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
343   NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
344   NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
345   NVEC_CUDA_CONTENT(v)->mem_helper                    = SUNMemoryHelper_Cuda();
346   NVEC_CUDA_CONTENT(v)->own_helper                    = SUNTRUE;
347   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
348   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNTRUE;
349   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
350   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
351   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
352 
353   if (NVEC_CUDA_MEMHELP(v) == NULL)
354   {
355     SUNDIALS_DEBUG_PRINT("ERROR in N_VNewManaged_Cuda: memory helper is NULL\n");
356     N_VDestroy(v);
357     return(NULL);
358   }
359 
360   if (AllocateData(v))
361   {
362     SUNDIALS_DEBUG_PRINT("ERROR in N_VNewManaged_Cuda: AllocateData returned nonzero\n");
363     N_VDestroy(v);
364     return(NULL);
365   }
366 
367   return(v);
368 }
369 
N_VMake_Cuda(sunindextype length,realtype * h_vdata,realtype * d_vdata)370 N_Vector N_VMake_Cuda(sunindextype length, realtype *h_vdata, realtype *d_vdata)
371 {
372   N_Vector v;
373 
374   if (h_vdata == NULL || d_vdata == NULL) return(NULL);
375 
376   v = NULL;
377   v = N_VNewEmpty_Cuda();
378   if (v == NULL) return(NULL);
379 
380   NVEC_CUDA_CONTENT(v)->length                        = length;
381   NVEC_CUDA_CONTENT(v)->host_data                     = SUNMemoryHelper_Wrap(h_vdata, SUNMEMTYPE_HOST);
382   NVEC_CUDA_CONTENT(v)->device_data                   = SUNMemoryHelper_Wrap(d_vdata, SUNMEMTYPE_DEVICE);
383   NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
384   NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
385   NVEC_CUDA_CONTENT(v)->mem_helper                    = SUNMemoryHelper_Cuda();
386   NVEC_CUDA_CONTENT(v)->own_helper                    = SUNTRUE;
387   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
388   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNFALSE;
389   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
390   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
391   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
392 
393   if (NVEC_CUDA_MEMHELP(v) == NULL)
394   {
395     SUNDIALS_DEBUG_PRINT("ERROR in N_VMake_Cuda: memory helper is NULL\n");
396     N_VDestroy(v);
397     return(NULL);
398   }
399 
400   if (NVEC_CUDA_CONTENT(v)->device_data == NULL ||
401       NVEC_CUDA_CONTENT(v)->host_data == NULL)
402   {
403     SUNDIALS_DEBUG_PRINT("ERROR in N_VMake_Cuda: SUNMemoryHelper_Wrap returned NULL\n");
404     N_VDestroy(v);
405     return(NULL);
406   }
407 
408   return(v);
409 }
410 
N_VMakeManaged_Cuda(sunindextype length,realtype * vdata)411 N_Vector N_VMakeManaged_Cuda(sunindextype length, realtype *vdata)
412 {
413   N_Vector v;
414 
415   if (vdata == NULL) return(NULL);
416 
417   v = NULL;
418   v = N_VNewEmpty_Cuda();
419   if (v == NULL) return(NULL);
420 
421   NVEC_CUDA_CONTENT(v)->length                        = length;
422   NVEC_CUDA_CONTENT(v)->host_data                     = SUNMemoryHelper_Wrap(vdata, SUNMEMTYPE_UVM);
423   NVEC_CUDA_CONTENT(v)->device_data                   = SUNMemoryHelper_Alias(NVEC_CUDA_CONTENT(v)->host_data);
424   NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
425   NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
426   NVEC_CUDA_CONTENT(v)->mem_helper                    = SUNMemoryHelper_Cuda();
427   NVEC_CUDA_CONTENT(v)->own_helper                    = SUNTRUE;
428   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
429   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNTRUE;
430   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
431   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
432   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
433 
434   if (NVEC_CUDA_MEMHELP(v) == NULL)
435   {
436     SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeManaged_Cuda: memory helper is NULL\n");
437     N_VDestroy(v);
438     return(NULL);
439   }
440 
441   if (NVEC_CUDA_CONTENT(v)->device_data == NULL ||
442       NVEC_CUDA_CONTENT(v)->host_data == NULL)
443   {
444     SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeManaged_Cuda: SUNMemoryHelper_Wrap returned NULL\n");
445     N_VDestroy(v);
446     return(NULL);
447   }
448 
449   return(v);
450 }
451 
N_VMakeWithManagedAllocator_Cuda(sunindextype length,void * (* allocfn)(size_t),void (* freefn)(void *))452 N_Vector N_VMakeWithManagedAllocator_Cuda(sunindextype length,
453                                           void* (*allocfn)(size_t),
454                                           void (*freefn)(void*))
455 {
456   UserAllocHelper* ua;
457   N_Vector v;
458 
459   v = NULL;
460   v = N_VNewEmpty_Cuda();
461   if (v == NULL) return(NULL);
462 
463   NVEC_CUDA_CONTENT(v)->length                        = length;
464   NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
465   NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
466   NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
467   NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
468   NVEC_CUDA_CONTENT(v)->mem_helper                    = SUNMemoryHelper_Cuda();
469   NVEC_CUDA_CONTENT(v)->own_helper                    = SUNTRUE;
470   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
471   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNTRUE;
472   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
473   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
474   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
475 
476   if (NVEC_CUDA_MEMHELP(v) == NULL)
477   {
478     SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeWithManagedAllocator_Cuda: memory helper is NULL\n");
479     N_VDestroy(v);
480     return(NULL);
481   }
482 
483   ua = (UserAllocHelper*) malloc(sizeof(UserAllocHelper));
484   ua->userallocfn                    = allocfn;
485   ua->userfreefn                     = freefn;
486   NVEC_CUDA_MEMHELP(v)->content      = (void*) ua;
487   NVEC_CUDA_MEMHELP(v)->ops->alloc   = UserAlloc;
488   NVEC_CUDA_MEMHELP(v)->ops->dealloc = UserDealloc;
489   NVEC_CUDA_MEMHELP(v)->ops->clone   = HelperClone;
490   NVEC_CUDA_MEMHELP(v)->ops->destroy = HelperDestroy;
491 
492   if (AllocateData(v))
493   {
494     SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeWithManagedAllocator_Cuda: AllocateData returned nonzero\n");
495     N_VDestroy(v);
496     return(NULL);
497   }
498 
499   return(v);
500 }
501 
502 /* ----------------------------------------------------------------------------
503  * Set pointer to the raw host data. Does not free the existing pointer.
504  */
505 
N_VSetHostArrayPointer_Cuda(realtype * h_vdata,N_Vector v)506 void N_VSetHostArrayPointer_Cuda(realtype* h_vdata, N_Vector v)
507 {
508   if (N_VIsManagedMemory_Cuda(v))
509   {
510     if (NVEC_CUDA_CONTENT(v)->host_data)
511     {
512       NVEC_CUDA_CONTENT(v)->host_data->ptr = (void*) h_vdata;
513       NVEC_CUDA_CONTENT(v)->device_data->ptr = (void*) h_vdata;
514     }
515     else
516     {
517       NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Wrap((void*) h_vdata, SUNMEMTYPE_UVM);
518       NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Alias(NVEC_CUDA_CONTENT(v)->host_data);
519     }
520   }
521   else
522   {
523     if (NVEC_CUDA_CONTENT(v)->host_data)
524     {
525       NVEC_CUDA_CONTENT(v)->host_data->ptr = (void*) h_vdata;
526     }
527     else
528     {
529       NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Wrap((void*) h_vdata, SUNMEMTYPE_HOST);
530     }
531   }
532 }
533 
534 /* ----------------------------------------------------------------------------
535  * Set pointer to the raw device data
536  */
537 
N_VSetDeviceArrayPointer_Cuda(realtype * d_vdata,N_Vector v)538 void N_VSetDeviceArrayPointer_Cuda(realtype* d_vdata, N_Vector v)
539 {
540   if (N_VIsManagedMemory_Cuda(v))
541   {
542     if (NVEC_CUDA_CONTENT(v)->device_data)
543     {
544       NVEC_CUDA_CONTENT(v)->device_data->ptr = (void*) d_vdata;
545       NVEC_CUDA_CONTENT(v)->host_data->ptr = (void*) d_vdata;
546     }
547     else
548     {
549       NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Wrap((void*) d_vdata, SUNMEMTYPE_UVM);
550       NVEC_CUDA_CONTENT(v)->host_data = SUNMemoryHelper_Alias(NVEC_CUDA_CONTENT(v)->device_data);
551     }
552   }
553   else
554   {
555     if (NVEC_CUDA_CONTENT(v)->device_data)
556     {
557       NVEC_CUDA_CONTENT(v)->device_data->ptr = (void*) d_vdata;
558     }
559     else
560     {
561       NVEC_CUDA_CONTENT(v)->device_data = SUNMemoryHelper_Wrap((void*) d_vdata, SUNMEMTYPE_DEVICE);
562     }
563   }
564 }
565 
566 /* ----------------------------------------------------------------------------
567  * Return a flag indicating if the memory for the vector data is managed
568  */
N_VIsManagedMemory_Cuda(N_Vector x)569 booleantype N_VIsManagedMemory_Cuda(N_Vector x)
570 {
571   return NVEC_CUDA_PRIVATE(x)->use_managed_mem;
572 }
573 
N_VSetKernelExecPolicy_Cuda(N_Vector x,SUNCudaExecPolicy * stream_exec_policy,SUNCudaExecPolicy * reduce_exec_policy)574 int N_VSetKernelExecPolicy_Cuda(N_Vector x,
575                                 SUNCudaExecPolicy* stream_exec_policy,
576                                 SUNCudaExecPolicy* reduce_exec_policy)
577 {
578   if (x == NULL || stream_exec_policy == NULL || reduce_exec_policy == NULL)
579     return(-1);
580 
581   if (NVEC_CUDA_CONTENT(x)->own_exec)
582   {
583     delete NVEC_CUDA_CONTENT(x)->stream_exec_policy;
584     delete NVEC_CUDA_CONTENT(x)->reduce_exec_policy;
585   }
586 
587   NVEC_CUDA_CONTENT(x)->stream_exec_policy = stream_exec_policy;
588   NVEC_CUDA_CONTENT(x)->reduce_exec_policy = reduce_exec_policy;
589   NVEC_CUDA_CONTENT(x)->own_exec = SUNFALSE;
590 
591   return(0);
592 }
593 
594 /*
595  * ----------------------------------------------------------------------------
596  * DEPRECATED: will be removed in SUNDIALS v6.
597  * Sets the cudaStream_t to use for execution of the CUDA kernels.
598  */
N_VSetCudaStream_Cuda(N_Vector x,cudaStream_t * stream)599 void N_VSetCudaStream_Cuda(N_Vector x, cudaStream_t *stream)
600 {
601   const CudaExecPolicy* xs = NVEC_CUDA_CONTENT(x)->stream_exec_policy;
602   const CudaExecPolicy* xr = NVEC_CUDA_CONTENT(x)->reduce_exec_policy;
603   CudaThreadDirectExecPolicy* s =
604     new CudaThreadDirectExecPolicy(xs->blockSize(), *stream);
605   CudaBlockReduceExecPolicy* r =
606     new CudaBlockReduceExecPolicy(xr->blockSize(), xr->gridSize(), *stream);
607   N_VSetKernelExecPolicy_Cuda(x, s, r);
608   NVEC_CUDA_CONTENT(x)->own_exec = SUNTRUE;
609 }
610 
611 /* ----------------------------------------------------------------------------
612  * Copy vector data to the device
613  */
614 
N_VCopyToDevice_Cuda(N_Vector x)615 void N_VCopyToDevice_Cuda(N_Vector x)
616 {
617   int copy_fail;
618 
619   copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
620                                         NVEC_CUDA_CONTENT(x)->device_data,
621                                         NVEC_CUDA_CONTENT(x)->host_data,
622                                         NVEC_CUDA_MEMSIZE(x),
623                                         (void*) NVEC_CUDA_STREAM(x));
624 
625   if (copy_fail)
626   {
627     SUNDIALS_DEBUG_PRINT("ERROR in N_VCopyToDevice_Cuda: SUNMemoryHelper_CopyAsync returned nonzero\n");
628   }
629 
630   /* we synchronize with respect to the host, but only in this stream */
631   SUNDIALS_CUDA_VERIFY(cudaStreamSynchronize(*NVEC_CUDA_STREAM(x)));
632 }
633 
634 /* ----------------------------------------------------------------------------
635  * Copy vector data from the device to the host
636  */
637 
N_VCopyFromDevice_Cuda(N_Vector x)638 void N_VCopyFromDevice_Cuda(N_Vector x)
639 {
640   int copy_fail;
641 
642   copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
643                                         NVEC_CUDA_CONTENT(x)->host_data,
644                                         NVEC_CUDA_CONTENT(x)->device_data,
645                                         NVEC_CUDA_MEMSIZE(x),
646                                         (void*) NVEC_CUDA_STREAM(x));
647 
648   if (copy_fail)
649   {
650     SUNDIALS_DEBUG_PRINT("ERROR in N_VCopyFromDevice_Cuda: SUNMemoryHelper_CopyAsync returned nonzero\n");
651   }
652 
653   /* we synchronize with respect to the host, but only in this stream */
654   SUNDIALS_CUDA_VERIFY(cudaStreamSynchronize(*NVEC_CUDA_STREAM(x)));
655 }
656 
657 /* ----------------------------------------------------------------------------
658  * Function to print the a CUDA-based vector to stdout
659  */
660 
N_VPrint_Cuda(N_Vector x)661 void N_VPrint_Cuda(N_Vector x)
662 {
663   N_VPrintFile_Cuda(x, stdout);
664 }
665 
666 /* ----------------------------------------------------------------------------
667  * Function to print the a CUDA-based vector to outfile
668  */
669 
N_VPrintFile_Cuda(N_Vector x,FILE * outfile)670 void N_VPrintFile_Cuda(N_Vector x, FILE *outfile)
671 {
672   sunindextype i;
673 
674   for (i = 0; i < NVEC_CUDA_CONTENT(x)->length; i++) {
675 #if defined(SUNDIALS_EXTENDED_PRECISION)
676     fprintf(outfile, "%35.32Lg\n", NVEC_CUDA_HDATAp(x)[i]);
677 #elif defined(SUNDIALS_DOUBLE_PRECISION)
678     fprintf(outfile, "%19.16g\n", NVEC_CUDA_HDATAp(x)[i]);
679 #else
680     fprintf(outfile, "%11.8g\n", NVEC_CUDA_HDATAp(x)[i]);
681 #endif
682   }
683   fprintf(outfile, "\n");
684 
685   return;
686 }
687 
688 
689 /*
690  * -----------------------------------------------------------------
691  * implementation of vector operations
692  * -----------------------------------------------------------------
693  */
694 
N_VCloneEmpty_Cuda(N_Vector w)695 N_Vector N_VCloneEmpty_Cuda(N_Vector w)
696 {
697   N_Vector v;
698 
699   if (w == NULL) return(NULL);
700 
701   /* Create vector */
702   v = NULL;
703   v = N_VNewEmpty_Cuda();
704   if (v == NULL) return(NULL);
705 
706   /* Attach operations */
707   if (N_VCopyOps(w, v)) { N_VDestroy(v); return(NULL); }
708 
709   /* Set content */
710   NVEC_CUDA_CONTENT(v)->length                        = NVEC_CUDA_CONTENT(w)->length;
711   NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
712   NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
713   NVEC_CUDA_CONTENT(v)->mem_helper                    = NULL;
714   NVEC_CUDA_CONTENT(v)->own_exec                      = SUNTRUE;
715   NVEC_CUDA_PRIVATE(v)->use_managed_mem               = NVEC_CUDA_PRIVATE(w)->use_managed_mem;
716   NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
717   NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
718   NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
719 
720   return(v);
721 }
722 
N_VClone_Cuda(N_Vector w)723 N_Vector N_VClone_Cuda(N_Vector w)
724 {
725   N_Vector v;
726 
727   v = NULL;
728   v = N_VCloneEmpty_Cuda(w);
729   if (v == NULL) return(NULL);
730 
731   NVEC_CUDA_MEMHELP(v) = SUNMemoryHelper_Clone(NVEC_CUDA_MEMHELP(w));
732   NVEC_CUDA_CONTENT(v)->own_helper = SUNTRUE;
733   NVEC_CUDA_CONTENT(v)->stream_exec_policy = NVEC_CUDA_CONTENT(w)->stream_exec_policy->clone();
734   NVEC_CUDA_CONTENT(v)->reduce_exec_policy = NVEC_CUDA_CONTENT(w)->reduce_exec_policy->clone();
735 
736   if (NVEC_CUDA_MEMHELP(v) == NULL)
737   {
738     SUNDIALS_DEBUG_PRINT("ERROR in N_VClone_Cuda: SUNMemoryHelper_Clone returned NULL\n");
739     N_VDestroy(v);
740     return(NULL);
741   }
742 
743   if (AllocateData(v))
744   {
745     SUNDIALS_DEBUG_PRINT("ERROR in N_VClone_Cuda: AllocateData returned nonzero\n");
746     N_VDestroy(v);
747     return(NULL);
748   }
749 
750   return(v);
751 }
752 
N_VDestroy_Cuda(N_Vector v)753 void N_VDestroy_Cuda(N_Vector v)
754 {
755   N_VectorContent_Cuda vc;
756   N_PrivateVectorContent_Cuda vcp;
757 
758   if (v == NULL) return;
759 
760   /* free ops structure */
761   if (v->ops != NULL)
762   {
763     free(v->ops);
764     v->ops = NULL;
765   }
766 
767   /* extract content */
768   vc = NVEC_CUDA_CONTENT(v);
769   if (vc == NULL)
770   {
771     free(v);
772     v = NULL;
773     return;
774   }
775 
776   /* free private content */
777   vcp = (N_PrivateVectorContent_Cuda) vc->priv;
778   if (vcp != NULL)
779   {
780     /* free items in private content */
781     FreeReductionBuffer(v);
782     free(vcp);
783     vc->priv = NULL;
784   }
785 
786   /* free items in content */
787   if (vc->own_exec)
788   {
789     delete vc->stream_exec_policy;
790     vc->stream_exec_policy = NULL;
791     delete vc->reduce_exec_policy;
792     vc->reduce_exec_policy = NULL;
793   }
794 
795   if (NVEC_CUDA_MEMHELP(v))
796   {
797     SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vc->host_data);
798     vc->host_data = NULL;
799     SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vc->device_data);
800     vc->device_data = NULL;
801     if (vc->own_helper) SUNMemoryHelper_Destroy(vc->mem_helper);
802     vc->mem_helper = NULL;
803   }
804 
805   /* free content struct */
806   free(vc);
807 
808   /* free vector */
809   free(v);
810 
811   return;
812 }
813 
N_VSpace_Cuda(N_Vector X,sunindextype * lrw,sunindextype * liw)814 void N_VSpace_Cuda(N_Vector X, sunindextype *lrw, sunindextype *liw)
815 {
816   *lrw = NVEC_CUDA_CONTENT(X)->length;
817   *liw = 2;
818 }
819 
N_VConst_Cuda(realtype a,N_Vector X)820 void N_VConst_Cuda(realtype a, N_Vector X)
821 {
822   size_t grid, block, shMemSize;
823   cudaStream_t stream;
824 
825   GetKernelParameters(X, false, grid, block, shMemSize, stream);
826   setConstKernel<<<grid, block, shMemSize, stream>>>
827   (
828     a,
829     NVEC_CUDA_DDATAp(X),
830     NVEC_CUDA_CONTENT(X)->length
831   );
832   PostKernelLaunch();
833 }
834 
N_VLinearSum_Cuda(realtype a,N_Vector X,realtype b,N_Vector Y,N_Vector Z)835 void N_VLinearSum_Cuda(realtype a, N_Vector X, realtype b, N_Vector Y, N_Vector Z)
836 {
837   size_t grid, block, shMemSize;
838   cudaStream_t stream;
839 
840   GetKernelParameters(X, false, grid, block, shMemSize, stream);
841   linearSumKernel<<<grid, block, shMemSize, stream>>>
842   (
843     a,
844     NVEC_CUDA_DDATAp(X),
845     b,
846     NVEC_CUDA_DDATAp(Y),
847     NVEC_CUDA_DDATAp(Z),
848     NVEC_CUDA_CONTENT(X)->length
849   );
850   PostKernelLaunch();
851 }
852 
N_VProd_Cuda(N_Vector X,N_Vector Y,N_Vector Z)853 void N_VProd_Cuda(N_Vector X, N_Vector Y, N_Vector Z)
854 {
855   size_t grid, block, shMemSize;
856   cudaStream_t stream;
857 
858   GetKernelParameters(X, false, grid, block, shMemSize, stream);
859   prodKernel<<<grid, block, shMemSize, stream>>>
860   (
861     NVEC_CUDA_DDATAp(X),
862     NVEC_CUDA_DDATAp(Y),
863     NVEC_CUDA_DDATAp(Z),
864     NVEC_CUDA_CONTENT(X)->length
865   );
866   PostKernelLaunch();
867 }
868 
N_VDiv_Cuda(N_Vector X,N_Vector Y,N_Vector Z)869 void N_VDiv_Cuda(N_Vector X, N_Vector Y, N_Vector Z)
870 {
871   size_t grid, block, shMemSize;
872   cudaStream_t stream;
873 
874   GetKernelParameters(X, false, grid, block, shMemSize, stream);
875   divKernel<<<grid, block, shMemSize, stream>>>
876   (
877     NVEC_CUDA_DDATAp(X),
878     NVEC_CUDA_DDATAp(Y),
879     NVEC_CUDA_DDATAp(Z),
880     NVEC_CUDA_CONTENT(X)->length
881   );
882   PostKernelLaunch();
883 }
884 
N_VScale_Cuda(realtype a,N_Vector X,N_Vector Z)885 void N_VScale_Cuda(realtype a, N_Vector X, N_Vector Z)
886 {
887   size_t grid, block, shMemSize;
888   cudaStream_t stream;
889 
890   GetKernelParameters(X, false, grid, block, shMemSize, stream);
891   scaleKernel<<<grid, block, shMemSize, stream>>>
892   (
893     a,
894     NVEC_CUDA_DDATAp(X),
895     NVEC_CUDA_DDATAp(Z),
896     NVEC_CUDA_CONTENT(X)->length
897   );
898   PostKernelLaunch();
899 }
900 
N_VAbs_Cuda(N_Vector X,N_Vector Z)901 void N_VAbs_Cuda(N_Vector X, N_Vector Z)
902 {
903   size_t grid, block, shMemSize;
904   cudaStream_t stream;
905 
906   GetKernelParameters(X, false, grid, block, shMemSize, stream);
907   absKernel<<<grid, block, shMemSize, stream>>>
908   (
909     NVEC_CUDA_DDATAp(X),
910     NVEC_CUDA_DDATAp(Z),
911     NVEC_CUDA_CONTENT(X)->length
912   );
913   PostKernelLaunch();
914 }
915 
N_VInv_Cuda(N_Vector X,N_Vector Z)916 void N_VInv_Cuda(N_Vector X, N_Vector Z)
917 {
918   size_t grid, block, shMemSize;
919   cudaStream_t stream;
920 
921   GetKernelParameters(X, false, grid, block, shMemSize, stream);
922   invKernel<<<grid, block, shMemSize, stream>>>
923   (
924     NVEC_CUDA_DDATAp(X),
925     NVEC_CUDA_DDATAp(Z),
926     NVEC_CUDA_CONTENT(X)->length
927   );
928   PostKernelLaunch();
929 }
930 
N_VAddConst_Cuda(N_Vector X,realtype b,N_Vector Z)931 void N_VAddConst_Cuda(N_Vector X, realtype b, N_Vector Z)
932 {
933   size_t grid, block, shMemSize;
934   cudaStream_t stream;
935 
936   GetKernelParameters(X, false, grid, block, shMemSize, stream);
937   addConstKernel<<<grid, block, shMemSize, stream>>>
938   (
939     b,
940     NVEC_CUDA_DDATAp(X),
941     NVEC_CUDA_DDATAp(Z),
942     NVEC_CUDA_CONTENT(X)->length
943   );
944   PostKernelLaunch();
945 }
946 
N_VDotProd_Cuda(N_Vector X,N_Vector Y)947 realtype N_VDotProd_Cuda(N_Vector X, N_Vector Y)
948 {
949   size_t grid, block, shMemSize;
950   cudaStream_t stream;
951 
952   if (InitializeReductionBuffer(X, ZERO))
953   {
954     SUNDIALS_DEBUG_PRINT("ERROR in N_VDotProd_Cuda: InitializeReductionBuffer returned nonzero\n");
955   }
956 
957   GetKernelParameters(X, true, grid, block, shMemSize, stream);
958   dotProdKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
959   (
960     NVEC_CUDA_DDATAp(X),
961     NVEC_CUDA_DDATAp(Y),
962     NVEC_CUDA_DBUFFERp(X),
963     NVEC_CUDA_CONTENT(X)->length
964   );
965   PostKernelLaunch();
966 
967   // Get result from the GPU
968   CopyReductionBufferFromDevice(X);
969   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
970 
971   return gpu_result;
972 }
973 
N_VMaxNorm_Cuda(N_Vector X)974 realtype N_VMaxNorm_Cuda(N_Vector X)
975 {
976   size_t grid, block, shMemSize;
977   cudaStream_t stream;
978 
979   if (InitializeReductionBuffer(X, ZERO))
980   {
981     SUNDIALS_DEBUG_PRINT("ERROR in N_VMaxNorm_Cuda: InitializeReductionBuffer returned nonzero\n");
982   }
983 
984   GetKernelParameters(X, true, grid, block, shMemSize, stream);
985   maxNormKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
986   (
987     NVEC_CUDA_DDATAp(X),
988     NVEC_CUDA_DBUFFERp(X),
989     NVEC_CUDA_CONTENT(X)->length
990   );
991   PostKernelLaunch();
992 
993   // Finish reduction on CPU if there are less than two blocks of data left.
994   CopyReductionBufferFromDevice(X);
995   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
996 
997   return gpu_result;
998 }
999 
N_VWSqrSumLocal_Cuda(N_Vector X,N_Vector W)1000 realtype N_VWSqrSumLocal_Cuda(N_Vector X, N_Vector W)
1001 {
1002   size_t grid, block, shMemSize;
1003   cudaStream_t stream;
1004 
1005   if (InitializeReductionBuffer(X, ZERO))
1006   {
1007     SUNDIALS_DEBUG_PRINT("ERROR in N_VWSqrSumLocal_Cuda: InitializeReductionBuffer returned nonzero\n");
1008   }
1009 
1010   GetKernelParameters(X, true, grid, block, shMemSize, stream);
1011   wL2NormSquareKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1012   (
1013     NVEC_CUDA_DDATAp(X),
1014     NVEC_CUDA_DDATAp(W),
1015     NVEC_CUDA_DBUFFERp(X),
1016     NVEC_CUDA_CONTENT(X)->length
1017   );
1018   PostKernelLaunch();
1019 
1020   // Get result from the GPU
1021   CopyReductionBufferFromDevice(X);
1022   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1023 
1024   return gpu_result;
1025 }
1026 
N_VWrmsNorm_Cuda(N_Vector X,N_Vector W)1027 realtype N_VWrmsNorm_Cuda(N_Vector X, N_Vector W)
1028 {
1029   const realtype sum = N_VWSqrSumLocal_Cuda(X, W);
1030   return std::sqrt(sum/NVEC_CUDA_CONTENT(X)->length);
1031 }
1032 
N_VWSqrSumMaskLocal_Cuda(N_Vector X,N_Vector W,N_Vector Id)1033 realtype N_VWSqrSumMaskLocal_Cuda(N_Vector X, N_Vector W, N_Vector Id)
1034 {
1035   size_t grid, block, shMemSize;
1036   cudaStream_t stream;
1037 
1038   if (InitializeReductionBuffer(X, ZERO))
1039   {
1040     SUNDIALS_DEBUG_PRINT("ERROR in N_VWSqrSumMaskLocal_Cuda: InitializeReductionBuffer returned nonzero\n");
1041   }
1042 
1043   GetKernelParameters(X, true, grid, block, shMemSize, stream);
1044   wL2NormSquareMaskKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1045   (
1046     NVEC_CUDA_DDATAp(X),
1047     NVEC_CUDA_DDATAp(W),
1048     NVEC_CUDA_DDATAp(Id),
1049     NVEC_CUDA_DBUFFERp(X),
1050     NVEC_CUDA_CONTENT(X)->length
1051   );
1052   PostKernelLaunch();
1053 
1054   // Get result from the GPU
1055   CopyReductionBufferFromDevice(X);
1056   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1057 
1058   return gpu_result;
1059 }
1060 
N_VWrmsNormMask_Cuda(N_Vector X,N_Vector W,N_Vector Id)1061 realtype N_VWrmsNormMask_Cuda(N_Vector X, N_Vector W, N_Vector Id)
1062 {
1063   const realtype sum = N_VWSqrSumMaskLocal_Cuda(X, W, Id);
1064   return std::sqrt(sum/NVEC_CUDA_CONTENT(X)->length);
1065 }
1066 
N_VMin_Cuda(N_Vector X)1067 realtype N_VMin_Cuda(N_Vector X)
1068 {
1069   const realtype maxVal = std::numeric_limits<realtype>::max();
1070 
1071   size_t grid, block, shMemSize;
1072   cudaStream_t stream;
1073 
1074   if (InitializeReductionBuffer(X, maxVal))
1075   {
1076     SUNDIALS_DEBUG_PRINT("ERROR in N_VMin_Cuda: InitializeReductionBuffer returned nonzero\n");
1077   }
1078 
1079   GetKernelParameters(X, true, grid, block, shMemSize, stream);
1080   findMinKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1081   (
1082     maxVal,
1083     NVEC_CUDA_DDATAp(X),
1084     NVEC_CUDA_DBUFFERp(X),
1085     NVEC_CUDA_CONTENT(X)->length
1086   );
1087   PostKernelLaunch();
1088 
1089   // Get result from the GPU
1090   CopyReductionBufferFromDevice(X);
1091   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1092 
1093   return gpu_result;
1094 }
1095 
N_VWL2Norm_Cuda(N_Vector X,N_Vector W)1096 realtype N_VWL2Norm_Cuda(N_Vector X, N_Vector W)
1097 {
1098   const realtype sum = N_VWSqrSumLocal_Cuda(X, W);
1099   return std::sqrt(sum);
1100 }
1101 
N_VL1Norm_Cuda(N_Vector X)1102 realtype N_VL1Norm_Cuda(N_Vector X)
1103 {
1104   size_t grid, block, shMemSize;
1105   cudaStream_t stream;
1106 
1107   if (InitializeReductionBuffer(X, ZERO))
1108   {
1109     SUNDIALS_DEBUG_PRINT("ERROR in N_VL1Norm_Cuda: InitializeReductionBuffer returned nonzero\n");
1110   }
1111 
1112   GetKernelParameters(X, true, grid, block, shMemSize, stream);
1113   L1NormKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1114   (
1115     NVEC_CUDA_DDATAp(X),
1116     NVEC_CUDA_DBUFFERp(X),
1117     NVEC_CUDA_CONTENT(X)->length
1118   );
1119   PostKernelLaunch();
1120 
1121   // Get result from the GPU
1122   CopyReductionBufferFromDevice(X);
1123   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1124 
1125   return gpu_result;
1126 }
1127 
N_VCompare_Cuda(realtype c,N_Vector X,N_Vector Z)1128 void N_VCompare_Cuda(realtype c, N_Vector X, N_Vector Z)
1129 {
1130   size_t grid, block, shMemSize;
1131   cudaStream_t stream;
1132 
1133   GetKernelParameters(X, false, grid, block, shMemSize, stream);
1134   compareKernel<<<grid, block, shMemSize, stream>>>
1135   (
1136     c,
1137     NVEC_CUDA_DDATAp(X),
1138     NVEC_CUDA_DDATAp(Z),
1139     NVEC_CUDA_CONTENT(X)->length
1140   );
1141   PostKernelLaunch();
1142 }
1143 
N_VInvTest_Cuda(N_Vector X,N_Vector Z)1144 booleantype N_VInvTest_Cuda(N_Vector X, N_Vector Z)
1145 {
1146   size_t grid, block, shMemSize;
1147   cudaStream_t stream;
1148 
1149   if (InitializeReductionBuffer(X, ZERO))
1150   {
1151     SUNDIALS_DEBUG_PRINT("ERROR in N_VInvTest_Cuda: InitializeReductionBuffer returned nonzero\n");
1152   }
1153 
1154   GetKernelParameters(X, true, grid, block, shMemSize, stream);
1155   invTestKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1156   (
1157     NVEC_CUDA_DDATAp(X),
1158     NVEC_CUDA_DDATAp(Z),
1159     NVEC_CUDA_DBUFFERp(X),
1160     NVEC_CUDA_CONTENT(X)->length
1161   );
1162   PostKernelLaunch();
1163 
1164   // Get result from the GPU
1165   CopyReductionBufferFromDevice(X);
1166   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1167 
1168   return (gpu_result < HALF);
1169 }
1170 
N_VConstrMask_Cuda(N_Vector C,N_Vector X,N_Vector M)1171 booleantype N_VConstrMask_Cuda(N_Vector C, N_Vector X, N_Vector M)
1172 {
1173   size_t grid, block, shMemSize;
1174   cudaStream_t stream;
1175 
1176   if (InitializeReductionBuffer(X, ZERO))
1177   {
1178     SUNDIALS_DEBUG_PRINT("ERROR in N_VConstrMask_Cuda: InitializeReductionBuffer returned nonzero\n");
1179   }
1180 
1181   GetKernelParameters(X, true, grid, block, shMemSize, stream);
1182   constrMaskKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1183   (
1184     NVEC_CUDA_DDATAp(C),
1185     NVEC_CUDA_DDATAp(X),
1186     NVEC_CUDA_DDATAp(M),
1187     NVEC_CUDA_DBUFFERp(X),
1188     NVEC_CUDA_CONTENT(X)->length
1189   );
1190   PostKernelLaunch();
1191 
1192   // Get result from the GPU
1193   CopyReductionBufferFromDevice(X);
1194   realtype gpu_result = NVEC_CUDA_HBUFFERp(X)[0];
1195 
1196   return (gpu_result < HALF);
1197 }
1198 
N_VMinQuotient_Cuda(N_Vector num,N_Vector denom)1199 realtype N_VMinQuotient_Cuda(N_Vector num, N_Vector denom)
1200 {
1201   // Starting value for min reduction
1202   const realtype maxVal = std::numeric_limits<realtype>::max();
1203   size_t grid, block, shMemSize;
1204   cudaStream_t stream;
1205 
1206   if (InitializeReductionBuffer(num, maxVal))
1207   {
1208     SUNDIALS_DEBUG_PRINT("ERROR in N_VMinQuotient_Cuda: InitializeReductionBuffer returned nonzero\n");
1209   }
1210 
1211   GetKernelParameters(num, true, grid, block, shMemSize, stream);
1212   minQuotientKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1213   (
1214     maxVal,
1215     NVEC_CUDA_DDATAp(num),
1216     NVEC_CUDA_DDATAp(denom),
1217     NVEC_CUDA_DBUFFERp(num),
1218     NVEC_CUDA_CONTENT(num)->length
1219   );
1220   PostKernelLaunch();
1221 
1222   // Get result from the GPU
1223   CopyReductionBufferFromDevice(num);
1224   realtype gpu_result = NVEC_CUDA_HBUFFERp(num)[0];
1225 
1226   return gpu_result;
1227 }
1228 
1229 
1230 /*
1231  * -----------------------------------------------------------------
1232  * fused vector operations
1233  * -----------------------------------------------------------------
1234  */
1235 
N_VLinearCombination_Cuda(int nvec,realtype * c,N_Vector * X,N_Vector Z)1236 int N_VLinearCombination_Cuda(int nvec, realtype* c, N_Vector* X, N_Vector Z)
1237 {
1238   cudaError_t err;
1239 
1240   // Copy c array to device
1241   realtype* d_c;
1242   err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
1243   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1244   err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
1245   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1246 
1247   // Create array of device pointers on host
1248   realtype** h_Xd = new realtype*[nvec];
1249   for (int i=0; i<nvec; i++)
1250     h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1251 
1252   // Copy array of device pointers to device from host
1253   realtype** d_Xd;
1254   err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1255   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1256   err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1257   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1258 
1259   // Set kernel parameters and launch
1260   size_t grid, block, shMemSize;
1261   cudaStream_t stream;
1262 
1263   if (GetKernelParameters(X[0], false, grid, block, shMemSize, stream)) return(-1);
1264   linearCombinationKernel<<<grid, block, shMemSize, stream>>>
1265   (
1266     nvec,
1267     d_c,
1268     d_Xd,
1269     NVEC_CUDA_DDATAp(Z),
1270     NVEC_CUDA_CONTENT(Z)->length
1271   );
1272   PostKernelLaunch();
1273 
1274   // Free host array
1275   delete[] h_Xd;
1276 
1277   // Free device arrays
1278   err = cudaFree(d_c);
1279   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1280   err = cudaFree(d_Xd);
1281   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1282 
1283   return(0);
1284 }
1285 
N_VScaleAddMulti_Cuda(int nvec,realtype * c,N_Vector X,N_Vector * Y,N_Vector * Z)1286 int N_VScaleAddMulti_Cuda(int nvec, realtype* c, N_Vector X, N_Vector* Y,
1287                           N_Vector* Z)
1288 {
1289   cudaError_t err;
1290 
1291   // Copy c array to device
1292   realtype* d_c;
1293   err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
1294   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1295   err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
1296   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1297 
1298   // Create array of device pointers on host
1299   realtype** h_Yd = new realtype*[nvec];
1300   for (int i=0; i<nvec; i++)
1301     h_Yd[i] = NVEC_CUDA_DDATAp(Y[i]);
1302 
1303   realtype** h_Zd = new realtype*[nvec];
1304   for (int i=0; i<nvec; i++)
1305     h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1306 
1307   // Copy array of device pointers to device from host
1308   realtype** d_Yd;
1309   err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
1310   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1311   err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1312   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1313 
1314   realtype** d_Zd;
1315   err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1316   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1317   err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1318   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1319 
1320   // Set kernel parameters
1321   size_t grid, block, shMemSize;
1322   cudaStream_t stream;
1323 
1324   if (GetKernelParameters(X, false, grid, block, shMemSize, stream)) return(-1);
1325   scaleAddMultiKernel<<<grid, block, shMemSize, stream>>>
1326   (
1327     nvec,
1328     d_c,
1329     NVEC_CUDA_DDATAp(X),
1330     d_Yd,
1331     d_Zd,
1332     NVEC_CUDA_CONTENT(X)->length
1333   );
1334   PostKernelLaunch();
1335 
1336   // Free host array
1337   delete[] h_Yd;
1338   delete[] h_Zd;
1339 
1340   // Free device arrays
1341   err = cudaFree(d_c);
1342   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1343   err = cudaFree(d_Yd);
1344   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1345   err = cudaFree(d_Zd);
1346   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1347 
1348   return(0);
1349 }
1350 
N_VDotProdMulti_Cuda(int nvec,N_Vector X,N_Vector * Y,realtype * dots)1351 int N_VDotProdMulti_Cuda(int nvec, N_Vector X, N_Vector* Y, realtype* dots)
1352 {
1353   cudaError_t err;
1354 
1355   // Create array of device pointers on host
1356   realtype** h_Yd = new realtype*[nvec];
1357   for (int i=0; i<nvec; i++)
1358     h_Yd[i] = NVEC_CUDA_DDATAp(Y[i]);
1359 
1360   // Copy array of device pointers to device from host
1361   realtype** d_Yd;
1362   err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
1363   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1364   err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1365   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1366 
1367   // Set kernel parameters
1368   size_t grid, block, shMemSize;
1369   cudaStream_t stream;
1370 
1371   if (GetKernelParameters(X, false, grid, block, shMemSize, stream)) return(-1);
1372   grid = nvec;
1373 
1374   // Allocate reduction buffer on device
1375   realtype* d_buff;
1376   err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
1377   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1378   err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
1379   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1380 
1381   dotProdMultiKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1382   (
1383     nvec,
1384     NVEC_CUDA_DDATAp(X),
1385     d_Yd,
1386     d_buff,
1387     NVEC_CUDA_CONTENT(X)->length
1388   );
1389   PostKernelLaunch();
1390 
1391   // Copy GPU result to the cpu.
1392   err = cudaMemcpy(dots, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
1393   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1394 
1395   // Free host array
1396   delete[] h_Yd;
1397 
1398   // Free device arrays
1399   err = cudaFree(d_Yd);
1400   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1401   err = cudaFree(d_buff);
1402   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1403 
1404   return(0);
1405 }
1406 
1407 
1408 /*
1409  * -----------------------------------------------------------------------------
1410  * vector array operations
1411  * -----------------------------------------------------------------------------
1412  */
1413 
N_VLinearSumVectorArray_Cuda(int nvec,realtype a,N_Vector * X,realtype b,N_Vector * Y,N_Vector * Z)1414 int N_VLinearSumVectorArray_Cuda(int nvec, realtype a, N_Vector* X, realtype b,
1415                                  N_Vector* Y, N_Vector* Z)
1416 {
1417   cudaError_t err;
1418 
1419   // Create array of device pointers on host
1420   realtype** h_Xd = new realtype*[nvec];
1421   for (int i=0; i<nvec; i++)
1422     h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1423 
1424   realtype** h_Yd = new realtype*[nvec];
1425   for (int i=0; i<nvec; i++)
1426     h_Yd[i] = NVEC_CUDA_DDATAp(Y[i]);
1427 
1428   realtype** h_Zd = new realtype*[nvec];
1429   for (int i=0; i<nvec; i++)
1430     h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1431 
1432   // Copy array of device pointers to device from host
1433   realtype** d_Xd;
1434   err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1435   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1436   err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1437   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1438 
1439   realtype** d_Yd;
1440   err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
1441   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1442   err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1443   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1444 
1445   realtype** d_Zd;
1446   err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1447   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1448   err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1449   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1450 
1451   // Set kernel parameters
1452   size_t grid, block, shMemSize;
1453   cudaStream_t stream;
1454 
1455   if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1456   linearSumVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1457   (
1458     nvec,
1459     a,
1460     d_Xd,
1461     b,
1462     d_Yd,
1463     d_Zd,
1464     NVEC_CUDA_CONTENT(Z[0])->length
1465   );
1466   PostKernelLaunch();
1467 
1468   // Free host array
1469   delete[] h_Xd;
1470   delete[] h_Yd;
1471   delete[] h_Zd;
1472 
1473   // Free device arrays
1474   err = cudaFree(d_Xd);
1475   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1476   err = cudaFree(d_Yd);
1477   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1478   err = cudaFree(d_Zd);
1479   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1480 
1481   return(0);
1482 }
1483 
N_VScaleVectorArray_Cuda(int nvec,realtype * c,N_Vector * X,N_Vector * Z)1484 int N_VScaleVectorArray_Cuda(int nvec, realtype* c, N_Vector* X, N_Vector* Z)
1485 {
1486   cudaError_t err;
1487 
1488   // Copy c array to device
1489   realtype* d_c;
1490   err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
1491   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1492   err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
1493   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1494 
1495   // Create array of device pointers on host
1496   realtype** h_Xd = new realtype*[nvec];
1497   for (int i=0; i<nvec; i++)
1498     h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1499 
1500   realtype** h_Zd = new realtype*[nvec];
1501   for (int i=0; i<nvec; i++)
1502     h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1503 
1504   // Copy array of device pointers to device from host
1505   realtype** d_Xd;
1506   err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1507   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1508   err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1509   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1510 
1511   realtype** d_Zd;
1512   err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1513   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1514   err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1515   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1516 
1517   // Set kernel parameters
1518   size_t grid, block, shMemSize;
1519   cudaStream_t stream;
1520 
1521   if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1522   scaleVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1523   (
1524     nvec,
1525     d_c,
1526     d_Xd,
1527     d_Zd,
1528     NVEC_CUDA_CONTENT(Z[0])->length
1529   );
1530   PostKernelLaunch();
1531 
1532   // Free host array
1533   delete[] h_Xd;
1534   delete[] h_Zd;
1535 
1536   // Free device arrays
1537   err = cudaFree(d_c);
1538   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1539   err = cudaFree(d_Xd);
1540   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1541   err = cudaFree(d_Zd);
1542   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1543 
1544   return(0);
1545 }
1546 
N_VConstVectorArray_Cuda(int nvec,realtype c,N_Vector * Z)1547 int N_VConstVectorArray_Cuda(int nvec, realtype c, N_Vector* Z)
1548 {
1549   cudaError_t err;
1550 
1551   // Create array of device pointers on host
1552   realtype** h_Zd = new realtype*[nvec];
1553   for (int i=0; i<nvec; i++)
1554     h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1555 
1556   // Copy array of device pointers to device from host
1557   realtype** d_Zd;
1558   err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1559   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1560   err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1561   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1562 
1563   // Set kernel parameters
1564   size_t grid, block, shMemSize;
1565   cudaStream_t stream;
1566 
1567   if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1568   constVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1569   (
1570     nvec,
1571     c,
1572     d_Zd,
1573     NVEC_CUDA_CONTENT(Z[0])->length
1574   );
1575   PostKernelLaunch();
1576 
1577   // Free host array
1578   delete[] h_Zd;
1579 
1580   // Free device arrays
1581   err = cudaFree(d_Zd);
1582   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1583 
1584   return(0);
1585 }
1586 
N_VWrmsNormVectorArray_Cuda(int nvec,N_Vector * X,N_Vector * W,realtype * norms)1587 int N_VWrmsNormVectorArray_Cuda(int nvec, N_Vector* X, N_Vector* W,
1588                                 realtype* norms)
1589 {
1590   cudaError_t err;
1591 
1592   // Create array of device pointers on host
1593   realtype** h_Xd = new realtype*[nvec];
1594   for (int i=0; i<nvec; i++)
1595     h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1596   realtype** h_Wd = new realtype*[nvec];
1597   for (int i=0; i<nvec; i++)
1598     h_Wd[i] = NVEC_CUDA_DDATAp(W[i]);
1599 
1600   // Copy array of device pointers to device from host
1601   realtype** d_Xd;
1602   err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1603   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1604   err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1605   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1606 
1607   realtype** d_Wd;
1608   err = cudaMalloc((void**) &d_Wd, nvec*sizeof(realtype*));
1609   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1610   err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1611   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1612 
1613   // Set kernel parameters
1614   size_t grid, block, shMemSize;
1615   cudaStream_t stream;
1616 
1617   if (GetKernelParameters(X[0], true, grid, block, shMemSize, stream)) return(-1);
1618   grid = nvec;
1619 
1620   // Allocate reduction buffer on device
1621   realtype* d_buff;
1622   err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
1623   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1624   err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
1625   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1626 
1627   wL2NormSquareVectorArrayKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1628   (
1629     nvec,
1630     d_Xd,
1631     d_Wd,
1632     d_buff,
1633     NVEC_CUDA_CONTENT(X[0])->length
1634   );
1635   PostKernelLaunch();
1636 
1637   // Copy GPU result to the cpu.
1638   err = cudaMemcpy(norms, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
1639   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1640 
1641   // Finish computation
1642   for (int k=0; k<nvec; ++k)
1643     norms[k] = std::sqrt(norms[k]/NVEC_CUDA_CONTENT(X[0])->length);
1644 
1645   // Free host array
1646   delete[] h_Xd;
1647   delete[] h_Wd;
1648 
1649   // Free device arrays
1650   err = cudaFree(d_Xd);
1651   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1652   err = cudaFree(d_Wd);
1653   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1654   err = cudaFree(d_buff);
1655   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1656 
1657   return(0);
1658 }
1659 
N_VWrmsNormMaskVectorArray_Cuda(int nvec,N_Vector * X,N_Vector * W,N_Vector id,realtype * norms)1660 int N_VWrmsNormMaskVectorArray_Cuda(int nvec, N_Vector* X, N_Vector* W,
1661                                     N_Vector id, realtype* norms)
1662 {
1663   cudaError_t err;
1664 
1665   // Create array of device pointers on host
1666   realtype** h_Xd = new realtype*[nvec];
1667   for (int i=0; i<nvec; i++)
1668     h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1669 
1670   realtype** h_Wd = new realtype*[nvec];
1671   for (int i=0; i<nvec; i++)
1672     h_Wd[i] = NVEC_CUDA_DDATAp(W[i]);
1673 
1674   // Copy array of device pointers to device from host
1675   realtype** d_Xd;
1676   err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1677   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1678   err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1679   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1680 
1681   realtype** d_Wd;
1682   err = cudaMalloc((void**) &d_Wd, nvec*sizeof(realtype*));
1683   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1684   err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1685   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1686 
1687   // Set kernel parameters
1688   size_t grid, block, shMemSize;
1689   cudaStream_t stream;
1690 
1691   if (GetKernelParameters(X[0], true, grid, block, shMemSize, stream)) return(-1);
1692   grid = nvec;
1693 
1694   // Allocate reduction buffer on device
1695   realtype* d_buff;
1696   err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
1697   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1698   err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
1699   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1700 
1701   wL2NormSquareMaskVectorArrayKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
1702   (
1703     nvec,
1704     d_Xd,
1705     d_Wd,
1706     NVEC_CUDA_DDATAp(id),
1707     d_buff,
1708     NVEC_CUDA_CONTENT(X[0])->length
1709   );
1710   PostKernelLaunch();
1711 
1712   // Copy GPU result to the cpu.
1713   err = cudaMemcpy(norms, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
1714   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1715 
1716   // Finish computation
1717   for (int k=0; k<nvec; ++k)
1718     norms[k] = std::sqrt(norms[k]/NVEC_CUDA_CONTENT(X[0])->length);
1719 
1720   // Free host array
1721   delete[] h_Xd;
1722   delete[] h_Wd;
1723 
1724   // Free device arrays
1725   err = cudaFree(d_Xd);
1726   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1727   err = cudaFree(d_Wd);
1728   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1729   err = cudaFree(d_buff);
1730   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1731 
1732   return(0);
1733 }
1734 
N_VScaleAddMultiVectorArray_Cuda(int nvec,int nsum,realtype * c,N_Vector * X,N_Vector ** Y,N_Vector ** Z)1735 int N_VScaleAddMultiVectorArray_Cuda(int nvec, int nsum, realtype* c,
1736                                      N_Vector* X, N_Vector** Y, N_Vector** Z)
1737 {
1738   cudaError_t err;
1739 
1740   // Copy c array to device
1741   realtype* d_c;
1742   err = cudaMalloc((void**) &d_c, nsum*sizeof(realtype));
1743   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1744   err = cudaMemcpy(d_c, c, nsum*sizeof(realtype), cudaMemcpyHostToDevice);
1745   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1746 
1747   // Create array of device pointers on host
1748   realtype** h_Xd = new realtype*[nvec];
1749   for (int i=0; i<nvec; i++)
1750     h_Xd[i] = NVEC_CUDA_DDATAp(X[i]);
1751 
1752   realtype** h_Yd = new realtype*[nsum*nvec];
1753   for (int j=0; j<nvec; j++)
1754     for (int i=0; i<nsum; i++)
1755       h_Yd[j*nsum+i] = NVEC_CUDA_DDATAp(Y[i][j]);
1756 
1757   realtype** h_Zd = new realtype*[nsum*nvec];
1758   for (int j=0; j<nvec; j++)
1759     for (int i=0; i<nsum; i++)
1760       h_Zd[j*nsum+i] = NVEC_CUDA_DDATAp(Z[i][j]);
1761 
1762   // Copy array of device pointers to device from host
1763   realtype** d_Xd;
1764   err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
1765   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1766   err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1767   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1768 
1769   realtype** d_Yd;
1770   err = cudaMalloc((void**) &d_Yd, nsum*nvec*sizeof(realtype*));
1771   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1772   err = cudaMemcpy(d_Yd, h_Yd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1773   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1774 
1775   realtype** d_Zd;
1776   err = cudaMalloc((void**) &d_Zd, nsum*nvec*sizeof(realtype*));
1777   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1778   err = cudaMemcpy(d_Zd, h_Zd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1779   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1780 
1781   // Set kernel parameters
1782   size_t grid, block, shMemSize;
1783   cudaStream_t stream;
1784 
1785   if (GetKernelParameters(Z[0][0], false, grid, block, shMemSize, stream)) return(-1);
1786   scaleAddMultiVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1787   (
1788     nvec,
1789     nsum,
1790     d_c,
1791     d_Xd,
1792     d_Yd,
1793     d_Zd,
1794     NVEC_CUDA_CONTENT(Z[0][0])->length
1795   );
1796   PostKernelLaunch();
1797 
1798   // Free host array
1799   delete[] h_Xd;
1800   delete[] h_Yd;
1801   delete[] h_Zd;
1802 
1803   // Free device arrays
1804   err = cudaFree(d_c);
1805   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1806   err = cudaFree(d_Xd);
1807   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1808   err = cudaFree(d_Yd);
1809   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1810   err = cudaFree(d_Zd);
1811   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1812 
1813   return(0);
1814 }
1815 
N_VLinearCombinationVectorArray_Cuda(int nvec,int nsum,realtype * c,N_Vector ** X,N_Vector * Z)1816 int N_VLinearCombinationVectorArray_Cuda(int nvec, int nsum, realtype* c,
1817                                          N_Vector** X, N_Vector* Z)
1818 {
1819   cudaError_t err;
1820 
1821   // Copy c array to device
1822   realtype* d_c;
1823   err = cudaMalloc((void**) &d_c, nsum*sizeof(realtype));
1824   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1825   err = cudaMemcpy(d_c, c, nsum*sizeof(realtype), cudaMemcpyHostToDevice);
1826   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1827 
1828   // Create array of device pointers on host
1829   realtype** h_Xd = new realtype*[nsum*nvec];
1830   for (int j=0; j<nvec; j++)
1831     for (int i=0; i<nsum; i++)
1832       h_Xd[j*nsum+i] = NVEC_CUDA_DDATAp(X[i][j]);
1833 
1834   realtype** h_Zd = new realtype*[nvec];
1835   for (int i=0; i<nvec; i++)
1836     h_Zd[i] = NVEC_CUDA_DDATAp(Z[i]);
1837 
1838   // Copy array of device pointers to device from host
1839   realtype** d_Xd;
1840   err = cudaMalloc((void**) &d_Xd, nsum*nvec*sizeof(realtype*));
1841   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1842   err = cudaMemcpy(d_Xd, h_Xd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1843   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1844 
1845   realtype** d_Zd;
1846   err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
1847   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1848   err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
1849   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1850 
1851   // Set kernel parameters
1852   size_t grid, block, shMemSize;
1853   cudaStream_t stream;
1854 
1855   if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return(-1);
1856   linearCombinationVectorArrayKernel<<<grid, block, shMemSize, stream>>>
1857   (
1858     nvec,
1859     nsum,
1860     d_c,
1861     d_Xd,
1862     d_Zd,
1863     NVEC_CUDA_CONTENT(Z[0])->length
1864   );
1865   PostKernelLaunch();
1866 
1867   // Free host array
1868   delete[] h_Xd;
1869   delete[] h_Zd;
1870 
1871   // Free device arrays
1872   err = cudaFree(d_c);
1873   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1874   err = cudaFree(d_Xd);
1875   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1876   err = cudaFree(d_Zd);
1877   if (!SUNDIALS_CUDA_VERIFY(err)) return(-1);
1878 
1879   return cudaGetLastError();
1880 }
1881 
1882 
1883 /*
1884  * -----------------------------------------------------------------
1885  * OPTIONAL XBraid interface operations
1886  * -----------------------------------------------------------------
1887  */
1888 
1889 
N_VBufSize_Cuda(N_Vector x,sunindextype * size)1890 int N_VBufSize_Cuda(N_Vector x, sunindextype *size)
1891 {
1892   if (x == NULL) return(-1);
1893   *size = (sunindextype)NVEC_CUDA_MEMSIZE(x);
1894   return(0);
1895 }
1896 
1897 
N_VBufPack_Cuda(N_Vector x,void * buf)1898 int N_VBufPack_Cuda(N_Vector x, void *buf)
1899 {
1900   int copy_fail = 0;
1901   cudaError_t cuerr;
1902 
1903   if (x == NULL || buf == NULL) return(-1);
1904 
1905   SUNMemory buf_mem = SUNMemoryHelper_Wrap(buf, SUNMEMTYPE_HOST);
1906   if (buf_mem == NULL) return(-1);
1907 
1908   copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
1909                                         buf_mem,
1910                                         NVEC_CUDA_CONTENT(x)->device_data,
1911                                         NVEC_CUDA_MEMSIZE(x),
1912                                         (void*) NVEC_CUDA_STREAM(x));
1913 
1914   /* we synchronize with respect to the host, but only in this stream */
1915   cuerr = cudaStreamSynchronize(*NVEC_CUDA_STREAM(x));
1916 
1917   SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(x), buf_mem);
1918 
1919   return (!SUNDIALS_CUDA_VERIFY(cuerr) || copy_fail ? -1 : 0);
1920 }
1921 
1922 
N_VBufUnpack_Cuda(N_Vector x,void * buf)1923 int N_VBufUnpack_Cuda(N_Vector x, void *buf)
1924 {
1925   int copy_fail = 0;
1926   cudaError_t cuerr;
1927 
1928   if (x == NULL || buf == NULL) return(-1);
1929 
1930   SUNMemory buf_mem = SUNMemoryHelper_Wrap(buf, SUNMEMTYPE_HOST);
1931   if (buf_mem == NULL) return(-1);
1932 
1933   copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(x),
1934                                         NVEC_CUDA_CONTENT(x)->device_data,
1935                                         buf_mem,
1936                                         NVEC_CUDA_MEMSIZE(x),
1937                                         (void*) NVEC_CUDA_STREAM(x));
1938 
1939   /* we synchronize with respect to the host, but only in this stream */
1940   cuerr = cudaStreamSynchronize(*NVEC_CUDA_STREAM(x));
1941 
1942   SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(x), buf_mem);
1943 
1944   return (!SUNDIALS_CUDA_VERIFY(cuerr) || copy_fail ? -1 : 0);
1945 }
1946 
1947 
1948 /*
1949  * -----------------------------------------------------------------
1950  * Enable / Disable fused and vector array operations
1951  * -----------------------------------------------------------------
1952  */
1953 
N_VEnableFusedOps_Cuda(N_Vector v,booleantype tf)1954 int N_VEnableFusedOps_Cuda(N_Vector v, booleantype tf)
1955 {
1956   /* check that vector is non-NULL */
1957   if (v == NULL) return(-1);
1958 
1959   /* check that ops structure is non-NULL */
1960   if (v->ops == NULL) return(-1);
1961 
1962   if (tf)
1963   {
1964     /* enable all fused vector operations */
1965     v->ops->nvlinearcombination = N_VLinearCombination_Cuda;
1966     v->ops->nvscaleaddmulti     = N_VScaleAddMulti_Cuda;
1967     v->ops->nvdotprodmulti      = N_VDotProdMulti_Cuda;
1968     /* enable all vector array operations */
1969     v->ops->nvlinearsumvectorarray         = N_VLinearSumVectorArray_Cuda;
1970     v->ops->nvscalevectorarray             = N_VScaleVectorArray_Cuda;
1971     v->ops->nvconstvectorarray             = N_VConstVectorArray_Cuda;
1972     v->ops->nvwrmsnormvectorarray          = N_VWrmsNormVectorArray_Cuda;
1973     v->ops->nvwrmsnormmaskvectorarray      = N_VWrmsNormMaskVectorArray_Cuda;
1974     v->ops->nvscaleaddmultivectorarray     = N_VScaleAddMultiVectorArray_Cuda;
1975     v->ops->nvlinearcombinationvectorarray = N_VLinearCombinationVectorArray_Cuda;
1976   }
1977   else
1978   {
1979     /* disable all fused vector operations */
1980     v->ops->nvlinearcombination = NULL;
1981     v->ops->nvscaleaddmulti     = NULL;
1982     v->ops->nvdotprodmulti      = NULL;
1983     /* disable all vector array operations */
1984     v->ops->nvlinearsumvectorarray         = NULL;
1985     v->ops->nvscalevectorarray             = NULL;
1986     v->ops->nvconstvectorarray             = NULL;
1987     v->ops->nvwrmsnormvectorarray          = NULL;
1988     v->ops->nvwrmsnormmaskvectorarray      = NULL;
1989     v->ops->nvscaleaddmultivectorarray     = NULL;
1990     v->ops->nvlinearcombinationvectorarray = NULL;
1991   }
1992 
1993   /* return success */
1994   return(0);
1995 }
1996 
N_VEnableLinearCombination_Cuda(N_Vector v,booleantype tf)1997 int N_VEnableLinearCombination_Cuda(N_Vector v, booleantype tf)
1998 {
1999   /* check that vector is non-NULL */
2000   if (v == NULL) return(-1);
2001 
2002   /* check that ops structure is non-NULL */
2003   if (v->ops == NULL) return(-1);
2004 
2005   /* enable/disable operation */
2006   if (tf)
2007     v->ops->nvlinearcombination = N_VLinearCombination_Cuda;
2008   else
2009     v->ops->nvlinearcombination = NULL;
2010 
2011   /* return success */
2012   return(0);
2013 }
2014 
N_VEnableScaleAddMulti_Cuda(N_Vector v,booleantype tf)2015 int N_VEnableScaleAddMulti_Cuda(N_Vector v, booleantype tf)
2016 {
2017   /* check that vector is non-NULL */
2018   if (v == NULL) return(-1);
2019 
2020   /* check that ops structure is non-NULL */
2021   if (v->ops == NULL) return(-1);
2022 
2023   /* enable/disable operation */
2024   if (tf)
2025     v->ops->nvscaleaddmulti = N_VScaleAddMulti_Cuda;
2026   else
2027     v->ops->nvscaleaddmulti = NULL;
2028 
2029   /* return success */
2030   return(0);
2031 }
2032 
N_VEnableDotProdMulti_Cuda(N_Vector v,booleantype tf)2033 int N_VEnableDotProdMulti_Cuda(N_Vector v, booleantype tf)
2034 {
2035   /* check that vector is non-NULL */
2036   if (v == NULL) return(-1);
2037 
2038   /* check that ops structure is non-NULL */
2039   if (v->ops == NULL) return(-1);
2040 
2041   /* enable/disable operation */
2042   if (tf)
2043     v->ops->nvdotprodmulti = N_VDotProdMulti_Cuda;
2044   else
2045     v->ops->nvdotprodmulti = NULL;
2046 
2047   /* return success */
2048   return(0);
2049 }
2050 
N_VEnableLinearSumVectorArray_Cuda(N_Vector v,booleantype tf)2051 int N_VEnableLinearSumVectorArray_Cuda(N_Vector v, booleantype tf)
2052 {
2053   /* check that vector is non-NULL */
2054   if (v == NULL) return(-1);
2055 
2056   /* check that ops structure is non-NULL */
2057   if (v->ops == NULL) return(-1);
2058 
2059   /* enable/disable operation */
2060   if (tf)
2061     v->ops->nvlinearsumvectorarray = N_VLinearSumVectorArray_Cuda;
2062   else
2063     v->ops->nvlinearsumvectorarray = NULL;
2064 
2065   /* return success */
2066   return(0);
2067 }
2068 
N_VEnableScaleVectorArray_Cuda(N_Vector v,booleantype tf)2069 int N_VEnableScaleVectorArray_Cuda(N_Vector v, booleantype tf)
2070 {
2071   /* check that vector is non-NULL */
2072   if (v == NULL) return(-1);
2073 
2074   /* check that ops structure is non-NULL */
2075   if (v->ops == NULL) return(-1);
2076 
2077   /* enable/disable operation */
2078   if (tf)
2079     v->ops->nvscalevectorarray = N_VScaleVectorArray_Cuda;
2080   else
2081     v->ops->nvscalevectorarray = NULL;
2082 
2083   /* return success */
2084   return(0);
2085 }
2086 
N_VEnableConstVectorArray_Cuda(N_Vector v,booleantype tf)2087 int N_VEnableConstVectorArray_Cuda(N_Vector v, booleantype tf)
2088 {
2089   /* check that vector is non-NULL */
2090   if (v == NULL) return(-1);
2091 
2092   /* check that ops structure is non-NULL */
2093   if (v->ops == NULL) return(-1);
2094 
2095   /* enable/disable operation */
2096   if (tf)
2097     v->ops->nvconstvectorarray = N_VConstVectorArray_Cuda;
2098   else
2099     v->ops->nvconstvectorarray = NULL;
2100 
2101   /* return success */
2102   return(0);
2103 }
2104 
N_VEnableWrmsNormVectorArray_Cuda(N_Vector v,booleantype tf)2105 int N_VEnableWrmsNormVectorArray_Cuda(N_Vector v, booleantype tf)
2106 {
2107   /* check that vector is non-NULL */
2108   if (v == NULL) return(-1);
2109 
2110   /* check that ops structure is non-NULL */
2111   if (v->ops == NULL) return(-1);
2112 
2113   /* enable/disable operation */
2114   if (tf)
2115     v->ops->nvwrmsnormvectorarray = N_VWrmsNormVectorArray_Cuda;
2116   else
2117     v->ops->nvwrmsnormvectorarray = NULL;
2118 
2119   /* return success */
2120   return(0);
2121 }
2122 
N_VEnableWrmsNormMaskVectorArray_Cuda(N_Vector v,booleantype tf)2123 int N_VEnableWrmsNormMaskVectorArray_Cuda(N_Vector v, booleantype tf)
2124 {
2125   /* check that vector is non-NULL */
2126   if (v == NULL) return(-1);
2127 
2128   /* check that ops structure is non-NULL */
2129   if (v->ops == NULL) return(-1);
2130 
2131   /* enable/disable operation */
2132   if (tf)
2133     v->ops->nvwrmsnormmaskvectorarray = N_VWrmsNormMaskVectorArray_Cuda;
2134   else
2135     v->ops->nvwrmsnormmaskvectorarray = NULL;
2136 
2137   /* return success */
2138   return(0);
2139 }
2140 
N_VEnableScaleAddMultiVectorArray_Cuda(N_Vector v,booleantype tf)2141 int N_VEnableScaleAddMultiVectorArray_Cuda(N_Vector v, booleantype tf)
2142 {
2143   /* check that vector is non-NULL */
2144   if (v == NULL) return(-1);
2145 
2146   /* check that ops structure is non-NULL */
2147   if (v->ops == NULL) return(-1);
2148 
2149   /* enable/disable operation */
2150   if (tf)
2151     v->ops->nvscaleaddmultivectorarray = N_VScaleAddMultiVectorArray_Cuda;
2152   else
2153     v->ops->nvscaleaddmultivectorarray = NULL;
2154 
2155   /* return success */
2156   return(0);
2157 }
2158 
N_VEnableLinearCombinationVectorArray_Cuda(N_Vector v,booleantype tf)2159 int N_VEnableLinearCombinationVectorArray_Cuda(N_Vector v, booleantype tf)
2160 {
2161   /* check that vector is non-NULL */
2162   if (v == NULL) return(-1);
2163 
2164   /* check that ops structure is non-NULL */
2165   if (v->ops == NULL) return(-1);
2166 
2167   /* enable/disable operation */
2168   if (tf)
2169     v->ops->nvlinearcombinationvectorarray = N_VLinearCombinationVectorArray_Cuda;
2170   else
2171     v->ops->nvlinearcombinationvectorarray = NULL;
2172 
2173   /* return success */
2174   return(0);
2175 }
2176 
2177 /*
2178  * Private helper functions.
2179  */
2180 
AllocateData(N_Vector v)2181 int AllocateData(N_Vector v)
2182 {
2183   int alloc_fail = 0;
2184   N_VectorContent_Cuda vc = NVEC_CUDA_CONTENT(v);
2185   N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
2186 
2187   if (N_VGetLength_Cuda(v) == 0) return(0);
2188 
2189   if (vcp->use_managed_mem)
2190   {
2191     alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v), &(vc->device_data),
2192                                        NVEC_CUDA_MEMSIZE(v), SUNMEMTYPE_UVM);
2193     if (alloc_fail)
2194     {
2195       SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: SUNMemoryHelper_Alloc failed for SUNMEMTYPE_UVM\n");
2196     }
2197     vc->host_data = SUNMemoryHelper_Alias(vc->device_data);
2198   }
2199   else
2200   {
2201     alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v), &(vc->host_data),
2202                                        NVEC_CUDA_MEMSIZE(v), SUNMEMTYPE_HOST);
2203     if (alloc_fail)
2204     {
2205       SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_HOST\n");
2206     }
2207 
2208     alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v), &(vc->device_data),
2209                                        NVEC_CUDA_MEMSIZE(v), SUNMEMTYPE_DEVICE);
2210     if (alloc_fail)
2211     {
2212       SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_DEVICE\n");
2213     }
2214   }
2215 
2216   return(alloc_fail ? -1 : 0);
2217 }
2218 
2219 /*
2220  * Initializes the internal buffer used for reductions.
2221  * If the buffer is already allocated, it will only be reallocated
2222  * if it is no longer large enough. This may occur if the length
2223  * of the vector is increased. The buffer is initialized to the
2224  * value given.
2225  */
InitializeReductionBuffer(N_Vector v,const realtype value)2226 int InitializeReductionBuffer(N_Vector v, const realtype value)
2227 {
2228   int alloc_fail = 0, copy_fail = 0;
2229   size_t bytes = sizeof(realtype);
2230   booleantype need_to_allocate = SUNFALSE;
2231   N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
2232   SUNMemory value_mem = SUNMemoryHelper_Wrap((void*) &value, SUNMEMTYPE_HOST);
2233 
2234   /* we allocate if the existing reduction buffer is not large enough */
2235   if (vcp->reduce_buffer_allocated_bytes < bytes)
2236   {
2237     FreeReductionBuffer(v);
2238     need_to_allocate = SUNTRUE;
2239   }
2240 
2241   if (need_to_allocate)
2242   {
2243     alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v),
2244                                        &(vcp->reduce_buffer_host), bytes,
2245                                        SUNMEMTYPE_PINNED);
2246     if (alloc_fail)
2247     {
2248       SUNDIALS_DEBUG_PRINT("WARNING in InitializeReductionBuffer: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_PINNED, using SUNMEMTYPE_HOST instead\n");
2249 
2250       /* try to allocate just plain host memory instead */
2251       alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v),
2252                                          &(vcp->reduce_buffer_host), bytes,
2253                                          SUNMEMTYPE_HOST);
2254       if (alloc_fail)
2255       {
2256         SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_HOST\n");
2257       }
2258     }
2259     alloc_fail = SUNMemoryHelper_Alloc(NVEC_CUDA_MEMHELP(v),
2260                                        &(vcp->reduce_buffer_dev), bytes,
2261                                        SUNMEMTYPE_DEVICE);
2262     if (alloc_fail)
2263     {
2264       SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: SUNMemoryHelper_Alloc failed to alloc SUNMEMTYPE_DEVICE\n");
2265     }
2266   }
2267 
2268   if (!alloc_fail)
2269   {
2270     /* store the size of the buffer */
2271     vcp->reduce_buffer_allocated_bytes = bytes;
2272 
2273     /* initialize the memory with the value */
2274     copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(v),
2275                                           vcp->reduce_buffer_dev, value_mem,
2276                                           bytes, (void*) NVEC_CUDA_STREAM(v));
2277 
2278     if (copy_fail)
2279     {
2280       SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: SUNMemoryHelper_CopyAsync failed\n");
2281     }
2282   }
2283 
2284   SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), value_mem);
2285   return((alloc_fail || copy_fail) ? -1 : 0);
2286 }
2287 
2288 /* Free the reduction buffer
2289  */
FreeReductionBuffer(N_Vector v)2290 void FreeReductionBuffer(N_Vector v)
2291 {
2292   N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
2293 
2294   if (vcp == NULL) return;
2295 
2296   if (vcp->reduce_buffer_dev != NULL)
2297     SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vcp->reduce_buffer_dev);
2298   vcp->reduce_buffer_dev  = NULL;
2299   if (vcp->reduce_buffer_host != NULL)
2300     SUNMemoryHelper_Dealloc(NVEC_CUDA_MEMHELP(v), vcp->reduce_buffer_host);
2301   vcp->reduce_buffer_host = NULL;
2302 }
2303 
2304 /* Copy the reduction buffer from the device to the host.
2305  */
CopyReductionBufferFromDevice(N_Vector v,size_t n)2306 int CopyReductionBufferFromDevice(N_Vector v, size_t n)
2307 {
2308   int copy_fail;
2309   cudaError_t cuerr;
2310 
2311   copy_fail = SUNMemoryHelper_CopyAsync(NVEC_CUDA_MEMHELP(v),
2312                                         NVEC_CUDA_PRIVATE(v)->reduce_buffer_host,
2313                                         NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev,
2314                                         n*sizeof(realtype),
2315                                         (void*) NVEC_CUDA_STREAM(v));
2316 
2317   if (copy_fail)
2318   {
2319     SUNDIALS_DEBUG_PRINT("ERROR in CopyReductionBufferFromDevice: SUNMemoryHelper_CopyAsync returned nonzero\n");
2320   }
2321 
2322   /* we synchronize with respect to the host, but only in this stream */
2323   cuerr = cudaStreamSynchronize(*NVEC_CUDA_STREAM(v));
2324   return (!SUNDIALS_CUDA_VERIFY(cuerr) || copy_fail ? -1 : 0);
2325 }
2326 
2327 /* Get the kernel launch parameters based on the kernel type (reduction or not),
2328  * using the appropriate kernel execution policy.
2329  */
GetKernelParameters(N_Vector v,booleantype reduction,size_t & grid,size_t & block,size_t & shMemSize,cudaStream_t & stream,size_t n)2330 static int GetKernelParameters(N_Vector v, booleantype reduction, size_t& grid,
2331                                size_t& block, size_t& shMemSize,
2332                                cudaStream_t& stream, size_t n)
2333 {
2334   n = (n == 0) ? NVEC_CUDA_CONTENT(v)->length : n;
2335   if (reduction)
2336   {
2337     SUNCudaExecPolicy* reduce_exec_policy = NVEC_CUDA_CONTENT(v)->reduce_exec_policy;
2338     grid      = reduce_exec_policy->gridSize(n);
2339     block     = reduce_exec_policy->blockSize();
2340     shMemSize = 0;
2341     stream    = *(reduce_exec_policy->stream());
2342     if (block % CUDA_WARP_SIZE)
2343     {
2344 #ifdef SUNDIALS_DEBUG
2345       throw std::runtime_error("the block size must be a multiple must be of CUDA warp size");
2346 #endif
2347       return(-1);
2348     }
2349   }
2350   else
2351   {
2352     SUNCudaExecPolicy* stream_exec_policy = NVEC_CUDA_CONTENT(v)->stream_exec_policy;
2353     grid      = stream_exec_policy->gridSize(n);
2354     block     = stream_exec_policy->blockSize();
2355     shMemSize = 0;
2356     stream    = *(stream_exec_policy->stream());
2357   }
2358 
2359   if (grid == 0)
2360   {
2361 #ifdef SUNDIALS_DEBUG
2362     throw std::runtime_error("the grid size must be > 0");
2363 #endif
2364     return(-1);
2365   }
2366   if (block == 0)
2367   {
2368 #ifdef SUNDIALS_DEBUG
2369     throw std::runtime_error("the block size must be > 0");
2370 #endif
2371     return(-1);
2372   }
2373 
2374   return(0);
2375 }
2376 
2377 /* Should be called after a kernel launch.
2378  * If SUNDIALS_DEBUG_CUDA_LASTERROR is not defined, then the function does nothing.
2379  * If it is defined, the function will synchronize and check the last CUDA error.
2380  */
PostKernelLaunch()2381 void PostKernelLaunch()
2382 {
2383 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
2384   cudaDeviceSynchronize();
2385   SUNDIALS_CUDA_VERIFY(cudaGetLastError());
2386 #endif
2387 }
2388 
2389 
2390 } // extern "C"
2391