1 /*
2    Implements the sequential Kokkos vectors.
3 */
4 #include "Kokkos_Core_fwd.hpp"
5 #include "Kokkos_Macros.hpp"
6 #include "Kokkos_Parallel.hpp"
7 #include "Kokkos_Parallel_Reduce.hpp"
8 #include <petsc/private/sfimpl.h>
9 #include <petsc/private/petscimpl.h>
10 #include <petscmath.h>
11 #include <petscviewer.h>
12 #include <KokkosBlas.hpp>
13 
14 #include <petscconf.h>
15 #include <petscvec.hpp>
16 #include <petscerror.h>
17 #include <../src/vec/vec/impls/dvecimpl.h> /* for VecCreate_Seq_Private */
18 #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
19 
20 #if defined(PETSC_USE_DEBUG)
21   #define VecErrorIfNotKokkos(v) \
22     do {                     \
23       PetscErrorCode   ierr; \
24       PetscBool        isKokkos = PETSC_FALSE; \
25       ierr = PetscObjectTypeCompareAny((PetscObject)(v),&isKokkos,VECSEQKOKKOS,VECMPIKOKKOS,VECKOKKOS,"");CHKERRQ(ierr); \
26       if (!isKokkos) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Calling VECKOKKOS methods on a non-VECKOKKOS object"); \
27     } while (0)
28 #else
29   #define VecErrorIfNotKokkos(v) do {(void)(v);} while (0)
30 #endif
31 
VecKokkosSyncHost(Vec v)32 PetscErrorCode VecKokkosSyncHost(Vec v)
33 {
34   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
35 
36   PetscFunctionBegin;
37   VecErrorIfNotKokkos(v);
38   veckok->dual_v.sync_host();
39   PetscFunctionReturn(0);
40 }
41 
VecKokkosModifyHost(Vec v)42 PetscErrorCode VecKokkosModifyHost(Vec v)
43 {
44   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
45 
46   PetscFunctionBegin;
47   VecErrorIfNotKokkos(v);
48   veckok->dual_v.modify_host();
49   PetscFunctionReturn(0);
50 }
51 
52 /* TODO: Is VecGetKokkosDeviceView() a better name, if we always use -vec_type kokkos */
VecKokkosGetDeviceView(Vec v,PetscScalarViewDevice_t * d_view)53 PetscErrorCode VecKokkosGetDeviceView(Vec v,PetscScalarViewDevice_t* d_view)
54 {
55   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
56 
57   PetscFunctionBegin;
58   VecErrorIfNotKokkos(v);
59   veckok->dual_v.sync_device(); /* User might read and write the device view */
60   *d_view = veckok->dual_v.view_device();
61   PetscFunctionReturn(0);
62 }
63 
VecKokkosRestoreDeviceView(Vec v,PetscScalarViewDevice_t * d_view)64 PetscErrorCode VecKokkosRestoreDeviceView(Vec v,PetscScalarViewDevice_t* d_view)
65 {
66   PetscErrorCode ierr;
67   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);
68 
69   PetscFunctionBegin;
70   VecErrorIfNotKokkos(v);
71   veckok->dual_v.modify_device();
72   ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
VecKokkosGetDeviceViewRead(Vec v,ConstPetscScalarViewDevice_t * d_view)76 PetscErrorCode VecKokkosGetDeviceViewRead(Vec v,ConstPetscScalarViewDevice_t* d_view)
77 {
78   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
79 
80   PetscFunctionBegin;
81   VecErrorIfNotKokkos(v);
82   veckok->dual_v.sync_device();
83   *d_view = veckok->dual_v.view_device();
84   PetscFunctionReturn(0);
85 }
86 
87 /* VecKokkosRestoreDeviceViewRead() is defined as a no-op in header */
88 
89 /* This one will overwrite the device, so no need to sync device.
90   But let's say the host is modified. In VecKokkosRestoreDeviceViewWrite(), we have to
91   clear_sync_state() to only mark the device as modified. Otherwise we will have a wrong
92   state saying both host and device are modified. If xin=yin, we have to support code sequence:
93 
94   VecKokkosGetDeviceViewWrite(xin,&xv);
95   VecKokkosGetDeviceViewRead(yin,&yv);
96   ...
97   VecKokkosRestoreDeviceViewWrite(xin,&xv);
98   VecKokkosRestoreDeviceViewRead(yin,&yv);
99  */
VecKokkosGetDeviceViewWrite(Vec v,PetscScalarViewDevice_t * d_view)100 PetscErrorCode VecKokkosGetDeviceViewWrite(Vec v,PetscScalarViewDevice_t* d_view)
101 {
102   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
103 
104   PetscFunctionBegin;
105   VecErrorIfNotKokkos(v);
106   *d_view = veckok->dual_v.view_device();
107   PetscFunctionReturn(0);
108 }
109 
VecKokkosRestoreDeviceViewWrite(Vec v,PetscScalarViewDevice_t * dv)110 PetscErrorCode VecKokkosRestoreDeviceViewWrite(Vec v,PetscScalarViewDevice_t* dv)
111 {
112   PetscErrorCode ierr;
113   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);
114 
115   PetscFunctionBegin;
116   VecErrorIfNotKokkos(v);
117   veckok->dual_v.clear_sync_state();
118   veckok->dual_v.modify_device();
119   ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
120   PetscFunctionReturn(0);
121 }
122 
VecKokkosGetArrayInPlace(Vec v,PetscScalar ** array)123 PetscErrorCode VecKokkosGetArrayInPlace(Vec v,PetscScalar** array)
124 {
125   PetscErrorCode ierr;
126 
127   PetscFunctionBegin;
128   ierr = VecKokkosGetArrayInPlace_Internal(v,array,NULL);CHKERRQ(ierr);
129   PetscFunctionReturn(0);
130 }
131 
VecKokkosGetArrayInPlace_Internal(Vec v,PetscScalar ** array,PetscMemType * mtype)132 PetscErrorCode VecKokkosGetArrayInPlace_Internal(Vec v,PetscScalar** array,PetscMemType *mtype)
133 {
134   Vec_Kokkos  *veckok = static_cast<Vec_Kokkos*>(v->spptr);
135 
136   PetscFunctionBegin;
137   VecErrorIfNotKokkos(v);
138   if (veckok->dual_v.need_sync_device()) {
139    /* Host has newer data than device */
140     *array = veckok->dual_v.view_host().data();
141     if (mtype) *mtype = PETSC_MEMTYPE_HOST;
142   } else {
143     /* Device has newer or same data as host. We prefer returning devcie data*/
144     *array = veckok->dual_v.view_device().data();
145     if (mtype) *mtype = PETSC_MEMTYPE_DEVICE;
146   }
147   PetscFunctionReturn(0);
148 }
149 
VecKokkosRestoreArrayInPlace(Vec v,PetscScalar ** array)150 PetscErrorCode VecKokkosRestoreArrayInPlace(Vec v,PetscScalar** array)
151 {
152   PetscErrorCode ierr;
153   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);
154 
155   PetscFunctionBegin;
156   VecErrorIfNotKokkos(v);
157   if (veckok->dual_v.need_sync_device()) { /* Host has newer data than device */
158     veckok->dual_v.modify_host();
159   } else {
160     veckok->dual_v.modify_device();
161   }
162   ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
163   PetscFunctionReturn(0);
164 }
165 
VecKokkosGetArrayReadInPlace(Vec v,const PetscScalar ** array)166 PetscErrorCode VecKokkosGetArrayReadInPlace(Vec v,const PetscScalar** array)
167 {
168   PetscErrorCode ierr;
169 
170   PetscFunctionBegin;
171   ierr = VecKokkosGetArrayReadInPlace_Internal(v,array,NULL);CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 }
174 
VecKokkosGetArrayReadInPlace_Internal(Vec v,const PetscScalar ** array,PetscMemType * mtype)175 PetscErrorCode VecKokkosGetArrayReadInPlace_Internal(Vec v,const PetscScalar** array,PetscMemType *mtype)
176 {
177   Vec_Kokkos  *veckok = static_cast<Vec_Kokkos*>(v->spptr);
178 
179   PetscFunctionBegin;
180   VecErrorIfNotKokkos(v);
181   if (veckok->dual_v.need_sync_device()) { /* Host has newer data than device */
182     *array = veckok->dual_v.view_host().data();
183     if (mtype) *mtype = PETSC_MEMTYPE_HOST;
184   } else {
185     *array = veckok->dual_v.view_device().data();
186     if (mtype) *mtype = PETSC_MEMTYPE_DEVICE;
187   }
188   PetscFunctionReturn(0);
189 }
190 
VecSetRandom_SeqKokkos(Vec xin,PetscRandom r)191 PetscErrorCode VecSetRandom_SeqKokkos(Vec xin,PetscRandom r)
192 {
193   PetscErrorCode ierr;
194   PetscInt       n = xin->map->n,i;
195   PetscScalar    *xx;
196 
197   PetscFunctionBegin;
198   ierr = VecGetArrayWrite(xin,&xx);CHKERRQ(ierr); /* TODO: generate randoms directly on device */
199   for (i=0; i<n; i++) { ierr = PetscRandomGetValue(r,&xx[i]);CHKERRQ(ierr); }
200   ierr = VecRestoreArrayWrite(xin,&xx);CHKERRQ(ierr);
201   PetscFunctionReturn(0);
202 }
203 
204 /* x = |x| */
VecAbs_SeqKokkos(Vec xin)205 PetscErrorCode VecAbs_SeqKokkos(Vec xin)
206 {
207   PetscErrorCode             ierr;
208   PetscScalarViewDevice_t    xv;
209 
210   PetscFunctionBegin;
211   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
212   ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
213   KokkosBlas::abs(xv,xv);
214   ierr = WaitForKokkos();CHKERRQ(ierr);
215   ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
216   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 /* x = 1/x */
VecReciprocal_SeqKokkos(Vec xin)221 PetscErrorCode VecReciprocal_SeqKokkos(Vec xin)
222 {
223   PetscErrorCode             ierr;
224   PetscScalarViewDevice_t    xv;
225 
226   PetscFunctionBegin;
227   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
228   ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
229   /* KokkosBlas::reciprocal(xv,xv); */
230   Kokkos::parallel_for(xv.extent(0),KOKKOS_LAMBDA(const int64_t i) {if (xv(i) != (PetscScalar)0.0) xv(i) = (PetscScalar)1.0/xv(i);});
231   ierr = WaitForKokkos();CHKERRQ(ierr);
232   ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
233   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
234   PetscFunctionReturn(0);
235 }
236 
VecMin_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)237 PetscErrorCode VecMin_SeqKokkos(Vec xin,PetscInt *p,PetscReal *val)
238 {
239   typedef Kokkos::MinLoc<PetscReal,PetscInt>::value_type MinLocValue_t;
240 
241   PetscErrorCode                  ierr;
242   ConstPetscScalarViewDevice_t    xv;
243   MinLocValue_t                   minloc;
244 
245   PetscFunctionBegin;
246   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
247   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
248   Kokkos::parallel_reduce("VecMin",xin->map->n,KOKKOS_LAMBDA(PetscInt i,MinLocValue_t& lminloc) {
249     if (xv(i) < lminloc.val) {
250       lminloc.val = xv(i);
251       lminloc.loc = i;
252     }
253   },Kokkos::MinLoc<PetscReal,PetscInt>(minloc)); /* Kokkos will set minloc properly even if xin is zero-lengthed */
254   *p   = minloc.loc;
255   *val = minloc.val;
256   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
257   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
258   PetscFunctionReturn(0);
259 }
260 
VecMax_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)261 PetscErrorCode VecMax_SeqKokkos(Vec xin,PetscInt *p,PetscReal *val)
262 {
263   typedef Kokkos::MaxLoc<PetscReal,PetscInt>::value_type MaxLocValue_t;
264 
265   PetscErrorCode                  ierr;
266   ConstPetscScalarViewDevice_t    xv;
267   MaxLocValue_t                   maxloc;
268 
269   PetscFunctionBegin;
270   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
271   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
272   Kokkos::parallel_reduce("VecMax",xin->map->n,KOKKOS_LAMBDA(PetscInt i,MaxLocValue_t& lmaxloc) {
273     if (xv(i) > lmaxloc.val) {
274       lmaxloc.val = xv(i);
275       lmaxloc.loc = i;
276     }
277   },Kokkos::MaxLoc<PetscReal,PetscInt>(maxloc));
278   *p   = maxloc.loc;
279   *val = maxloc.val;
280   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
281   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
VecShift_SeqKokkos(Vec xin,PetscScalar shift)285 PetscErrorCode VecShift_SeqKokkos(Vec xin,PetscScalar shift)
286 {
287   PetscErrorCode                  ierr;
288   PetscScalarViewDevice_t         xv;
289 
290   PetscFunctionBegin;
291   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
292   ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
293   Kokkos::parallel_for("VecShift",xin->map->n,KOKKOS_LAMBDA(PetscInt i) {xv(i) += shift;});
294   ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
295   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
296   PetscFunctionReturn(0);
297 }
298 
299 /* y = alpha x + y */
VecAXPY_SeqKokkos(Vec yin,PetscScalar alpha,Vec xin)300 PetscErrorCode VecAXPY_SeqKokkos(Vec yin,PetscScalar alpha,Vec xin)
301 {
302   PetscErrorCode               ierr;
303   PetscBool                    xiskok,yiskok;
304   PetscScalarViewDevice_t      yv;
305   ConstPetscScalarViewDevice_t xv;
306 
307   PetscFunctionBegin;
308   if (alpha == (PetscScalar)0.0) PetscFunctionReturn(0);
309   if (yin == xin) {
310     ierr = VecScale_SeqKokkos(yin,alpha+1);CHKERRQ(ierr);
311     PetscFunctionReturn(0);
312   }
313   ierr = PetscObjectTypeCompareAny((PetscObject)xin,&xiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
314   ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
315   if (xiskok && yiskok) {
316     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
317     ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
318     ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
319     KokkosBlas::axpy(alpha,xv,yv);
320     ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
321     ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
322     ierr = WaitForKokkos();CHKERRQ(ierr);
323     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324     ierr = PetscLogGpuFlops(2.0*yin->map->n);CHKERRQ(ierr);
325   } else {
326     ierr = VecAXPY_Seq(yin,alpha,xin);CHKERRQ(ierr);
327   }
328   PetscFunctionReturn(0);
329 }
330 
331 /* y = x + beta y */
VecAYPX_SeqKokkos(Vec yin,PetscScalar beta,Vec xin)332 PetscErrorCode VecAYPX_SeqKokkos(Vec yin,PetscScalar beta,Vec xin)
333 {
334   PetscErrorCode   ierr;
335 
336   PetscFunctionBegin;
337   /* One needs to define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 to have optimizations for cases alpha/beta = 0,+/-1 */
338   ierr = VecAXPBY_SeqKokkos(yin,1.0,beta,xin);CHKERRQ(ierr);
339   PetscFunctionReturn(0);
340 }
341 
342 /* z = y^H x */
VecDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)343 PetscErrorCode VecDot_SeqKokkos(Vec xin,Vec yin,PetscScalar *z)
344 {
345   PetscErrorCode                  ierr;
346   ConstPetscScalarViewDevice_t    xv,yv;
347 
348   PetscFunctionBegin;
349   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
350   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
351   ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
352   Kokkos::parallel_reduce("VecDot",xin->map->n,KOKKOS_LAMBDA(int64_t i, PetscScalar& update) {
353 #if defined(PETSC_USE_COMPLEX)
354     update += Kokkos::conj(yv(i))*xv(i);
355 #else
356     update += yv(i)*xv(i);
357 #endif
358   },*z); /* Kokkos always overwrites z, so no need to init it */
359   ierr = WaitForKokkos();CHKERRQ(ierr);
360   ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
361   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
362   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
363   if (xin->map->n > 0) {ierr = PetscLogGpuFlops(2.0*xin->map->n);CHKERRQ(ierr);}
364   PetscFunctionReturn(0);
365 }
366 
367 struct MDotFunctor {
368   /* Note the C++ notation for an array typedef */
369   typedef PetscScalar value_type[];
370   typedef ConstPetscScalarViewDevice_t::size_type size_type;
371 
372   /* Tell Kokkos the result array's number of entries. This must be a public value in the functor */
373   const size_type              value_count;
374   ConstPetscScalarViewDevice_t xv,yv[8];
375 
MDotFunctorMDotFunctor376   MDotFunctor(ConstPetscScalarViewDevice_t& xv,
377               const PetscInt ny, /* Number of valid entries in yv[8]. 1 <= ny <= 8 */
378               ConstPetscScalarViewDevice_t& yv0, ConstPetscScalarViewDevice_t& yv1,
379               ConstPetscScalarViewDevice_t& yv2, ConstPetscScalarViewDevice_t& yv3,
380               ConstPetscScalarViewDevice_t& yv4, ConstPetscScalarViewDevice_t& yv5,
381               ConstPetscScalarViewDevice_t& yv6, ConstPetscScalarViewDevice_t& yv7)
382     : value_count(ny),xv(xv)
383   {
384     yv[0] = yv0; yv[1] = yv1;
385     yv[2] = yv2; yv[3] = yv3;
386     yv[4] = yv4; yv[5] = yv5;
387     yv[6] = yv6; yv[7] = yv7;
388   }
389 
operator ()MDotFunctor390   KOKKOS_INLINE_FUNCTION void operator() (const size_type i,value_type sum) const {
391     PetscScalar xval = xv(i);
392     for (size_type j = 0; j<value_count; ++j) {
393      #if defined(PETSC_USE_COMPLEX)
394       sum[j] += Kokkos::conj(yv[j](i))*xval;
395      #else
396       sum[j] += yv[j](i)*xval;
397      #endif
398     }
399   }
400 
joinMDotFunctor401   KOKKOS_INLINE_FUNCTION void join (volatile value_type dst,const volatile value_type src) const
402   {
403     for (size_type j = 0; j<value_count; j++) dst[j] += src[j];
404   }
405 
initMDotFunctor406   KOKKOS_INLINE_FUNCTION void init (value_type sum) const
407   {
408     for (size_type j = 0; j<value_count; j++) sum[j] = 0.0;
409   }
410 };
411 
412 /* z[i] = (x,y_i) = y_i^H x */
VecMDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)413 PetscErrorCode VecMDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
414 {
415   PetscErrorCode                  ierr;
416   PetscInt                        i,j,cur=0,ngroup=nv/8,rem=nv%8;
417   ConstPetscScalarViewDevice_t    xv,yv[8];
418   PetscScalarViewHost_t           zv(z,nv);
419 
420   PetscFunctionBegin;
421   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
422   for (i=0; i<ngroup; i++) { /* 8 y's per group */
423     for (j=0; j<8; j++) {ierr = VecKokkosGetDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
424     MDotFunctor mdot(xv,8,yv[0],yv[1],yv[2],yv[3],yv[4],yv[5],yv[6],yv[7]); /* Hope Kokkos make it asynchronous */
425     Kokkos::parallel_reduce(xin->map->n,mdot,Kokkos::subview(zv,Kokkos::pair<PetscInt,PetscInt>(cur,cur+8)));
426     for (j=0; j<8; j++) {ierr = VecKokkosRestoreDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
427     cur += 8;
428   }
429 
430   if (rem) { /* The remaining */
431     for (j=0; j<rem; j++) {ierr = VecKokkosGetDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
432     MDotFunctor mdot(xv,rem,yv[0],yv[1],yv[2],yv[3],yv[4],yv[5],yv[6],yv[7]);
433     Kokkos::parallel_reduce(xin->map->n,mdot,Kokkos::subview(zv,Kokkos::pair<PetscInt,PetscInt>(cur,cur+rem)));
434     for (j=0; j<rem; j++) {ierr = VecKokkosRestoreDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
435   }
436   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
437   Kokkos::fence(); /* If reduce is async, then we need this fence to make sure z is ready for use on host */
438   PetscFunctionReturn(0);
439 }
440 
441 /* x[:] = alpha */
VecSet_SeqKokkos(Vec xin,PetscScalar alpha)442 PetscErrorCode VecSet_SeqKokkos(Vec xin,PetscScalar alpha)
443 {
444   PetscErrorCode            ierr;
445   PetscScalarViewDevice_t   xv;
446 
447   PetscFunctionBegin;
448   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
449   ierr = VecKokkosGetDeviceViewWrite(xin,&xv);CHKERRQ(ierr);
450   KokkosBlas::fill(xv,alpha);
451   ierr = WaitForKokkos();CHKERRQ(ierr);
452   ierr = VecKokkosRestoreDeviceViewWrite(xin,&xv);CHKERRQ(ierr);
453   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
454   PetscFunctionReturn(0);
455 }
456 
457 /* x = alpha x */
VecScale_SeqKokkos(Vec xin,PetscScalar alpha)458 PetscErrorCode VecScale_SeqKokkos(Vec xin,PetscScalar alpha)
459 {
460   PetscErrorCode            ierr;
461   PetscScalarViewDevice_t   xv;
462 
463   PetscFunctionBegin;
464   if (alpha == (PetscScalar)0.0) {
465     ierr = VecSet_SeqKokkos(xin,alpha);CHKERRQ(ierr);
466   } else if (alpha != (PetscScalar)1.0) {
467     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
468     ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
469     KokkosBlas::scal(xv,alpha,xv);
470     ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
471     ierr = WaitForKokkos();CHKERRQ(ierr);
472     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
473     ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
474   }
475   PetscFunctionReturn(0);
476 }
477 
478 /* z = x^T y */
VecTDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)479 PetscErrorCode VecTDot_SeqKokkos(Vec xin,Vec yin,PetscScalar *z)
480 {
481   PetscErrorCode               ierr;
482   ConstPetscScalarViewDevice_t xv,yv;
483 
484   PetscFunctionBegin;
485   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
486   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
487   ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
488   *z = KokkosBlas::dot(xv,yv);
489   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
490   ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
491   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
492   if (xin->map->n > 0) {ierr = PetscLogGpuFlops(2.0*xin->map->n-1);CHKERRQ(ierr);}
493   PetscFunctionReturn(0);
494 }
495 
496 /* y = x, where x is VECKOKKOS, but y may be not */
VecCopy_SeqKokkos(Vec xin,Vec yin)497 PetscErrorCode VecCopy_SeqKokkos(Vec xin,Vec yin)
498 {
499   PetscErrorCode    ierr;
500 
501   PetscFunctionBegin;
502   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
503   if (xin != yin) {
504     Vec_Kokkos *xkok = static_cast<Vec_Kokkos*>(xin->spptr);
505     if (yin->offloadmask == PETSC_OFFLOAD_VECKOKKOS) {
506       /* y is also a VecKokkos */
507       Vec_Kokkos *ykok = static_cast<Vec_Kokkos*>(yin->spptr);
508       /* Kokkos rule: if x's host has newer data, it will copy to y's host view; otherwise to y's device view
509         In case x's host ins newer, y's device is newer, it will error (though should not, I think). So we just
510         clear y's sync state.
511        */
512       ykok->dual_v.clear_sync_state();
513       Kokkos::deep_copy(ykok->dual_v,xkok->dual_v);
514     } else {
515       PetscScalar *yarray;
516       ierr = VecGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
517       PetscScalarViewHost_t yv(yarray,yin->map->n);
518       if (xkok->dual_v.need_sync_host()) { /* x's device has newer data */
519         Kokkos::deep_copy(yv,xkok->dual_v.view_device());
520       } else {
521         Kokkos::deep_copy(yv,xkok->dual_v.view_host());
522       }
523       ierr = VecRestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
524     }
525   }
526   ierr = WaitForKokkos();CHKERRQ(ierr);
527   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 
531 /* y[i] <--> x[i] */
VecSwap_SeqKokkos(Vec xin,Vec yin)532 PetscErrorCode VecSwap_SeqKokkos(Vec xin,Vec yin)
533 {
534   PetscErrorCode                  ierr;
535   PetscScalarViewDevice_t         xv,yv;
536 
537   PetscFunctionBegin;
538   if (xin != yin) {
539     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
540     ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
541     ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
542     Kokkos::parallel_for(xin->map->n,KOKKOS_LAMBDA(const int64_t i) {
543       PetscScalar tmp = xv(i);
544       xv(i) = yv(i);
545       yv(i) = tmp;
546     });
547     ierr = WaitForKokkos();CHKERRQ(ierr);
548     ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
549     ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
550     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
551   }
552   PetscFunctionReturn(0);
553 }
554 
555 /*  w = alpha x + y */
VecWAXPY_SeqKokkos(Vec win,PetscScalar alpha,Vec xin,Vec yin)556 PetscErrorCode VecWAXPY_SeqKokkos(Vec win,PetscScalar alpha,Vec xin, Vec yin)
557 {
558   PetscErrorCode                  ierr;
559   ConstPetscScalarViewDevice_t    xv,yv;
560   PetscScalarViewDevice_t         wv;
561 
562   PetscFunctionBegin;
563   if (alpha == (PetscScalar)0.0) {
564     ierr = VecCopy_SeqKokkos(yin,win);CHKERRQ(ierr);
565   } else {
566     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
567     ierr = VecKokkosGetDeviceViewWrite(win,&wv);CHKERRQ(ierr);
568     ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
569     ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
570     Kokkos::parallel_for(win->map->n,KOKKOS_LAMBDA(const int64_t i) {wv(i) = alpha*xv(i) + yv(i);});
571     ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
572     ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
573     ierr = VecKokkosRestoreDeviceViewWrite(win,&wv);CHKERRQ(ierr);
574     ierr = WaitForKokkos();CHKERRQ(ierr);
575     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
576     ierr = PetscLogGpuFlops(2*win->map->n);CHKERRQ(ierr);
577   }
578   PetscFunctionReturn(0);
579 }
580 
581 struct MAXPYFunctor {
582   typedef ConstPetscScalarViewDevice_t::size_type size_type;
583 
584   PetscScalarViewDevice_t       yv;
585   PetscInt                      nx;   /* Significent entries in a[8] and xv[8] */
586   PetscScalar                   a[8];
587   ConstPetscScalarViewDevice_t  xv[8];
588 
MAXPYFunctorMAXPYFunctor589   MAXPYFunctor(PetscScalarViewDevice_t yv,
590                PetscInt nx,
591                PetscScalar a0,PetscScalar a1,PetscScalar a2,PetscScalar a3,
592                PetscScalar a4,PetscScalar a5,PetscScalar a6,PetscScalar a7,
593                ConstPetscScalarViewDevice_t xv0, ConstPetscScalarViewDevice_t xv1,
594                ConstPetscScalarViewDevice_t xv2, ConstPetscScalarViewDevice_t xv3,
595                ConstPetscScalarViewDevice_t xv4, ConstPetscScalarViewDevice_t xv5,
596                ConstPetscScalarViewDevice_t xv6, ConstPetscScalarViewDevice_t xv7)
597     : yv(yv),nx(nx)
598   {
599     a[0]  = a0;  a[1]  = a1;
600     a[2]  = a2;  a[3]  = a3;
601     a[4]  = a4;  a[5]  = a5;
602     a[6]  = a6;  a[7]  = a7;
603     xv[0] = xv0; xv[1] = xv1;
604     xv[2] = xv2; xv[3] = xv3;
605     xv[4] = xv4; xv[5] = xv5;
606     xv[6] = xv6; xv[7] = xv7;
607   }
608 
operator ()MAXPYFunctor609   KOKKOS_INLINE_FUNCTION void operator() (const size_type i) const {
610     for (PetscInt j = 0; j<nx; ++j) yv(i) += a[j]*xv[j](i);
611   }
612 };
613 
614 /*  y = y + sum alpha[i] x[i] */
VecMAXPY_SeqKokkos(Vec yin,PetscInt nv,const PetscScalar * alpha,Vec * xin)615 PetscErrorCode VecMAXPY_SeqKokkos(Vec yin, PetscInt nv,const PetscScalar *alpha,Vec *xin)
616 {
617   PetscErrorCode                  ierr;
618   PetscInt                        i,j,cur=0,ngroup=nv/8,rem=nv%8;
619   PetscScalarViewDevice_t         yv;
620   PetscScalar                     a[8];
621   ConstPetscScalarViewDevice_t    xv[8];
622 
623   PetscFunctionBegin;
624   ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
625   for (i=0; i<ngroup; i++) { /* 8 x's per group */
626     for (j=0; j<8; j++) { /* Fill the parameters */
627       a[j] = alpha[cur+j];
628       ierr = VecKokkosGetDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);
629     }
630     MAXPYFunctor maxpy(yv,8,a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],xv[0],xv[1],xv[2],xv[3],xv[4],xv[5],xv[6],xv[7]);
631     Kokkos::parallel_for(yin->map->n,maxpy);
632     for (j=0; j<8; j++) {ierr = VecKokkosRestoreDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);}
633     cur += 8;
634   }
635 
636   if (rem) { /* The remaining */
637     for (j=0; j<rem; j++) {
638       a[j] = alpha[cur+j];
639       ierr = VecKokkosGetDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);
640     }
641     MAXPYFunctor maxpy(yv,rem,a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],xv[0],xv[1],xv[2],xv[3],xv[4],xv[5],xv[6],xv[7]);
642     Kokkos::parallel_for(yin->map->n,maxpy);
643     for (j=0; j<rem; j++) {ierr = VecKokkosRestoreDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);}
644   }
645   ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
646   ierr = PetscLogGpuFlops(nv*2.0*yin->map->n);CHKERRQ(ierr);
647   ierr = WaitForKokkos();CHKERRQ(ierr);
648   PetscFunctionReturn(0);
649 }
650 
651 /* y = alpha x + beta y */
VecAXPBY_SeqKokkos(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)652 PetscErrorCode VecAXPBY_SeqKokkos(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
653 {
654   PetscErrorCode               ierr;
655   ConstPetscScalarViewDevice_t xv;
656   PetscScalarViewDevice_t      yv;
657   PetscBool                    xiskok,yiskok;
658 
659   PetscFunctionBegin;
660   ierr = PetscObjectTypeCompareAny((PetscObject)xin,&xiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
661   ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
662   if (xiskok && yiskok) {
663     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
664     ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
665     ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
666     KokkosBlas::axpby(alpha,xv,beta,yv);
667     ierr = WaitForKokkos();CHKERRQ(ierr);
668     ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
669     ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
670     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
671     if (alpha == (PetscScalar)0.0 || beta == (PetscScalar)0.0) {
672       ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
673     } else if (beta == (PetscScalar)1.0 || alpha == (PetscScalar)1.0) {
674       ierr = PetscLogGpuFlops(2.0*xin->map->n);CHKERRQ(ierr);
675     } else {
676       ierr = PetscLogGpuFlops(3.0*xin->map->n);CHKERRQ(ierr);
677     }
678   } else {
679     ierr = VecAXPBY_Seq(yin,alpha,beta,xin);CHKERRQ(ierr);
680   }
681   PetscFunctionReturn(0);
682 }
683 
684 /* z = alpha x + beta y + gamma z */
VecAXPBYPCZ_SeqKokkos(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)685 PetscErrorCode VecAXPBYPCZ_SeqKokkos(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
686 {
687   PetscErrorCode ierr;
688   ConstPetscScalarViewDevice_t    xv,yv;
689   PetscScalarViewDevice_t         zv;
690 
691   PetscFunctionBegin;
692   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
693   ierr = VecKokkosGetDeviceView(zin,&zv);CHKERRQ(ierr);
694   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
695   ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
696   KokkosBlas::update(alpha,xv,beta,yv,gamma,zv);
697   ierr = WaitForKokkos();CHKERRQ(ierr);
698   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
699   ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
700   ierr = VecKokkosRestoreDeviceView(zin,&zv);CHKERRQ(ierr);
701   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
702   ierr = PetscLogGpuFlops(zin->map->n*5);CHKERRQ(ierr);
703   PetscFunctionReturn(0);
704 }
705 
706 /* w = x*y. Any subset of the x, y, and w may be the same vector. */
VecPointwiseMult_SeqKokkos(Vec win,Vec xin,Vec yin)707 PetscErrorCode VecPointwiseMult_SeqKokkos(Vec win,Vec xin,Vec yin)
708 {
709   PetscErrorCode                  ierr;
710   ConstPetscScalarViewDevice_t    xv,yv;
711   PetscScalarViewDevice_t         wv;
712 
713   PetscFunctionBegin;
714   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
715   ierr = VecKokkosGetDeviceViewWrite(win,&wv);CHKERRQ(ierr);
716   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
717   ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
718   Kokkos::parallel_for(win->map->n,KOKKOS_LAMBDA(const int64_t i) {wv(i) = xv(i)*yv(i);});
719   ierr = WaitForKokkos();CHKERRQ(ierr);
720   ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
721   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
722   ierr = VecKokkosRestoreDeviceViewWrite(win,&wv);CHKERRQ(ierr);
723   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
724   ierr = PetscLogGpuFlops(win->map->n);CHKERRQ(ierr);
725   PetscFunctionReturn(0);
726 }
727 
728 /* w = x/y */
VecPointwiseDivide_SeqKokkos(Vec win,Vec xin,Vec yin)729 PetscErrorCode VecPointwiseDivide_SeqKokkos(Vec win,Vec xin,Vec yin)
730 {
731   PetscErrorCode                  ierr;
732   ConstPetscScalarViewDevice_t    xv,yv;
733   PetscScalarViewDevice_t         wv;
734 
735   PetscFunctionBegin;
736   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
737   ierr = VecKokkosGetDeviceViewWrite(win,&wv);CHKERRQ(ierr);
738   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
739   ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
740   Kokkos::parallel_for(win->map->n,KOKKOS_LAMBDA(const int64_t i) {
741     if (yv(i) != 0.0) wv(i) = xv(i)/yv(i);
742     else wv(i) = 0.0;
743   });
744   ierr = WaitForKokkos();CHKERRQ(ierr);
745   ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
746   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
747   ierr = VecKokkosRestoreDeviceViewWrite(win,&wv);CHKERRQ(ierr);
748   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
749   ierr = PetscLogGpuFlops(win->map->n);CHKERRQ(ierr);
750   PetscFunctionReturn(0);
751 }
752 
VecNorm_SeqKokkos(Vec xin,NormType type,PetscReal * z)753 PetscErrorCode VecNorm_SeqKokkos(Vec xin,NormType type,PetscReal *z)
754 {
755   PetscErrorCode                ierr;
756   const PetscInt                n = xin->map->n;
757   ConstPetscScalarViewDevice_t  xv;
758 
759   PetscFunctionBegin;
760   if (type == NORM_1_AND_2) {
761     ierr = VecNorm_SeqKokkos(xin,NORM_1,z);CHKERRQ(ierr);
762     ierr = VecNorm_SeqKokkos(xin,NORM_2,z+1);CHKERRQ(ierr);
763   } else {
764     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
765     ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
766     if (type == NORM_2 || type == NORM_FROBENIUS) {
767       *z   = KokkosBlas::nrm2(xv);
768       ierr = PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));CHKERRQ(ierr);
769     } else if (type == NORM_1) {
770       *z   = KokkosBlas::nrm1(xv);
771       ierr = PetscLogGpuFlops(PetscMax(n-1.0,0.0));CHKERRQ(ierr);
772     } else if (type == NORM_INFINITY) {
773       *z = KokkosBlas::nrminf(xv);
774     }
775     ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
776     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
777   }
778   PetscFunctionReturn(0);
779 }
780 
781 /* A functor for DotNorm2 so that we can compute dp and nm in one kernel */
782 struct DotNorm2 {
783   typedef PetscScalar                                    value_type[];
784   typedef ConstPetscScalarViewDevice_t::size_type  size_type;
785 
786   size_type                    value_count;
787   ConstPetscScalarViewDevice_t xv_, yv_;
788 
DotNorm2DotNorm2789   DotNorm2(ConstPetscScalarViewDevice_t& xv,ConstPetscScalarViewDevice_t& yv) :
790     value_count(2), xv_(xv), yv_(yv) {}
791 
operator ()DotNorm2792   KOKKOS_INLINE_FUNCTION void operator() (const size_type i, value_type result) const {
793     #if defined(PETSC_USE_COMPLEX)
794       result[0] += Kokkos::conj(yv_(i))*xv_(i);
795       result[1] += Kokkos::conj(yv_(i))*xv_(i);
796     #else
797       result[0] += yv_(i)*xv_(i);
798       result[1] += yv_(i)*yv_(i);
799     #endif
800   }
801 
joinDotNorm2802   KOKKOS_INLINE_FUNCTION void join (volatile value_type dst, const volatile value_type src) const {
803     dst[0] += src[0];
804     dst[1] += src[1];
805   }
806 
initDotNorm2807   KOKKOS_INLINE_FUNCTION void init (value_type result) const {
808     result[0] = 0.0;
809     result[1] = 0.0;
810   }
811 };
812 
813 /* dp = y^H x, nm = y^H y */
VecDotNorm2_SeqKokkos(Vec xin,Vec yin,PetscScalar * dp,PetscScalar * nm)814 PetscErrorCode VecDotNorm2_SeqKokkos(Vec xin, Vec yin, PetscScalar *dp, PetscScalar *nm)
815 {
816   PetscErrorCode                  ierr;
817   ConstPetscScalarViewDevice_t    xv,yv;
818   PetscScalar                     result[2];
819 
820   PetscFunctionBegin;
821   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
822   ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
823   ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
824   DotNorm2 dn(xv,yv);
825   Kokkos::parallel_reduce(xin->map->n,dn,result);
826   ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
827   ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
828   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
829   ierr = PetscLogGpuFlops(4.0*xin->map->n);CHKERRQ(ierr);
830   PetscFunctionReturn(0);
831 }
832 
VecConjugate_SeqKokkos(Vec xin)833 PetscErrorCode VecConjugate_SeqKokkos(Vec xin)
834 {
835 #if defined(PETSC_USE_COMPLEX)
836   PetscErrorCode            ierr;
837   PetscScalarViewDevice_t   xv;
838 
839   PetscFunctionBegin;
840   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
841   ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
842   Kokkos::parallel_for(xin->map->n,KOKKOS_LAMBDA(int64_t i) {xv(i) = Kokkos::conj(xv(i));});
843   ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
844   ierr = WaitForKokkos();CHKERRQ(err);
845   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
846 #else
847   PetscFunctionBegin;
848 #endif
849   PetscFunctionReturn(0);
850 }
851 
VecKokkosUpdateAfterChangingHostArray_Private(Vec v)852 static PetscErrorCode VecKokkosUpdateAfterChangingHostArray_Private(Vec v)
853 {
854   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
855   Vec_Seq    *vecseq = static_cast<Vec_Seq*>(v->data);
856 
857   PetscFunctionBegin;
858   /* Rebuild h_v and dual_v with the new host array*/
859   veckok->h_v    = PetscScalarViewHost_t(vecseq->array,v->map->n);
860   veckok->dual_v = PetscScalarKokkosDualView_t(veckok->d_v,veckok->h_v);
861   veckok->dual_v.modify_host();
862   PetscFunctionReturn(0);
863 }
864 
865 /* Temporarily replace the array in vin with a[]. Return to the original array with a call to VecResetArray() */
VecPlaceArray_SeqKokkos(Vec vin,const PetscScalar * a)866 PetscErrorCode VecPlaceArray_SeqKokkos(Vec vin,const PetscScalar *a)
867 {
868   PetscErrorCode ierr;
869 
870   PetscFunctionBegin;
871   ierr = VecPlaceArray_Seq(vin,a);CHKERRQ(ierr);
872   ierr = VecKokkosUpdateAfterChangingHostArray_Private(vin);CHKERRQ(ierr);
873   PetscFunctionReturn(0);
874 }
875 
VecResetArray_SeqKokkos(Vec vin)876 PetscErrorCode VecResetArray_SeqKokkos(Vec vin)
877 {
878   PetscErrorCode ierr;
879 
880   PetscFunctionBegin;
881   ierr = VecKokkosSyncHost(vin);CHKERRQ(ierr); /* User wants to unhook the provided host array. Sync it so that user can get the latest */
882   ierr = VecResetArray_Seq(vin);CHKERRQ(ierr);
883   ierr = VecKokkosUpdateAfterChangingHostArray_Private(vin);CHKERRQ(ierr);
884   PetscFunctionReturn(0);
885 }
886 
887 /* Replace the array in vin with a[] that must be allocated by PetscMalloc. a[] is owned by vin afterwords. */
VecReplaceArray_SeqKokkos(Vec vin,const PetscScalar * a)888 PetscErrorCode VecReplaceArray_SeqKokkos(Vec vin,const PetscScalar *a)
889 {
890   PetscErrorCode ierr;
891   Vec_Seq        *vecseq = (Vec_Seq*)vin->data;
892 
893   PetscFunctionBegin;
894   /* Make sure the users array has the latest values */
895   if (vecseq->array != vecseq->array_allocated) {ierr = VecKokkosSyncHost(vin);CHKERRQ(ierr);}
896   ierr = VecReplaceArray_Seq(vin,a);CHKERRQ(ierr);
897   ierr = VecKokkosUpdateAfterChangingHostArray_Private(vin);CHKERRQ(ierr);
898   PetscFunctionReturn(0);
899 }
900 
901 /* Maps the local portion of vector v into vector w */
VecGetLocalVector_SeqKokkos(Vec v,Vec w)902 PetscErrorCode VecGetLocalVector_SeqKokkos(Vec v,Vec w)
903 {
904   PetscErrorCode   ierr;
905   Vec_Seq          *vecseq = static_cast<Vec_Seq*>(w->data);
906   Vec_Kokkos       *veckok = static_cast<Vec_Kokkos*>(w->spptr);
907 
908   PetscFunctionBegin;
909   PetscCheckTypeName(w,VECSEQKOKKOS);
910   /* Destroy w->data, w->spptr */
911   if (vecseq) {
912     ierr = PetscFree(vecseq->array_allocated);CHKERRQ(ierr);
913     ierr = PetscFree(w->data);CHKERRQ(ierr);
914   }
915   delete veckok;
916 
917   /* Replace with v's */
918   w->data  = v->data;
919   w->spptr = v->spptr;
920   ierr     = PetscObjectStateIncrease((PetscObject)w);CHKERRQ(ierr);
921   PetscFunctionReturn(0);
922 }
923 
VecRestoreLocalVector_SeqKokkos(Vec v,Vec w)924 PetscErrorCode VecRestoreLocalVector_SeqKokkos(Vec v,Vec w)
925 {
926   PetscErrorCode ierr;
927 
928   PetscFunctionBegin;
929   PetscCheckTypeName(w,VECSEQKOKKOS);
930   v->data  = w->data;
931   v->spptr = w->spptr;
932   ierr     = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
933   /* TODO: need to think if setting w->data/spptr to NULL is safe */
934   w->data  = NULL;
935   w->spptr = NULL;
936   PetscFunctionReturn(0);
937 }
938 
939 /* Get array on host to overwrite, so no need to sync host. In VecRestoreArrayWrite() we will mark host is modified. */
VecGetArrayWrite_SeqKokkos(Vec v,PetscScalar ** vv)940 PetscErrorCode VecGetArrayWrite_SeqKokkos(Vec v,PetscScalar **vv)
941 {
942   Vec_Kokkos       *veckok = static_cast<Vec_Kokkos*>(v->spptr);
943 
944   PetscFunctionBegin;
945   veckok->dual_v.clear_sync_state();
946   *vv = veckok->dual_v.view_host().data();
947   PetscFunctionReturn(0);
948 }
949 
VecSetOps_SeqKokkos(Vec v)950 static PetscErrorCode VecSetOps_SeqKokkos(Vec v)
951 {
952   PetscFunctionBegin;
953   v->ops->abs                    = VecAbs_SeqKokkos;
954   v->ops->reciprocal             = VecReciprocal_SeqKokkos;
955   v->ops->pointwisemult          = VecPointwiseMult_SeqKokkos;
956   v->ops->min                    = VecMin_SeqKokkos;
957   v->ops->max                    = VecMax_SeqKokkos;
958   v->ops->shift                  = VecShift_SeqKokkos;
959   v->ops->dot                    = VecDot_SeqKokkos;
960   v->ops->norm                   = VecNorm_SeqKokkos;
961   v->ops->tdot                   = VecTDot_SeqKokkos;
962   v->ops->scale                  = VecScale_SeqKokkos;
963   v->ops->copy                   = VecCopy_SeqKokkos;
964   v->ops->set                    = VecSet_SeqKokkos;
965   v->ops->swap                   = VecSwap_SeqKokkos;
966   v->ops->axpy                   = VecAXPY_SeqKokkos;
967   v->ops->axpby                  = VecAXPBY_SeqKokkos;
968   v->ops->axpbypcz               = VecAXPBYPCZ_SeqKokkos;
969   v->ops->pointwisedivide        = VecPointwiseDivide_SeqKokkos;
970   v->ops->setrandom              = VecSetRandom_SeqKokkos;
971   v->ops->dot_local              = VecDot_SeqKokkos;
972   v->ops->tdot_local             = VecTDot_SeqKokkos;
973   v->ops->norm_local             = VecNorm_SeqKokkos;
974   v->ops->mdot_local             = VecMDot_SeqKokkos;
975   v->ops->maxpy                  = VecMAXPY_SeqKokkos;
976   v->ops->mdot                   = VecMDot_SeqKokkos;
977   v->ops->aypx                   = VecAYPX_SeqKokkos;
978   v->ops->waxpy                  = VecWAXPY_SeqKokkos;
979   v->ops->dotnorm2               = VecDotNorm2_SeqKokkos;
980   v->ops->placearray             = VecPlaceArray_SeqKokkos;
981   v->ops->replacearray           = VecReplaceArray_SeqKokkos;
982   v->ops->resetarray             = VecResetArray_SeqKokkos;
983   v->ops->destroy                = VecDestroy_SeqKokkos;
984   v->ops->duplicate              = VecDuplicate_SeqKokkos;
985   v->ops->conjugate              = VecConjugate_SeqKokkos;
986   v->ops->getlocalvector         = VecGetLocalVector_SeqKokkos;
987   v->ops->restorelocalvector     = VecRestoreLocalVector_SeqKokkos;
988   v->ops->getlocalvectorread     = VecGetLocalVector_SeqKokkos;
989   v->ops->restorelocalvectorread = VecRestoreLocalVector_SeqKokkos;
990   v->ops->getarraywrite          = VecGetArrayWrite_SeqKokkos;
991   PetscFunctionReturn(0);
992 }
993 
994 /* Assuming the Vec_Seq struct of the vector is ready, build the Vec_Kokkos struct for it.
995    Also assume the vector is to be init'ed, so it is in a host-device sync'ed state after the call.
996  */
BuildVecKokkosFromVecSeq_Private(Vec v)997 static PetscErrorCode BuildVecKokkosFromVecSeq_Private(Vec v)
998 {
999   Vec_Seq        *vecseq = static_cast<Vec_Seq*>(v->data);
1000   Vec_Kokkos     *veckok = NULL;
1001   PetscScalar    *darray;
1002 
1003   PetscFunctionBegin;
1004   if (std::is_same<DeviceMemorySpace,HostMemorySpace>::value) {
1005     darray = vecseq->array;
1006   } else {
1007     darray = static_cast<PetscScalar*>(Kokkos::kokkos_malloc<DeviceMemorySpace>(sizeof(PetscScalar)*v->map->n));
1008   }
1009   if (v->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"v->spptr not NULL");
1010   veckok = new Vec_Kokkos(v->map->n,vecseq->array,darray,darray);
1011   Kokkos::deep_copy(veckok->dual_v.view_device(),0.0);
1012   v->spptr = static_cast<void*>(veckok);
1013   v->offloadmask = PETSC_OFFLOAD_VECKOKKOS;
1014   PetscFunctionReturn(0);
1015 }
1016 
VecCreate_SeqKokkos(Vec v)1017 PetscErrorCode VecCreate_SeqKokkos(Vec v)
1018 {
1019   PetscErrorCode ierr;
1020 
1021   PetscFunctionBegin;
1022   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1023   ierr = PetscLayoutSetUp(v->map);CHKERRQ(ierr);
1024   ierr = VecCreate_Seq(v);CHKERRQ(ierr);  /* Build a sequential vector, allocate array */
1025   ierr = VecSet_Seq(v,0.0);CHKERRQ(ierr); /* Zero the host array */
1026   ierr = PetscObjectChangeTypeName((PetscObject)v,VECSEQKOKKOS);CHKERRQ(ierr);
1027   ierr = VecSetOps_SeqKokkos(v);CHKERRQ(ierr);
1028   ierr = BuildVecKokkosFromVecSeq_Private(v);CHKERRQ(ierr);
1029   PetscFunctionReturn(0);
1030 }
1031 
1032 /*@C
1033    VecCreateSeqKokkosWithArray - Creates a Kokkos sequential array-style vector,
1034    where the user provides the array space to store the vector values. The array
1035    provided must be a device array.
1036 
1037    Collective
1038 
1039    Input Parameter:
1040 +  comm - the communicator, should be PETSC_COMM_SELF
1041 .  bs - the block size
1042 .  n - the vector length
1043 -  array - device memory where the vector elements are to be stored.
1044 
1045    Output Parameter:
1046 .  V - the vector
1047 
1048    Notes:
1049    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1050    same type as an existing vector.
1051 
1052    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
1053    at a later stage to SET the array for storing the vector values.
1054 
1055    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1056    The user should not free the array until the vector is destroyed.
1057 
1058    Level: intermediate
1059 
1060 .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
1061           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
1062           VecCreateMPIWithArray()
1063 @*/
VecCreateSeqKokkosWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar darray[],Vec * v)1064 PetscErrorCode  VecCreateSeqKokkosWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar darray[],Vec *v)
1065 {
1066   PetscErrorCode ierr;
1067   PetscMPIInt    size;
1068   Vec            w;
1069   Vec_Kokkos     *veckok = NULL;
1070   PetscScalar    *harray;
1071 
1072   PetscFunctionBegin;
1073   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1074   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQKOKKOS on more than one process");
1075 
1076   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1077   ierr = VecCreate(comm,&w);CHKERRQ(ierr);
1078   ierr = VecSetSizes(w,n,n);CHKERRQ(ierr);
1079   ierr = VecSetBlockSize(w,bs);CHKERRQ(ierr);
1080 
1081   /* Given a device array, build the Vec_Seq struct */
1082   if (std::is_same<DeviceMemorySpace,HostMemorySpace>::value) {harray = const_cast<PetscScalar*>(darray);}
1083   else {ierr = PetscMalloc1(w->map->n,&harray);CHKERRQ(ierr);}
1084   ierr   = VecCreate_Seq_Private(w,harray);CHKERRQ(ierr); /* Build a sequential vector with harray */
1085 
1086   ierr   = PetscObjectChangeTypeName((PetscObject)w,VECSEQKOKKOS);CHKERRQ(ierr); /* Change it to Kokkos */
1087   ierr   = VecSetOps_SeqKokkos(w);CHKERRQ(ierr);
1088   veckok = new Vec_Kokkos(n,harray,const_cast<PetscScalar*>(darray),NULL);
1089   veckok->dual_v.modify_device(); /* Mark the device is modified */
1090   w->offloadmask = PETSC_OFFLOAD_VECKOKKOS;
1091   w->spptr = static_cast<void*>(veckok);
1092   *v       = w;
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 /* TODO: ftn-auto generates veckok.kokkosf.c */
1097 /*@C
1098  VecCreateSeqKokkos - Creates a standard, sequential array-style vector.
1099 
1100  Collective
1101 
1102  Input Parameter:
1103  +  comm - the communicator, should be PETSC_COMM_SELF
1104  -  n - the vector length
1105 
1106  Output Parameter:
1107  .  v - the vector
1108 
1109  Notes:
1110  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1111  same type as an existing vector.
1112 
1113  Level: intermediate
1114 
1115  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1116  @*/
VecCreateSeqKokkos(MPI_Comm comm,PetscInt n,Vec * v)1117 PetscErrorCode VecCreateSeqKokkos(MPI_Comm comm,PetscInt n,Vec *v)
1118 {
1119   PetscErrorCode ierr;
1120 
1121   PetscFunctionBegin;
1122   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1123   ierr = VecCreate(comm,v);CHKERRQ(ierr);
1124   ierr = VecSetSizes(*v,n,n);CHKERRQ(ierr);
1125   ierr = VecSetType(*v,VECSEQKOKKOS);CHKERRQ(ierr); /* Calls VecCreate_SeqKokkos */
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 /* Duplicate layout etc but not the values in the input vector */
VecDuplicate_SeqKokkos(Vec win,Vec * v)1130 PetscErrorCode VecDuplicate_SeqKokkos(Vec win,Vec *v)
1131 {
1132   PetscErrorCode ierr;
1133 
1134   PetscFunctionBegin;
1135   ierr = VecDuplicate_Seq(win,v);CHKERRQ(ierr); /* It also dups ops of win */
1136   PetscFunctionReturn(0);
1137 }
1138 
VecDestroy_SeqKokkos(Vec v)1139 PetscErrorCode VecDestroy_SeqKokkos(Vec v)
1140 {
1141   PetscErrorCode ierr;
1142   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);
1143   Vec_Seq        *vecseq = static_cast<Vec_Seq*>(v->data);
1144 
1145   PetscFunctionBegin;
1146   delete veckok;
1147   v->spptr = NULL;
1148   if (vecseq) {ierr = VecDestroy_Seq(v);CHKERRQ(ierr);}
1149   PetscFunctionReturn(0);
1150 }
1151