1 /*
2 Implements the sequential Kokkos vectors.
3 */
4 #include "Kokkos_Core_fwd.hpp"
5 #include "Kokkos_Macros.hpp"
6 #include "Kokkos_Parallel.hpp"
7 #include "Kokkos_Parallel_Reduce.hpp"
8 #include <petsc/private/sfimpl.h>
9 #include <petsc/private/petscimpl.h>
10 #include <petscmath.h>
11 #include <petscviewer.h>
12 #include <KokkosBlas.hpp>
13
14 #include <petscconf.h>
15 #include <petscvec.hpp>
16 #include <petscerror.h>
17 #include <../src/vec/vec/impls/dvecimpl.h> /* for VecCreate_Seq_Private */
18 #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
19
20 #if defined(PETSC_USE_DEBUG)
21 #define VecErrorIfNotKokkos(v) \
22 do { \
23 PetscErrorCode ierr; \
24 PetscBool isKokkos = PETSC_FALSE; \
25 ierr = PetscObjectTypeCompareAny((PetscObject)(v),&isKokkos,VECSEQKOKKOS,VECMPIKOKKOS,VECKOKKOS,"");CHKERRQ(ierr); \
26 if (!isKokkos) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Calling VECKOKKOS methods on a non-VECKOKKOS object"); \
27 } while (0)
28 #else
29 #define VecErrorIfNotKokkos(v) do {(void)(v);} while (0)
30 #endif
31
VecKokkosSyncHost(Vec v)32 PetscErrorCode VecKokkosSyncHost(Vec v)
33 {
34 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
35
36 PetscFunctionBegin;
37 VecErrorIfNotKokkos(v);
38 veckok->dual_v.sync_host();
39 PetscFunctionReturn(0);
40 }
41
VecKokkosModifyHost(Vec v)42 PetscErrorCode VecKokkosModifyHost(Vec v)
43 {
44 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
45
46 PetscFunctionBegin;
47 VecErrorIfNotKokkos(v);
48 veckok->dual_v.modify_host();
49 PetscFunctionReturn(0);
50 }
51
52 /* TODO: Is VecGetKokkosDeviceView() a better name, if we always use -vec_type kokkos */
VecKokkosGetDeviceView(Vec v,PetscScalarViewDevice_t * d_view)53 PetscErrorCode VecKokkosGetDeviceView(Vec v,PetscScalarViewDevice_t* d_view)
54 {
55 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
56
57 PetscFunctionBegin;
58 VecErrorIfNotKokkos(v);
59 veckok->dual_v.sync_device(); /* User might read and write the device view */
60 *d_view = veckok->dual_v.view_device();
61 PetscFunctionReturn(0);
62 }
63
VecKokkosRestoreDeviceView(Vec v,PetscScalarViewDevice_t * d_view)64 PetscErrorCode VecKokkosRestoreDeviceView(Vec v,PetscScalarViewDevice_t* d_view)
65 {
66 PetscErrorCode ierr;
67 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
68
69 PetscFunctionBegin;
70 VecErrorIfNotKokkos(v);
71 veckok->dual_v.modify_device();
72 ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
73 PetscFunctionReturn(0);
74 }
75
VecKokkosGetDeviceViewRead(Vec v,ConstPetscScalarViewDevice_t * d_view)76 PetscErrorCode VecKokkosGetDeviceViewRead(Vec v,ConstPetscScalarViewDevice_t* d_view)
77 {
78 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
79
80 PetscFunctionBegin;
81 VecErrorIfNotKokkos(v);
82 veckok->dual_v.sync_device();
83 *d_view = veckok->dual_v.view_device();
84 PetscFunctionReturn(0);
85 }
86
87 /* VecKokkosRestoreDeviceViewRead() is defined as a no-op in header */
88
89 /* This one will overwrite the device, so no need to sync device.
90 But let's say the host is modified. In VecKokkosRestoreDeviceViewWrite(), we have to
91 clear_sync_state() to only mark the device as modified. Otherwise we will have a wrong
92 state saying both host and device are modified. If xin=yin, we have to support code sequence:
93
94 VecKokkosGetDeviceViewWrite(xin,&xv);
95 VecKokkosGetDeviceViewRead(yin,&yv);
96 ...
97 VecKokkosRestoreDeviceViewWrite(xin,&xv);
98 VecKokkosRestoreDeviceViewRead(yin,&yv);
99 */
VecKokkosGetDeviceViewWrite(Vec v,PetscScalarViewDevice_t * d_view)100 PetscErrorCode VecKokkosGetDeviceViewWrite(Vec v,PetscScalarViewDevice_t* d_view)
101 {
102 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
103
104 PetscFunctionBegin;
105 VecErrorIfNotKokkos(v);
106 *d_view = veckok->dual_v.view_device();
107 PetscFunctionReturn(0);
108 }
109
VecKokkosRestoreDeviceViewWrite(Vec v,PetscScalarViewDevice_t * dv)110 PetscErrorCode VecKokkosRestoreDeviceViewWrite(Vec v,PetscScalarViewDevice_t* dv)
111 {
112 PetscErrorCode ierr;
113 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
114
115 PetscFunctionBegin;
116 VecErrorIfNotKokkos(v);
117 veckok->dual_v.clear_sync_state();
118 veckok->dual_v.modify_device();
119 ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
120 PetscFunctionReturn(0);
121 }
122
VecKokkosGetArrayInPlace(Vec v,PetscScalar ** array)123 PetscErrorCode VecKokkosGetArrayInPlace(Vec v,PetscScalar** array)
124 {
125 PetscErrorCode ierr;
126
127 PetscFunctionBegin;
128 ierr = VecKokkosGetArrayInPlace_Internal(v,array,NULL);CHKERRQ(ierr);
129 PetscFunctionReturn(0);
130 }
131
VecKokkosGetArrayInPlace_Internal(Vec v,PetscScalar ** array,PetscMemType * mtype)132 PetscErrorCode VecKokkosGetArrayInPlace_Internal(Vec v,PetscScalar** array,PetscMemType *mtype)
133 {
134 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
135
136 PetscFunctionBegin;
137 VecErrorIfNotKokkos(v);
138 if (veckok->dual_v.need_sync_device()) {
139 /* Host has newer data than device */
140 *array = veckok->dual_v.view_host().data();
141 if (mtype) *mtype = PETSC_MEMTYPE_HOST;
142 } else {
143 /* Device has newer or same data as host. We prefer returning devcie data*/
144 *array = veckok->dual_v.view_device().data();
145 if (mtype) *mtype = PETSC_MEMTYPE_DEVICE;
146 }
147 PetscFunctionReturn(0);
148 }
149
VecKokkosRestoreArrayInPlace(Vec v,PetscScalar ** array)150 PetscErrorCode VecKokkosRestoreArrayInPlace(Vec v,PetscScalar** array)
151 {
152 PetscErrorCode ierr;
153 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
154
155 PetscFunctionBegin;
156 VecErrorIfNotKokkos(v);
157 if (veckok->dual_v.need_sync_device()) { /* Host has newer data than device */
158 veckok->dual_v.modify_host();
159 } else {
160 veckok->dual_v.modify_device();
161 }
162 ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
163 PetscFunctionReturn(0);
164 }
165
VecKokkosGetArrayReadInPlace(Vec v,const PetscScalar ** array)166 PetscErrorCode VecKokkosGetArrayReadInPlace(Vec v,const PetscScalar** array)
167 {
168 PetscErrorCode ierr;
169
170 PetscFunctionBegin;
171 ierr = VecKokkosGetArrayReadInPlace_Internal(v,array,NULL);CHKERRQ(ierr);
172 PetscFunctionReturn(0);
173 }
174
VecKokkosGetArrayReadInPlace_Internal(Vec v,const PetscScalar ** array,PetscMemType * mtype)175 PetscErrorCode VecKokkosGetArrayReadInPlace_Internal(Vec v,const PetscScalar** array,PetscMemType *mtype)
176 {
177 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
178
179 PetscFunctionBegin;
180 VecErrorIfNotKokkos(v);
181 if (veckok->dual_v.need_sync_device()) { /* Host has newer data than device */
182 *array = veckok->dual_v.view_host().data();
183 if (mtype) *mtype = PETSC_MEMTYPE_HOST;
184 } else {
185 *array = veckok->dual_v.view_device().data();
186 if (mtype) *mtype = PETSC_MEMTYPE_DEVICE;
187 }
188 PetscFunctionReturn(0);
189 }
190
VecSetRandom_SeqKokkos(Vec xin,PetscRandom r)191 PetscErrorCode VecSetRandom_SeqKokkos(Vec xin,PetscRandom r)
192 {
193 PetscErrorCode ierr;
194 PetscInt n = xin->map->n,i;
195 PetscScalar *xx;
196
197 PetscFunctionBegin;
198 ierr = VecGetArrayWrite(xin,&xx);CHKERRQ(ierr); /* TODO: generate randoms directly on device */
199 for (i=0; i<n; i++) { ierr = PetscRandomGetValue(r,&xx[i]);CHKERRQ(ierr); }
200 ierr = VecRestoreArrayWrite(xin,&xx);CHKERRQ(ierr);
201 PetscFunctionReturn(0);
202 }
203
204 /* x = |x| */
VecAbs_SeqKokkos(Vec xin)205 PetscErrorCode VecAbs_SeqKokkos(Vec xin)
206 {
207 PetscErrorCode ierr;
208 PetscScalarViewDevice_t xv;
209
210 PetscFunctionBegin;
211 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
212 ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
213 KokkosBlas::abs(xv,xv);
214 ierr = WaitForKokkos();CHKERRQ(ierr);
215 ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
216 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
217 PetscFunctionReturn(0);
218 }
219
220 /* x = 1/x */
VecReciprocal_SeqKokkos(Vec xin)221 PetscErrorCode VecReciprocal_SeqKokkos(Vec xin)
222 {
223 PetscErrorCode ierr;
224 PetscScalarViewDevice_t xv;
225
226 PetscFunctionBegin;
227 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
228 ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
229 /* KokkosBlas::reciprocal(xv,xv); */
230 Kokkos::parallel_for(xv.extent(0),KOKKOS_LAMBDA(const int64_t i) {if (xv(i) != (PetscScalar)0.0) xv(i) = (PetscScalar)1.0/xv(i);});
231 ierr = WaitForKokkos();CHKERRQ(ierr);
232 ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
233 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
234 PetscFunctionReturn(0);
235 }
236
VecMin_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)237 PetscErrorCode VecMin_SeqKokkos(Vec xin,PetscInt *p,PetscReal *val)
238 {
239 typedef Kokkos::MinLoc<PetscReal,PetscInt>::value_type MinLocValue_t;
240
241 PetscErrorCode ierr;
242 ConstPetscScalarViewDevice_t xv;
243 MinLocValue_t minloc;
244
245 PetscFunctionBegin;
246 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
247 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
248 Kokkos::parallel_reduce("VecMin",xin->map->n,KOKKOS_LAMBDA(PetscInt i,MinLocValue_t& lminloc) {
249 if (xv(i) < lminloc.val) {
250 lminloc.val = xv(i);
251 lminloc.loc = i;
252 }
253 },Kokkos::MinLoc<PetscReal,PetscInt>(minloc)); /* Kokkos will set minloc properly even if xin is zero-lengthed */
254 *p = minloc.loc;
255 *val = minloc.val;
256 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
257 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
258 PetscFunctionReturn(0);
259 }
260
VecMax_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)261 PetscErrorCode VecMax_SeqKokkos(Vec xin,PetscInt *p,PetscReal *val)
262 {
263 typedef Kokkos::MaxLoc<PetscReal,PetscInt>::value_type MaxLocValue_t;
264
265 PetscErrorCode ierr;
266 ConstPetscScalarViewDevice_t xv;
267 MaxLocValue_t maxloc;
268
269 PetscFunctionBegin;
270 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
271 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
272 Kokkos::parallel_reduce("VecMax",xin->map->n,KOKKOS_LAMBDA(PetscInt i,MaxLocValue_t& lmaxloc) {
273 if (xv(i) > lmaxloc.val) {
274 lmaxloc.val = xv(i);
275 lmaxloc.loc = i;
276 }
277 },Kokkos::MaxLoc<PetscReal,PetscInt>(maxloc));
278 *p = maxloc.loc;
279 *val = maxloc.val;
280 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
281 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
282 PetscFunctionReturn(0);
283 }
284
VecShift_SeqKokkos(Vec xin,PetscScalar shift)285 PetscErrorCode VecShift_SeqKokkos(Vec xin,PetscScalar shift)
286 {
287 PetscErrorCode ierr;
288 PetscScalarViewDevice_t xv;
289
290 PetscFunctionBegin;
291 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
292 ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
293 Kokkos::parallel_for("VecShift",xin->map->n,KOKKOS_LAMBDA(PetscInt i) {xv(i) += shift;});
294 ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
295 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
296 PetscFunctionReturn(0);
297 }
298
299 /* y = alpha x + y */
VecAXPY_SeqKokkos(Vec yin,PetscScalar alpha,Vec xin)300 PetscErrorCode VecAXPY_SeqKokkos(Vec yin,PetscScalar alpha,Vec xin)
301 {
302 PetscErrorCode ierr;
303 PetscBool xiskok,yiskok;
304 PetscScalarViewDevice_t yv;
305 ConstPetscScalarViewDevice_t xv;
306
307 PetscFunctionBegin;
308 if (alpha == (PetscScalar)0.0) PetscFunctionReturn(0);
309 if (yin == xin) {
310 ierr = VecScale_SeqKokkos(yin,alpha+1);CHKERRQ(ierr);
311 PetscFunctionReturn(0);
312 }
313 ierr = PetscObjectTypeCompareAny((PetscObject)xin,&xiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
314 ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
315 if (xiskok && yiskok) {
316 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
317 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
318 ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
319 KokkosBlas::axpy(alpha,xv,yv);
320 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
321 ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
322 ierr = WaitForKokkos();CHKERRQ(ierr);
323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324 ierr = PetscLogGpuFlops(2.0*yin->map->n);CHKERRQ(ierr);
325 } else {
326 ierr = VecAXPY_Seq(yin,alpha,xin);CHKERRQ(ierr);
327 }
328 PetscFunctionReturn(0);
329 }
330
331 /* y = x + beta y */
VecAYPX_SeqKokkos(Vec yin,PetscScalar beta,Vec xin)332 PetscErrorCode VecAYPX_SeqKokkos(Vec yin,PetscScalar beta,Vec xin)
333 {
334 PetscErrorCode ierr;
335
336 PetscFunctionBegin;
337 /* One needs to define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 to have optimizations for cases alpha/beta = 0,+/-1 */
338 ierr = VecAXPBY_SeqKokkos(yin,1.0,beta,xin);CHKERRQ(ierr);
339 PetscFunctionReturn(0);
340 }
341
342 /* z = y^H x */
VecDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)343 PetscErrorCode VecDot_SeqKokkos(Vec xin,Vec yin,PetscScalar *z)
344 {
345 PetscErrorCode ierr;
346 ConstPetscScalarViewDevice_t xv,yv;
347
348 PetscFunctionBegin;
349 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
350 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
351 ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
352 Kokkos::parallel_reduce("VecDot",xin->map->n,KOKKOS_LAMBDA(int64_t i, PetscScalar& update) {
353 #if defined(PETSC_USE_COMPLEX)
354 update += Kokkos::conj(yv(i))*xv(i);
355 #else
356 update += yv(i)*xv(i);
357 #endif
358 },*z); /* Kokkos always overwrites z, so no need to init it */
359 ierr = WaitForKokkos();CHKERRQ(ierr);
360 ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
361 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
362 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
363 if (xin->map->n > 0) {ierr = PetscLogGpuFlops(2.0*xin->map->n);CHKERRQ(ierr);}
364 PetscFunctionReturn(0);
365 }
366
367 struct MDotFunctor {
368 /* Note the C++ notation for an array typedef */
369 typedef PetscScalar value_type[];
370 typedef ConstPetscScalarViewDevice_t::size_type size_type;
371
372 /* Tell Kokkos the result array's number of entries. This must be a public value in the functor */
373 const size_type value_count;
374 ConstPetscScalarViewDevice_t xv,yv[8];
375
MDotFunctorMDotFunctor376 MDotFunctor(ConstPetscScalarViewDevice_t& xv,
377 const PetscInt ny, /* Number of valid entries in yv[8]. 1 <= ny <= 8 */
378 ConstPetscScalarViewDevice_t& yv0, ConstPetscScalarViewDevice_t& yv1,
379 ConstPetscScalarViewDevice_t& yv2, ConstPetscScalarViewDevice_t& yv3,
380 ConstPetscScalarViewDevice_t& yv4, ConstPetscScalarViewDevice_t& yv5,
381 ConstPetscScalarViewDevice_t& yv6, ConstPetscScalarViewDevice_t& yv7)
382 : value_count(ny),xv(xv)
383 {
384 yv[0] = yv0; yv[1] = yv1;
385 yv[2] = yv2; yv[3] = yv3;
386 yv[4] = yv4; yv[5] = yv5;
387 yv[6] = yv6; yv[7] = yv7;
388 }
389
operator ()MDotFunctor390 KOKKOS_INLINE_FUNCTION void operator() (const size_type i,value_type sum) const {
391 PetscScalar xval = xv(i);
392 for (size_type j = 0; j<value_count; ++j) {
393 #if defined(PETSC_USE_COMPLEX)
394 sum[j] += Kokkos::conj(yv[j](i))*xval;
395 #else
396 sum[j] += yv[j](i)*xval;
397 #endif
398 }
399 }
400
joinMDotFunctor401 KOKKOS_INLINE_FUNCTION void join (volatile value_type dst,const volatile value_type src) const
402 {
403 for (size_type j = 0; j<value_count; j++) dst[j] += src[j];
404 }
405
initMDotFunctor406 KOKKOS_INLINE_FUNCTION void init (value_type sum) const
407 {
408 for (size_type j = 0; j<value_count; j++) sum[j] = 0.0;
409 }
410 };
411
412 /* z[i] = (x,y_i) = y_i^H x */
VecMDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)413 PetscErrorCode VecMDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
414 {
415 PetscErrorCode ierr;
416 PetscInt i,j,cur=0,ngroup=nv/8,rem=nv%8;
417 ConstPetscScalarViewDevice_t xv,yv[8];
418 PetscScalarViewHost_t zv(z,nv);
419
420 PetscFunctionBegin;
421 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
422 for (i=0; i<ngroup; i++) { /* 8 y's per group */
423 for (j=0; j<8; j++) {ierr = VecKokkosGetDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
424 MDotFunctor mdot(xv,8,yv[0],yv[1],yv[2],yv[3],yv[4],yv[5],yv[6],yv[7]); /* Hope Kokkos make it asynchronous */
425 Kokkos::parallel_reduce(xin->map->n,mdot,Kokkos::subview(zv,Kokkos::pair<PetscInt,PetscInt>(cur,cur+8)));
426 for (j=0; j<8; j++) {ierr = VecKokkosRestoreDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
427 cur += 8;
428 }
429
430 if (rem) { /* The remaining */
431 for (j=0; j<rem; j++) {ierr = VecKokkosGetDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
432 MDotFunctor mdot(xv,rem,yv[0],yv[1],yv[2],yv[3],yv[4],yv[5],yv[6],yv[7]);
433 Kokkos::parallel_reduce(xin->map->n,mdot,Kokkos::subview(zv,Kokkos::pair<PetscInt,PetscInt>(cur,cur+rem)));
434 for (j=0; j<rem; j++) {ierr = VecKokkosRestoreDeviceViewRead(yin[cur+j],&yv[j]);CHKERRQ(ierr);}
435 }
436 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
437 Kokkos::fence(); /* If reduce is async, then we need this fence to make sure z is ready for use on host */
438 PetscFunctionReturn(0);
439 }
440
441 /* x[:] = alpha */
VecSet_SeqKokkos(Vec xin,PetscScalar alpha)442 PetscErrorCode VecSet_SeqKokkos(Vec xin,PetscScalar alpha)
443 {
444 PetscErrorCode ierr;
445 PetscScalarViewDevice_t xv;
446
447 PetscFunctionBegin;
448 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
449 ierr = VecKokkosGetDeviceViewWrite(xin,&xv);CHKERRQ(ierr);
450 KokkosBlas::fill(xv,alpha);
451 ierr = WaitForKokkos();CHKERRQ(ierr);
452 ierr = VecKokkosRestoreDeviceViewWrite(xin,&xv);CHKERRQ(ierr);
453 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
454 PetscFunctionReturn(0);
455 }
456
457 /* x = alpha x */
VecScale_SeqKokkos(Vec xin,PetscScalar alpha)458 PetscErrorCode VecScale_SeqKokkos(Vec xin,PetscScalar alpha)
459 {
460 PetscErrorCode ierr;
461 PetscScalarViewDevice_t xv;
462
463 PetscFunctionBegin;
464 if (alpha == (PetscScalar)0.0) {
465 ierr = VecSet_SeqKokkos(xin,alpha);CHKERRQ(ierr);
466 } else if (alpha != (PetscScalar)1.0) {
467 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
468 ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
469 KokkosBlas::scal(xv,alpha,xv);
470 ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
471 ierr = WaitForKokkos();CHKERRQ(ierr);
472 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
473 ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
474 }
475 PetscFunctionReturn(0);
476 }
477
478 /* z = x^T y */
VecTDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)479 PetscErrorCode VecTDot_SeqKokkos(Vec xin,Vec yin,PetscScalar *z)
480 {
481 PetscErrorCode ierr;
482 ConstPetscScalarViewDevice_t xv,yv;
483
484 PetscFunctionBegin;
485 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
486 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
487 ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
488 *z = KokkosBlas::dot(xv,yv);
489 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
490 ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
491 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
492 if (xin->map->n > 0) {ierr = PetscLogGpuFlops(2.0*xin->map->n-1);CHKERRQ(ierr);}
493 PetscFunctionReturn(0);
494 }
495
496 /* y = x, where x is VECKOKKOS, but y may be not */
VecCopy_SeqKokkos(Vec xin,Vec yin)497 PetscErrorCode VecCopy_SeqKokkos(Vec xin,Vec yin)
498 {
499 PetscErrorCode ierr;
500
501 PetscFunctionBegin;
502 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
503 if (xin != yin) {
504 Vec_Kokkos *xkok = static_cast<Vec_Kokkos*>(xin->spptr);
505 if (yin->offloadmask == PETSC_OFFLOAD_VECKOKKOS) {
506 /* y is also a VecKokkos */
507 Vec_Kokkos *ykok = static_cast<Vec_Kokkos*>(yin->spptr);
508 /* Kokkos rule: if x's host has newer data, it will copy to y's host view; otherwise to y's device view
509 In case x's host ins newer, y's device is newer, it will error (though should not, I think). So we just
510 clear y's sync state.
511 */
512 ykok->dual_v.clear_sync_state();
513 Kokkos::deep_copy(ykok->dual_v,xkok->dual_v);
514 } else {
515 PetscScalar *yarray;
516 ierr = VecGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
517 PetscScalarViewHost_t yv(yarray,yin->map->n);
518 if (xkok->dual_v.need_sync_host()) { /* x's device has newer data */
519 Kokkos::deep_copy(yv,xkok->dual_v.view_device());
520 } else {
521 Kokkos::deep_copy(yv,xkok->dual_v.view_host());
522 }
523 ierr = VecRestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
524 }
525 }
526 ierr = WaitForKokkos();CHKERRQ(ierr);
527 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
528 PetscFunctionReturn(0);
529 }
530
531 /* y[i] <--> x[i] */
VecSwap_SeqKokkos(Vec xin,Vec yin)532 PetscErrorCode VecSwap_SeqKokkos(Vec xin,Vec yin)
533 {
534 PetscErrorCode ierr;
535 PetscScalarViewDevice_t xv,yv;
536
537 PetscFunctionBegin;
538 if (xin != yin) {
539 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
540 ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
541 ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
542 Kokkos::parallel_for(xin->map->n,KOKKOS_LAMBDA(const int64_t i) {
543 PetscScalar tmp = xv(i);
544 xv(i) = yv(i);
545 yv(i) = tmp;
546 });
547 ierr = WaitForKokkos();CHKERRQ(ierr);
548 ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
549 ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
550 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
551 }
552 PetscFunctionReturn(0);
553 }
554
555 /* w = alpha x + y */
VecWAXPY_SeqKokkos(Vec win,PetscScalar alpha,Vec xin,Vec yin)556 PetscErrorCode VecWAXPY_SeqKokkos(Vec win,PetscScalar alpha,Vec xin, Vec yin)
557 {
558 PetscErrorCode ierr;
559 ConstPetscScalarViewDevice_t xv,yv;
560 PetscScalarViewDevice_t wv;
561
562 PetscFunctionBegin;
563 if (alpha == (PetscScalar)0.0) {
564 ierr = VecCopy_SeqKokkos(yin,win);CHKERRQ(ierr);
565 } else {
566 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
567 ierr = VecKokkosGetDeviceViewWrite(win,&wv);CHKERRQ(ierr);
568 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
569 ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
570 Kokkos::parallel_for(win->map->n,KOKKOS_LAMBDA(const int64_t i) {wv(i) = alpha*xv(i) + yv(i);});
571 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
572 ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
573 ierr = VecKokkosRestoreDeviceViewWrite(win,&wv);CHKERRQ(ierr);
574 ierr = WaitForKokkos();CHKERRQ(ierr);
575 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
576 ierr = PetscLogGpuFlops(2*win->map->n);CHKERRQ(ierr);
577 }
578 PetscFunctionReturn(0);
579 }
580
581 struct MAXPYFunctor {
582 typedef ConstPetscScalarViewDevice_t::size_type size_type;
583
584 PetscScalarViewDevice_t yv;
585 PetscInt nx; /* Significent entries in a[8] and xv[8] */
586 PetscScalar a[8];
587 ConstPetscScalarViewDevice_t xv[8];
588
MAXPYFunctorMAXPYFunctor589 MAXPYFunctor(PetscScalarViewDevice_t yv,
590 PetscInt nx,
591 PetscScalar a0,PetscScalar a1,PetscScalar a2,PetscScalar a3,
592 PetscScalar a4,PetscScalar a5,PetscScalar a6,PetscScalar a7,
593 ConstPetscScalarViewDevice_t xv0, ConstPetscScalarViewDevice_t xv1,
594 ConstPetscScalarViewDevice_t xv2, ConstPetscScalarViewDevice_t xv3,
595 ConstPetscScalarViewDevice_t xv4, ConstPetscScalarViewDevice_t xv5,
596 ConstPetscScalarViewDevice_t xv6, ConstPetscScalarViewDevice_t xv7)
597 : yv(yv),nx(nx)
598 {
599 a[0] = a0; a[1] = a1;
600 a[2] = a2; a[3] = a3;
601 a[4] = a4; a[5] = a5;
602 a[6] = a6; a[7] = a7;
603 xv[0] = xv0; xv[1] = xv1;
604 xv[2] = xv2; xv[3] = xv3;
605 xv[4] = xv4; xv[5] = xv5;
606 xv[6] = xv6; xv[7] = xv7;
607 }
608
operator ()MAXPYFunctor609 KOKKOS_INLINE_FUNCTION void operator() (const size_type i) const {
610 for (PetscInt j = 0; j<nx; ++j) yv(i) += a[j]*xv[j](i);
611 }
612 };
613
614 /* y = y + sum alpha[i] x[i] */
VecMAXPY_SeqKokkos(Vec yin,PetscInt nv,const PetscScalar * alpha,Vec * xin)615 PetscErrorCode VecMAXPY_SeqKokkos(Vec yin, PetscInt nv,const PetscScalar *alpha,Vec *xin)
616 {
617 PetscErrorCode ierr;
618 PetscInt i,j,cur=0,ngroup=nv/8,rem=nv%8;
619 PetscScalarViewDevice_t yv;
620 PetscScalar a[8];
621 ConstPetscScalarViewDevice_t xv[8];
622
623 PetscFunctionBegin;
624 ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
625 for (i=0; i<ngroup; i++) { /* 8 x's per group */
626 for (j=0; j<8; j++) { /* Fill the parameters */
627 a[j] = alpha[cur+j];
628 ierr = VecKokkosGetDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);
629 }
630 MAXPYFunctor maxpy(yv,8,a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],xv[0],xv[1],xv[2],xv[3],xv[4],xv[5],xv[6],xv[7]);
631 Kokkos::parallel_for(yin->map->n,maxpy);
632 for (j=0; j<8; j++) {ierr = VecKokkosRestoreDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);}
633 cur += 8;
634 }
635
636 if (rem) { /* The remaining */
637 for (j=0; j<rem; j++) {
638 a[j] = alpha[cur+j];
639 ierr = VecKokkosGetDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);
640 }
641 MAXPYFunctor maxpy(yv,rem,a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],xv[0],xv[1],xv[2],xv[3],xv[4],xv[5],xv[6],xv[7]);
642 Kokkos::parallel_for(yin->map->n,maxpy);
643 for (j=0; j<rem; j++) {ierr = VecKokkosRestoreDeviceViewRead(xin[cur+j],&xv[j]);CHKERRQ(ierr);}
644 }
645 ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
646 ierr = PetscLogGpuFlops(nv*2.0*yin->map->n);CHKERRQ(ierr);
647 ierr = WaitForKokkos();CHKERRQ(ierr);
648 PetscFunctionReturn(0);
649 }
650
651 /* y = alpha x + beta y */
VecAXPBY_SeqKokkos(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)652 PetscErrorCode VecAXPBY_SeqKokkos(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
653 {
654 PetscErrorCode ierr;
655 ConstPetscScalarViewDevice_t xv;
656 PetscScalarViewDevice_t yv;
657 PetscBool xiskok,yiskok;
658
659 PetscFunctionBegin;
660 ierr = PetscObjectTypeCompareAny((PetscObject)xin,&xiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
661 ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");CHKERRQ(ierr);
662 if (xiskok && yiskok) {
663 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
664 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
665 ierr = VecKokkosGetDeviceView(yin,&yv);CHKERRQ(ierr);
666 KokkosBlas::axpby(alpha,xv,beta,yv);
667 ierr = WaitForKokkos();CHKERRQ(ierr);
668 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
669 ierr = VecKokkosRestoreDeviceView(yin,&yv);CHKERRQ(ierr);
670 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
671 if (alpha == (PetscScalar)0.0 || beta == (PetscScalar)0.0) {
672 ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
673 } else if (beta == (PetscScalar)1.0 || alpha == (PetscScalar)1.0) {
674 ierr = PetscLogGpuFlops(2.0*xin->map->n);CHKERRQ(ierr);
675 } else {
676 ierr = PetscLogGpuFlops(3.0*xin->map->n);CHKERRQ(ierr);
677 }
678 } else {
679 ierr = VecAXPBY_Seq(yin,alpha,beta,xin);CHKERRQ(ierr);
680 }
681 PetscFunctionReturn(0);
682 }
683
684 /* z = alpha x + beta y + gamma z */
VecAXPBYPCZ_SeqKokkos(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)685 PetscErrorCode VecAXPBYPCZ_SeqKokkos(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
686 {
687 PetscErrorCode ierr;
688 ConstPetscScalarViewDevice_t xv,yv;
689 PetscScalarViewDevice_t zv;
690
691 PetscFunctionBegin;
692 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
693 ierr = VecKokkosGetDeviceView(zin,&zv);CHKERRQ(ierr);
694 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
695 ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
696 KokkosBlas::update(alpha,xv,beta,yv,gamma,zv);
697 ierr = WaitForKokkos();CHKERRQ(ierr);
698 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
699 ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
700 ierr = VecKokkosRestoreDeviceView(zin,&zv);CHKERRQ(ierr);
701 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
702 ierr = PetscLogGpuFlops(zin->map->n*5);CHKERRQ(ierr);
703 PetscFunctionReturn(0);
704 }
705
706 /* w = x*y. Any subset of the x, y, and w may be the same vector. */
VecPointwiseMult_SeqKokkos(Vec win,Vec xin,Vec yin)707 PetscErrorCode VecPointwiseMult_SeqKokkos(Vec win,Vec xin,Vec yin)
708 {
709 PetscErrorCode ierr;
710 ConstPetscScalarViewDevice_t xv,yv;
711 PetscScalarViewDevice_t wv;
712
713 PetscFunctionBegin;
714 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
715 ierr = VecKokkosGetDeviceViewWrite(win,&wv);CHKERRQ(ierr);
716 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
717 ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
718 Kokkos::parallel_for(win->map->n,KOKKOS_LAMBDA(const int64_t i) {wv(i) = xv(i)*yv(i);});
719 ierr = WaitForKokkos();CHKERRQ(ierr);
720 ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
721 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
722 ierr = VecKokkosRestoreDeviceViewWrite(win,&wv);CHKERRQ(ierr);
723 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
724 ierr = PetscLogGpuFlops(win->map->n);CHKERRQ(ierr);
725 PetscFunctionReturn(0);
726 }
727
728 /* w = x/y */
VecPointwiseDivide_SeqKokkos(Vec win,Vec xin,Vec yin)729 PetscErrorCode VecPointwiseDivide_SeqKokkos(Vec win,Vec xin,Vec yin)
730 {
731 PetscErrorCode ierr;
732 ConstPetscScalarViewDevice_t xv,yv;
733 PetscScalarViewDevice_t wv;
734
735 PetscFunctionBegin;
736 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
737 ierr = VecKokkosGetDeviceViewWrite(win,&wv);CHKERRQ(ierr);
738 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
739 ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
740 Kokkos::parallel_for(win->map->n,KOKKOS_LAMBDA(const int64_t i) {
741 if (yv(i) != 0.0) wv(i) = xv(i)/yv(i);
742 else wv(i) = 0.0;
743 });
744 ierr = WaitForKokkos();CHKERRQ(ierr);
745 ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
746 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
747 ierr = VecKokkosRestoreDeviceViewWrite(win,&wv);CHKERRQ(ierr);
748 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
749 ierr = PetscLogGpuFlops(win->map->n);CHKERRQ(ierr);
750 PetscFunctionReturn(0);
751 }
752
VecNorm_SeqKokkos(Vec xin,NormType type,PetscReal * z)753 PetscErrorCode VecNorm_SeqKokkos(Vec xin,NormType type,PetscReal *z)
754 {
755 PetscErrorCode ierr;
756 const PetscInt n = xin->map->n;
757 ConstPetscScalarViewDevice_t xv;
758
759 PetscFunctionBegin;
760 if (type == NORM_1_AND_2) {
761 ierr = VecNorm_SeqKokkos(xin,NORM_1,z);CHKERRQ(ierr);
762 ierr = VecNorm_SeqKokkos(xin,NORM_2,z+1);CHKERRQ(ierr);
763 } else {
764 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
765 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
766 if (type == NORM_2 || type == NORM_FROBENIUS) {
767 *z = KokkosBlas::nrm2(xv);
768 ierr = PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));CHKERRQ(ierr);
769 } else if (type == NORM_1) {
770 *z = KokkosBlas::nrm1(xv);
771 ierr = PetscLogGpuFlops(PetscMax(n-1.0,0.0));CHKERRQ(ierr);
772 } else if (type == NORM_INFINITY) {
773 *z = KokkosBlas::nrminf(xv);
774 }
775 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
776 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
777 }
778 PetscFunctionReturn(0);
779 }
780
781 /* A functor for DotNorm2 so that we can compute dp and nm in one kernel */
782 struct DotNorm2 {
783 typedef PetscScalar value_type[];
784 typedef ConstPetscScalarViewDevice_t::size_type size_type;
785
786 size_type value_count;
787 ConstPetscScalarViewDevice_t xv_, yv_;
788
DotNorm2DotNorm2789 DotNorm2(ConstPetscScalarViewDevice_t& xv,ConstPetscScalarViewDevice_t& yv) :
790 value_count(2), xv_(xv), yv_(yv) {}
791
operator ()DotNorm2792 KOKKOS_INLINE_FUNCTION void operator() (const size_type i, value_type result) const {
793 #if defined(PETSC_USE_COMPLEX)
794 result[0] += Kokkos::conj(yv_(i))*xv_(i);
795 result[1] += Kokkos::conj(yv_(i))*xv_(i);
796 #else
797 result[0] += yv_(i)*xv_(i);
798 result[1] += yv_(i)*yv_(i);
799 #endif
800 }
801
joinDotNorm2802 KOKKOS_INLINE_FUNCTION void join (volatile value_type dst, const volatile value_type src) const {
803 dst[0] += src[0];
804 dst[1] += src[1];
805 }
806
initDotNorm2807 KOKKOS_INLINE_FUNCTION void init (value_type result) const {
808 result[0] = 0.0;
809 result[1] = 0.0;
810 }
811 };
812
813 /* dp = y^H x, nm = y^H y */
VecDotNorm2_SeqKokkos(Vec xin,Vec yin,PetscScalar * dp,PetscScalar * nm)814 PetscErrorCode VecDotNorm2_SeqKokkos(Vec xin, Vec yin, PetscScalar *dp, PetscScalar *nm)
815 {
816 PetscErrorCode ierr;
817 ConstPetscScalarViewDevice_t xv,yv;
818 PetscScalar result[2];
819
820 PetscFunctionBegin;
821 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
822 ierr = VecKokkosGetDeviceViewRead(xin,&xv);CHKERRQ(ierr);
823 ierr = VecKokkosGetDeviceViewRead(yin,&yv);CHKERRQ(ierr);
824 DotNorm2 dn(xv,yv);
825 Kokkos::parallel_reduce(xin->map->n,dn,result);
826 ierr = VecKokkosRestoreDeviceViewRead(yin,&yv);CHKERRQ(ierr);
827 ierr = VecKokkosRestoreDeviceViewRead(xin,&xv);CHKERRQ(ierr);
828 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
829 ierr = PetscLogGpuFlops(4.0*xin->map->n);CHKERRQ(ierr);
830 PetscFunctionReturn(0);
831 }
832
VecConjugate_SeqKokkos(Vec xin)833 PetscErrorCode VecConjugate_SeqKokkos(Vec xin)
834 {
835 #if defined(PETSC_USE_COMPLEX)
836 PetscErrorCode ierr;
837 PetscScalarViewDevice_t xv;
838
839 PetscFunctionBegin;
840 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
841 ierr = VecKokkosGetDeviceView(xin,&xv);CHKERRQ(ierr);
842 Kokkos::parallel_for(xin->map->n,KOKKOS_LAMBDA(int64_t i) {xv(i) = Kokkos::conj(xv(i));});
843 ierr = VecKokkosRestoreDeviceView(xin,&xv);CHKERRQ(ierr);
844 ierr = WaitForKokkos();CHKERRQ(err);
845 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
846 #else
847 PetscFunctionBegin;
848 #endif
849 PetscFunctionReturn(0);
850 }
851
VecKokkosUpdateAfterChangingHostArray_Private(Vec v)852 static PetscErrorCode VecKokkosUpdateAfterChangingHostArray_Private(Vec v)
853 {
854 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
855 Vec_Seq *vecseq = static_cast<Vec_Seq*>(v->data);
856
857 PetscFunctionBegin;
858 /* Rebuild h_v and dual_v with the new host array*/
859 veckok->h_v = PetscScalarViewHost_t(vecseq->array,v->map->n);
860 veckok->dual_v = PetscScalarKokkosDualView_t(veckok->d_v,veckok->h_v);
861 veckok->dual_v.modify_host();
862 PetscFunctionReturn(0);
863 }
864
865 /* Temporarily replace the array in vin with a[]. Return to the original array with a call to VecResetArray() */
VecPlaceArray_SeqKokkos(Vec vin,const PetscScalar * a)866 PetscErrorCode VecPlaceArray_SeqKokkos(Vec vin,const PetscScalar *a)
867 {
868 PetscErrorCode ierr;
869
870 PetscFunctionBegin;
871 ierr = VecPlaceArray_Seq(vin,a);CHKERRQ(ierr);
872 ierr = VecKokkosUpdateAfterChangingHostArray_Private(vin);CHKERRQ(ierr);
873 PetscFunctionReturn(0);
874 }
875
VecResetArray_SeqKokkos(Vec vin)876 PetscErrorCode VecResetArray_SeqKokkos(Vec vin)
877 {
878 PetscErrorCode ierr;
879
880 PetscFunctionBegin;
881 ierr = VecKokkosSyncHost(vin);CHKERRQ(ierr); /* User wants to unhook the provided host array. Sync it so that user can get the latest */
882 ierr = VecResetArray_Seq(vin);CHKERRQ(ierr);
883 ierr = VecKokkosUpdateAfterChangingHostArray_Private(vin);CHKERRQ(ierr);
884 PetscFunctionReturn(0);
885 }
886
887 /* Replace the array in vin with a[] that must be allocated by PetscMalloc. a[] is owned by vin afterwords. */
VecReplaceArray_SeqKokkos(Vec vin,const PetscScalar * a)888 PetscErrorCode VecReplaceArray_SeqKokkos(Vec vin,const PetscScalar *a)
889 {
890 PetscErrorCode ierr;
891 Vec_Seq *vecseq = (Vec_Seq*)vin->data;
892
893 PetscFunctionBegin;
894 /* Make sure the users array has the latest values */
895 if (vecseq->array != vecseq->array_allocated) {ierr = VecKokkosSyncHost(vin);CHKERRQ(ierr);}
896 ierr = VecReplaceArray_Seq(vin,a);CHKERRQ(ierr);
897 ierr = VecKokkosUpdateAfterChangingHostArray_Private(vin);CHKERRQ(ierr);
898 PetscFunctionReturn(0);
899 }
900
901 /* Maps the local portion of vector v into vector w */
VecGetLocalVector_SeqKokkos(Vec v,Vec w)902 PetscErrorCode VecGetLocalVector_SeqKokkos(Vec v,Vec w)
903 {
904 PetscErrorCode ierr;
905 Vec_Seq *vecseq = static_cast<Vec_Seq*>(w->data);
906 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(w->spptr);
907
908 PetscFunctionBegin;
909 PetscCheckTypeName(w,VECSEQKOKKOS);
910 /* Destroy w->data, w->spptr */
911 if (vecseq) {
912 ierr = PetscFree(vecseq->array_allocated);CHKERRQ(ierr);
913 ierr = PetscFree(w->data);CHKERRQ(ierr);
914 }
915 delete veckok;
916
917 /* Replace with v's */
918 w->data = v->data;
919 w->spptr = v->spptr;
920 ierr = PetscObjectStateIncrease((PetscObject)w);CHKERRQ(ierr);
921 PetscFunctionReturn(0);
922 }
923
VecRestoreLocalVector_SeqKokkos(Vec v,Vec w)924 PetscErrorCode VecRestoreLocalVector_SeqKokkos(Vec v,Vec w)
925 {
926 PetscErrorCode ierr;
927
928 PetscFunctionBegin;
929 PetscCheckTypeName(w,VECSEQKOKKOS);
930 v->data = w->data;
931 v->spptr = w->spptr;
932 ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
933 /* TODO: need to think if setting w->data/spptr to NULL is safe */
934 w->data = NULL;
935 w->spptr = NULL;
936 PetscFunctionReturn(0);
937 }
938
939 /* Get array on host to overwrite, so no need to sync host. In VecRestoreArrayWrite() we will mark host is modified. */
VecGetArrayWrite_SeqKokkos(Vec v,PetscScalar ** vv)940 PetscErrorCode VecGetArrayWrite_SeqKokkos(Vec v,PetscScalar **vv)
941 {
942 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
943
944 PetscFunctionBegin;
945 veckok->dual_v.clear_sync_state();
946 *vv = veckok->dual_v.view_host().data();
947 PetscFunctionReturn(0);
948 }
949
VecSetOps_SeqKokkos(Vec v)950 static PetscErrorCode VecSetOps_SeqKokkos(Vec v)
951 {
952 PetscFunctionBegin;
953 v->ops->abs = VecAbs_SeqKokkos;
954 v->ops->reciprocal = VecReciprocal_SeqKokkos;
955 v->ops->pointwisemult = VecPointwiseMult_SeqKokkos;
956 v->ops->min = VecMin_SeqKokkos;
957 v->ops->max = VecMax_SeqKokkos;
958 v->ops->shift = VecShift_SeqKokkos;
959 v->ops->dot = VecDot_SeqKokkos;
960 v->ops->norm = VecNorm_SeqKokkos;
961 v->ops->tdot = VecTDot_SeqKokkos;
962 v->ops->scale = VecScale_SeqKokkos;
963 v->ops->copy = VecCopy_SeqKokkos;
964 v->ops->set = VecSet_SeqKokkos;
965 v->ops->swap = VecSwap_SeqKokkos;
966 v->ops->axpy = VecAXPY_SeqKokkos;
967 v->ops->axpby = VecAXPBY_SeqKokkos;
968 v->ops->axpbypcz = VecAXPBYPCZ_SeqKokkos;
969 v->ops->pointwisedivide = VecPointwiseDivide_SeqKokkos;
970 v->ops->setrandom = VecSetRandom_SeqKokkos;
971 v->ops->dot_local = VecDot_SeqKokkos;
972 v->ops->tdot_local = VecTDot_SeqKokkos;
973 v->ops->norm_local = VecNorm_SeqKokkos;
974 v->ops->mdot_local = VecMDot_SeqKokkos;
975 v->ops->maxpy = VecMAXPY_SeqKokkos;
976 v->ops->mdot = VecMDot_SeqKokkos;
977 v->ops->aypx = VecAYPX_SeqKokkos;
978 v->ops->waxpy = VecWAXPY_SeqKokkos;
979 v->ops->dotnorm2 = VecDotNorm2_SeqKokkos;
980 v->ops->placearray = VecPlaceArray_SeqKokkos;
981 v->ops->replacearray = VecReplaceArray_SeqKokkos;
982 v->ops->resetarray = VecResetArray_SeqKokkos;
983 v->ops->destroy = VecDestroy_SeqKokkos;
984 v->ops->duplicate = VecDuplicate_SeqKokkos;
985 v->ops->conjugate = VecConjugate_SeqKokkos;
986 v->ops->getlocalvector = VecGetLocalVector_SeqKokkos;
987 v->ops->restorelocalvector = VecRestoreLocalVector_SeqKokkos;
988 v->ops->getlocalvectorread = VecGetLocalVector_SeqKokkos;
989 v->ops->restorelocalvectorread = VecRestoreLocalVector_SeqKokkos;
990 v->ops->getarraywrite = VecGetArrayWrite_SeqKokkos;
991 PetscFunctionReturn(0);
992 }
993
994 /* Assuming the Vec_Seq struct of the vector is ready, build the Vec_Kokkos struct for it.
995 Also assume the vector is to be init'ed, so it is in a host-device sync'ed state after the call.
996 */
BuildVecKokkosFromVecSeq_Private(Vec v)997 static PetscErrorCode BuildVecKokkosFromVecSeq_Private(Vec v)
998 {
999 Vec_Seq *vecseq = static_cast<Vec_Seq*>(v->data);
1000 Vec_Kokkos *veckok = NULL;
1001 PetscScalar *darray;
1002
1003 PetscFunctionBegin;
1004 if (std::is_same<DeviceMemorySpace,HostMemorySpace>::value) {
1005 darray = vecseq->array;
1006 } else {
1007 darray = static_cast<PetscScalar*>(Kokkos::kokkos_malloc<DeviceMemorySpace>(sizeof(PetscScalar)*v->map->n));
1008 }
1009 if (v->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"v->spptr not NULL");
1010 veckok = new Vec_Kokkos(v->map->n,vecseq->array,darray,darray);
1011 Kokkos::deep_copy(veckok->dual_v.view_device(),0.0);
1012 v->spptr = static_cast<void*>(veckok);
1013 v->offloadmask = PETSC_OFFLOAD_VECKOKKOS;
1014 PetscFunctionReturn(0);
1015 }
1016
VecCreate_SeqKokkos(Vec v)1017 PetscErrorCode VecCreate_SeqKokkos(Vec v)
1018 {
1019 PetscErrorCode ierr;
1020
1021 PetscFunctionBegin;
1022 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1023 ierr = PetscLayoutSetUp(v->map);CHKERRQ(ierr);
1024 ierr = VecCreate_Seq(v);CHKERRQ(ierr); /* Build a sequential vector, allocate array */
1025 ierr = VecSet_Seq(v,0.0);CHKERRQ(ierr); /* Zero the host array */
1026 ierr = PetscObjectChangeTypeName((PetscObject)v,VECSEQKOKKOS);CHKERRQ(ierr);
1027 ierr = VecSetOps_SeqKokkos(v);CHKERRQ(ierr);
1028 ierr = BuildVecKokkosFromVecSeq_Private(v);CHKERRQ(ierr);
1029 PetscFunctionReturn(0);
1030 }
1031
1032 /*@C
1033 VecCreateSeqKokkosWithArray - Creates a Kokkos sequential array-style vector,
1034 where the user provides the array space to store the vector values. The array
1035 provided must be a device array.
1036
1037 Collective
1038
1039 Input Parameter:
1040 + comm - the communicator, should be PETSC_COMM_SELF
1041 . bs - the block size
1042 . n - the vector length
1043 - array - device memory where the vector elements are to be stored.
1044
1045 Output Parameter:
1046 . V - the vector
1047
1048 Notes:
1049 Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1050 same type as an existing vector.
1051
1052 If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
1053 at a later stage to SET the array for storing the vector values.
1054
1055 PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1056 The user should not free the array until the vector is destroyed.
1057
1058 Level: intermediate
1059
1060 .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
1061 VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
1062 VecCreateMPIWithArray()
1063 @*/
VecCreateSeqKokkosWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar darray[],Vec * v)1064 PetscErrorCode VecCreateSeqKokkosWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar darray[],Vec *v)
1065 {
1066 PetscErrorCode ierr;
1067 PetscMPIInt size;
1068 Vec w;
1069 Vec_Kokkos *veckok = NULL;
1070 PetscScalar *harray;
1071
1072 PetscFunctionBegin;
1073 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1074 if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQKOKKOS on more than one process");
1075
1076 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1077 ierr = VecCreate(comm,&w);CHKERRQ(ierr);
1078 ierr = VecSetSizes(w,n,n);CHKERRQ(ierr);
1079 ierr = VecSetBlockSize(w,bs);CHKERRQ(ierr);
1080
1081 /* Given a device array, build the Vec_Seq struct */
1082 if (std::is_same<DeviceMemorySpace,HostMemorySpace>::value) {harray = const_cast<PetscScalar*>(darray);}
1083 else {ierr = PetscMalloc1(w->map->n,&harray);CHKERRQ(ierr);}
1084 ierr = VecCreate_Seq_Private(w,harray);CHKERRQ(ierr); /* Build a sequential vector with harray */
1085
1086 ierr = PetscObjectChangeTypeName((PetscObject)w,VECSEQKOKKOS);CHKERRQ(ierr); /* Change it to Kokkos */
1087 ierr = VecSetOps_SeqKokkos(w);CHKERRQ(ierr);
1088 veckok = new Vec_Kokkos(n,harray,const_cast<PetscScalar*>(darray),NULL);
1089 veckok->dual_v.modify_device(); /* Mark the device is modified */
1090 w->offloadmask = PETSC_OFFLOAD_VECKOKKOS;
1091 w->spptr = static_cast<void*>(veckok);
1092 *v = w;
1093 PetscFunctionReturn(0);
1094 }
1095
1096 /* TODO: ftn-auto generates veckok.kokkosf.c */
1097 /*@C
1098 VecCreateSeqKokkos - Creates a standard, sequential array-style vector.
1099
1100 Collective
1101
1102 Input Parameter:
1103 + comm - the communicator, should be PETSC_COMM_SELF
1104 - n - the vector length
1105
1106 Output Parameter:
1107 . v - the vector
1108
1109 Notes:
1110 Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1111 same type as an existing vector.
1112
1113 Level: intermediate
1114
1115 .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1116 @*/
VecCreateSeqKokkos(MPI_Comm comm,PetscInt n,Vec * v)1117 PetscErrorCode VecCreateSeqKokkos(MPI_Comm comm,PetscInt n,Vec *v)
1118 {
1119 PetscErrorCode ierr;
1120
1121 PetscFunctionBegin;
1122 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1123 ierr = VecCreate(comm,v);CHKERRQ(ierr);
1124 ierr = VecSetSizes(*v,n,n);CHKERRQ(ierr);
1125 ierr = VecSetType(*v,VECSEQKOKKOS);CHKERRQ(ierr); /* Calls VecCreate_SeqKokkos */
1126 PetscFunctionReturn(0);
1127 }
1128
1129 /* Duplicate layout etc but not the values in the input vector */
VecDuplicate_SeqKokkos(Vec win,Vec * v)1130 PetscErrorCode VecDuplicate_SeqKokkos(Vec win,Vec *v)
1131 {
1132 PetscErrorCode ierr;
1133
1134 PetscFunctionBegin;
1135 ierr = VecDuplicate_Seq(win,v);CHKERRQ(ierr); /* It also dups ops of win */
1136 PetscFunctionReturn(0);
1137 }
1138
VecDestroy_SeqKokkos(Vec v)1139 PetscErrorCode VecDestroy_SeqKokkos(Vec v)
1140 {
1141 PetscErrorCode ierr;
1142 Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
1143 Vec_Seq *vecseq = static_cast<Vec_Seq*>(v->data);
1144
1145 PetscFunctionBegin;
1146 delete veckok;
1147 v->spptr = NULL;
1148 if (vecseq) {ierr = VecDestroy_Seq(v);CHKERRQ(ierr);}
1149 PetscFunctionReturn(0);
1150 }
1151