1 /* -----------------------------------------------------------------
2 * Programmer(s): Slaven Peles @ LLNL
3 * -----------------------------------------------------------------
4 * Acknowledgements: This example is based on idaHeat2D_kry_p
5 * example by Daniel R. Reynolds @ SMU and
6 * Allan Taylor, Alan Hindmarsh and
7 * Radu Serban @ LLNL
8 * -----------------------------------------------------------------
9 * Example problem for IDA: 2D heat equation, parallel, GMRES.
10 *
11 * This example solves a discretized 2D heat equation problem.
12 * This version uses the Krylov solver SUNSPGMR.
13 *
14 * The DAE system solved is a spatial discretization of the PDE
15 * du/dt = d^2u/dx^2 + d^2u/dy^2
16 * on the unit square. The boundary condition is u = 0 on all edges.
17 * Initial conditions are given by u = 16 x (1 - x) y (1 - y).
18 * The PDE is treated with central differences on a uniform MX x MY
19 * grid. The values of u at the interior points satisfy ODEs, and
20 * equations u = 0 at the boundaries are appended, to form a DAE
21 * system of size N = MX * MY. Here MX = MY = 10.
22 *
23 * The system is actually implemented on submeshes, processor by
24 * processor, with an MXSUB by MYSUB mesh on each of NPEX * NPEY
25 * processors.
26 *
27 * The system is solved with IDA using the Krylov linear solver
28 * SUNSPGMR. The preconditioner uses the diagonal elements of the
29 * Jacobian only. Routines for preconditioning, required by
30 * SUNSPGMR, are supplied here. The constraints u >= 0 are posed
31 * for all components. Local error testing on the boundary values
32 * is suppressed. Output is taken at t = 0, .01, .02, .04,
33 * ..., 10.24.
34 * -----------------------------------------------------------------
35 * When running this example for unit testing, set the environemnt
36 * variables OMP_PROC_BIND=false and OMP_NUM_THREADS=2 to run
37 * without thread binding and with two OpenMP threads.
38 * -----------------------------------------------------------------
39 */
40
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <cmath>
44 #include <string.h>
45 #include <vector>
46
47 #include <Tpetra_Core.hpp>
48 #include <Tpetra_Vector.hpp>
49 #include <Tpetra_Version.hpp>
50
51 #include <ida/ida.h>
52 #include <nvector/trilinos/SundialsTpetraVectorInterface.hpp>
53 #include <nvector/nvector_trilinos.h>
54 #include <sunlinsol/sunlinsol_spgmr.h>
55 #include <sundials/sundials_types.h>
56 #include <sundials/sundials_mpi_types.h>
57
58 #define ZERO RCONST(0.0)
59 #define ONE RCONST(1.0)
60 #define TWO RCONST(2.0)
61
62 #define NOUT 11 /* Number of output times */
63
64 #define NPEX 2 /* No. PEs in x direction of PE array */
65 #define NPEY 2 /* No. PEs in y direction of PE array */
66 /* Total no. PEs = NPEX*NPEY */
67 #define MXSUB 5 /* No. x points per subgrid */
68 #define MYSUB 5 /* No. y points per subgrid */
69
70 /* Global spatial mesh is MX x MY = (NPEX x MXSUB) x (NPEY x MYSUB) */
71
72 /* type definitions */
73 typedef Sundials::TpetraVectorInterface::vector_type vector_type;
74 typedef vector_type::scalar_type scalar_type;
75 typedef vector_type::mag_type mag_type;
76 typedef vector_type::global_ordinal_type global_ordinal_type;
77 typedef vector_type::local_ordinal_type local_ordinal_type;
78 typedef vector_type::node_type::memory_space memory_space;
79 typedef vector_type::node_type::execution_space execution_space;
80 typedef vector_type::map_type map_type;
81
82 /* User data structure */
83 struct UserData {
84 /* user data destructor */
~UserDataUserData85 ~UserData()
86 {
87 if (pp != NULL)
88 N_VDestroy(pp);
89 }
90
91 Teuchos::RCP<const Teuchos::Comm<int> > comm;
92 int thispe, npex, npey, ixsub, jysub;
93 sunindextype mx, my, mxsub, mysub;
94 realtype dx, dy, coeffx, coeffy, coeffxy;
95 Kokkos::View<scalar_type*, memory_space> uext; /* device array */
96
97 Kokkos::View<scalar_type*, memory_space> send_buff_top;
98 Kokkos::View<scalar_type*, memory_space> send_buff_bottom;
99 Kokkos::View<scalar_type*, memory_space> send_buff_left;
100 Kokkos::View<scalar_type*, memory_space> send_buff_right;
101 Kokkos::View<scalar_type*, memory_space> recv_buff_top;
102 Kokkos::View<scalar_type*, memory_space> recv_buff_bottom;
103 Kokkos::View<scalar_type*, memory_space> recv_buff_left;
104 Kokkos::View<scalar_type*, memory_space> recv_buff_right;
105
106 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_send_buff_top;
107 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_send_buff_bottom;
108 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_send_buff_left;
109 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_send_buff_right;
110 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_recv_buff_top;
111 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_recv_buff_bottom;
112 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_recv_buff_left;
113 typename Kokkos::View<scalar_type*, memory_space>::HostMirror h_recv_buff_right;
114
115 N_Vector pp; /* vector of diagonal preconditioner elements */
116 };
117
118
119 /* User-supplied residual and supporting functions */
120
121 int resHeat(realtype tt, N_Vector uu, N_Vector up,
122 N_Vector rr, void *user_data);
123
124 static int rescomm(N_Vector uu, N_Vector up, void *user_data);
125
126 static int reslocal(realtype tt, N_Vector uu, N_Vector up,
127 N_Vector res, void *user_data);
128
129 static int BSend(N_Vector uu, void *user_data);
130
131 static int BRecvPost(MPI_Request request[], void *user_data);
132
133 static int BRecvWait(MPI_Request request[], void *user_data);
134
135
136 /* User-supplied preconditioner functions */
137
138 int PsolveHeat(realtype tt, N_Vector uu, N_Vector up, N_Vector rr,
139 N_Vector rvec, N_Vector zvec, realtype c_j,
140 realtype delta, void *user_data);
141
142 int PsetupHeat(realtype tt, N_Vector yy, N_Vector yp, N_Vector rr,
143 realtype c_j, void *user_data);
144
145 /* Private function to allocate memory, initialize problem and print output */
146
147 static int InitUserData(int thispe, const Teuchos::RCP<const Teuchos::Comm<int> > comm, UserData *data);
148
149 static int AllocUserData(int thispe, N_Vector uu, UserData *data);
150
151 static int SetInitialProfile(N_Vector uu, N_Vector up, N_Vector id,
152 N_Vector res, UserData *data);
153
154 static void PrintHeader(realtype rtol, realtype atol, UserData *data);
155
156 static void PrintOutput(int id, void *ida_mem, realtype t, N_Vector uu);
157
158 static void PrintFinalStats(void *ida_mem);
159
160 static int check_retval(void *flagvalue, const char *funcname, int opt, int id);
161
162 /*
163 *--------------------------------------------------------------------
164 * MAIN PROGRAM
165 *--------------------------------------------------------------------
166 */
167
main(int argc,char * argv[])168 int main(int argc, char *argv[])
169 {
170 void *ida_mem = NULL;
171 SUNLinearSolver LS = NULL;
172 int iout, retval;
173 realtype rtol, atol, t0, t1, tout, tret;
174 N_Vector uu = NULL;
175 N_Vector up = NULL;
176 N_Vector constraints = NULL;
177 N_Vector id = NULL;
178 N_Vector res = NULL;
179
180 /* Start an MPI session */
181 Tpetra::ScopeGuard tpetraScope(&argc, &argv);
182
183 /* Create Tpetra communicator */
184 auto comm = Tpetra::getDefaultComm();
185 const int thispe = comm->getRank();
186 const int npes = comm->getSize();
187
188 /* Allocate and initialize the data structure */
189 UserData *data = new UserData();
190 if(check_retval((void *)data, "malloc", 2, thispe))
191 return -1;
192
193 /* Initialize user data */
194 InitUserData(thispe, comm, data);
195
196 /* Set global solution vector lengths. */
197 const sunindextype global_length = data->mx * data->my;
198
199 /* Choose zero-based (C-style) indexing. */
200 const sunindextype index_base = 0;
201
202 /* Construct am MPI Map */
203 Teuchos::RCP<const map_type> testMap =
204 Teuchos::rcp(new map_type (global_length, index_base, comm,
205 Tpetra::GloballyDistributed));
206
207 /* Check if the number of MPI processes matches the number of subgrids */
208 if (npes != (data->npex * data->npey)) {
209 if (thispe == 0)
210 fprintf(stderr,
211 "\nMPI_ERROR(0): npes = %d is not equal to NPEX*NPEY = %d\n",
212 npes, data->npex * data->npey);
213 delete data;
214 return 1;
215 }
216
217 /* Construct a Tpetra vector and return refernce counting pointer to it. */
218 Teuchos::RCP<vector_type> rcpuu =
219 Teuchos::rcp(new vector_type(testMap));
220
221
222
223 /* Allocate and initialize N-vectors. */
224
225 uu = N_VMake_Trilinos(rcpuu);
226 if(check_retval((void *)uu, "N_VMake_Trilinos", 0, thispe))
227 return -1;
228
229 up = N_VClone(uu);
230 if(check_retval((void *)up, "N_VClone", 0, thispe))
231 return -1;
232
233 res = N_VClone(uu);
234 if(check_retval((void *)res, "N_VClone", 0, thispe))
235 return -1;
236
237 constraints = N_VClone(uu);
238 if(check_retval((void *)constraints, "N_VClone", 0, thispe))
239 return -1;
240
241 id = N_VClone(uu);
242 if(check_retval((void *)id, "N_VClone", 0, thispe))
243 return -1;
244
245 /* Allocate user data extended vector and MPI buffers */
246 retval = AllocUserData(thispe, uu, data);
247 if(check_retval(&retval, "AllocUserData", 1, thispe)) return -1;
248
249
250 /* Initialize the uu, up, id, and res profiles. */
251 SetInitialProfile(uu, up, id, res, data);
252
253 /* Set constraints to all 1's for nonnegative solution values. */
254 N_VConst(ONE, constraints);
255
256 t0 = ZERO; t1 = RCONST(0.01);
257
258 /* Scalar relative and absolute tolerance. */
259 rtol = ZERO;
260 atol = RCONST(1.0e-3);
261
262 /* Call IDACreate and IDAMalloc to initialize solution. */
263
264 ida_mem = IDACreate();
265 if(check_retval((void *)ida_mem, "IDACreate", 0, thispe))
266 return -1;
267
268 retval = IDASetUserData(ida_mem, data);
269 if(check_retval(&retval, "IDASetUserData", 1, thispe))
270 return -1;
271
272 retval = IDASetSuppressAlg(ida_mem, SUNTRUE);
273 if(check_retval(&retval, "IDASetSuppressAlg", 1, thispe))
274 return -1;
275
276 retval = IDASetId(ida_mem, id);
277 if(check_retval(&retval, "IDASetId", 1, thispe))
278 return -1;
279
280 retval = IDASetConstraints(ida_mem, constraints);
281 if(check_retval(&retval, "IDASetConstraints", 1, thispe))
282 return -1;
283 N_VDestroy(constraints);
284
285 retval = IDAInit(ida_mem, resHeat, t0, uu, up);
286 if(check_retval(&retval, "IDAInit", 1, thispe))
287 return -1;
288
289 retval = IDASStolerances(ida_mem, rtol, atol);
290 if(check_retval(&retval, "IDASStolerances", 1, thispe))
291 return -1;
292
293 /* Call SUNLinSol_SPGMR and IDASetLinearSolver to specify the linear solver. */
294
295 LS = SUNLinSol_SPGMR(uu, PREC_LEFT, 0); /* use default maxl */
296 if(check_retval((void *)LS, "SUNLinSol_SPGMR", 0, thispe))
297 return -1;
298
299 retval = IDASetLinearSolver(ida_mem, LS, NULL);
300 if(check_retval(&retval, "IDASetLinearSolver", 1, thispe))
301 return -1;
302
303 retval = IDASetPreconditioner(ida_mem, PsetupHeat, PsolveHeat);
304 if(check_retval(&retval, "IDASetPreconditioner", 1, thispe))
305 return -1;
306
307 /* Print output heading (on processor 0 only) and intial solution */
308
309 if (thispe == 0) PrintHeader(rtol, atol, data);
310 PrintOutput(thispe, ida_mem, t0, uu);
311
312 /* Loop over tout, call IDASolve, print output. */
313
314 for (tout = t1, iout = 1; iout <= NOUT; iout++, tout *= TWO) {
315
316 retval = IDASolve(ida_mem, tout, &tret, uu, up, IDA_NORMAL);
317 if(check_retval(&retval, "IDASolve", 1, thispe))
318 return -1;
319
320 PrintOutput(thispe, ida_mem, tret, uu);
321
322 }
323
324 /* Print remaining counters. */
325
326 if (thispe == 0) PrintFinalStats(ida_mem);
327
328 /* Free memory */
329
330 IDAFree(&ida_mem);
331 SUNLinSolFree(LS);
332
333 N_VDestroy(id);
334 N_VDestroy(res);
335 N_VDestroy(up);
336 N_VDestroy(uu);
337
338 /* Delete template Tpetra vector */
339 rcpuu = Teuchos::null;
340
341 delete data;
342
343 return(0);
344 }
345
346 /*
347 *--------------------------------------------------------------------
348 * FUNCTIONS CALLED BY IDA
349 *--------------------------------------------------------------------
350 */
351
352 /*
353 * resHeat: heat equation system residual function
354 * This uses 5-point central differencing on the interior points, and
355 * includes algebraic equations for the boundary values.
356 * So for each interior point, the residual component has the form
357 * res_i = u'_i - (central difference)_i
358 * while for each boundary point, it is res_i = u_i.
359 *
360 * This parallel implementation uses several supporting routines.
361 * First a call is made to rescomm to do communication of subgrid boundary
362 * data into array uext. Then reslocal is called to compute the residual
363 * on individual processors and their corresponding domains. The routines
364 * BSend, BRecvPost, and BREcvWait handle interprocessor communication
365 * of uu required to calculate the residual.
366 */
resHeat(realtype tt,N_Vector uu,N_Vector up,N_Vector rr,void * user_data)367 int resHeat(realtype tt, N_Vector uu, N_Vector up, N_Vector rr,
368 void *user_data)
369 {
370 int retval = 0;
371
372 /* Call rescomm to do inter-processor communication. */
373 retval = rescomm(uu, up, user_data);
374
375 /* Call reslocal to calculate res. */
376 retval = reslocal(tt, uu, up, rr, user_data);
377
378 return(retval);
379 }
380
381
382 /*
383 * PsetupHeat: setup for diagonal preconditioner for heatsk.
384 *
385 * The optional user-supplied functions PsetupHeat and
386 * PsolveHeat together must define the left preconditoner
387 * matrix P approximating the system Jacobian matrix
388 * J = dF/du + cj*dF/du'
389 * (where the DAE system is F(t,u,u') = 0), and solve the linear
390 * systems P z = r. This is done in this case by keeping only
391 * the diagonal elements of the J matrix above, storing them as
392 * inverses in a vector pp, when computed in PsetupHeat, for
393 * subsequent use in PsolveHeat.
394 *
395 * In this instance, only cj and data (user data structure, with
396 * pp etc.) are used from the PsetupHeat argument list.
397 *
398 */
PsetupHeat(realtype tt,N_Vector yy,N_Vector yp,N_Vector rr,realtype c_j,void * user_data)399 int PsetupHeat(realtype tt, N_Vector yy, N_Vector yp, N_Vector rr,
400 realtype c_j, void *user_data)
401 {
402 /* Unwrap the user data */
403 UserData* data = reinterpret_cast<UserData*>(user_data);
404 const int ixsub = data->ixsub;
405 const int jysub = data->jysub;
406 const int npex = data->npex;
407 const int npey = data->npey;
408 const sunindextype mxsub = data->mxsub;
409 const sunindextype mysub = data->mysub;
410
411 sunindextype ibc, i0, jbc, j0;
412
413 /* Initially set all pp elements on the device to one. */
414 N_VConst(ONE, data->pp);
415
416 /* Get access to local data */
417 Teuchos::RCP<vector_type> pptp = N_VGetVector_Trilinos(data->pp);
418 auto pp_2d = pptp->getLocalView<memory_space>();
419 auto pp_1d = Kokkos::subview (pp_2d, Kokkos::ALL(), 0);
420
421 /* Calculate the value for the inverse element of the diagonal preconditioner */
422 const realtype pelinv = ONE/(c_j + data->coeffxy);
423
424 ibc = (ixsub == 0) || (ixsub == npex-1);
425 i0 = (ixsub == 0);
426 jbc = (jysub == 0) || (jysub == npey-1);
427 j0 = (jysub == 0);
428
429 /* Set inverse of the preconditioner; ppv must be on the device */
430 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, (mxsub - ibc)*(mysub - jbc)),
431 KOKKOS_LAMBDA (const local_ordinal_type &tid)
432 {
433 sunindextype j = tid / (mxsub - ibc) + j0;
434 sunindextype i = tid % (mxsub - ibc) + i0;
435
436 pp_1d(i + j*mxsub) = pelinv;
437 }
438 );
439
440 return(0);
441 }
442
443
444 /*
445 * PsolveHeat: solve preconditioner linear system.
446 * This routine multiplies the input vector rvec by the vector pp
447 * containing the inverse diagonal Jacobian elements (previously
448 * computed in PsetupHeat), returning the result in zvec.
449 */
PsolveHeat(realtype tt,N_Vector uu,N_Vector up,N_Vector rr,N_Vector rvec,N_Vector zvec,realtype c_j,realtype delta,void * user_data)450 int PsolveHeat(realtype tt, N_Vector uu, N_Vector up,
451 N_Vector rr, N_Vector rvec, N_Vector zvec,
452 realtype c_j, realtype delta, void *user_data)
453 {
454 UserData* data = reinterpret_cast<UserData*>(user_data);
455
456 N_VProd(data->pp, rvec, zvec);
457
458 return(0);
459 }
460
461 /*
462 *--------------------------------------------------------------------
463 * SUPPORTING FUNCTIONS
464 *--------------------------------------------------------------------
465 */
466
467
468 /*
469 * rescomm routine. This routine performs all inter-processor
470 * communication of data in u needed to calculate G.
471 */
rescomm(N_Vector uu,N_Vector up,void * user_data)472 static int rescomm(N_Vector uu, N_Vector up, void* user_data)
473 {
474 /* Set array of raw MPI requests */
475 MPI_Request request[4];
476
477 /* Start receiving boundary data from neighboring PEs. */
478 BRecvPost(request, user_data);
479
480 /* Send data from boundary of local grid to neighboring PEs. */
481 BSend(uu, user_data);
482
483 /* Finish receiving boundary data from neighboring PEs. */
484 BRecvWait(request, user_data);
485
486 return(0);
487 }
488
489 /*
490 * reslocal routine. Compute res = F(t, uu, up). This routine assumes
491 * that all inter-processor communication of data needed to calculate F
492 * has already been done, and that this data is in the work array uext.
493 */
reslocal(realtype tt,N_Vector uu,N_Vector up,N_Vector rr,void * user_data)494 static int reslocal(realtype tt, N_Vector uu, N_Vector up, N_Vector rr,
495 void *user_data)
496 {
497 UserData* data = reinterpret_cast<UserData*>(user_data);
498
499 /* Get subgrid indices, array sizes, and grid coefficients. */
500 const int ixsub = data->ixsub;
501 const int jysub = data->jysub;
502 const int npex = data->npex;
503 const int npey = data->npey;
504 const sunindextype mxsub = data->mxsub;
505 const sunindextype mxsub2 = data->mxsub + 2;
506 const sunindextype mysub = data->mysub;
507 const realtype coeffx = data->coeffx;
508 const realtype coeffy = data->coeffy;
509 const realtype coeffxy = data->coeffxy;
510
511 sunindextype ibc, i0, jbc, j0;
512
513 /* Initialize all elements of rr to uu. This sets the boundary
514 elements simply without indexing hassles. */
515 N_VScale(ONE, uu, rr);
516
517 /* Get vector data arrays, extended work array uext. */
518 Teuchos::RCP<vector_type> uuv = N_VGetVector_Trilinos(uu);
519 Teuchos::RCP<vector_type> upv = N_VGetVector_Trilinos(up);
520 Teuchos::RCP<vector_type> rrv = N_VGetVector_Trilinos(rr);
521
522 const auto uu_2d = uuv->getLocalView<memory_space>();
523 const auto uu_1d = Kokkos::subview (uu_2d, Kokkos::ALL(), 0);
524 const auto up_2d = upv->getLocalView<memory_space>();
525 const auto up_1d = Kokkos::subview (up_2d, Kokkos::ALL(), 0);
526 auto rr_2d = rrv->getLocalView<memory_space>();
527 auto rr_1d = Kokkos::subview (rr_2d, Kokkos::ALL(), 0);
528 Kokkos::View<realtype*, memory_space> uext_1d = data->uext;
529
530 /* Copy local segment of u vector into the working extended array uext.
531 This completes uext prior to the computation of the rr vector.
532 uext and uuv must be on the device. */
533 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mxsub*mysub),
534 KOKKOS_LAMBDA (const local_ordinal_type &tid)
535 {
536 sunindextype j = tid/mxsub;
537 sunindextype i = tid%mxsub;
538
539 uext_1d((i+1) + (j+1)*mxsub2) = uu_1d(i + j*mxsub);
540 }
541 );
542
543 /* Set loop limits for the interior of the local subgrid. */
544
545 /* Prepare to loop over subgrid. */
546 ibc = (ixsub == 0) || (ixsub == npex-1);
547 i0 = (ixsub == 0);
548 jbc = (jysub == 0) || (jysub == npey-1);
549 j0 = (jysub == 0);
550
551 /* Compute local residual; uext, upv, and resv must be on the device */
552 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, (mxsub - ibc)*(mysub - jbc)),
553 KOKKOS_LAMBDA (const local_ordinal_type &tid)
554 {
555 sunindextype j = tid/(mxsub - ibc) + j0;
556 sunindextype i = tid%(mxsub - ibc) + i0;
557 sunindextype locu = i + j*mxsub;
558 sunindextype locue = (i+1) + (j+1)*mxsub2;
559
560 realtype termx = coeffx * (uext_1d(locue-1) + uext_1d(locue+1));
561 realtype termy = coeffy * (uext_1d(locue-mxsub2) + uext_1d(locue+mxsub2));
562 realtype termctr = coeffxy * uext_1d(locue);
563 rr_1d(locu) = up_1d(locu) - (termx + termy - termctr);
564 }
565 );
566
567 return(0);
568 }
569
570
571 /*
572 * Routine to send boundary data to neighboring PEs.
573 */
BSend(N_Vector uu,void * user_data)574 static int BSend(N_Vector uu, void *user_data)
575 {
576 UserData* data = reinterpret_cast<UserData*>(user_data);
577
578 /* Get comm, thispe, subgrid indices, data sizes */
579 auto comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(data->comm);
580 auto rawComm = comm->getRawMpiComm();
581
582 const int thispe = data->thispe;
583 const int ixsub = data->ixsub;
584 const int jysub = data->jysub;
585 const int npex = data->npex;
586 const int npey = data->npey;
587
588 /* Type conversion from sunindextype to int (required by Teuchos::send) */
589 const int mxsub = data->mxsub;
590 const int mysub = data->mysub;
591
592 /* Get pointers to buffers and extended solution vector data array uext. */
593 Kokkos::View<realtype*, memory_space> bufleft = data->send_buff_left;
594 Kokkos::View<realtype*, memory_space> bufright = data->send_buff_right;
595 Kokkos::View<realtype*, memory_space> buftop = data->send_buff_top;
596 Kokkos::View<realtype*, memory_space> bufbottom = data->send_buff_bottom;
597 typename Kokkos::View<realtype*>::HostMirror h_bufleft = data->h_send_buff_left;
598 typename Kokkos::View<realtype*>::HostMirror h_bufright = data->h_send_buff_right;
599 typename Kokkos::View<realtype*>::HostMirror h_buftop = data->h_send_buff_top;
600 typename Kokkos::View<realtype*>::HostMirror h_bufbottom = data->h_send_buff_bottom;
601
602 /* Get solution vector data. */
603 Teuchos::RCP<vector_type> uuv = N_VGetVector_Trilinos(uu);
604 const auto uu_2d = uuv->getLocalView<memory_space>();
605 const auto uu_1d = Kokkos::subview (uu_2d, Kokkos::ALL(), 0);
606
607 /* If jysub > 0, send data from bottom x-line of u. (via bufbottom) */
608
609 if (jysub != 0) {
610 /* Device kernel here to copy from uarray to the buffer on the device */
611 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mxsub),
612 KOKKOS_LAMBDA (const local_ordinal_type &lx)
613 {
614 bufbottom(lx) = uu_1d(lx);
615 }
616 );
617 /* Copy device buffer to the corresponding host buffer */
618 Kokkos::deep_copy(h_bufbottom, bufbottom);
619 /* MPI send buffer */
620 MPI_Send(h_bufbottom.data(), mxsub, MPI_SUNREALTYPE, thispe-npex, 0, *rawComm);
621 }
622
623 /* If jysub < NPEY-1, send data from top x-line of u. (via buftop) */
624
625 if (jysub != npey-1) {
626 /* Device kernel here to copy from uarray to the buffer on the device */
627 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mxsub),
628 KOKKOS_LAMBDA (const local_ordinal_type &lx)
629 {
630 buftop(lx) = uu_1d((mysub-1)*mxsub + lx);
631 }
632 );
633 /* Copy buffer to the host */
634 Kokkos::deep_copy(h_buftop, buftop);
635 /* MPI send buffer */
636 MPI_Send(h_buftop.data(), mxsub, MPI_SUNREALTYPE, thispe+npex, 0, *rawComm);
637 }
638
639 /* If ixsub > 0, send data from left y-line of u (via bufleft). */
640
641 if (ixsub != 0) {
642 /* Device kernel here to copy from uarray to the buffer on the device */
643 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mysub),
644 KOKKOS_LAMBDA (const local_ordinal_type &ly)
645 {
646 bufleft(ly) = uu_1d(ly*mxsub);
647 }
648 );
649 /* Copy buffer to the host */
650 Kokkos::deep_copy(h_bufleft, bufleft);
651 /* MPI send buffer */
652 MPI_Send(h_bufleft.data(), mysub, MPI_SUNREALTYPE, thispe-1, 0, *rawComm);
653 }
654
655 /* If ixsub < NPEX-1, send data from right y-line of u (via bufright). */
656
657 if (ixsub != npex-1) {
658 /* Device kernel here to copy from uarray to the buffer on the device */
659 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mysub),
660 KOKKOS_LAMBDA (const local_ordinal_type &ly)
661 {
662 bufright(ly) = uu_1d(ly*mxsub + (mxsub-1));
663 }
664 );
665 /* Copy buffer to the host */
666 Kokkos::deep_copy(h_bufright, bufright);
667 /* MPI send buffer */
668 MPI_Send(h_bufright.data(), mysub, MPI_SUNREALTYPE, thispe+1, 0, *rawComm);
669 }
670
671 return(0);
672
673 }
674
675
676 /*
677 * Routine to start receiving boundary data from neighboring PEs.
678 * Notes:
679 * 1) buffer should be able to hold 2*(MYSUB+MYSUB) realtype entries, should
680 * be passed to both the BRecvPost and BRecvWait functions, and should not
681 * be manipulated between the two calls.
682 * 2) request should have 4 entries, and should be passed in
683 * both calls also.
684 */
BRecvPost(MPI_Request request[],void * user_data)685 static int BRecvPost(MPI_Request request[], void *user_data)
686 {
687 UserData* data = reinterpret_cast<UserData*>(user_data);
688
689 /* Get comm, thispe, subgrid indices, data sizes */
690 auto comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(data->comm);
691 auto rawComm = comm->getRawMpiComm();
692
693 const int thispe = data->thispe;
694 const int ixsub = data->ixsub;
695 const int jysub = data->jysub;
696 const int npex = data->npex;
697 const int npey = data->npey;
698 const sunindextype mxsub = data->mxsub;
699 const sunindextype mysub = data->mysub;
700
701 /* Get left, right, top and bottom host buffers. */
702 typename Kokkos::View<realtype*>::HostMirror h_bufleft = data->h_recv_buff_left;
703 typename Kokkos::View<realtype*>::HostMirror h_bufright = data->h_recv_buff_right;
704 typename Kokkos::View<realtype*>::HostMirror h_buftop = data->h_recv_buff_top;
705 typename Kokkos::View<realtype*>::HostMirror h_bufbottom = data->h_recv_buff_bottom;
706
707 /* If jysub > 0, receive data for bottom x-line of uext. */
708 if (jysub != 0) {
709 MPI_Irecv(h_bufbottom.data(), mxsub, MPI_SUNREALTYPE,
710 thispe-npex, 0, *rawComm, &request[0]);
711 }
712
713 /* If jysub < NPEY-1, receive data for top x-line of uext. */
714 if (jysub != npey-1) {
715 MPI_Irecv(h_buftop.data(), mxsub, MPI_SUNREALTYPE,
716 thispe+npex, 0, *rawComm, &request[1]);
717 }
718
719 /* If ixsub > 0, receive data for left y-line of uext (via bufleft). */
720 if (ixsub != 0) {
721 MPI_Irecv(h_bufleft.data(), mysub, MPI_SUNREALTYPE,
722 thispe-1, 0, *rawComm, &request[2]);
723 }
724
725 /* If ixsub < NPEX-1, receive data for right y-line of uext (via bufright). */
726 if (ixsub != npex-1) {
727 MPI_Irecv(h_bufright.data(), mysub, MPI_SUNREALTYPE,
728 thispe+1, 0, *rawComm, &request[3]);
729 }
730
731 return(0);
732
733 }
734
735 /*
736 * Routine to finish receiving boundary data from neighboring PEs.
737 * Notes:
738 * 1) buffer should be able to hold 2*MYSUB realtype entries, should be
739 * passed to both the BRecvPost and BRecvWait functions, and should not
740 * be manipulated between the two calls.
741 * 2) request should have four entries, and should be passed in both
742 * calls also.
743 */
BRecvWait(MPI_Request request[],void * user_data)744 static int BRecvWait(MPI_Request request[], void *user_data)
745 {
746 MPI_Status status;
747 UserData* data = reinterpret_cast<UserData*>(user_data);
748
749 /* Get thispe, subgrid indices, data sizes */
750 const int ixsub = data->ixsub;
751 const int jysub = data->jysub;
752 const int npex = data->npex;
753 const int npey = data->npey;
754 const sunindextype mxsub = data->mxsub;
755 const sunindextype mysub = data->mysub;
756
757 /* Get pointers to buffers and extended solution vector data array uext. */
758 typename Kokkos::View<realtype*>::HostMirror h_bufleft = data->h_recv_buff_left;
759 typename Kokkos::View<realtype*>::HostMirror h_bufright = data->h_recv_buff_right;
760 typename Kokkos::View<realtype*>::HostMirror h_buftop = data->h_recv_buff_top;
761 typename Kokkos::View<realtype*>::HostMirror h_bufbottom = data->h_recv_buff_bottom;
762
763 Kokkos::View<realtype*, memory_space> bufleft = data->recv_buff_left;
764 Kokkos::View<realtype*, memory_space> bufright = data->recv_buff_right;
765 Kokkos::View<realtype*, memory_space> buftop = data->recv_buff_top;
766 Kokkos::View<realtype*, memory_space> bufbottom = data->recv_buff_bottom;
767
768 Kokkos::View<realtype*, memory_space> uext_1d = data->uext;
769
770 const sunindextype mxsub2 = mxsub + 2;
771 const sunindextype mysub1 = mysub + 1;
772
773 /* If jysub > 0, receive data for bottom x-line of uext. */
774 if (jysub != 0) {
775 MPI_Wait(&request[0], &status);
776 /* Copy the buffer from the host to the device */
777 Kokkos::deep_copy(bufbottom, h_bufbottom);
778 /* Copy the bottom dev_recv_buff to uext. */
779 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mxsub),
780 KOKKOS_LAMBDA (const local_ordinal_type &lx)
781 {
782 uext_1d(1 + lx) = bufbottom(lx);
783 }
784 );
785 }
786
787 /* If jysub < NPEY-1, receive data for top x-line of uext. */
788 if (jysub != npey-1) {
789 MPI_Wait(&request[1], &status);
790 /* Copy the buffer from the host to the device */
791 Kokkos::deep_copy(buftop, h_buftop);
792 /* Copy the top dev_recv_buff to uext. */
793 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mxsub),
794 KOKKOS_LAMBDA (const local_ordinal_type &lx)
795 {
796 uext_1d((1 + mysub1*mxsub2) + lx) = buftop(lx);
797 }
798 );
799 }
800
801 /* If ixsub > 0, receive data for left y-line of uext (via bufleft). */
802 if (ixsub != 0) {
803 MPI_Wait(&request[2], &status);
804 /* Copy the buffer from the host to the device */
805 Kokkos::deep_copy(bufleft, h_bufleft);
806 /* Copy the left dev_recv_buff to uext. */
807 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mysub),
808 KOKKOS_LAMBDA (const local_ordinal_type &ly)
809 {
810 uext_1d((ly + 1)*mxsub2) = bufleft(ly);
811 }
812 );
813 }
814
815 /* If ixsub < NPEX-1, receive data for right y-line of uext (via bufright). */
816 if (ixsub != npex-1) {
817 MPI_Wait(&request[3], &status);
818 /* Copy the buffer from the host to the device */
819 Kokkos::deep_copy(bufright, h_bufright);
820 /* Copy the right dev_recv_buff to uext. */
821 Kokkos::parallel_for (Kokkos::RangePolicy<execution_space>(0, mysub),
822 KOKKOS_LAMBDA (const local_ordinal_type &ly)
823 {
824 uext_1d((ly + 2)*mxsub2 - 1) = bufright(ly);
825 }
826 );
827 }
828
829 return(0);
830 }
831
832 /*
833 *--------------------------------------------------------------------
834 * PRIVATE FUNCTIONS
835 *--------------------------------------------------------------------
836 */
837
838 /*
839 * InitUserData initializes the user's data block data.
840 */
InitUserData(int thispe,const Teuchos::RCP<const Teuchos::Comm<int>> comm,UserData * data)841 static int InitUserData(int thispe, const Teuchos::RCP<const Teuchos::Comm<int> > comm, UserData *data)
842 {
843 data->comm = comm;
844 data->thispe = thispe;
845 data->npex = NPEX; /* Number of subgrids in x-direction */
846 data->npey = NPEY; /* Number of subgrids in y-direction */
847 data->mxsub = MXSUB; /* Number of subgrid mesh points in x-direction */
848 data->mysub = MYSUB; /* Number of subgrid mesh points in y-direction */
849 data->jysub = thispe/data->npex;
850 data->ixsub = thispe - (data->jysub * data->npex);
851 data->mx = data->npex * data->mxsub; /* Mesh size in x-direction */
852 data->my = data->npey * data->mysub; /* Mesh size in y-direction */
853 data->dx = ONE/(data->mx-ONE); /* Assumes a [0,1] interval in x. */
854 data->dy = ONE/(data->my-ONE); /* Assumes a [0,1] interval in y. */
855 data->coeffx = ONE/(data->dx * data->dx);
856 data->coeffy = ONE/(data->dy * data->dy);
857 data->coeffxy = TWO/(data->dx * data->dx) + TWO/(data->dy * data->dy);
858
859 return(0);
860 }
861
862
863 /*
864 * AllocUserData allocates memory for the extended vector uext
865 * and MPI communication buffers.
866 */
AllocUserData(int thispe,N_Vector uu,UserData * data)867 static int AllocUserData(int thispe, N_Vector uu, UserData *data)
868 {
869 sunindextype mxsub = data->mxsub;
870 sunindextype mysub = data->mysub;
871
872 /* Allocate local extended vector (includes ghost nodes) */
873 Kokkos::resize(data->uext, (mxsub + 2)*(mysub +2));
874
875 /* Allocate local send buffers */
876 Kokkos::resize(data->send_buff_left, mysub);
877 Kokkos::resize(data->send_buff_right, mysub);
878 Kokkos::resize(data->send_buff_top, mxsub);
879 Kokkos::resize(data->send_buff_bottom, mxsub);
880
881 data->h_send_buff_left = create_mirror_view(data->send_buff_left);
882 data->h_send_buff_right = create_mirror_view(data->send_buff_right);
883 data->h_send_buff_top = create_mirror_view(data->send_buff_top);
884 data->h_send_buff_bottom = create_mirror_view(data->send_buff_bottom);
885
886 /* Allocate local receive buffers */
887 Kokkos::resize(data->recv_buff_left, mysub);
888 Kokkos::resize(data->recv_buff_right, mysub);
889 Kokkos::resize(data->recv_buff_top, mxsub);
890 Kokkos::resize(data->recv_buff_bottom, mxsub);
891
892 data->h_recv_buff_left = create_mirror_view(data->recv_buff_left);
893 data->h_recv_buff_right = create_mirror_view(data->recv_buff_right);
894 data->h_recv_buff_top = create_mirror_view(data->recv_buff_top);
895 data->h_recv_buff_bottom = create_mirror_view(data->recv_buff_bottom);
896
897 /* An N-vector to hold preconditioner. */
898 data->pp = N_VClone(uu);
899 if(data->pp == NULL) {
900 return -1;
901 }
902
903 return 0;
904 }
905
906
907 /*
908 * SetInitialProfile sets the initial values for the problem.
909 */
SetInitialProfile(N_Vector uu,N_Vector up,N_Vector id,N_Vector res,UserData * data)910 static int SetInitialProfile(N_Vector uu, N_Vector up, N_Vector id,
911 N_Vector res, UserData *data)
912 {
913 sunindextype i, iloc, j, jloc, loc;
914 realtype xfact, yfact;
915
916 /* Initialize uu. */
917
918 /* Get Tpetra vectors */
919 Teuchos::RCP<vector_type> tu = N_VGetVector_Trilinos(uu);
920 Teuchos::RCP<vector_type> ti = N_VGetVector_Trilinos(id);
921
922 /* Get access to local uu and id data, sync the host with the device */
923 tu->sync<Kokkos::HostSpace>();
924 auto u_2d = tu->getLocalView<Kokkos::HostSpace>();
925 auto u_1d = Kokkos::subview(u_2d, Kokkos::ALL(), 0);
926 tu->modify<Kokkos::HostSpace>();
927
928 ti->sync<Kokkos::HostSpace>();
929 auto id_2d = ti->getLocalView<Kokkos::HostSpace>();
930 auto id_1d = Kokkos::subview(id_2d, Kokkos::ALL(), 0);
931 ti->modify<Kokkos::HostSpace>();
932
933 /* Set mesh spacings and subgrid indices for this PE. */
934 const realtype dx = data->dx;
935 const realtype dy = data->dy;
936 const int ixsub = data->ixsub;
937 const int jysub = data->jysub;
938
939 /* Set beginning and ending locations in the global array corresponding
940 to the portion of that array assigned to this processor. */
941 const sunindextype mxsub = data->mxsub;
942 const sunindextype mysub = data->mysub;
943 const sunindextype ixbegin = mxsub*ixsub;
944 const sunindextype ixend = mxsub*(ixsub+1) - 1;
945 const sunindextype jybegin = mysub*jysub;
946 const sunindextype jyend = mysub*(jysub+1) - 1;
947
948 /* Loop over the local array, computing the initial profile value.
949 The global indices are (i,j) and the local indices are (iloc,jloc).
950 Also set the id vector to zero for boundary points, one otherwise. */
951
952 for (j = jybegin, jloc = 0; j <= jyend; j++, jloc++) {
953 yfact = dy*j;
954 for (i = ixbegin, iloc = 0; i <= ixend; i++, iloc++) {
955 xfact = dx*i;
956 loc = iloc + jloc*mxsub;
957 u_1d(loc) = RCONST(16.0) * xfact * (ONE - xfact) * yfact * (ONE - yfact);
958
959 if (i == 0 || i == data->mx - 1 || j == 0 || j == data->my - 1)
960 id_1d(loc) = ZERO;
961 else
962 id_1d(loc) = ONE;
963 }
964 }
965
966 /* Sync the device with the host */
967 tu->sync<memory_space>();
968 ti->sync<memory_space>();
969
970 /* Initialize up. */
971
972 N_VConst(ZERO, up); /* Initially set up = 0. */
973
974 /* resHeat sets res to negative of ODE RHS values at interior points. */
975 resHeat(ZERO, uu, up, res, data);
976
977 /* Copy -res into up to get correct initial up values on the device only! */
978 N_VScale(-ONE, res, up);
979
980 return(0);
981 }
982
983
984 /*
985 * Print first lines of output and table heading
986 */
PrintHeader(realtype rtol,realtype atol,UserData * data)987 static void PrintHeader(realtype rtol, realtype atol, UserData *data)
988 {
989 printf("\nidaHeat2D_kry_p: Heat equation, parallel example problem for IDA\n");
990 printf(" Discretized heat equation on 2D unit square.\n");
991 printf(" Zero boundary conditions,");
992 printf(" polynomial initial conditions.\n");
993 printf(" Mesh dimensions: %d x %d", (int) data->mx, (int) data->my);
994 printf(" Total system size: %ld\n\n", (long) data->mx * data->my);
995 printf("Subgrid dimensions: %d x %d", (int) data->mxsub, (int) data->mysub);
996 printf(" Processor array: %d x %d\n", (int) data->npex, (int) data->npey);
997 #if defined(SUNDIALS_EXTENDED_PRECISION)
998 printf("Tolerance parameters: rtol = %Lg atol = %Lg\n", rtol, atol);
999 #elif defined(SUNDIALS_DOUBLE_PRECISION)
1000 printf("Tolerance parameters: rtol = %g atol = %g\n", rtol, atol);
1001 #else
1002 printf("Tolerance parameters: rtol = %g atol = %g\n", rtol, atol);
1003 #endif
1004 printf("Constraints set to force all solution components >= 0. \n");
1005 printf("SUPPRESSALG = SUNTRUE to suppress local error testing on ");
1006 printf("all boundary components. \n");
1007 printf("Linear solver: SUNSPGMR ");
1008 printf("Preconditioner: diagonal elements only.\n");
1009
1010 /* Print output table heading and initial line of table. */
1011 printf("\n Output Summary (umax = max-norm of solution) \n\n");
1012 printf(" time umax k nst nni nli nre nreLS h npe nps\n");
1013 printf("----------------------------------------------------------------------\n");
1014 }
1015
1016
1017 /*
1018 * PrintOutput: print max norm of solution and current solver statistics
1019 */
PrintOutput(int id,void * ida_mem,realtype t,N_Vector uu)1020 static void PrintOutput(int id, void *ida_mem, realtype t, N_Vector uu)
1021 {
1022 realtype hused, umax;
1023 long int nst, nni, nje, nre, nreLS, nli, npe, nps;
1024 int kused, retval;
1025
1026 umax = N_VMaxNorm(uu);
1027
1028 if (id == 0) {
1029
1030 retval = IDAGetLastOrder(ida_mem, &kused);
1031 check_retval(&retval, "IDAGetLastOrder", 1, id);
1032 retval = IDAGetNumSteps(ida_mem, &nst);
1033 check_retval(&retval, "IDAGetNumSteps", 1, id);
1034 retval = IDAGetNumNonlinSolvIters(ida_mem, &nni);
1035 check_retval(&retval, "IDAGetNumNonlinSolvIters", 1, id);
1036 retval = IDAGetNumResEvals(ida_mem, &nre);
1037 check_retval(&retval, "IDAGetNumResEvals", 1, id);
1038 retval = IDAGetLastStep(ida_mem, &hused);
1039 check_retval(&retval, "IDAGetLastStep", 1, id);
1040 retval = IDAGetNumJtimesEvals(ida_mem, &nje);
1041 check_retval(&retval, "IDAGetNumJtimesEvals", 1, id);
1042 retval = IDAGetNumLinIters(ida_mem, &nli);
1043 check_retval(&retval, "IDAGetNumLinIters", 1, id);
1044 retval = IDAGetNumLinResEvals(ida_mem, &nreLS);
1045 check_retval(&retval, "IDAGetNumLinResEvals", 1, id);
1046 retval = IDAGetNumPrecEvals(ida_mem, &npe);
1047 check_retval(&retval, "IDAGetPrecEvals", 1, id);
1048 retval = IDAGetNumPrecSolves(ida_mem, &nps);
1049 check_retval(&retval, "IDAGetNumPrecSolves", 1, id);
1050
1051 #if defined(SUNDIALS_EXTENDED_PRECISION)
1052 printf(" %5.2Lf %13.5Le %d %3ld %3ld %3ld %4ld %4ld %9.2Le %3ld %3ld\n",
1053 t, umax, kused, nst, nni, nje, nre, nreLS, hused, npe, nps);
1054 #elif defined(SUNDIALS_DOUBLE_PRECISION)
1055 printf(" %5.2f %13.5e %d %3ld %3ld %3ld %4ld %4ld %9.2e %3ld %3ld\n",
1056 t, umax, kused, nst, nni, nje, nre, nreLS, hused, npe, nps);
1057 #else
1058 printf(" %5.2f %13.5e %d %3ld %3ld %3ld %4ld %4ld %9.2e %3ld %3ld\n",
1059 t, umax, kused, nst, nni, nje, nre, nreLS, hused, npe, nps);
1060 #endif
1061
1062 }
1063 }
1064
1065
1066 /*
1067 * Print some final integrator statistics
1068 */
PrintFinalStats(void * ida_mem)1069 static void PrintFinalStats(void *ida_mem)
1070 {
1071 long int netf, ncfn, ncfl;
1072
1073 IDAGetNumErrTestFails(ida_mem, &netf);
1074 IDAGetNumNonlinSolvConvFails(ida_mem, &ncfn);
1075 IDAGetNumLinConvFails(ida_mem, &ncfl);
1076
1077 printf("\nError test failures = %ld\n", netf);
1078 printf("Nonlinear convergence failures = %ld\n", ncfn);
1079 printf("Linear convergence failures = %ld\n", ncfl);
1080 }
1081
1082 /*
1083 * Check function return value...
1084 * opt == 0 means SUNDIALS function allocates memory so check if
1085 * returned NULL pointer
1086 * opt == 1 means SUNDIALS function returns a flag so check if
1087 * flag >= 0
1088 * opt == 2 means function allocates memory so check if returned
1089 * NULL pointer
1090 */
check_retval(void * flagvalue,const char * funcname,int opt,int id)1091 static int check_retval(void *flagvalue, const char *funcname, int opt, int id)
1092 {
1093 int *errflag;
1094
1095 if (opt == 0 && flagvalue == NULL) {
1096 /* Check if SUNDIALS function returned NULL pointer - no memory allocated */
1097 fprintf(stderr,
1098 "\nSUNDIALS_ERROR(%d): %s() failed - returned NULL pointer\n\n",
1099 id, funcname);
1100 return(1);
1101 } else if (opt == 1) {
1102 /* Check if flag < 0 */
1103 errflag = (int *) flagvalue;
1104 if (*errflag < 0) {
1105 fprintf(stderr,
1106 "\nSUNDIALS_ERROR(%d): %s() failed with flag = %d\n\n",
1107 id, funcname, *errflag);
1108 return(1);
1109 }
1110 } else if (opt == 2 && flagvalue == NULL) {
1111 /* Check if function returned NULL pointer - no memory allocated */
1112 fprintf(stderr,
1113 "\nMEMORY_ERROR(%d): %s() failed - returned NULL pointer\n\n",
1114 id, funcname);
1115 return(1);
1116 }
1117
1118 return(0);
1119 }
1120