1 /* ---------------------------------------------------------------------
2  *
3  * Copyright (C) 2019 - 2020 by the deal.II authors
4  *
5  * This file is part of the deal.II library.
6  *
7  * The deal.II library is free software; you can use it, redistribute
8  * it, and/or modify it under the terms of the GNU Lesser General
9  * Public License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  * The full text of the license can be found in the file LICENSE.md at
12  * the top level directory of deal.II.
13  *
14  * ---------------------------------------------------------------------
15 
16  *
17  * Authors: Bruno Turcksin, Daniel Arndt, Oak Ridge National Laboratory, 2019
18  */
19 
20 // First include the necessary files from the deal.II library known from the
21 // previous tutorials.
22 #include <deal.II/base/conditional_ostream.h>
23 #include <deal.II/base/quadrature_lib.h>
24 
25 #include <deal.II/dofs/dof_tools.h>
26 
27 #include <deal.II/fe/fe_q.h>
28 
29 #include <deal.II/grid/grid_generator.h>
30 #include <deal.II/grid/tria.h>
31 
32 #include <deal.II/lac/affine_constraints.h>
33 #include <deal.II/lac/la_parallel_vector.h>
34 #include <deal.II/lac/precondition.h>
35 #include <deal.II/lac/solver_cg.h>
36 
37 #include <deal.II/numerics/data_out.h>
38 #include <deal.II/numerics/vector_tools.h>
39 
40 // The following ones include the data structures for the
41 // implementation of matrix-free methods on GPU:
42 #include <deal.II/base/cuda.h>
43 
44 #include <deal.II/matrix_free/cuda_fe_evaluation.h>
45 #include <deal.II/matrix_free/cuda_matrix_free.h>
46 #include <deal.II/matrix_free/operators.h>
47 
48 #include <fstream>
49 
50 
51 // As usual, we enclose everything into a namespace of its own:
52 namespace Step64
53 {
54   using namespace dealii;
55 
56 
57   // @sect3{Class <code>VaryingCoefficientFunctor</code>}
58 
59   // Next, we define a class that implements the varying coefficients
60   // we want to use in the Helmholtz operator. Later, we want to pass
61   // an object of this type to a CUDAWrappers::MatrixFree
62   // object that expects the class to have an `operator()` that fills the
63   // values provided in the constructor for a given cell. This operator
64   // needs to run on the device, so it needs to be marked as `__device__`
65   // for the compiler.
66   template <int dim, int fe_degree>
67   class VaryingCoefficientFunctor
68   {
69   public:
VaryingCoefficientFunctor(double * coefficient)70     VaryingCoefficientFunctor(double *coefficient)
71       : coef(coefficient)
72     {}
73 
74     __device__ void operator()(
75       const unsigned int                                          cell,
76       const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data);
77 
78     // Since CUDAWrappers::MatrixFree::Data doesn't know about the size of its
79     // arrays, we need to store the number of quadrature points and the numbers
80     // of degrees of freedom in this class to do necessary index conversions.
81     static const unsigned int n_dofs_1d = fe_degree + 1;
82     static const unsigned int n_local_dofs =
83       dealii::Utilities::pow(n_dofs_1d, dim);
84     static const unsigned int n_q_points =
85       dealii::Utilities::pow(n_dofs_1d, dim);
86 
87   private:
88     double *coef;
89   };
90 
91 
92 
93   // The following function implements this coefficient. Recall from
94   // the introduction that we have defined it as $a(\mathbf
95   // x)=\frac{10}{0.05 + 2\|\mathbf x\|^2}$
96   template <int dim, int fe_degree>
operator ()(const unsigned int cell,const typename CUDAWrappers::MatrixFree<dim,double>::Data * gpu_data)97   __device__ void VaryingCoefficientFunctor<dim, fe_degree>::operator()(
98     const unsigned int                                          cell,
99     const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data)
100   {
101     const unsigned int pos = CUDAWrappers::local_q_point_id<dim, double>(
102       cell, gpu_data, n_dofs_1d, n_q_points);
103     const Point<dim> q_point =
104       CUDAWrappers::get_quadrature_point<dim, double>(cell,
105                                                       gpu_data,
106                                                       n_dofs_1d);
107 
108     double p_square = 0.;
109     for (unsigned int i = 0; i < dim; ++i)
110       {
111         const double coord = q_point[i];
112         p_square += coord * coord;
113       }
114     coef[pos] = 10. / (0.05 + 2. * p_square);
115   }
116 
117 
118   // @sect3{Class <code>HelmholtzOperatorQuad</code>}
119 
120   // The class `HelmholtzOperatorQuad` implements the evaluation of
121   // the Helmholtz operator at each quadrature point. It uses a
122   // similar mechanism as the MatrixFree framework introduced in
123   // step-37. In contrast to there, the actual quadrature point
124   // index is treated implicitly by converting the current thread
125   // index. As before, the functions of this class need to run on
126   // the device, so need to be marked as `__device__` for the
127   // compiler.
128   template <int dim, int fe_degree>
129   class HelmholtzOperatorQuad
130   {
131   public:
HelmholtzOperatorQuad(double coef)132     __device__ HelmholtzOperatorQuad(double coef)
133       : coef(coef)
134     {}
135 
136     __device__ void
137     operator()(CUDAWrappers::FEEvaluation<dim, fe_degree> *fe_eval) const;
138 
139   private:
140     double coef;
141   };
142 
143 
144   // The Helmholtz problem we want to solve here reads in weak form as follows:
145   // @f{eqnarray*}
146   //   (\nabla v, \nabla u)+ (v, a(\mathbf x) u) &=&(v,1) \quad \forall v.
147   // @f}
148   // If you have seen step-37, then it will be obvious that
149   // the two terms on the left-hand side correspond to the two function calls
150   // here:
151   template <int dim, int fe_degree>
152   __device__ void HelmholtzOperatorQuad<dim, fe_degree>::
operator ()(CUDAWrappers::FEEvaluation<dim,fe_degree> * fe_eval) const153                   operator()(CUDAWrappers::FEEvaluation<dim, fe_degree> *fe_eval) const
154   {
155     fe_eval->submit_value(coef * fe_eval->get_value());
156     fe_eval->submit_gradient(fe_eval->get_gradient());
157   }
158 
159 
160   // @sect3{Class <code>LocalHelmholtzOperator</code>}
161 
162   // Finally, we need to define a class that implements the whole operator
163   // evaluation that corresponds to a matrix-vector product in matrix-based
164   // approaches.
165   template <int dim, int fe_degree>
166   class LocalHelmholtzOperator
167   {
168   public:
LocalHelmholtzOperator(double * coefficient)169     LocalHelmholtzOperator(double *coefficient)
170       : coef(coefficient)
171     {}
172 
173     __device__ void operator()(
174       const unsigned int                                          cell,
175       const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data,
176       CUDAWrappers::SharedData<dim, double> *                     shared_data,
177       const double *                                              src,
178       double *                                                    dst) const;
179 
180     // Again, the CUDAWrappers::MatrixFree object doesn't know about the number
181     // of degrees of freedom and the number of quadrature points so we need
182     // to store these for index calculations in the call operator.
183     static const unsigned int n_dofs_1d    = fe_degree + 1;
184     static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim);
185     static const unsigned int n_q_points   = Utilities::pow(fe_degree + 1, dim);
186 
187   private:
188     double *coef;
189   };
190 
191 
192   // This is the call operator that performs the Helmholtz operator evaluation
193   // on a given cell similar to the MatrixFree framework on the CPU.
194   // In particular, we need access to both values and gradients of the source
195   // vector and we write value and gradient information to the destination
196   // vector.
197   template <int dim, int fe_degree>
operator ()(const unsigned int cell,const typename CUDAWrappers::MatrixFree<dim,double>::Data * gpu_data,CUDAWrappers::SharedData<dim,double> * shared_data,const double * src,double * dst) const198   __device__ void LocalHelmholtzOperator<dim, fe_degree>::operator()(
199     const unsigned int                                          cell,
200     const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data,
201     CUDAWrappers::SharedData<dim, double> *                     shared_data,
202     const double *                                              src,
203     double *                                                    dst) const
204   {
205     const unsigned int pos = CUDAWrappers::local_q_point_id<dim, double>(
206       cell, gpu_data, n_dofs_1d, n_q_points);
207 
208     CUDAWrappers::FEEvaluation<dim, fe_degree, fe_degree + 1, 1, double>
209       fe_eval(cell, gpu_data, shared_data);
210     fe_eval.read_dof_values(src);
211     fe_eval.evaluate(true, true);
212     fe_eval.apply_for_each_quad_point(
213       HelmholtzOperatorQuad<dim, fe_degree>(coef[pos]));
214     fe_eval.integrate(true, true);
215     fe_eval.distribute_local_to_global(dst);
216   }
217 
218 
219   // @sect3{Class <code>HelmholtzOperator</code>}
220 
221   // The `HelmholtzOperator` class acts as wrapper for
222   // `LocalHelmholtzOperator` defining an interface that can be used
223   // with linear solvers like SolverCG. In particular, like every
224   // class that implements the interface of a linear operator, it
225   // needs to have a `vmult()` function that performs the action of
226   // the linear operator on a source vector.
227   template <int dim, int fe_degree>
228   class HelmholtzOperator
229   {
230   public:
231     HelmholtzOperator(const DoFHandler<dim> &          dof_handler,
232                       const AffineConstraints<double> &constraints);
233 
234     void
235     vmult(LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &dst,
236           const LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA>
237             &src) const;
238 
239     void initialize_dof_vector(
240       LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &vec) const;
241 
242   private:
243     CUDAWrappers::MatrixFree<dim, double>       mf_data;
244     LinearAlgebra::CUDAWrappers::Vector<double> coef;
245   };
246 
247 
248 
249   // The following is the implementation of the constructor of this
250   // class. In the first part, we initialize the `mf_data` member
251   // variable that is going to provide us with the necessary
252   // information when evaluating the operator.
253   //
254   // In the second half, we need to store the value of the coefficient
255   // for each quadrature point in every active, locally owned cell.
256   // We can ask the parallel triangulation for the number of active, locally
257   // owned cells but only have a DoFHandler object at hand. Since
258   // DoFHandler::get_triangulation() returns a Triangulation object, not a
259   // parallel::TriangulationBase object, we have to downcast the return value.
260   // This is safe to do here because we know that the triangulation is a
261   // parallel:distributed::Triangulation object in fact.
262   template <int dim, int fe_degree>
HelmholtzOperator(const DoFHandler<dim> & dof_handler,const AffineConstraints<double> & constraints)263   HelmholtzOperator<dim, fe_degree>::HelmholtzOperator(
264     const DoFHandler<dim> &          dof_handler,
265     const AffineConstraints<double> &constraints)
266   {
267     MappingQGeneric<dim> mapping(fe_degree);
268     typename CUDAWrappers::MatrixFree<dim, double>::AdditionalData
269       additional_data;
270     additional_data.mapping_update_flags = update_values | update_gradients |
271                                            update_JxW_values |
272                                            update_quadrature_points;
273     const QGauss<1> quad(fe_degree + 1);
274     mf_data.reinit(mapping, dof_handler, constraints, quad, additional_data);
275 
276 
277     const unsigned int n_owned_cells =
278       dynamic_cast<const parallel::TriangulationBase<dim> *>(
279         &dof_handler.get_triangulation())
280         ->n_locally_owned_active_cells();
281     coef.reinit(Utilities::pow(fe_degree + 1, dim) * n_owned_cells);
282 
283     const VaryingCoefficientFunctor<dim, fe_degree> functor(coef.get_values());
284     mf_data.evaluate_coefficients(functor);
285   }
286 
287 
288   // The key step then is to use all of the previous classes to loop over
289   // all cells to perform the matrix-vector product. We implement this
290   // in the next function.
291   //
292   // When applying the Helmholtz operator, we have to be careful to handle
293   // boundary conditions correctly. Since the local operator doesn't know about
294   // constraints, we have to copy the correct values from the source to the
295   // destination vector afterwards.
296   template <int dim, int fe_degree>
vmult(LinearAlgebra::distributed::Vector<double,MemorySpace::CUDA> & dst,const LinearAlgebra::distributed::Vector<double,MemorySpace::CUDA> & src) const297   void HelmholtzOperator<dim, fe_degree>::vmult(
298     LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &      dst,
299     const LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &src)
300     const
301   {
302     dst = 0.;
303     LocalHelmholtzOperator<dim, fe_degree> helmholtz_operator(
304       coef.get_values());
305     mf_data.cell_loop(helmholtz_operator, src, dst);
306     mf_data.copy_constrained_values(src, dst);
307   }
308 
309 
310 
311   template <int dim, int fe_degree>
initialize_dof_vector(LinearAlgebra::distributed::Vector<double,MemorySpace::CUDA> & vec) const312   void HelmholtzOperator<dim, fe_degree>::initialize_dof_vector(
313     LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &vec) const
314   {
315     mf_data.initialize_dof_vector(vec);
316   }
317 
318 
319   // @sect3{Class <code>HelmholtzProblem</code>}
320 
321   // This is the main class of this program. It defines the usual
322   // framework we use for tutorial programs. The only point worth
323   // commenting on is the `solve()` function and the choice of vector
324   // types.
325   template <int dim, int fe_degree>
326   class HelmholtzProblem
327   {
328   public:
329     HelmholtzProblem();
330 
331     void run();
332 
333   private:
334     void setup_system();
335 
336     void assemble_rhs();
337 
338     void solve();
339 
340     void output_results(const unsigned int cycle) const;
341 
342     MPI_Comm mpi_communicator;
343 
344     parallel::distributed::Triangulation<dim> triangulation;
345 
346     FE_Q<dim>       fe;
347     DoFHandler<dim> dof_handler;
348 
349     IndexSet locally_owned_dofs;
350     IndexSet locally_relevant_dofs;
351 
352     AffineConstraints<double>                          constraints;
353     std::unique_ptr<HelmholtzOperator<dim, fe_degree>> system_matrix_dev;
354 
355     // Since all the operations in the `solve()` function are executed on the
356     // graphics card, it is necessary for the vectors used to store their values
357     // on the GPU as well. LinearAlgebra::distributed::Vector can be told which
358     // memory space to use. There is also LinearAlgebra::CUDAWrappers::Vector
359     // that always uses GPU memory storage but doesn't work with MPI. It might
360     // be worth noticing that the communication between different MPI processes
361     // can be improved if the MPI implementation is CUDA-aware and the configure
362     // flag `DEAL_II_MPI_WITH_CUDA_SUPPORT` is enabled. (The value of this
363     // flag needs to be set at the time you call `cmake` when installing
364     // deal.II.)
365     //
366     // In addition, we also keep a solution vector with CPU storage such that we
367     // can view and display the solution as usual.
368     LinearAlgebra::distributed::Vector<double, MemorySpace::Host>
369                                                                   ghost_solution_host;
370     LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> solution_dev;
371     LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA>
372       system_rhs_dev;
373 
374     ConditionalOStream pcout;
375   };
376 
377 
378   // The implementation of all the remaining functions of this class apart from
379   // `Helmholtzproblem::solve()` doesn't contain anything new and we won't
380   // further comment much on the overall approach.
381   template <int dim, int fe_degree>
HelmholtzProblem()382   HelmholtzProblem<dim, fe_degree>::HelmholtzProblem()
383     : mpi_communicator(MPI_COMM_WORLD)
384     , triangulation(mpi_communicator)
385     , fe(fe_degree)
386     , dof_handler(triangulation)
387     , pcout(std::cout, Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
388   {}
389 
390 
391 
392   template <int dim, int fe_degree>
setup_system()393   void HelmholtzProblem<dim, fe_degree>::setup_system()
394   {
395     dof_handler.distribute_dofs(fe);
396 
397     locally_owned_dofs = dof_handler.locally_owned_dofs();
398     DoFTools::extract_locally_relevant_dofs(dof_handler, locally_relevant_dofs);
399     system_rhs_dev.reinit(locally_owned_dofs, mpi_communicator);
400 
401     constraints.clear();
402     constraints.reinit(locally_relevant_dofs);
403     DoFTools::make_hanging_node_constraints(dof_handler, constraints);
404     VectorTools::interpolate_boundary_values(dof_handler,
405                                              0,
406                                              Functions::ZeroFunction<dim>(),
407                                              constraints);
408     constraints.close();
409 
410     system_matrix_dev.reset(
411       new HelmholtzOperator<dim, fe_degree>(dof_handler, constraints));
412 
413     ghost_solution_host.reinit(locally_owned_dofs,
414                                locally_relevant_dofs,
415                                mpi_communicator);
416     system_matrix_dev->initialize_dof_vector(solution_dev);
417     system_rhs_dev.reinit(solution_dev);
418   }
419 
420 
421 
422   // Unlike programs such as step-4 or step-6, we will not have to
423   // assemble the whole linear system but only the right hand side
424   // vector. This looks in essence like we did in step-4, for example,
425   // but we have to pay attention to using the right constraints
426   // object when copying local contributions into the global
427   // vector. In particular, we need to make sure the entries that
428   // correspond to boundary nodes are properly zeroed out. This is
429   // necessary for CG to converge.  (Another solution would be to
430   // modify the `vmult()` function above in such a way that we pretend
431   // the source vector has zero entries by just not taking them into
432   // account in matrix-vector products. But the approach used here is
433   // simpler.)
434   //
435   // At the end of the function, we can't directly copy the values
436   // from the host to the device but need to use an intermediate
437   // object of type LinearAlgebra::ReadWriteVector to construct the
438   // correct communication pattern necessary.
439   template <int dim, int fe_degree>
assemble_rhs()440   void HelmholtzProblem<dim, fe_degree>::assemble_rhs()
441   {
442     LinearAlgebra::distributed::Vector<double, MemorySpace::Host>
443                       system_rhs_host(locally_owned_dofs,
444                       locally_relevant_dofs,
445                       mpi_communicator);
446     const QGauss<dim> quadrature_formula(fe_degree + 1);
447 
448     FEValues<dim> fe_values(fe,
449                             quadrature_formula,
450                             update_values | update_quadrature_points |
451                               update_JxW_values);
452 
453     const unsigned int dofs_per_cell = fe.n_dofs_per_cell();
454     const unsigned int n_q_points    = quadrature_formula.size();
455 
456     Vector<double> cell_rhs(dofs_per_cell);
457 
458     std::vector<types::global_dof_index> local_dof_indices(dofs_per_cell);
459 
460     for (const auto &cell : dof_handler.active_cell_iterators())
461       if (cell->is_locally_owned())
462         {
463           cell_rhs = 0;
464 
465           fe_values.reinit(cell);
466 
467           for (unsigned int q_index = 0; q_index < n_q_points; ++q_index)
468             {
469               for (unsigned int i = 0; i < dofs_per_cell; ++i)
470                 cell_rhs(i) += (fe_values.shape_value(i, q_index) * 1.0 *
471                                 fe_values.JxW(q_index));
472             }
473 
474           cell->get_dof_indices(local_dof_indices);
475           constraints.distribute_local_to_global(cell_rhs,
476                                                  local_dof_indices,
477                                                  system_rhs_host);
478         }
479     system_rhs_host.compress(VectorOperation::add);
480 
481     LinearAlgebra::ReadWriteVector<double> rw_vector(locally_owned_dofs);
482     rw_vector.import(system_rhs_host, VectorOperation::insert);
483     system_rhs_dev.import(rw_vector, VectorOperation::insert);
484   }
485 
486 
487 
488   // This solve() function finally contains the calls to the new classes
489   // previously discussed. Here we don't use any preconditioner, i.e.,
490   // precondition by the identity matrix, to focus just on the peculiarities of
491   // the CUDAWrappers::MatrixFree framework. Of course, in a real application
492   // the choice of a suitable preconditioner is crucial but we have at least the
493   // same restrictions as in step-37 since matrix entries are computed on the
494   // fly and not stored.
495   //
496   // After solving the linear system in the first part of the function, we
497   // copy the solution from the device to the host to be able to view its
498   // values and display it in `output_results()`. This transfer works the same
499   // as at the end of the previous function.
500   template <int dim, int fe_degree>
solve()501   void HelmholtzProblem<dim, fe_degree>::solve()
502   {
503     PreconditionIdentity preconditioner;
504 
505     SolverControl solver_control(system_rhs_dev.size(),
506                                  1e-12 * system_rhs_dev.l2_norm());
507     SolverCG<LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA>> cg(
508       solver_control);
509     cg.solve(*system_matrix_dev, solution_dev, system_rhs_dev, preconditioner);
510 
511     pcout << "  Solved in " << solver_control.last_step() << " iterations."
512           << std::endl;
513 
514     LinearAlgebra::ReadWriteVector<double> rw_vector(locally_owned_dofs);
515     rw_vector.import(solution_dev, VectorOperation::insert);
516     ghost_solution_host.import(rw_vector, VectorOperation::insert);
517 
518     constraints.distribute(ghost_solution_host);
519 
520     ghost_solution_host.update_ghost_values();
521   }
522 
523   // The output results function is as usual since we have already copied the
524   // values back from the GPU to the CPU.
525   //
526   // While we're already doing something with the function, we might
527   // as well compute the $L_2$ norm of the solution. We do this by
528   // calling VectorTools::integrate_difference(). That function is
529   // meant to compute the error by evaluating the difference between
530   // the numerical solution (given by a vector of values for the
531   // degrees of freedom) and an object representing the exact
532   // solution. But we can easily compute the $L_2$ norm of the
533   // solution by passing in a zero function instead. That is, instead
534   // of evaluating the error $\|u_h-u\|_{L_2(\Omega)}$, we are just
535   // evaluating $\|u_h-0\|_{L_2(\Omega)}=\|u_h\|_{L_2(\Omega)}$
536   // instead.
537   template <int dim, int fe_degree>
output_results(const unsigned int cycle) const538   void HelmholtzProblem<dim, fe_degree>::output_results(
539     const unsigned int cycle) const
540   {
541     DataOut<dim> data_out;
542 
543     data_out.attach_dof_handler(dof_handler);
544     data_out.add_data_vector(ghost_solution_host, "solution");
545     data_out.build_patches();
546 
547     DataOutBase::VtkFlags flags;
548     flags.compression_level = DataOutBase::VtkFlags::best_speed;
549     data_out.set_flags(flags);
550     data_out.write_vtu_with_pvtu_record(
551       "./", "solution", cycle, mpi_communicator, 2);
552 
553     Vector<float> cellwise_norm(triangulation.n_active_cells());
554     VectorTools::integrate_difference(dof_handler,
555                                       ghost_solution_host,
556                                       Functions::ZeroFunction<dim>(),
557                                       cellwise_norm,
558                                       QGauss<dim>(fe.degree + 2),
559                                       VectorTools::L2_norm);
560     const double global_norm =
561       VectorTools::compute_global_error(triangulation,
562                                         cellwise_norm,
563                                         VectorTools::L2_norm);
564     pcout << "  solution norm: " << global_norm << std::endl;
565   }
566 
567 
568   // There is nothing surprising in the `run()` function either. We simply
569   // compute the solution on a series of (globally) refined meshes.
570   template <int dim, int fe_degree>
run()571   void HelmholtzProblem<dim, fe_degree>::run()
572   {
573     for (unsigned int cycle = 0; cycle < 7 - dim; ++cycle)
574       {
575         pcout << "Cycle " << cycle << std::endl;
576 
577         if (cycle == 0)
578           GridGenerator::hyper_cube(triangulation, 0., 1.);
579         triangulation.refine_global(1);
580 
581         setup_system();
582 
583         pcout << "   Number of active cells:       "
584               << triangulation.n_global_active_cells() << std::endl
585               << "   Number of degrees of freedom: " << dof_handler.n_dofs()
586               << std::endl;
587 
588         assemble_rhs();
589         solve();
590         output_results(cycle);
591         pcout << std::endl;
592       }
593   }
594 } // namespace Step64
595 
596 
597 // @sect3{The <code>main()</code> function}
598 
599 // Finally for the `main()` function.  By default, all the MPI ranks
600 // will try to access the device with number 0, which we assume to be
601 // the GPU device associated with the CPU on which a particular MPI
602 // rank runs. This works, but if we are running with MPI support it
603 // may be that multiple MPI processes are running on the same machine
604 // (for example, one per CPU core) and then they would all want to
605 // access the same GPU on that machine. If there is only one GPU in
606 // the machine, there is nothing we can do about it: All MPI ranks on
607 // that machine need to share it. But if there are more than one GPU,
608 // then it is better to address different graphic cards for different
609 // processes. The choice below is based on the MPI process id by
610 // assigning GPUs round robin to GPU ranks. (To work correctly, this
611 // scheme assumes that the MPI ranks on one machine are
612 // consecutive. If that were not the case, then the rank-GPU
613 // association may just not be optimal.) To make this work, MPI needs
614 // to be initialized before using this function.
main(int argc,char * argv[])615 int main(int argc, char *argv[])
616 {
617   try
618     {
619       using namespace Step64;
620 
621       Utilities::MPI::MPI_InitFinalize mpi_init(argc, argv, 1);
622 
623       int         n_devices       = 0;
624       cudaError_t cuda_error_code = cudaGetDeviceCount(&n_devices);
625       AssertCuda(cuda_error_code);
626       const unsigned int my_mpi_id =
627         Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
628       const int device_id = my_mpi_id % n_devices;
629       cuda_error_code     = cudaSetDevice(device_id);
630       AssertCuda(cuda_error_code);
631 
632       HelmholtzProblem<3, 3> helmholtz_problem;
633       helmholtz_problem.run();
634     }
635   catch (std::exception &exc)
636     {
637       std::cerr << std::endl
638                 << std::endl
639                 << "----------------------------------------------------"
640                 << std::endl;
641       std::cerr << "Exception on processing: " << std::endl
642                 << exc.what() << std::endl
643                 << "Aborting!" << std::endl
644                 << "----------------------------------------------------"
645                 << std::endl;
646       return 1;
647     }
648   catch (...)
649     {
650       std::cerr << std::endl
651                 << std::endl
652                 << "----------------------------------------------------"
653                 << std::endl;
654       std::cerr << "Unknown exception!" << std::endl
655                 << "Aborting!" << std::endl
656                 << "----------------------------------------------------"
657                 << std::endl;
658       return 1;
659     }
660 
661   return 0;
662 }
663