1 /* ---------------------------------------------------------------------
2 *
3 * Copyright (C) 2019 - 2020 by the deal.II authors
4 *
5 * This file is part of the deal.II library.
6 *
7 * The deal.II library is free software; you can use it, redistribute
8 * it, and/or modify it under the terms of the GNU Lesser General
9 * Public License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 * The full text of the license can be found in the file LICENSE.md at
12 * the top level directory of deal.II.
13 *
14 * ---------------------------------------------------------------------
15
16 *
17 * Authors: Bruno Turcksin, Daniel Arndt, Oak Ridge National Laboratory, 2019
18 */
19
20 // First include the necessary files from the deal.II library known from the
21 // previous tutorials.
22 #include <deal.II/base/conditional_ostream.h>
23 #include <deal.II/base/quadrature_lib.h>
24
25 #include <deal.II/dofs/dof_tools.h>
26
27 #include <deal.II/fe/fe_q.h>
28
29 #include <deal.II/grid/grid_generator.h>
30 #include <deal.II/grid/tria.h>
31
32 #include <deal.II/lac/affine_constraints.h>
33 #include <deal.II/lac/la_parallel_vector.h>
34 #include <deal.II/lac/precondition.h>
35 #include <deal.II/lac/solver_cg.h>
36
37 #include <deal.II/numerics/data_out.h>
38 #include <deal.II/numerics/vector_tools.h>
39
40 // The following ones include the data structures for the
41 // implementation of matrix-free methods on GPU:
42 #include <deal.II/base/cuda.h>
43
44 #include <deal.II/matrix_free/cuda_fe_evaluation.h>
45 #include <deal.II/matrix_free/cuda_matrix_free.h>
46 #include <deal.II/matrix_free/operators.h>
47
48 #include <fstream>
49
50
51 // As usual, we enclose everything into a namespace of its own:
52 namespace Step64
53 {
54 using namespace dealii;
55
56
57 // @sect3{Class <code>VaryingCoefficientFunctor</code>}
58
59 // Next, we define a class that implements the varying coefficients
60 // we want to use in the Helmholtz operator. Later, we want to pass
61 // an object of this type to a CUDAWrappers::MatrixFree
62 // object that expects the class to have an `operator()` that fills the
63 // values provided in the constructor for a given cell. This operator
64 // needs to run on the device, so it needs to be marked as `__device__`
65 // for the compiler.
66 template <int dim, int fe_degree>
67 class VaryingCoefficientFunctor
68 {
69 public:
VaryingCoefficientFunctor(double * coefficient)70 VaryingCoefficientFunctor(double *coefficient)
71 : coef(coefficient)
72 {}
73
74 __device__ void operator()(
75 const unsigned int cell,
76 const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data);
77
78 // Since CUDAWrappers::MatrixFree::Data doesn't know about the size of its
79 // arrays, we need to store the number of quadrature points and the numbers
80 // of degrees of freedom in this class to do necessary index conversions.
81 static const unsigned int n_dofs_1d = fe_degree + 1;
82 static const unsigned int n_local_dofs =
83 dealii::Utilities::pow(n_dofs_1d, dim);
84 static const unsigned int n_q_points =
85 dealii::Utilities::pow(n_dofs_1d, dim);
86
87 private:
88 double *coef;
89 };
90
91
92
93 // The following function implements this coefficient. Recall from
94 // the introduction that we have defined it as $a(\mathbf
95 // x)=\frac{10}{0.05 + 2\|\mathbf x\|^2}$
96 template <int dim, int fe_degree>
operator ()(const unsigned int cell,const typename CUDAWrappers::MatrixFree<dim,double>::Data * gpu_data)97 __device__ void VaryingCoefficientFunctor<dim, fe_degree>::operator()(
98 const unsigned int cell,
99 const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data)
100 {
101 const unsigned int pos = CUDAWrappers::local_q_point_id<dim, double>(
102 cell, gpu_data, n_dofs_1d, n_q_points);
103 const Point<dim> q_point =
104 CUDAWrappers::get_quadrature_point<dim, double>(cell,
105 gpu_data,
106 n_dofs_1d);
107
108 double p_square = 0.;
109 for (unsigned int i = 0; i < dim; ++i)
110 {
111 const double coord = q_point[i];
112 p_square += coord * coord;
113 }
114 coef[pos] = 10. / (0.05 + 2. * p_square);
115 }
116
117
118 // @sect3{Class <code>HelmholtzOperatorQuad</code>}
119
120 // The class `HelmholtzOperatorQuad` implements the evaluation of
121 // the Helmholtz operator at each quadrature point. It uses a
122 // similar mechanism as the MatrixFree framework introduced in
123 // step-37. In contrast to there, the actual quadrature point
124 // index is treated implicitly by converting the current thread
125 // index. As before, the functions of this class need to run on
126 // the device, so need to be marked as `__device__` for the
127 // compiler.
128 template <int dim, int fe_degree>
129 class HelmholtzOperatorQuad
130 {
131 public:
HelmholtzOperatorQuad(double coef)132 __device__ HelmholtzOperatorQuad(double coef)
133 : coef(coef)
134 {}
135
136 __device__ void
137 operator()(CUDAWrappers::FEEvaluation<dim, fe_degree> *fe_eval) const;
138
139 private:
140 double coef;
141 };
142
143
144 // The Helmholtz problem we want to solve here reads in weak form as follows:
145 // @f{eqnarray*}
146 // (\nabla v, \nabla u)+ (v, a(\mathbf x) u) &=&(v,1) \quad \forall v.
147 // @f}
148 // If you have seen step-37, then it will be obvious that
149 // the two terms on the left-hand side correspond to the two function calls
150 // here:
151 template <int dim, int fe_degree>
152 __device__ void HelmholtzOperatorQuad<dim, fe_degree>::
operator ()(CUDAWrappers::FEEvaluation<dim,fe_degree> * fe_eval) const153 operator()(CUDAWrappers::FEEvaluation<dim, fe_degree> *fe_eval) const
154 {
155 fe_eval->submit_value(coef * fe_eval->get_value());
156 fe_eval->submit_gradient(fe_eval->get_gradient());
157 }
158
159
160 // @sect3{Class <code>LocalHelmholtzOperator</code>}
161
162 // Finally, we need to define a class that implements the whole operator
163 // evaluation that corresponds to a matrix-vector product in matrix-based
164 // approaches.
165 template <int dim, int fe_degree>
166 class LocalHelmholtzOperator
167 {
168 public:
LocalHelmholtzOperator(double * coefficient)169 LocalHelmholtzOperator(double *coefficient)
170 : coef(coefficient)
171 {}
172
173 __device__ void operator()(
174 const unsigned int cell,
175 const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data,
176 CUDAWrappers::SharedData<dim, double> * shared_data,
177 const double * src,
178 double * dst) const;
179
180 // Again, the CUDAWrappers::MatrixFree object doesn't know about the number
181 // of degrees of freedom and the number of quadrature points so we need
182 // to store these for index calculations in the call operator.
183 static const unsigned int n_dofs_1d = fe_degree + 1;
184 static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim);
185 static const unsigned int n_q_points = Utilities::pow(fe_degree + 1, dim);
186
187 private:
188 double *coef;
189 };
190
191
192 // This is the call operator that performs the Helmholtz operator evaluation
193 // on a given cell similar to the MatrixFree framework on the CPU.
194 // In particular, we need access to both values and gradients of the source
195 // vector and we write value and gradient information to the destination
196 // vector.
197 template <int dim, int fe_degree>
operator ()(const unsigned int cell,const typename CUDAWrappers::MatrixFree<dim,double>::Data * gpu_data,CUDAWrappers::SharedData<dim,double> * shared_data,const double * src,double * dst) const198 __device__ void LocalHelmholtzOperator<dim, fe_degree>::operator()(
199 const unsigned int cell,
200 const typename CUDAWrappers::MatrixFree<dim, double>::Data *gpu_data,
201 CUDAWrappers::SharedData<dim, double> * shared_data,
202 const double * src,
203 double * dst) const
204 {
205 const unsigned int pos = CUDAWrappers::local_q_point_id<dim, double>(
206 cell, gpu_data, n_dofs_1d, n_q_points);
207
208 CUDAWrappers::FEEvaluation<dim, fe_degree, fe_degree + 1, 1, double>
209 fe_eval(cell, gpu_data, shared_data);
210 fe_eval.read_dof_values(src);
211 fe_eval.evaluate(true, true);
212 fe_eval.apply_for_each_quad_point(
213 HelmholtzOperatorQuad<dim, fe_degree>(coef[pos]));
214 fe_eval.integrate(true, true);
215 fe_eval.distribute_local_to_global(dst);
216 }
217
218
219 // @sect3{Class <code>HelmholtzOperator</code>}
220
221 // The `HelmholtzOperator` class acts as wrapper for
222 // `LocalHelmholtzOperator` defining an interface that can be used
223 // with linear solvers like SolverCG. In particular, like every
224 // class that implements the interface of a linear operator, it
225 // needs to have a `vmult()` function that performs the action of
226 // the linear operator on a source vector.
227 template <int dim, int fe_degree>
228 class HelmholtzOperator
229 {
230 public:
231 HelmholtzOperator(const DoFHandler<dim> & dof_handler,
232 const AffineConstraints<double> &constraints);
233
234 void
235 vmult(LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &dst,
236 const LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA>
237 &src) const;
238
239 void initialize_dof_vector(
240 LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &vec) const;
241
242 private:
243 CUDAWrappers::MatrixFree<dim, double> mf_data;
244 LinearAlgebra::CUDAWrappers::Vector<double> coef;
245 };
246
247
248
249 // The following is the implementation of the constructor of this
250 // class. In the first part, we initialize the `mf_data` member
251 // variable that is going to provide us with the necessary
252 // information when evaluating the operator.
253 //
254 // In the second half, we need to store the value of the coefficient
255 // for each quadrature point in every active, locally owned cell.
256 // We can ask the parallel triangulation for the number of active, locally
257 // owned cells but only have a DoFHandler object at hand. Since
258 // DoFHandler::get_triangulation() returns a Triangulation object, not a
259 // parallel::TriangulationBase object, we have to downcast the return value.
260 // This is safe to do here because we know that the triangulation is a
261 // parallel:distributed::Triangulation object in fact.
262 template <int dim, int fe_degree>
HelmholtzOperator(const DoFHandler<dim> & dof_handler,const AffineConstraints<double> & constraints)263 HelmholtzOperator<dim, fe_degree>::HelmholtzOperator(
264 const DoFHandler<dim> & dof_handler,
265 const AffineConstraints<double> &constraints)
266 {
267 MappingQGeneric<dim> mapping(fe_degree);
268 typename CUDAWrappers::MatrixFree<dim, double>::AdditionalData
269 additional_data;
270 additional_data.mapping_update_flags = update_values | update_gradients |
271 update_JxW_values |
272 update_quadrature_points;
273 const QGauss<1> quad(fe_degree + 1);
274 mf_data.reinit(mapping, dof_handler, constraints, quad, additional_data);
275
276
277 const unsigned int n_owned_cells =
278 dynamic_cast<const parallel::TriangulationBase<dim> *>(
279 &dof_handler.get_triangulation())
280 ->n_locally_owned_active_cells();
281 coef.reinit(Utilities::pow(fe_degree + 1, dim) * n_owned_cells);
282
283 const VaryingCoefficientFunctor<dim, fe_degree> functor(coef.get_values());
284 mf_data.evaluate_coefficients(functor);
285 }
286
287
288 // The key step then is to use all of the previous classes to loop over
289 // all cells to perform the matrix-vector product. We implement this
290 // in the next function.
291 //
292 // When applying the Helmholtz operator, we have to be careful to handle
293 // boundary conditions correctly. Since the local operator doesn't know about
294 // constraints, we have to copy the correct values from the source to the
295 // destination vector afterwards.
296 template <int dim, int fe_degree>
vmult(LinearAlgebra::distributed::Vector<double,MemorySpace::CUDA> & dst,const LinearAlgebra::distributed::Vector<double,MemorySpace::CUDA> & src) const297 void HelmholtzOperator<dim, fe_degree>::vmult(
298 LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> & dst,
299 const LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &src)
300 const
301 {
302 dst = 0.;
303 LocalHelmholtzOperator<dim, fe_degree> helmholtz_operator(
304 coef.get_values());
305 mf_data.cell_loop(helmholtz_operator, src, dst);
306 mf_data.copy_constrained_values(src, dst);
307 }
308
309
310
311 template <int dim, int fe_degree>
initialize_dof_vector(LinearAlgebra::distributed::Vector<double,MemorySpace::CUDA> & vec) const312 void HelmholtzOperator<dim, fe_degree>::initialize_dof_vector(
313 LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> &vec) const
314 {
315 mf_data.initialize_dof_vector(vec);
316 }
317
318
319 // @sect3{Class <code>HelmholtzProblem</code>}
320
321 // This is the main class of this program. It defines the usual
322 // framework we use for tutorial programs. The only point worth
323 // commenting on is the `solve()` function and the choice of vector
324 // types.
325 template <int dim, int fe_degree>
326 class HelmholtzProblem
327 {
328 public:
329 HelmholtzProblem();
330
331 void run();
332
333 private:
334 void setup_system();
335
336 void assemble_rhs();
337
338 void solve();
339
340 void output_results(const unsigned int cycle) const;
341
342 MPI_Comm mpi_communicator;
343
344 parallel::distributed::Triangulation<dim> triangulation;
345
346 FE_Q<dim> fe;
347 DoFHandler<dim> dof_handler;
348
349 IndexSet locally_owned_dofs;
350 IndexSet locally_relevant_dofs;
351
352 AffineConstraints<double> constraints;
353 std::unique_ptr<HelmholtzOperator<dim, fe_degree>> system_matrix_dev;
354
355 // Since all the operations in the `solve()` function are executed on the
356 // graphics card, it is necessary for the vectors used to store their values
357 // on the GPU as well. LinearAlgebra::distributed::Vector can be told which
358 // memory space to use. There is also LinearAlgebra::CUDAWrappers::Vector
359 // that always uses GPU memory storage but doesn't work with MPI. It might
360 // be worth noticing that the communication between different MPI processes
361 // can be improved if the MPI implementation is CUDA-aware and the configure
362 // flag `DEAL_II_MPI_WITH_CUDA_SUPPORT` is enabled. (The value of this
363 // flag needs to be set at the time you call `cmake` when installing
364 // deal.II.)
365 //
366 // In addition, we also keep a solution vector with CPU storage such that we
367 // can view and display the solution as usual.
368 LinearAlgebra::distributed::Vector<double, MemorySpace::Host>
369 ghost_solution_host;
370 LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA> solution_dev;
371 LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA>
372 system_rhs_dev;
373
374 ConditionalOStream pcout;
375 };
376
377
378 // The implementation of all the remaining functions of this class apart from
379 // `Helmholtzproblem::solve()` doesn't contain anything new and we won't
380 // further comment much on the overall approach.
381 template <int dim, int fe_degree>
HelmholtzProblem()382 HelmholtzProblem<dim, fe_degree>::HelmholtzProblem()
383 : mpi_communicator(MPI_COMM_WORLD)
384 , triangulation(mpi_communicator)
385 , fe(fe_degree)
386 , dof_handler(triangulation)
387 , pcout(std::cout, Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
388 {}
389
390
391
392 template <int dim, int fe_degree>
setup_system()393 void HelmholtzProblem<dim, fe_degree>::setup_system()
394 {
395 dof_handler.distribute_dofs(fe);
396
397 locally_owned_dofs = dof_handler.locally_owned_dofs();
398 DoFTools::extract_locally_relevant_dofs(dof_handler, locally_relevant_dofs);
399 system_rhs_dev.reinit(locally_owned_dofs, mpi_communicator);
400
401 constraints.clear();
402 constraints.reinit(locally_relevant_dofs);
403 DoFTools::make_hanging_node_constraints(dof_handler, constraints);
404 VectorTools::interpolate_boundary_values(dof_handler,
405 0,
406 Functions::ZeroFunction<dim>(),
407 constraints);
408 constraints.close();
409
410 system_matrix_dev.reset(
411 new HelmholtzOperator<dim, fe_degree>(dof_handler, constraints));
412
413 ghost_solution_host.reinit(locally_owned_dofs,
414 locally_relevant_dofs,
415 mpi_communicator);
416 system_matrix_dev->initialize_dof_vector(solution_dev);
417 system_rhs_dev.reinit(solution_dev);
418 }
419
420
421
422 // Unlike programs such as step-4 or step-6, we will not have to
423 // assemble the whole linear system but only the right hand side
424 // vector. This looks in essence like we did in step-4, for example,
425 // but we have to pay attention to using the right constraints
426 // object when copying local contributions into the global
427 // vector. In particular, we need to make sure the entries that
428 // correspond to boundary nodes are properly zeroed out. This is
429 // necessary for CG to converge. (Another solution would be to
430 // modify the `vmult()` function above in such a way that we pretend
431 // the source vector has zero entries by just not taking them into
432 // account in matrix-vector products. But the approach used here is
433 // simpler.)
434 //
435 // At the end of the function, we can't directly copy the values
436 // from the host to the device but need to use an intermediate
437 // object of type LinearAlgebra::ReadWriteVector to construct the
438 // correct communication pattern necessary.
439 template <int dim, int fe_degree>
assemble_rhs()440 void HelmholtzProblem<dim, fe_degree>::assemble_rhs()
441 {
442 LinearAlgebra::distributed::Vector<double, MemorySpace::Host>
443 system_rhs_host(locally_owned_dofs,
444 locally_relevant_dofs,
445 mpi_communicator);
446 const QGauss<dim> quadrature_formula(fe_degree + 1);
447
448 FEValues<dim> fe_values(fe,
449 quadrature_formula,
450 update_values | update_quadrature_points |
451 update_JxW_values);
452
453 const unsigned int dofs_per_cell = fe.n_dofs_per_cell();
454 const unsigned int n_q_points = quadrature_formula.size();
455
456 Vector<double> cell_rhs(dofs_per_cell);
457
458 std::vector<types::global_dof_index> local_dof_indices(dofs_per_cell);
459
460 for (const auto &cell : dof_handler.active_cell_iterators())
461 if (cell->is_locally_owned())
462 {
463 cell_rhs = 0;
464
465 fe_values.reinit(cell);
466
467 for (unsigned int q_index = 0; q_index < n_q_points; ++q_index)
468 {
469 for (unsigned int i = 0; i < dofs_per_cell; ++i)
470 cell_rhs(i) += (fe_values.shape_value(i, q_index) * 1.0 *
471 fe_values.JxW(q_index));
472 }
473
474 cell->get_dof_indices(local_dof_indices);
475 constraints.distribute_local_to_global(cell_rhs,
476 local_dof_indices,
477 system_rhs_host);
478 }
479 system_rhs_host.compress(VectorOperation::add);
480
481 LinearAlgebra::ReadWriteVector<double> rw_vector(locally_owned_dofs);
482 rw_vector.import(system_rhs_host, VectorOperation::insert);
483 system_rhs_dev.import(rw_vector, VectorOperation::insert);
484 }
485
486
487
488 // This solve() function finally contains the calls to the new classes
489 // previously discussed. Here we don't use any preconditioner, i.e.,
490 // precondition by the identity matrix, to focus just on the peculiarities of
491 // the CUDAWrappers::MatrixFree framework. Of course, in a real application
492 // the choice of a suitable preconditioner is crucial but we have at least the
493 // same restrictions as in step-37 since matrix entries are computed on the
494 // fly and not stored.
495 //
496 // After solving the linear system in the first part of the function, we
497 // copy the solution from the device to the host to be able to view its
498 // values and display it in `output_results()`. This transfer works the same
499 // as at the end of the previous function.
500 template <int dim, int fe_degree>
solve()501 void HelmholtzProblem<dim, fe_degree>::solve()
502 {
503 PreconditionIdentity preconditioner;
504
505 SolverControl solver_control(system_rhs_dev.size(),
506 1e-12 * system_rhs_dev.l2_norm());
507 SolverCG<LinearAlgebra::distributed::Vector<double, MemorySpace::CUDA>> cg(
508 solver_control);
509 cg.solve(*system_matrix_dev, solution_dev, system_rhs_dev, preconditioner);
510
511 pcout << " Solved in " << solver_control.last_step() << " iterations."
512 << std::endl;
513
514 LinearAlgebra::ReadWriteVector<double> rw_vector(locally_owned_dofs);
515 rw_vector.import(solution_dev, VectorOperation::insert);
516 ghost_solution_host.import(rw_vector, VectorOperation::insert);
517
518 constraints.distribute(ghost_solution_host);
519
520 ghost_solution_host.update_ghost_values();
521 }
522
523 // The output results function is as usual since we have already copied the
524 // values back from the GPU to the CPU.
525 //
526 // While we're already doing something with the function, we might
527 // as well compute the $L_2$ norm of the solution. We do this by
528 // calling VectorTools::integrate_difference(). That function is
529 // meant to compute the error by evaluating the difference between
530 // the numerical solution (given by a vector of values for the
531 // degrees of freedom) and an object representing the exact
532 // solution. But we can easily compute the $L_2$ norm of the
533 // solution by passing in a zero function instead. That is, instead
534 // of evaluating the error $\|u_h-u\|_{L_2(\Omega)}$, we are just
535 // evaluating $\|u_h-0\|_{L_2(\Omega)}=\|u_h\|_{L_2(\Omega)}$
536 // instead.
537 template <int dim, int fe_degree>
output_results(const unsigned int cycle) const538 void HelmholtzProblem<dim, fe_degree>::output_results(
539 const unsigned int cycle) const
540 {
541 DataOut<dim> data_out;
542
543 data_out.attach_dof_handler(dof_handler);
544 data_out.add_data_vector(ghost_solution_host, "solution");
545 data_out.build_patches();
546
547 DataOutBase::VtkFlags flags;
548 flags.compression_level = DataOutBase::VtkFlags::best_speed;
549 data_out.set_flags(flags);
550 data_out.write_vtu_with_pvtu_record(
551 "./", "solution", cycle, mpi_communicator, 2);
552
553 Vector<float> cellwise_norm(triangulation.n_active_cells());
554 VectorTools::integrate_difference(dof_handler,
555 ghost_solution_host,
556 Functions::ZeroFunction<dim>(),
557 cellwise_norm,
558 QGauss<dim>(fe.degree + 2),
559 VectorTools::L2_norm);
560 const double global_norm =
561 VectorTools::compute_global_error(triangulation,
562 cellwise_norm,
563 VectorTools::L2_norm);
564 pcout << " solution norm: " << global_norm << std::endl;
565 }
566
567
568 // There is nothing surprising in the `run()` function either. We simply
569 // compute the solution on a series of (globally) refined meshes.
570 template <int dim, int fe_degree>
run()571 void HelmholtzProblem<dim, fe_degree>::run()
572 {
573 for (unsigned int cycle = 0; cycle < 7 - dim; ++cycle)
574 {
575 pcout << "Cycle " << cycle << std::endl;
576
577 if (cycle == 0)
578 GridGenerator::hyper_cube(triangulation, 0., 1.);
579 triangulation.refine_global(1);
580
581 setup_system();
582
583 pcout << " Number of active cells: "
584 << triangulation.n_global_active_cells() << std::endl
585 << " Number of degrees of freedom: " << dof_handler.n_dofs()
586 << std::endl;
587
588 assemble_rhs();
589 solve();
590 output_results(cycle);
591 pcout << std::endl;
592 }
593 }
594 } // namespace Step64
595
596
597 // @sect3{The <code>main()</code> function}
598
599 // Finally for the `main()` function. By default, all the MPI ranks
600 // will try to access the device with number 0, which we assume to be
601 // the GPU device associated with the CPU on which a particular MPI
602 // rank runs. This works, but if we are running with MPI support it
603 // may be that multiple MPI processes are running on the same machine
604 // (for example, one per CPU core) and then they would all want to
605 // access the same GPU on that machine. If there is only one GPU in
606 // the machine, there is nothing we can do about it: All MPI ranks on
607 // that machine need to share it. But if there are more than one GPU,
608 // then it is better to address different graphic cards for different
609 // processes. The choice below is based on the MPI process id by
610 // assigning GPUs round robin to GPU ranks. (To work correctly, this
611 // scheme assumes that the MPI ranks on one machine are
612 // consecutive. If that were not the case, then the rank-GPU
613 // association may just not be optimal.) To make this work, MPI needs
614 // to be initialized before using this function.
main(int argc,char * argv[])615 int main(int argc, char *argv[])
616 {
617 try
618 {
619 using namespace Step64;
620
621 Utilities::MPI::MPI_InitFinalize mpi_init(argc, argv, 1);
622
623 int n_devices = 0;
624 cudaError_t cuda_error_code = cudaGetDeviceCount(&n_devices);
625 AssertCuda(cuda_error_code);
626 const unsigned int my_mpi_id =
627 Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
628 const int device_id = my_mpi_id % n_devices;
629 cuda_error_code = cudaSetDevice(device_id);
630 AssertCuda(cuda_error_code);
631
632 HelmholtzProblem<3, 3> helmholtz_problem;
633 helmholtz_problem.run();
634 }
635 catch (std::exception &exc)
636 {
637 std::cerr << std::endl
638 << std::endl
639 << "----------------------------------------------------"
640 << std::endl;
641 std::cerr << "Exception on processing: " << std::endl
642 << exc.what() << std::endl
643 << "Aborting!" << std::endl
644 << "----------------------------------------------------"
645 << std::endl;
646 return 1;
647 }
648 catch (...)
649 {
650 std::cerr << std::endl
651 << std::endl
652 << "----------------------------------------------------"
653 << std::endl;
654 std::cerr << "Unknown exception!" << std::endl
655 << "Aborting!" << std::endl
656 << "----------------------------------------------------"
657 << std::endl;
658 return 1;
659 }
660
661 return 0;
662 }
663