1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 1999 - 2019 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 #ifndef dealii_sparse_vanka_h
17 #define dealii_sparse_vanka_h
18 
19 
20 
21 #include <deal.II/base/config.h>
22 
23 #include <deal.II/base/multithread_info.h>
24 #include <deal.II/base/smartpointer.h>
25 
26 #include <map>
27 #include <vector>
28 
29 DEAL_II_NAMESPACE_OPEN
30 
31 // Forward declarations
32 #ifndef DOXYGEN
33 template <typename number>
34 class FullMatrix;
35 template <typename number>
36 class SparseMatrix;
37 template <typename number>
38 class Vector;
39 
40 template <typename number>
41 class SparseVanka;
42 template <typename number>
43 class SparseBlockVanka;
44 #endif
45 
46 /*! @addtogroup Preconditioners
47  *@{
48  */
49 
50 /**
51  * Point-wise Vanka preconditioning. This class does Vanka preconditioning  on
52  * a point-wise base. Vanka preconditioners are used for saddle point problems
53  * like Stokes' problem or problems arising in optimization where Lagrange
54  * multipliers occur and the Newton method matrix has a zero block. With these
55  * matrices the application of Jacobi or Gauss-Seidel methods is impossible,
56  * because some diagonal elements are zero in the rows of the Lagrange
57  * multiplier. The approach of Vanka is to solve a small (usually indefinite)
58  * system of equations for each Langrange multiplier variable (we will also
59  * call the pressure in Stokes' equation a Langrange multiplier since it can
60  * be interpreted as such).
61  *
62  * Objects of this class are constructed by passing a vector of indices of the
63  * degrees of freedom of the Lagrange multiplier. In the actual
64  * preconditioning method, these rows are traversed in the order in which the
65  * appear in the matrix. Since this is a Gauß-Seidel like procedure, remember
66  * to have a good ordering in advance (for transport dominated problems,
67  * Cuthill-McKee algorithms are a good means for this, if points on the inflow
68  * boundary are chosen as starting points for the renumbering).
69  *
70  * For each selected degree of freedom, a local system of equations is built
71  * by the degree of freedom itself and all other values coupling immediately,
72  * i.e. the set of degrees of freedom considered for the local system of
73  * equations for degree of freedom @p i is @p i itself and all @p j such that
74  * the element <tt>(i,j)</tt> is a nonzero entry in the sparse matrix under
75  * consideration. The elements <tt>(j,i)</tt> are not considered. We now pick
76  * all matrix entries from rows and columns out of the set of degrees of
77  * freedom just described out of the global matrix and put it into a local
78  * matrix, which is subsequently inverted. This system may be of different
79  * size for each degree of freedom, depending for example on the local
80  * neighborhood of the respective node on a computational grid.
81  *
82  * The right hand side is built up in the same way, i.e. by copying all
83  * entries that coupled with the one under present consideration, but it is
84  * augmented by all degrees of freedom coupling with the degrees from the set
85  * described above (i.e. the DoFs coupling second order to the present one).
86  * The reason for this is, that the local problems to be solved shall have
87  * Dirichlet boundary conditions on the second order coupling DoFs, so we have
88  * to take them into account but eliminate them before actually solving; this
89  * elimination is done by the modification of the right hand side, and in the
90  * end these degrees of freedom do not occur in the matrix and solution vector
91  * any more at all.
92  *
93  * This local system is solved and the values are updated into the destination
94  * vector.
95  *
96  * Remark: the Vanka method is a non-symmetric preconditioning method.
97  *
98  *
99  * <h3>Example of Use</h3> This little example is taken from a program doing
100  * parameter optimization. The Lagrange multiplier is the third component of
101  * the finite element used. The system is solved by the GMRES method.
102  * @code
103  *    // tag the Lagrange multiplier variable
104  *    vector<bool> signature(3);
105  *    signature[0] = signature[1] = false;
106  *    signature[2] = true;
107  *
108  *    // tag all dofs belonging to the Lagrange multiplier
109  *    vector<bool> selected_dofs (dof.n_dofs(), false);
110  *    DoFTools::extract_dofs(dof, signature, p_select);
111  *    // create the Vanka object
112  *    SparseVanka<double> vanka (global_matrix, selected_dofs);
113  *
114  *    // create the solver
115  *    SolverGMRES<> gmres(control,memory,504);
116  *
117  *    // solve
118  *    gmres.solve (global_matrix, solution, right_hand_side,
119  *                 vanka);
120  * @endcode
121  *
122  *
123  * <h4>Implementor's remark</h4> At present, the local matrices are built up
124  * such that the degree of freedom associated with the local Lagrange
125  * multiplier is the first one. Thus, usually the upper left entry in the
126  * local matrix is zero. It is not clear to me (W.B.) whether this might pose
127  * some problems in the inversion of the local matrices. Maybe someone would
128  * like to check this.
129  *
130  * @note Instantiations for this template are provided for <tt>@<float@> and
131  * @<double@></tt>; others can be generated in application programs (see the
132  * section on
133  * @ref Instantiations
134  * in the manual).
135  */
136 template <typename number>
137 class SparseVanka
138 {
139 public:
140   /**
141    * Declare type for container size.
142    */
143   using size_type = types::global_dof_index;
144 
145   /**
146    * Constructor. Does nothing.
147    *
148    * Call the initialize() function before using this object as preconditioner
149    * (vmult()).
150    */
151   SparseVanka();
152 
153   /**
154    * Constructor which also takes two deprecated inputs.
155    *
156    * @deprecated The use of the last two parameters is deprecated. They are
157    * currently ignored.
158    */
159   DEAL_II_DEPRECATED
160   SparseVanka(const SparseMatrix<number> &M,
161               const std::vector<bool> &   selected,
162               const bool                  conserve_memory,
163               const unsigned int n_threads = MultithreadInfo::n_threads());
164 
165   /**
166    * Constructor. Gets the matrix for preconditioning and a bit vector with
167    * entries @p true for all rows to be updated. A reference to this vector
168    * will be stored, so it must persist longer than the Vanka object. The same
169    * is true for the matrix.
170    *
171    * The matrix @p M which is passed here may or may not be the same matrix
172    * for which this object shall act as preconditioner. In particular, it is
173    * conceivable that the preconditioner is build up for one matrix once, but
174    * is used for subsequent steps in a nonlinear process as well, where the
175    * matrix changes in each step slightly.
176    */
177   SparseVanka(const SparseMatrix<number> &M, const std::vector<bool> &selected);
178 
179   /**
180    * Destructor. Delete all allocated matrices.
181    */
182   ~SparseVanka();
183 
184   /**
185    * Parameters for SparseVanka.
186    */
187   class AdditionalData
188   {
189   public:
190     /**
191      * Constructor. For the parameters' description, see below.
192      */
193     explicit AdditionalData(const std::vector<bool> &selected);
194 
195     /**
196      * Constructor. For the parameters' description, see below.
197      *
198      * @deprecated The use of this constructor is deprecated - the second and
199      * third parameters are ignored.
200      */
201     DEAL_II_DEPRECATED
202     AdditionalData(const std::vector<bool> &selected,
203                    const bool               conserve_memory,
204                    const unsigned int n_threads = MultithreadInfo::n_threads());
205 
206     /**
207      * Indices of those degrees of freedom that we shall work on.
208      */
209     const std::vector<bool> &selected;
210   };
211 
212 
213   /**
214    * If the default constructor is used then this function needs to be called
215    * before an object of this class is used as preconditioner.
216    *
217    * For more detail about possible parameters, see the class documentation
218    * and the documentation of the SparseVanka::AdditionalData class.
219    *
220    * After this function is called the preconditioner is ready to be used
221    * (using the <code>vmult</code> function of derived classes).
222    */
223   void
224   initialize(const SparseMatrix<number> &M,
225              const AdditionalData &      additional_data);
226 
227   /**
228    * Do the preconditioning. This function takes the residual in @p src and
229    * returns the resulting update vector in @p dst.
230    */
231   template <typename number2>
232   void
233   vmult(Vector<number2> &dst, const Vector<number2> &src) const;
234 
235   /**
236    * Apply transpose preconditioner. This function takes the residual in @p
237    * src  and returns the resulting update vector in @p dst.
238    */
239   template <typename number2>
240   void
241   Tvmult(Vector<number2> &dst, const Vector<number2> &src) const;
242 
243   /**
244    * Return the dimension of the codomain (or range) space. Note that the
245    * matrix is of dimension $m \times n$.
246    *
247    * @note This function should only be called if the preconditioner has been
248    * initialized.
249    */
250   size_type
251   m() const;
252 
253   /**
254    * Return the dimension of the domain space. Note that the matrix is of
255    * dimension $m \times n$.
256    *
257    * @note This function should only be called if the preconditioner has been
258    * initialized.
259    */
260   size_type
261   n() const;
262 
263 protected:
264   /**
265    * Apply the inverses corresponding to those degrees of freedom that have a
266    * @p true value in @p dof_mask, to the @p src vector and move the result
267    * into @p dst. Actually, only values for allowed indices are written to @p
268    * dst, so the application of this function only does what is announced in
269    * the general documentation if the given mask sets all values to zero
270    *
271    * The reason for providing the mask anyway is that in derived classes we
272    * may want to apply the preconditioner to parts of the matrix only, in
273    * order to parallelize the application. Then, it is important to only write
274    * to some slices of @p dst, in order to eliminate the dependencies of
275    * threads of each other.
276    *
277    * If a null pointer is passed instead of a pointer to the @p dof_mask (as
278    * is the default value), then it is assumed that we shall work on all
279    * degrees of freedom. This is then equivalent to calling the function with
280    * a <tt>vector<bool>(n_dofs,true)</tt>.
281    *
282    * The @p vmult of this class of course calls this function with a null
283    * pointer
284    */
285   template <typename number2>
286   void
287   apply_preconditioner(Vector<number2> &              dst,
288                        const Vector<number2> &        src,
289                        const std::vector<bool> *const dof_mask = nullptr) const;
290 
291   /**
292    * Determine an estimate for the memory consumption (in bytes) of this
293    * object.
294    */
295   std::size_t
296   memory_consumption() const;
297 
298 private:
299   /**
300    * Pointer to the matrix.
301    */
302   SmartPointer<const SparseMatrix<number>, SparseVanka<number>> matrix;
303 
304   /**
305    * Indices of those degrees of freedom that we shall work on.
306    */
307   const std::vector<bool> *selected;
308 
309   /**
310    * Array of inverse matrices, one for each degree of freedom. Only those
311    * elements will be used that are tagged in @p selected.
312    */
313   mutable std::vector<SmartPointer<FullMatrix<float>, SparseVanka<number>>>
314     inverses;
315 
316   /**
317    * The dimension of the range space.
318    */
319   size_type _m;
320 
321   /**
322    * The dimension of the domain space.
323    */
324   size_type _n;
325 
326   /**
327    * Compute the inverses of all selected diagonal elements.
328    */
329   void
330   compute_inverses();
331 
332   /**
333    * Compute the inverses at positions in the range <tt>[begin,end)</tt>. In
334    * non-multithreaded mode, <tt>compute_inverses()</tt> calls this function
335    * with the whole range, but in multithreaded mode, several copies of this
336    * function are spawned.
337    */
338   void
339   compute_inverses(const size_type begin, const size_type end);
340 
341   /**
342    * Compute the inverse of the block located at position @p row. Since the
343    * vector is used quite often, it is generated only once in the caller of
344    * this function and passed to this function which first clears it. Reusing
345    * the vector makes the process significantly faster than in the case where
346    * this function re-creates it each time.
347    */
348   void
349   compute_inverse(const size_type row, std::vector<size_type> &local_indices);
350 
351   // Make the derived class a friend. This seems silly, but is actually
352   // necessary, since derived classes can only access non-public members
353   // through their @p this pointer, but not access these members as member
354   // functions of other objects of the type of this base class (i.e. like
355   // <tt>x.f()</tt>, where @p x is an object of the base class, and @p f one
356   // of it's non-public member functions).
357   //
358   // Now, in the case of the @p SparseBlockVanka class, we would like to take
359   // the address of a function of the base class in order to call it through
360   // the multithreading framework, so the derived class has to be a friend.
361   template <typename T>
362   friend class SparseBlockVanka;
363 };
364 
365 
366 
367 /**
368  * Block version of the sparse Vanka preconditioner. This class divides the
369  * matrix into blocks and works on the diagonal blocks only, which of course
370  * reduces the efficiency as preconditioner, but is perfectly parallelizable.
371  * The constructor takes a parameter into how many blocks the matrix shall be
372  * subdivided and then lets the underlying class do the work. Division of the
373  * matrix is done in several ways which are described in detail below.
374  *
375  * This class is probably useless if you don't have a multiprocessor system,
376  * since then the amount of work per preconditioning step is the same as for
377  * the @p SparseVanka class, but preconditioning properties are worse. On the
378  * other hand, if you have a multiprocessor system, the worse preconditioning
379  * quality (leading to more iterations of the linear solver) usually is well
380  * balanced by the increased speed of application due to the parallelization,
381  * leading to an overall decrease in elapsed wall-time for solving your linear
382  * system. It should be noted that the quality as preconditioner reduces with
383  * growing number of blocks, so there may be an optimal value (in terms of
384  * wall-time per linear solve) for the number of blocks.
385  *
386  * To facilitate writing portable code, if the number of blocks into which the
387  * matrix is to be subdivided, is set to one, then this class acts just like
388  * the @p SparseVanka class. You may therefore want to set the number of
389  * blocks equal to the number of processors you have.
390  *
391  * Note that the parallelization is done if <tt>deal.II</tt> was configured
392  * for multithread use and that the number of threads which is spawned equals
393  * the number of blocks. This is reasonable since you will not want to set the
394  * number of blocks unnecessarily large, since, as mentioned, this reduces the
395  * preconditioning properties.
396  *
397  *
398  * <h3>Splitting the matrix into blocks</h3>
399  *
400  * Splitting the matrix into blocks is always done in a way such that the
401  * blocks are not necessarily of equal size, but such that the number of
402  * selected degrees of freedom for which a local system is to be solved is
403  * equal between blocks. The reason for this strategy to subdivision is load-
404  * balancing for multithreading. There are several possibilities to actually
405  * split the matrix into blocks, which are selected by the flag @p
406  * blocking_strategy that is passed to the constructor. By a block, we will in
407  * the sequel denote a list of indices of degrees of freedom; the algorithm
408  * will work on each block separately, i.e. the solutions of the local systems
409  * corresponding to a degree of freedom of one block will only be used to
410  * update the degrees of freedom belonging to the same block, but never to
411  * update degrees of freedom of other blocks. A block can be a consecutive
412  * list of indices, as in the first alternative below, or a nonconsecutive
413  * list of indices. Of course, we assume that the intersection of each two
414  * blocks is empty and that the union of all blocks equals the interval
415  * <tt>[0,N)</tt>, where @p N is the number of degrees of freedom of the
416  * system of equations.
417  *
418  * <ul>
419  * <li> @p index_intervals: Here, we chose the blocks to be intervals
420  * <tt>[a_i,a_{i+1</tt>)}, i.e. consecutive degrees of freedom are usually
421  * also within the same block. This is a reasonable strategy, if the degrees
422  * of freedom have, for example, be re-numbered using the Cuthill-McKee
423  * algorithm, in which spatially neighboring degrees of freedom have
424  * neighboring indices. In that case, coupling in the matrix is usually
425  * restricted to the vicinity of the diagonal as well, and we can simply cut
426  * the matrix into blocks.
427  *
428  * The bounds of the intervals, i.e. the @p a_i above, are chosen such that
429  * the number of degrees of freedom on which we shall work (i.e. usually the
430  * degrees of freedom corresponding to Lagrange multipliers) is about the same
431  * in each block; this does not mean, however, that the sizes of the blocks
432  * are equal, since the blocks also comprise the other degrees of freedom for
433  * which no local system is solved. In the extreme case, consider that all
434  * Lagrange multipliers are sorted to the end of the range of DoF indices,
435  * then the first block would be very large, since it comprises all other DoFs
436  * and some Lagrange multipliers, while all other blocks are rather small and
437  * comprise only Langrange multipliers. This strategy therefore does not only
438  * depend on the order in which the Lagrange DoFs are sorted, but also on the
439  * order in which the other DoFs are sorted. It is therefore necessary to note
440  * that this almost renders the capability as preconditioner useless if the
441  * degrees of freedom are numbered by component, i.e. all Lagrange multipliers
442  * en bloc.
443  *
444  * <li> @p adaptive: This strategy is a bit more clever in cases where the
445  * Langrange DoFs are clustered, as in the example above. It works as follows:
446  * it first groups the Lagrange DoFs into blocks, using the same strategy as
447  * above. However, instead of grouping the other DoFs into the blocks of
448  * Lagrange DoFs with nearest DoF index, it decides for each non-Lagrange DoF
449  * to put it into the block of Lagrange DoFs which write to this non-Lagrange
450  * DoF most often. This makes it possible to even sort the Lagrange DoFs to
451  * the end and still associate spatially neighboring non-Lagrange DoFs to the
452  * same blocks where the respective Lagrange DoFs are, since they couple to
453  * each other while spatially distant DoFs don't couple.
454  *
455  * The additional computational effort to sorting the non-Lagrange DoFs is not
456  * very large compared with the inversion of the local systems and applying
457  * the preconditioner, so this strategy might be reasonable if you want to
458  * sort your degrees of freedom by component. If the degrees of freedom are
459  * not sorted by component, the results of the both strategies outlined above
460  * does not differ much. However, unlike the first strategy, the performance
461  * of the second strategy does not deteriorate if the DoFs are renumbered by
462  * component.
463  * </ul>
464  *
465  *
466  * <h3>Typical results</h3>
467  *
468  * As a prototypical test case, we use a nonlinear problem from optimization,
469  * which leads to a series of saddle point problems, each of which is solved
470  * using GMRES with Vanka as preconditioner. The equation had approx. 850
471  * degrees of freedom. With the non-blocked version @p SparseVanka (or @p
472  * SparseBlockVanka with <tt>n_blocks==1</tt>), the following numbers of
473  * iterations is needed to solver the linear system in each nonlinear step:
474  * @verbatim
475  *   101 68 64 53 35 21
476  * @endverbatim
477  *
478  * With four blocks, we need the following numbers of iterations
479  * @verbatim
480  *   124 88 83 66 44 28
481  * @endverbatim
482  * As can be seen, more iterations are needed. However, in terms of computing
483  * time, the first version needs 72 seconds wall time (and 79 seconds CPU
484  * time, which is more than wall time since some other parts of the program
485  * were parallelized as well), while the second version needed 53 second wall
486  * time (and 110 seconds CPU time) on a four processor machine. The total time
487  * is in both cases dominated by the linear solvers. In this case, it is
488  * therefore worth while using the blocked version of the preconditioner if
489  * wall time is more important than CPU time.
490  *
491  * The results with the block version above were obtained with the first
492  * blocking strategy and the degrees of freedom were not numbered by
493  * component. Using the second strategy does not much change the numbers of
494  * iterations (at most by one in each step) and they also do not change when
495  * the degrees of freedom are sorted by component, while the first strategy
496  * significantly deteriorated.
497  */
498 template <typename number>
499 class SparseBlockVanka : public SparseVanka<number>
500 {
501 public:
502   /**
503    * Declare type for container size.
504    */
505   using size_type = types::global_dof_index;
506 
507   /**
508    * Enumeration of the different methods by which the DoFs are distributed to
509    * the blocks on which we are to work.
510    */
511   enum BlockingStrategy
512   {
513     /**
514      * Block by index intervals.
515      */
516     index_intervals,
517     /**
518      * Block with an adaptive strategy.
519      */
520     adaptive
521   };
522 
523   /**
524    * Constructor. Pass all arguments except for @p n_blocks to the base class.
525    *
526    * @deprecated This constructor is deprecated. The values passed to the last
527    * two arguments are ignored.
528    */
529   DEAL_II_DEPRECATED
530   SparseBlockVanka(const SparseMatrix<number> &M,
531                    const std::vector<bool> &   selected,
532                    const unsigned int          n_blocks,
533                    const BlockingStrategy      blocking_strategy,
534                    const bool                  conserve_memory,
535                    const unsigned int n_threads = MultithreadInfo::n_threads());
536 
537   /**
538    * Constructor. Pass all arguments except for @p n_blocks to the base class.
539    */
540   SparseBlockVanka(const SparseMatrix<number> &M,
541                    const std::vector<bool> &   selected,
542                    const unsigned int          n_blocks,
543                    const BlockingStrategy      blocking_strategy);
544 
545   /**
546    * Apply the preconditioner.
547    */
548   template <typename number2>
549   void
550   vmult(Vector<number2> &dst, const Vector<number2> &src) const;
551 
552   /**
553    * Determine an estimate for the memory consumption (in bytes) of this
554    * object.
555    */
556   std::size_t
557   memory_consumption() const;
558 
559 private:
560   /**
561    * Store the number of blocks.
562    */
563   const unsigned int n_blocks;
564 
565   /**
566    * In this field, we precompute for each block which degrees of freedom
567    * belong to it. Thus, if <tt>dof_masks[i][j]==true</tt>, then DoF @p j
568    * belongs to block @p i. Of course, no other <tt>dof_masks[l][j]</tt> may
569    * be @p true for <tt>l!=i</tt>. This computation is done in the
570    * constructor, to avoid recomputing each time the preconditioner is called.
571    */
572   std::vector<std::vector<bool>> dof_masks;
573 
574   /**
575    * Compute the contents of the field @p dof_masks. This function is called
576    * from the constructor.
577    */
578   void
579   compute_dof_masks(const SparseMatrix<number> &M,
580                     const std::vector<bool> &   selected,
581                     const BlockingStrategy      blocking_strategy);
582 };
583 
584 /*@}*/
585 /* ---------------------------------- Inline functions ------------------- */
586 
587 #ifndef DOXYGEN
588 
589 template <typename number>
590 inline typename SparseVanka<number>::size_type
m()591 SparseVanka<number>::m() const
592 {
593   Assert(_m != 0, ExcNotInitialized());
594   return _m;
595 }
596 
597 template <typename number>
598 inline typename SparseVanka<number>::size_type
n()599 SparseVanka<number>::n() const
600 {
601   Assert(_n != 0, ExcNotInitialized());
602   return _n;
603 }
604 
605 template <typename number>
606 template <typename number2>
607 inline void
Tvmult(Vector<number2> &,const Vector<number2> &)608 SparseVanka<number>::Tvmult(Vector<number2> & /*dst*/,
609                             const Vector<number2> & /*src*/) const
610 {
611   AssertThrow(false, ExcNotImplemented());
612 }
613 
614 #endif // DOXYGEN
615 
616 DEAL_II_NAMESPACE_CLOSE
617 
618 #endif
619