1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2018 - 2020 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 
17 #include <deal.II/base/conditional_ostream.h>
18 #include <deal.II/base/memory_consumption.h>
19 #include <deal.II/base/mpi.h>
20 #include <deal.II/base/multithread_info.h>
21 #include <deal.II/base/parallel.h>
22 #include <deal.II/base/utilities.h>
23 
24 #include <deal.II/matrix_free/task_info.h>
25 
26 
27 #ifdef DEAL_II_WITH_TBB
28 #  include <tbb/blocked_range.h>
29 #  include <tbb/parallel_for.h>
30 #  include <tbb/task.h>
31 #  include <tbb/task_scheduler_init.h>
32 #endif
33 
34 #include <iostream>
35 #include <set>
36 
37 DEAL_II_NAMESPACE_OPEN
38 
39 
40 
41 /*-------------------- Implementation of the matrix-free loop --------------*/
42 namespace internal
43 {
44   namespace MatrixFreeFunctions
45   {
46 #ifdef DEAL_II_WITH_TBB
47 
48     // This defines the TBB data structures that are needed to schedule the
49     // partition-partition variant
50 
51     namespace partition
52     {
53       class ActualCellWork
54       {
55       public:
ActualCellWork(MFWorkerInterface ** worker_pointer,const unsigned int partition,const TaskInfo & task_info)56         ActualCellWork(MFWorkerInterface **worker_pointer,
57                        const unsigned int  partition,
58                        const TaskInfo &    task_info)
59           : worker(nullptr)
60           , worker_pointer(worker_pointer)
61           , partition(partition)
62           , task_info(task_info)
63         {}
64 
ActualCellWork(MFWorkerInterface & worker,const unsigned int partition,const TaskInfo & task_info)65         ActualCellWork(MFWorkerInterface &worker,
66                        const unsigned int partition,
67                        const TaskInfo &   task_info)
68           : worker(&worker)
69           , worker_pointer(nullptr)
70           , partition(partition)
71           , task_info(task_info)
72         {}
73 
74         void
operator ()() const75         operator()() const
76         {
77           MFWorkerInterface *used_worker =
78             worker != nullptr ? worker : *worker_pointer;
79           Assert(used_worker != nullptr, ExcInternalError());
80           used_worker->cell(
81             std::make_pair(task_info.cell_partition_data[partition],
82                            task_info.cell_partition_data[partition + 1]));
83 
84           if (task_info.face_partition_data.empty() == false)
85             {
86               used_worker->face(
87                 std::make_pair(task_info.face_partition_data[partition],
88                                task_info.face_partition_data[partition + 1]));
89 
90               used_worker->boundary(std::make_pair(
91                 task_info.boundary_partition_data[partition],
92                 task_info.boundary_partition_data[partition + 1]));
93             }
94         }
95 
96       private:
97         MFWorkerInterface * worker;
98         MFWorkerInterface **worker_pointer;
99         const unsigned int  partition;
100         const TaskInfo &    task_info;
101       };
102 
103       class CellWork : public tbb::task
104       {
105       public:
CellWork(MFWorkerInterface & worker,const unsigned int partition,const TaskInfo & task_info,const bool is_blocked)106         CellWork(MFWorkerInterface &worker,
107                  const unsigned int partition,
108                  const TaskInfo &   task_info,
109                  const bool         is_blocked)
110           : dummy(nullptr)
111           , work(worker, partition, task_info)
112           , is_blocked(is_blocked)
113         {}
114 
115         tbb::task *
execute()116         execute() override
117         {
118           work();
119 
120           if (is_blocked == true)
121             tbb::empty_task::spawn(*dummy);
122           return nullptr;
123         }
124 
125         tbb::empty_task *dummy;
126 
127       private:
128         ActualCellWork work;
129         const bool     is_blocked;
130       };
131 
132 
133 
134       class PartitionWork : public tbb::task
135       {
136       public:
PartitionWork(MFWorkerInterface & function_in,const unsigned int partition_in,const TaskInfo & task_info_in,const bool is_blocked_in=false)137         PartitionWork(MFWorkerInterface &function_in,
138                       const unsigned int partition_in,
139                       const TaskInfo &   task_info_in,
140                       const bool         is_blocked_in = false)
141           : dummy(nullptr)
142           , function(function_in)
143           , partition(partition_in)
144           , task_info(task_info_in)
145           , is_blocked(is_blocked_in)
146         {}
147 
148         tbb::task *
execute()149         execute() override
150         {
151           tbb::empty_task *root =
152             new (tbb::task::allocate_root()) tbb::empty_task;
153           const unsigned int evens = task_info.partition_evens[partition];
154           const unsigned int odds  = task_info.partition_odds[partition];
155           const unsigned int n_blocked_workers =
156             task_info.partition_n_blocked_workers[partition];
157           const unsigned int n_workers =
158             task_info.partition_n_workers[partition];
159           std::vector<CellWork *> worker(n_workers);
160           std::vector<CellWork *> blocked_worker(n_blocked_workers);
161 
162           root->set_ref_count(evens + 1);
163           for (unsigned int j = 0; j < evens; j++)
164             {
165               worker[j] = new (root->allocate_child())
166                 CellWork(function,
167                          task_info.partition_row_index[partition] + 2 * j,
168                          task_info,
169                          false);
170               if (j > 0)
171                 {
172                   worker[j]->set_ref_count(2);
173                   blocked_worker[j - 1]->dummy =
174                     new (worker[j]->allocate_child()) tbb::empty_task;
175                   tbb::task::spawn(*blocked_worker[j - 1]);
176                 }
177               else
178                 worker[j]->set_ref_count(1);
179               if (j < evens - 1)
180                 {
181                   blocked_worker[j] = new (worker[j]->allocate_child())
182                     CellWork(function,
183                              task_info.partition_row_index[partition] + 2 * j +
184                                1,
185                              task_info,
186                              true);
187                 }
188               else
189                 {
190                   if (odds == evens)
191                     {
192                       worker[evens] = new (worker[j]->allocate_child())
193                         CellWork(function,
194                                  task_info.partition_row_index[partition] +
195                                    2 * j + 1,
196                                  task_info,
197                                  false);
198                       tbb::task::spawn(*worker[evens]);
199                     }
200                   else
201                     {
202                       tbb::empty_task *child =
203                         new (worker[j]->allocate_child()) tbb::empty_task();
204                       tbb::task::spawn(*child);
205                     }
206                 }
207             }
208 
209           root->wait_for_all();
210           root->destroy(*root);
211           if (is_blocked == true)
212             tbb::empty_task::spawn(*dummy);
213           return nullptr;
214         }
215 
216         tbb::empty_task *dummy;
217 
218       private:
219         MFWorkerInterface &function;
220         const unsigned int partition;
221         const TaskInfo &   task_info;
222         const bool         is_blocked;
223       };
224 
225     } // end of namespace partition
226 
227 
228 
229     namespace color
230     {
231       class CellWork
232       {
233       public:
CellWork(MFWorkerInterface & worker_in,const TaskInfo & task_info_in,const unsigned int partition_in)234         CellWork(MFWorkerInterface &worker_in,
235                  const TaskInfo &   task_info_in,
236                  const unsigned int partition_in)
237           : worker(worker_in)
238           , task_info(task_info_in)
239           , partition(partition_in)
240         {}
241 
242         void
operator ()(const tbb::blocked_range<unsigned int> & r) const243         operator()(const tbb::blocked_range<unsigned int> &r) const
244         {
245           const unsigned int start_index =
246             task_info.cell_partition_data[partition] +
247             task_info.block_size * r.begin();
248           const unsigned int end_index =
249             std::min(start_index + task_info.block_size * (r.end() - r.begin()),
250                      task_info.cell_partition_data[partition + 1]);
251           worker.cell(std::make_pair(start_index, end_index));
252 
253           if (task_info.face_partition_data.empty() == false)
254             {
255               AssertThrow(false, ExcNotImplemented());
256             }
257         }
258 
259       private:
260         MFWorkerInterface &worker;
261         const TaskInfo &   task_info;
262         const unsigned int partition;
263       };
264 
265 
266 
267       class PartitionWork : public tbb::task
268       {
269       public:
PartitionWork(MFWorkerInterface & worker_in,const unsigned int partition_in,const TaskInfo & task_info_in,const bool is_blocked_in)270         PartitionWork(MFWorkerInterface &worker_in,
271                       const unsigned int partition_in,
272                       const TaskInfo &   task_info_in,
273                       const bool         is_blocked_in)
274           : dummy(nullptr)
275           , worker(worker_in)
276           , partition(partition_in)
277           , task_info(task_info_in)
278           , is_blocked(is_blocked_in)
279         {}
280 
281         tbb::task *
execute()282         execute() override
283         {
284           const unsigned int n_chunks =
285             (task_info.cell_partition_data[partition + 1] -
286              task_info.cell_partition_data[partition] + task_info.block_size -
287              1) /
288             task_info.block_size;
289           parallel_for(tbb::blocked_range<unsigned int>(0, n_chunks, 1),
290                        CellWork(worker, task_info, partition));
291           if (is_blocked == true)
292             tbb::empty_task::spawn(*dummy);
293           return nullptr;
294         }
295 
296         tbb::empty_task *dummy;
297 
298       private:
299         MFWorkerInterface &worker;
300         const unsigned int partition;
301         const TaskInfo &   task_info;
302         const bool         is_blocked;
303       };
304 
305     } // end of namespace color
306 
307 
308 
309     class MPICommunication : public tbb::task
310     {
311     public:
MPICommunication(MFWorkerInterface & worker_in,const bool do_compress)312       MPICommunication(MFWorkerInterface &worker_in, const bool do_compress)
313         : worker(worker_in)
314         , do_compress(do_compress)
315       {}
316 
317       tbb::task *
execute()318       execute() override
319       {
320         if (do_compress == false)
321           worker.vector_update_ghosts_finish();
322         else
323           worker.vector_compress_start();
324         return nullptr;
325       }
326 
327     private:
328       MFWorkerInterface &worker;
329       const bool         do_compress;
330     };
331 
332 #endif // DEAL_II_WITH_TBB
333 
334 
335 
336     void
loop(MFWorkerInterface & funct) const337     TaskInfo::loop(MFWorkerInterface &funct) const
338     {
339       // If we use thread parallelism, we do not currently support to schedule
340       // pieces of updates within the loop, so this index will collect all
341       // calls in that case and work like a single complete loop over all
342       // cells
343       if (scheme != none)
344         funct.cell_loop_pre_range(numbers::invalid_unsigned_int);
345       else
346         funct.cell_loop_pre_range(
347           partition_row_index[partition_row_index.size() - 2]);
348 
349       funct.vector_update_ghosts_start();
350 
351 #ifdef DEAL_II_WITH_TBB
352 
353       if (scheme != none)
354         {
355           funct.zero_dst_vector_range(numbers::invalid_unsigned_int);
356           if (scheme == partition_partition)
357             {
358               tbb::empty_task *root =
359                 new (tbb::task::allocate_root()) tbb::empty_task;
360               root->set_ref_count(evens + 1);
361               std::vector<partition::PartitionWork *> worker(n_workers);
362               std::vector<partition::PartitionWork *> blocked_worker(
363                 n_blocked_workers);
364               MPICommunication *worker_compr =
365                 new (root->allocate_child()) MPICommunication(funct, true);
366               worker_compr->set_ref_count(1);
367               for (unsigned int j = 0; j < evens; j++)
368                 {
369                   if (j > 0)
370                     {
371                       worker[j] = new (root->allocate_child())
372                         partition::PartitionWork(funct, 2 * j, *this, false);
373                       worker[j]->set_ref_count(2);
374                       blocked_worker[j - 1]->dummy =
375                         new (worker[j]->allocate_child()) tbb::empty_task;
376                       tbb::task::spawn(*blocked_worker[j - 1]);
377                     }
378                   else
379                     {
380                       worker[j] = new (worker_compr->allocate_child())
381                         partition::PartitionWork(funct, 2 * j, *this, false);
382                       worker[j]->set_ref_count(2);
383                       MPICommunication *worker_dist =
384                         new (worker[j]->allocate_child())
385                           MPICommunication(funct, false);
386                       tbb::task::spawn(*worker_dist);
387                     }
388                   if (j < evens - 1)
389                     {
390                       blocked_worker[j] = new (worker[j]->allocate_child())
391                         partition::PartitionWork(funct, 2 * j + 1, *this, true);
392                     }
393                   else
394                     {
395                       if (odds == evens)
396                         {
397                           worker[evens] = new (worker[j]->allocate_child())
398                             partition::PartitionWork(funct,
399                                                      2 * j + 1,
400                                                      *this,
401                                                      false);
402                           tbb::task::spawn(*worker[evens]);
403                         }
404                       else
405                         {
406                           tbb::empty_task *child =
407                             new (worker[j]->allocate_child()) tbb::empty_task();
408                           tbb::task::spawn(*child);
409                         }
410                     }
411                 }
412 
413               root->wait_for_all();
414               root->destroy(*root);
415             }
416           else // end of partition-partition, start of partition-color
417             {
418               // check whether there is only one partition. if not, build up the
419               // tree of partitions
420               if (odds > 0)
421                 {
422                   tbb::empty_task *root =
423                     new (tbb::task::allocate_root()) tbb::empty_task;
424                   root->set_ref_count(evens + 1);
425                   const unsigned int n_blocked_workers =
426                     odds - (odds + evens + 1) % 2;
427                   const unsigned int n_workers =
428                     cell_partition_data.size() - 1 - n_blocked_workers;
429                   std::vector<color::PartitionWork *> worker(n_workers);
430                   std::vector<color::PartitionWork *> blocked_worker(
431                     n_blocked_workers);
432                   unsigned int      worker_index = 0, slice_index = 0;
433                   int               spawn_index_child = -2;
434                   MPICommunication *worker_compr =
435                     new (root->allocate_child()) MPICommunication(funct, true);
436                   worker_compr->set_ref_count(1);
437                   for (unsigned int part = 0;
438                        part < partition_row_index.size() - 1;
439                        part++)
440                     {
441                       if (part == 0)
442                         worker[worker_index] =
443                           new (worker_compr->allocate_child())
444                             color::PartitionWork(funct,
445                                                  slice_index,
446                                                  *this,
447                                                  false);
448                       else
449                         worker[worker_index] = new (root->allocate_child())
450                           color::PartitionWork(funct,
451                                                slice_index,
452                                                *this,
453                                                false);
454                       slice_index++;
455                       for (; slice_index < partition_row_index[part + 1];
456                            slice_index++)
457                         {
458                           worker[worker_index]->set_ref_count(1);
459                           worker_index++;
460                           worker[worker_index] =
461                             new (worker[worker_index - 1]->allocate_child())
462                               color::PartitionWork(funct,
463                                                    slice_index,
464                                                    *this,
465                                                    false);
466                         }
467                       worker[worker_index]->set_ref_count(2);
468                       if (part > 0)
469                         {
470                           blocked_worker[(part - 1) / 2]->dummy =
471                             new (worker[worker_index]->allocate_child())
472                               tbb::empty_task;
473                           worker_index++;
474                           if (spawn_index_child == -1)
475                             tbb::task::spawn(*blocked_worker[(part - 1) / 2]);
476                           else
477                             {
478                               Assert(spawn_index_child >= 0,
479                                      ExcInternalError());
480                               tbb::task::spawn(*worker[spawn_index_child]);
481                             }
482                           spawn_index_child = -2;
483                         }
484                       else
485                         {
486                           MPICommunication *worker_dist =
487                             new (worker[worker_index]->allocate_child())
488                               MPICommunication(funct, false);
489                           tbb::task::spawn(*worker_dist);
490                           worker_index++;
491                         }
492                       part += 1;
493                       if (part < partition_row_index.size() - 1)
494                         {
495                           if (part < partition_row_index.size() - 2)
496                             {
497                               blocked_worker[part / 2] =
498                                 new (worker[worker_index - 1]->allocate_child())
499                                   color::PartitionWork(funct,
500                                                        slice_index,
501                                                        *this,
502                                                        true);
503                               slice_index++;
504                               if (slice_index < partition_row_index[part + 1])
505                                 {
506                                   blocked_worker[part / 2]->set_ref_count(1);
507                                   worker[worker_index] = new (
508                                     blocked_worker[part / 2]->allocate_child())
509                                     color::PartitionWork(funct,
510                                                          slice_index,
511                                                          *this,
512                                                          false);
513                                   slice_index++;
514                                 }
515                               else
516                                 {
517                                   spawn_index_child = -1;
518                                   continue;
519                                 }
520                             }
521                           for (; slice_index < partition_row_index[part + 1];
522                                slice_index++)
523                             {
524                               if (slice_index > partition_row_index[part])
525                                 {
526                                   worker[worker_index]->set_ref_count(1);
527                                   worker_index++;
528                                 }
529                               worker[worker_index] =
530                                 new (worker[worker_index - 1]->allocate_child())
531                                   color::PartitionWork(funct,
532                                                        slice_index,
533                                                        *this,
534                                                        false);
535                             }
536                           spawn_index_child = worker_index;
537                           worker_index++;
538                         }
539                       else
540                         {
541                           tbb::empty_task *final =
542                             new (worker[worker_index - 1]->allocate_child())
543                               tbb::empty_task;
544                           tbb::task::spawn(*final);
545                           spawn_index_child = worker_index - 1;
546                         }
547                     }
548                   if (evens == odds)
549                     {
550                       Assert(spawn_index_child >= 0, ExcInternalError());
551                       tbb::task::spawn(*worker[spawn_index_child]);
552                     }
553                   root->wait_for_all();
554                   root->destroy(*root);
555                 }
556               // case when we only have one partition: this is the usual
557               // coloring scheme, and we just schedule a parallel for loop for
558               // each color
559               else
560                 {
561                   Assert(evens <= 1, ExcInternalError());
562                   funct.vector_update_ghosts_finish();
563 
564                   for (unsigned int color = 0; color < partition_row_index[1];
565                        ++color)
566                     {
567                       tbb::empty_task *root =
568                         new (tbb::task::allocate_root()) tbb::empty_task;
569                       root->set_ref_count(2);
570                       color::PartitionWork *worker =
571                         new (root->allocate_child())
572                           color::PartitionWork(funct, color, *this, false);
573                       tbb::empty_task::spawn(*worker);
574                       root->wait_for_all();
575                       root->destroy(*root);
576                     }
577 
578                   funct.vector_compress_start();
579                 }
580             }
581         }
582       else
583 #endif
584         // serial loop, go through up to three times and do the MPI transfer at
585         // the beginning/end of the second part
586         {
587           for (unsigned int part = 0; part < partition_row_index.size() - 2;
588                ++part)
589             {
590               if (part == 1)
591                 funct.vector_update_ghosts_finish();
592 
593               for (unsigned int i = partition_row_index[part];
594                    i < partition_row_index[part + 1];
595                    ++i)
596                 {
597                   funct.cell_loop_pre_range(i);
598                   funct.zero_dst_vector_range(i);
599                   AssertIndexRange(i + 1, cell_partition_data.size());
600                   if (cell_partition_data[i + 1] > cell_partition_data[i])
601                     {
602                       funct.cell(std::make_pair(cell_partition_data[i],
603                                                 cell_partition_data[i + 1]));
604                     }
605 
606                   if (face_partition_data.empty() == false)
607                     {
608                       if (face_partition_data[i + 1] > face_partition_data[i])
609                         funct.face(std::make_pair(face_partition_data[i],
610                                                   face_partition_data[i + 1]));
611                       if (boundary_partition_data[i + 1] >
612                           boundary_partition_data[i])
613                         funct.boundary(
614                           std::make_pair(boundary_partition_data[i],
615                                          boundary_partition_data[i + 1]));
616                     }
617                   funct.cell_loop_post_range(i);
618                 }
619 
620               if (part == 1)
621                 funct.vector_compress_start();
622             }
623         }
624       funct.vector_compress_finish();
625 
626       if (scheme != none)
627         funct.cell_loop_post_range(numbers::invalid_unsigned_int);
628       else
629         funct.cell_loop_post_range(
630           partition_row_index[partition_row_index.size() - 2]);
631     }
632 
633 
634 
TaskInfo()635     TaskInfo::TaskInfo()
636     {
637       clear();
638     }
639 
640 
641 
642     void
clear()643     TaskInfo::clear()
644     {
645       n_active_cells       = 0;
646       n_ghost_cells        = 0;
647       vectorization_length = 1;
648       block_size           = 0;
649       n_blocks             = 0;
650       scheme               = none;
651       partition_row_index.clear();
652       partition_row_index.resize(2);
653       cell_partition_data.clear();
654       face_partition_data.clear();
655       boundary_partition_data.clear();
656       evens             = 0;
657       odds              = 0;
658       n_blocked_workers = 0;
659       n_workers         = 0;
660       partition_evens.clear();
661       partition_odds.clear();
662       partition_n_blocked_workers.clear();
663       partition_n_workers.clear();
664       communicator = MPI_COMM_SELF;
665       my_pid       = 0;
666       n_procs      = 1;
667     }
668 
669 
670 
671     template <typename StreamType>
672     void
print_memory_statistics(StreamType & out,const std::size_t data_length) const673     TaskInfo::print_memory_statistics(StreamType &      out,
674                                       const std::size_t data_length) const
675     {
676       Utilities::MPI::MinMaxAvg memory_c =
677         Utilities::MPI::min_max_avg(1e-6 * data_length, communicator);
678       if (n_procs < 2)
679         out << memory_c.min;
680       else
681         out << memory_c.min << "/" << memory_c.avg << "/" << memory_c.max;
682       out << " MB" << std::endl;
683     }
684 
685 
686 
687     std::size_t
memory_consumption() const688     TaskInfo::memory_consumption() const
689     {
690       return (
691         sizeof(*this) +
692         MemoryConsumption::memory_consumption(partition_row_index) +
693         MemoryConsumption::memory_consumption(cell_partition_data) +
694         MemoryConsumption::memory_consumption(face_partition_data) +
695         MemoryConsumption::memory_consumption(boundary_partition_data) +
696         MemoryConsumption::memory_consumption(partition_evens) +
697         MemoryConsumption::memory_consumption(partition_odds) +
698         MemoryConsumption::memory_consumption(partition_n_blocked_workers) +
699         MemoryConsumption::memory_consumption(partition_n_workers));
700     }
701 
702 
703 
704     void
make_boundary_cells_divisible(std::vector<unsigned int> & boundary_cells)705     TaskInfo::make_boundary_cells_divisible(
706       std::vector<unsigned int> &boundary_cells)
707     {
708       // try to make the number of boundary cells divisible by the number of
709       // vectors in vectorization
710       unsigned int fillup_needed =
711         (vectorization_length - boundary_cells.size() % vectorization_length) %
712         vectorization_length;
713       if (fillup_needed > 0 && boundary_cells.size() < n_active_cells)
714         {
715           // fill additional cells into the list of boundary cells to get a
716           // balanced number. Go through the indices successively until we
717           // found enough indices
718           std::vector<unsigned int> new_boundary_cells;
719           new_boundary_cells.reserve(boundary_cells.size());
720 
721           unsigned int next_free_slot = 0, bound_index = 0;
722           while (fillup_needed > 0 && bound_index < boundary_cells.size())
723             {
724               if (next_free_slot < boundary_cells[bound_index])
725                 {
726                   // check if there are enough cells to fill with in the
727                   // current slot
728                   if (next_free_slot + fillup_needed <=
729                       boundary_cells[bound_index])
730                     {
731                       for (unsigned int j =
732                              boundary_cells[bound_index] - fillup_needed;
733                            j < boundary_cells[bound_index];
734                            ++j)
735                         new_boundary_cells.push_back(j);
736                       fillup_needed = 0;
737                     }
738                   // ok, not enough indices, so just take them all up to the
739                   // next boundary cell
740                   else
741                     {
742                       for (unsigned int j = next_free_slot;
743                            j < boundary_cells[bound_index];
744                            ++j)
745                         new_boundary_cells.push_back(j);
746                       fillup_needed -=
747                         boundary_cells[bound_index] - next_free_slot;
748                     }
749                 }
750               new_boundary_cells.push_back(boundary_cells[bound_index]);
751               next_free_slot = boundary_cells[bound_index] + 1;
752               ++bound_index;
753             }
754           while (fillup_needed > 0 &&
755                  (new_boundary_cells.size() == 0 ||
756                   new_boundary_cells.back() < n_active_cells - 1))
757             new_boundary_cells.push_back(new_boundary_cells.back() + 1);
758           while (bound_index < boundary_cells.size())
759             new_boundary_cells.push_back(boundary_cells[bound_index++]);
760 
761           boundary_cells.swap(new_boundary_cells);
762         }
763 
764       // set the number of cells
765       std::sort(boundary_cells.begin(), boundary_cells.end());
766 
767       // check that number of boundary cells is divisible by
768       // vectorization_length or that it contains all cells
769       Assert(boundary_cells.size() % vectorization_length == 0 ||
770                boundary_cells.size() == n_active_cells,
771              ExcInternalError());
772     }
773 
774 
775 
776     void
create_blocks_serial(const std::vector<unsigned int> & cells_with_comm,const unsigned int dofs_per_cell,const bool categories_are_hp,const std::vector<unsigned int> & cell_vectorization_categories,const bool cell_vectorization_categories_strict,const std::vector<unsigned int> & parent_relation,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & incompletely_filled_vectorization)777     TaskInfo::create_blocks_serial(
778       const std::vector<unsigned int> &cells_with_comm,
779       const unsigned int               dofs_per_cell,
780       const bool                       categories_are_hp,
781       const std::vector<unsigned int> &cell_vectorization_categories,
782       const bool                       cell_vectorization_categories_strict,
783       const std::vector<unsigned int> &parent_relation,
784       std::vector<unsigned int> &      renumbering,
785       std::vector<unsigned char> &     incompletely_filled_vectorization)
786     {
787       // This function is decomposed into several steps to determine a good
788       // ordering that satisfies the following constraints:
789       // a. Only cells belonging to the same category (or next higher if the
790       // cell_vectorization_categories_strict is false) can be grouped into
791       // the same SIMD batch
792       // b. hp adaptive computations must form contiguous ranges for the same
793       // degree (category) in cell_partition_data
794       // c. We want to group the cells with the same parent in the same SIMD
795       // lane if possible
796       // d. The cell order should be similar to the initial one
797       // e. Form sets without MPI communication and those with to overlap
798       // communication with computation
799       //
800       // These constraints are satisfied by first grouping by the categories
801       // and, within the groups, to distinguish between cells with a parent
802       // and those without. All of this is set up with batches of cells (with
803       // padding if the size does not match). Then we define a vector of
804       // arrays where we define sorting criteria for the cell batches to
805       // satisfy the items b and d together, split by different parts to
806       // satisfy item e.
807 
808       // Give the compiler a chance to detect that vectorization_length is a
809       // power of two, which allows it to replace integer divisions by shifts
810       unsigned int vectorization_length_bits = 0;
811       unsigned int my_length                 = vectorization_length;
812       while (my_length >>= 1)
813         ++vectorization_length_bits;
814       const unsigned int n_lanes = 1 << vectorization_length_bits;
815 
816       // Step 1: find tight map of categories for not taking exceeding amounts
817       // of memory below. Sort the new categories by the numbers in the
818       // old one to ensure we respect the given rules
819       unsigned int              n_categories = 1;
820       std::vector<unsigned int> tight_category_map(n_active_cells, 0);
821       if (cell_vectorization_categories.empty() == false)
822         {
823           AssertDimension(cell_vectorization_categories.size(),
824                           n_active_cells + n_ghost_cells);
825 
826           std::set<unsigned int> used_categories;
827           for (unsigned int i = 0; i < n_active_cells; ++i)
828             used_categories.insert(cell_vectorization_categories[i]);
829           std::vector<unsigned int> used_categories_vector(
830             used_categories.size());
831           n_categories = 0;
832           for (const auto &it : used_categories)
833             used_categories_vector[n_categories++] = it;
834           for (unsigned int i = 0; i < n_active_cells; ++i)
835             {
836               const unsigned int index =
837                 std::lower_bound(used_categories_vector.begin(),
838                                  used_categories_vector.end(),
839                                  cell_vectorization_categories[i]) -
840                 used_categories_vector.begin();
841               AssertIndexRange(index, used_categories_vector.size());
842               tight_category_map[i] = index;
843             }
844         }
845 
846       // Step 2: Sort the cells by the category. If we want to fill up the
847       // ranges in vectorization, promote some of the cells to a higher
848       // category
849       std::vector<std::vector<unsigned int>> renumbering_category(n_categories);
850       for (unsigned int i = 0; i < n_active_cells; ++i)
851         renumbering_category[tight_category_map[i]].push_back(i);
852 
853       if (cell_vectorization_categories_strict == false && n_categories > 1)
854         for (unsigned int j = n_categories - 1; j > 0; --j)
855           {
856             unsigned int lower_index = j - 1;
857             while (renumbering_category[j].size() % n_lanes)
858               {
859                 while (renumbering_category[j].size() % n_lanes &&
860                        !renumbering_category[lower_index].empty())
861                   {
862                     renumbering_category[j].push_back(
863                       renumbering_category[lower_index].back());
864                     renumbering_category[lower_index].pop_back();
865                   }
866                 if (lower_index == 0)
867                   break;
868                 else
869                   --lower_index;
870               }
871           }
872 
873       // Step 3: Use the parent relation to find a good grouping of cells. To
874       // do this, we first put cells of each category defined above into two
875       // bins, those which we know can be grouped together by the given parent
876       // relation and those which cannot
877       std::vector<unsigned int> temporary_numbering;
878       temporary_numbering.reserve(n_active_cells +
879                                   (n_lanes - 1) * n_categories);
880       const unsigned int n_cells_per_parent =
881         std::count(parent_relation.begin(), parent_relation.end(), 0);
882       std::vector<unsigned int> category_size;
883       for (unsigned int j = 0; j < n_categories; ++j)
884         {
885           std::vector<std::pair<unsigned int, unsigned int>> grouped_cells;
886           std::vector<unsigned int>                          other_cells;
887           for (const unsigned int cell : renumbering_category[j])
888             if (parent_relation.empty() ||
889                 parent_relation[cell] == numbers::invalid_unsigned_int)
890               other_cells.push_back(cell);
891             else
892               grouped_cells.emplace_back(parent_relation[cell], cell);
893 
894           // Compute the number of cells per group
895           std::sort(grouped_cells.begin(), grouped_cells.end());
896           std::vector<unsigned int> n_cells_per_group;
897           unsigned int              length = 0;
898           for (unsigned int i = 0; i < grouped_cells.size(); ++i, ++length)
899             if (i > 0 && grouped_cells[i].first != grouped_cells[i - 1].first)
900               {
901                 n_cells_per_group.push_back(length);
902                 length = 0;
903               }
904           if (length > 0)
905             n_cells_per_group.push_back(length);
906 
907           // Move groups that do not have the complete size (due to
908           // categories) to the 'other_cells'. The cells with correct group
909           // size are immediately appended to the temporary cell numbering
910           auto group_it = grouped_cells.begin();
911           for (unsigned int length : n_cells_per_group)
912             if (length < n_cells_per_parent)
913               for (unsigned int j = 0; j < length; ++j)
914                 other_cells.push_back((group_it++)->second);
915             else
916               {
917                 // we should not have more cells in a group than in the first
918                 // check we did above
919                 AssertDimension(length, n_cells_per_parent);
920                 for (unsigned int j = 0; j < length; ++j)
921                   temporary_numbering.push_back((group_it++)->second);
922               }
923 
924           // Sort the remaining cells and append them as well
925           std::sort(other_cells.begin(), other_cells.end());
926           temporary_numbering.insert(temporary_numbering.end(),
927                                      other_cells.begin(),
928                                      other_cells.end());
929 
930           while (temporary_numbering.size() % n_lanes != 0)
931             temporary_numbering.push_back(numbers::invalid_unsigned_int);
932 
933           category_size.push_back(temporary_numbering.size());
934         }
935 
936       // Step 4: Identify the batches with cells marked as "comm"
937       std::vector<bool> batch_with_comm(temporary_numbering.size() / n_lanes,
938                                         false);
939       std::vector<unsigned int> temporary_numbering_inverse(n_active_cells);
940       for (unsigned int i = 0; i < temporary_numbering.size(); ++i)
941         if (temporary_numbering[i] != numbers::invalid_unsigned_int)
942           temporary_numbering_inverse[temporary_numbering[i]] = i;
943       for (const unsigned int cell : cells_with_comm)
944         batch_with_comm[temporary_numbering_inverse[cell] / n_lanes] = true;
945 
946       // Step 5: Sort the batches of cells by their last cell index to get
947       // good locality, assuming that the initial cell order is of good
948       // locality. In case we have hp calculations with categories, we need to
949       // sort also by the category.
950       std::vector<std::array<unsigned int, 3>> batch_order;
951       std::vector<std::array<unsigned int, 3>> batch_order_comm;
952       for (unsigned int i = 0; i < temporary_numbering.size(); i += n_lanes)
953         {
954           unsigned int max_index = 0;
955           for (unsigned int j = 0; j < n_lanes; ++j)
956             if (temporary_numbering[i + j] < numbers::invalid_unsigned_int)
957               max_index = std::max(temporary_numbering[i + j], max_index);
958           const unsigned int category_hp =
959             categories_are_hp ?
960               std::upper_bound(category_size.begin(), category_size.end(), i) -
961                 category_size.begin() :
962               0;
963           const std::array<unsigned int, 3> next{{category_hp, max_index, i}};
964           if (batch_with_comm[i / n_lanes])
965             batch_order_comm.emplace_back(next);
966           else
967             batch_order.emplace_back(next);
968         }
969 
970       std::sort(batch_order.begin(), batch_order.end());
971       std::sort(batch_order_comm.begin(), batch_order_comm.end());
972 
973       // Step 6: Put the cells with communication in the middle of the cell
974       // range. For the MPI case, we need three groups to enable overlap for
975       // communication and computation (part before comm, part with comm, part
976       // after comm), whereas we need one for the other case. And in each
977       // case, we allow for a slot of "ghosted" cells.
978       std::vector<unsigned int> blocks;
979       if (n_procs == 1)
980         {
981           if (batch_order.empty())
982             std::swap(batch_order_comm, batch_order);
983           Assert(batch_order_comm.empty(), ExcInternalError());
984           partition_row_index.resize(3);
985           blocks = {0, static_cast<unsigned int>(batch_order.size())};
986         }
987       else
988         {
989           partition_row_index.resize(5);
990           const unsigned int comm_begin = batch_order.size() / 2;
991           batch_order.insert(batch_order.begin() + comm_begin,
992                              batch_order_comm.begin(),
993                              batch_order_comm.end());
994           const unsigned int comm_end = comm_begin + batch_order_comm.size();
995           const unsigned int end      = batch_order.size();
996           blocks                      = {0, comm_begin, comm_end, end};
997         }
998 
999       // Step 7: Fill in the data by batches for the locally owned cells.
1000       const unsigned int n_cell_batches = batch_order.size();
1001       const unsigned int n_ghost_batches =
1002         (n_ghost_cells + n_lanes - 1) / n_lanes;
1003       incompletely_filled_vectorization.resize(n_cell_batches +
1004                                                n_ghost_batches);
1005 
1006       cell_partition_data.clear();
1007       cell_partition_data.resize(1, 0);
1008 
1009       renumbering.clear();
1010       renumbering.resize(n_active_cells + n_ghost_cells,
1011                          numbers::invalid_unsigned_int);
1012 
1013       unsigned int counter = 0;
1014       for (unsigned int block = 0; block < blocks.size() - 1; ++block)
1015         {
1016           const unsigned int grain_size =
1017             std::max((2048U / dofs_per_cell) / 8 * 4, 2U);
1018           for (unsigned int k = blocks[block]; k < blocks[block + 1];
1019                k += grain_size)
1020             cell_partition_data.push_back(
1021               std::min(k + grain_size, blocks[block + 1]));
1022           partition_row_index[block + 1] = cell_partition_data.size() - 1;
1023 
1024           // Set the numbering according to the reordered temporary one
1025           for (unsigned int k = blocks[block]; k < blocks[block + 1]; ++k)
1026             {
1027               const unsigned int pos = batch_order[k][2];
1028               unsigned int       j   = 0;
1029               for (; j < n_lanes && temporary_numbering[pos + j] !=
1030                                       numbers::invalid_unsigned_int;
1031                    ++j)
1032                 renumbering[counter++] = temporary_numbering[pos + j];
1033               if (j < n_lanes)
1034                 incompletely_filled_vectorization[k] = j;
1035             }
1036         }
1037       AssertDimension(counter, n_active_cells);
1038 
1039       // Step 8: Treat the ghost cells
1040       for (unsigned int cell = n_active_cells;
1041            cell < n_active_cells + n_ghost_cells;
1042            ++cell)
1043         {
1044           if (!cell_vectorization_categories.empty())
1045             AssertDimension(cell_vectorization_categories[cell],
1046                             cell_vectorization_categories[n_active_cells]);
1047           renumbering[cell] = cell;
1048         }
1049       if (n_ghost_cells % n_lanes)
1050         incompletely_filled_vectorization.back() = n_ghost_cells % n_lanes;
1051       cell_partition_data.push_back(n_cell_batches + n_ghost_batches);
1052       partition_row_index.back() = cell_partition_data.size() - 1;
1053 
1054 #ifdef DEBUG
1055       std::vector<unsigned int> renumber_cpy(renumbering);
1056       std::sort(renumber_cpy.begin(), renumber_cpy.end());
1057       for (unsigned int i = 0; i < renumber_cpy.size(); ++i)
1058         AssertDimension(i, renumber_cpy[i]);
1059 #endif
1060     }
1061 
1062 
1063 
1064     void
initial_setup_blocks_tasks(const std::vector<unsigned int> & boundary_cells,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & incompletely_filled_vectorization)1065     TaskInfo::initial_setup_blocks_tasks(
1066       const std::vector<unsigned int> &boundary_cells,
1067       std::vector<unsigned int> &      renumbering,
1068       std::vector<unsigned char> &     incompletely_filled_vectorization)
1069     {
1070       const unsigned int n_macro_cells =
1071         (n_active_cells + vectorization_length - 1) / vectorization_length;
1072       const unsigned int n_ghost_slots =
1073         (n_ghost_cells + vectorization_length - 1) / vectorization_length;
1074       incompletely_filled_vectorization.resize(n_macro_cells + n_ghost_slots);
1075       if (n_macro_cells * vectorization_length > n_active_cells)
1076         incompletely_filled_vectorization[n_macro_cells - 1] =
1077           vectorization_length -
1078           (n_macro_cells * vectorization_length - n_active_cells);
1079       if (n_ghost_slots * vectorization_length > n_ghost_cells)
1080         incompletely_filled_vectorization[n_macro_cells + n_ghost_slots - 1] =
1081           vectorization_length -
1082           (n_ghost_slots * vectorization_length - n_ghost_cells);
1083 
1084       std::vector<unsigned int> reverse_numbering(
1085         n_active_cells, numbers::invalid_unsigned_int);
1086       for (unsigned int j = 0; j < boundary_cells.size(); ++j)
1087         reverse_numbering[boundary_cells[j]] = j;
1088       unsigned int counter = boundary_cells.size();
1089       for (unsigned int j = 0; j < n_active_cells; ++j)
1090         if (reverse_numbering[j] == numbers::invalid_unsigned_int)
1091           reverse_numbering[j] = counter++;
1092 
1093       AssertDimension(counter, n_active_cells);
1094       renumbering = Utilities::invert_permutation(reverse_numbering);
1095 
1096       for (unsigned int j = n_active_cells; j < n_active_cells + n_ghost_cells;
1097            ++j)
1098         renumbering.push_back(j);
1099 
1100       // TODO: might be able to simplify this code by not relying on the cell
1101       // partition data while computing the thread graph
1102       cell_partition_data.clear();
1103       cell_partition_data.push_back(0);
1104       if (n_procs > 1)
1105         {
1106           const unsigned int n_macro_boundary_cells =
1107             (boundary_cells.size() + vectorization_length - 1) /
1108             vectorization_length;
1109           cell_partition_data.push_back(
1110             (n_macro_cells - n_macro_boundary_cells) / 2);
1111           cell_partition_data.push_back(cell_partition_data[1] +
1112                                         n_macro_boundary_cells);
1113         }
1114       else
1115         AssertDimension(boundary_cells.size(), 0);
1116       cell_partition_data.push_back(n_macro_cells);
1117       cell_partition_data.push_back(cell_partition_data.back() + n_ghost_slots);
1118       partition_row_index.resize(n_procs > 1 ? 4 : 2);
1119       partition_row_index[0] = 0;
1120       partition_row_index[1] = 1;
1121       if (n_procs > 1)
1122         {
1123           partition_row_index[2] = 2;
1124           partition_row_index[3] = 3;
1125         }
1126     }
1127 
1128 
1129 
1130     void
guess_block_size(const unsigned int dofs_per_cell)1131     TaskInfo::guess_block_size(const unsigned int dofs_per_cell)
1132     {
1133       // user did not say a positive number, so we have to guess
1134       if (block_size == 0)
1135         {
1136           // we would like to have enough work to do, so as first guess, try
1137           // to get 16 times as many chunks as we have threads on the system.
1138           block_size = n_active_cells / (MultithreadInfo::n_threads() * 16 *
1139                                          vectorization_length);
1140 
1141           // if there are too few degrees of freedom per cell, need to
1142           // increase the block size
1143           const unsigned int minimum_parallel_grain_size = 200;
1144           if (dofs_per_cell * block_size < minimum_parallel_grain_size)
1145             block_size = (minimum_parallel_grain_size / dofs_per_cell + 1);
1146           if (dofs_per_cell * block_size > 10000)
1147             block_size /= 4;
1148 
1149           block_size =
1150             1 << static_cast<unsigned int>(std::log2(block_size + 1));
1151         }
1152       if (block_size > n_active_cells)
1153         block_size = std::max(1U, n_active_cells);
1154     }
1155 
1156 
1157 
1158     void
make_thread_graph_partition_color(DynamicSparsityPattern & connectivity_large,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & irregular_cells,const bool)1159     TaskInfo::make_thread_graph_partition_color(
1160       DynamicSparsityPattern &    connectivity_large,
1161       std::vector<unsigned int> & renumbering,
1162       std::vector<unsigned char> &irregular_cells,
1163       const bool)
1164     {
1165       const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1166       if (n_macro_cells == 0)
1167         return;
1168 
1169       Assert(vectorization_length > 0, ExcInternalError());
1170 
1171       unsigned int partition = 0, counter = 0;
1172 
1173       // Create connectivity graph for blocks based on connectivity graph for
1174       // cells.
1175       DynamicSparsityPattern connectivity(n_blocks, n_blocks);
1176       make_connectivity_cells_to_blocks(irregular_cells,
1177                                         connectivity_large,
1178                                         connectivity);
1179 
1180       // Create cell-block  partitioning.
1181 
1182       // For each block of cells, this variable saves to which partitions the
1183       // block belongs. Initialize all to -1 to mark them as not yet assigned
1184       // a partition.
1185       std::vector<unsigned int> cell_partition(n_blocks,
1186                                                numbers::invalid_unsigned_int);
1187 
1188       // In element j of this variable, one puts the old number of the block
1189       // that should be the jth block in the new numeration.
1190       std::vector<unsigned int> partition_list(n_blocks, 0);
1191       std::vector<unsigned int> partition_color_list(n_blocks, 0);
1192 
1193       // This vector points to the start of each partition.
1194       std::vector<unsigned int> partition_size(2, 0);
1195 
1196       // blocking_connectivity = true;
1197 
1198       // The cluster_size in make_partitioning defines that the no. of cells
1199       // in each partition should be a multiple of cluster_size.
1200       unsigned int cluster_size = 1;
1201 
1202       // Make the partitioning of the first layer of the blocks of cells.
1203       make_partitioning(connectivity,
1204                         cluster_size,
1205                         cell_partition,
1206                         partition_list,
1207                         partition_size,
1208                         partition);
1209 
1210       // Color the cells within each partition
1211       make_coloring_within_partitions_pre_blocked(connectivity,
1212                                                   partition,
1213                                                   cell_partition,
1214                                                   partition_list,
1215                                                   partition_size,
1216                                                   partition_color_list);
1217 
1218       partition_list = renumbering;
1219 
1220 #ifdef DEBUG
1221       // in debug mode, check that the partition color list is one-to-one
1222       {
1223         std::vector<unsigned int> sorted_pc_list(partition_color_list);
1224         std::sort(sorted_pc_list.begin(), sorted_pc_list.end());
1225         for (unsigned int i = 0; i < sorted_pc_list.size(); ++i)
1226           Assert(sorted_pc_list[i] == i, ExcInternalError());
1227       }
1228 #endif
1229 
1230       // set the start list for each block and compute the renumbering of
1231       // cells
1232       std::vector<unsigned int>  block_start(n_macro_cells + 1);
1233       std::vector<unsigned char> irregular(n_macro_cells);
1234 
1235       unsigned int mcell_start = 0;
1236       block_start[0]           = 0;
1237       for (unsigned int block = 0; block < n_blocks; block++)
1238         {
1239           block_start[block + 1] = block_start[block];
1240           for (unsigned int mcell = mcell_start;
1241                mcell < std::min(mcell_start + block_size, n_macro_cells);
1242                ++mcell)
1243             {
1244               unsigned int n_comp = (irregular_cells[mcell] > 0) ?
1245                                       irregular_cells[mcell] :
1246                                       vectorization_length;
1247               block_start[block + 1] += n_comp;
1248               ++counter;
1249             }
1250           mcell_start += block_size;
1251         }
1252       counter                    = 0;
1253       unsigned int counter_macro = 0;
1254       unsigned int block_size_last =
1255         n_macro_cells - block_size * (n_blocks - 1);
1256       if (block_size_last == 0)
1257         block_size_last = block_size;
1258 
1259       unsigned int tick = 0;
1260       for (unsigned int block = 0; block < n_blocks; block++)
1261         {
1262           unsigned int present_block = partition_color_list[block];
1263           for (unsigned int cell = block_start[present_block];
1264                cell < block_start[present_block + 1];
1265                ++cell)
1266             renumbering[counter++] = partition_list[cell];
1267           unsigned int this_block_size =
1268             (present_block == n_blocks - 1) ? block_size_last : block_size;
1269 
1270           // Also re-compute the content of cell_partition_data to
1271           // contain the numbers of cells, not blocks
1272           if (cell_partition_data[tick] == block)
1273             cell_partition_data[tick++] = counter_macro;
1274 
1275           for (unsigned int j = 0; j < this_block_size; j++)
1276             irregular[counter_macro++] =
1277               irregular_cells[present_block * block_size + j];
1278         }
1279       AssertDimension(tick + 1, cell_partition_data.size());
1280       cell_partition_data.back() = counter_macro;
1281 
1282       irregular_cells.swap(irregular);
1283       AssertDimension(counter, n_active_cells);
1284       AssertDimension(counter_macro, n_macro_cells);
1285 
1286       // check that the renumbering is one-to-one
1287 #ifdef DEBUG
1288       {
1289         std::vector<unsigned int> sorted_renumbering(renumbering);
1290         std::sort(sorted_renumbering.begin(), sorted_renumbering.end());
1291         for (unsigned int i = 0; i < sorted_renumbering.size(); ++i)
1292           Assert(sorted_renumbering[i] == i, ExcInternalError());
1293       }
1294 #endif
1295 
1296 
1297       update_task_info(
1298         partition); // Actually sets too much for partition color case
1299 
1300       AssertDimension(cell_partition_data.back(), n_macro_cells);
1301     }
1302 
1303 
1304 
1305     void
make_thread_graph(const std::vector<unsigned int> & cell_active_fe_index,DynamicSparsityPattern & connectivity,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & irregular_cells,const bool hp_bool)1306     TaskInfo::make_thread_graph(
1307       const std::vector<unsigned int> &cell_active_fe_index,
1308       DynamicSparsityPattern &         connectivity,
1309       std::vector<unsigned int> &      renumbering,
1310       std::vector<unsigned char> &     irregular_cells,
1311       const bool                       hp_bool)
1312     {
1313       const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1314       if (n_macro_cells == 0)
1315         return;
1316 
1317       Assert(vectorization_length > 0, ExcInternalError());
1318 
1319       // if we want to block before partitioning, create connectivity graph
1320       // for blocks based on connectivity graph for cells.
1321       DynamicSparsityPattern connectivity_blocks(n_blocks, n_blocks);
1322       make_connectivity_cells_to_blocks(irregular_cells,
1323                                         connectivity,
1324                                         connectivity_blocks);
1325 
1326       unsigned int n_blocks = 0;
1327       if (scheme == partition_color ||
1328           scheme == color) // blocking_connectivity == true
1329         n_blocks = this->n_blocks;
1330       else
1331         n_blocks = n_active_cells;
1332 
1333       // For each block of cells, this variable saves to which partitions the
1334       // block belongs. Initialize all to -1 to mark them as not yet assigned
1335       // a partition.
1336       std::vector<unsigned int> cell_partition(n_blocks,
1337                                                numbers::invalid_unsigned_int);
1338 
1339       // In element j of this variable, one puts the old number (but after
1340       // renumbering according to the input renumbering) of the block that
1341       // should be the jth block in the new numeration.
1342       std::vector<unsigned int> partition_list(n_blocks, 0);
1343       std::vector<unsigned int> partition_2layers_list(n_blocks, 0);
1344 
1345       // This vector points to the start of each partition.
1346       std::vector<unsigned int> partition_size(2, 0);
1347 
1348       unsigned int partition = 0;
1349 
1350       // Within the partitions we want to be able to block for the case that
1351       // we do not block already in the connectivity. The cluster_size in
1352       // make_partitioning defines that the no. of cells in each partition
1353       // should be a multiple of cluster_size.
1354       unsigned int cluster_size = 1;
1355       if (scheme == partition_partition)
1356         cluster_size = block_size * vectorization_length;
1357 
1358       // Make the partitioning of the first layer of the blocks of cells.
1359       if (scheme == partition_color || scheme == color)
1360         make_partitioning(connectivity_blocks,
1361                           cluster_size,
1362                           cell_partition,
1363                           partition_list,
1364                           partition_size,
1365                           partition);
1366       else
1367         make_partitioning(connectivity,
1368                           cluster_size,
1369                           cell_partition,
1370                           partition_list,
1371                           partition_size,
1372                           partition);
1373 
1374       // Partition or color second layer
1375       if (scheme == partition_partition)
1376 
1377         {
1378           // Partition within partitions.
1379           make_partitioning_within_partitions_post_blocked(
1380             connectivity,
1381             cell_active_fe_index,
1382             partition,
1383             cluster_size,
1384             hp_bool,
1385             cell_partition,
1386             partition_list,
1387             partition_size,
1388             partition_2layers_list,
1389             irregular_cells);
1390         }
1391       else if (scheme == partition_color || scheme == color)
1392         {
1393           make_coloring_within_partitions_pre_blocked(connectivity_blocks,
1394                                                       partition,
1395                                                       cell_partition,
1396                                                       partition_list,
1397                                                       partition_size,
1398                                                       partition_2layers_list);
1399         }
1400 
1401         // in debug mode, check that the partition_2layers_list is one-to-one
1402 #ifdef DEBUG
1403       {
1404         std::vector<unsigned int> sorted_pc_list(partition_2layers_list);
1405         std::sort(sorted_pc_list.begin(), sorted_pc_list.end());
1406         for (unsigned int i = 0; i < sorted_pc_list.size(); ++i)
1407           Assert(sorted_pc_list[i] == i, ExcInternalError());
1408       }
1409 #endif
1410 
1411       // Set the new renumbering
1412       std::vector<unsigned int> renumbering_in(n_active_cells, 0);
1413       renumbering_in.swap(renumbering);
1414       if (scheme == partition_partition) // blocking_connectivity == false
1415         {
1416           // This is the simple case. The renumbering is just a combination of
1417           // the renumbering that we were given as an input and the
1418           // renumbering of partition/coloring given in partition_2layers_list
1419           for (unsigned int j = 0; j < renumbering.size(); j++)
1420             renumbering[j] = renumbering_in[partition_2layers_list[j]];
1421           // Account for the ghost cells, finally.
1422           for (unsigned int i = 0; i < n_ghost_cells; ++i)
1423             renumbering.push_back(i + n_active_cells);
1424         }
1425       else
1426         {
1427           // set the start list for each block and compute the renumbering of
1428           // cells
1429           std::vector<unsigned int>  block_start(n_macro_cells + 1);
1430           std::vector<unsigned char> irregular(n_macro_cells);
1431 
1432           unsigned int counter     = 0;
1433           unsigned int mcell_start = 0;
1434           block_start[0]           = 0;
1435           for (unsigned int block = 0; block < n_blocks; block++)
1436             {
1437               block_start[block + 1] = block_start[block];
1438               for (unsigned int mcell = mcell_start;
1439                    mcell < std::min(mcell_start + block_size, n_macro_cells);
1440                    ++mcell)
1441                 {
1442                   unsigned int n_comp = (irregular_cells[mcell] > 0) ?
1443                                           irregular_cells[mcell] :
1444                                           vectorization_length;
1445                   block_start[block + 1] += n_comp;
1446                   ++counter;
1447                 }
1448               mcell_start += block_size;
1449             }
1450           counter                    = 0;
1451           unsigned int counter_macro = 0;
1452           unsigned int block_size_last =
1453             n_macro_cells - block_size * (n_blocks - 1);
1454           if (block_size_last == 0)
1455             block_size_last = block_size;
1456 
1457           unsigned int tick = 0;
1458           for (unsigned int block = 0; block < n_blocks; block++)
1459             {
1460               unsigned int present_block = partition_2layers_list[block];
1461               for (unsigned int cell = block_start[present_block];
1462                    cell < block_start[present_block + 1];
1463                    ++cell)
1464                 renumbering[counter++] = renumbering_in[cell];
1465               unsigned int this_block_size =
1466                 (present_block == n_blocks - 1) ? block_size_last : block_size;
1467 
1468               // Also re-compute the content of cell_partition_data to
1469               // contain the numbers of cells, not blocks
1470               if (cell_partition_data[tick] == block)
1471                 cell_partition_data[tick++] = counter_macro;
1472 
1473               for (unsigned int j = 0; j < this_block_size; j++)
1474                 irregular[counter_macro++] =
1475                   irregular_cells[present_block * block_size + j];
1476             }
1477           AssertDimension(tick + 1, cell_partition_data.size());
1478           cell_partition_data.back() = counter_macro;
1479 
1480           irregular_cells.swap(irregular);
1481           AssertDimension(counter, n_active_cells);
1482           AssertDimension(counter_macro, n_macro_cells);
1483           // check that the renumbering is one-to-one
1484 #ifdef DEBUG
1485           {
1486             std::vector<unsigned int> sorted_renumbering(renumbering);
1487             std::sort(sorted_renumbering.begin(), sorted_renumbering.end());
1488             for (unsigned int i = 0; i < sorted_renumbering.size(); ++i)
1489               Assert(sorted_renumbering[i] == i, ExcInternalError());
1490           }
1491 #endif
1492         }
1493 
1494       // Update the task_info with the more information for the thread graph.
1495       update_task_info(partition);
1496     }
1497 
1498 
1499 
1500     void
make_thread_graph_partition_partition(const std::vector<unsigned int> & cell_active_fe_index,DynamicSparsityPattern & connectivity,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & irregular_cells,const bool hp_bool)1501     TaskInfo::make_thread_graph_partition_partition(
1502       const std::vector<unsigned int> &cell_active_fe_index,
1503       DynamicSparsityPattern &         connectivity,
1504       std::vector<unsigned int> &      renumbering,
1505       std::vector<unsigned char> &     irregular_cells,
1506       const bool                       hp_bool)
1507     {
1508       const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1509       if (n_macro_cells == 0)
1510         return;
1511 
1512       const unsigned int cluster_size = block_size * vectorization_length;
1513 
1514       // Create cell-block  partitioning.
1515 
1516       // For each block of cells, this variable saves to which partitions the
1517       // block belongs. Initialize all to n_macro_cells to mark them as not
1518       // yet assigned a partition.
1519       std::vector<unsigned int> cell_partition(n_active_cells,
1520                                                numbers::invalid_unsigned_int);
1521 
1522 
1523       // In element j of this variable, one puts the old number of the block
1524       // that should be the jth block in the new numeration.
1525       std::vector<unsigned int> partition_list(n_active_cells, 0);
1526       std::vector<unsigned int> partition_partition_list(n_active_cells, 0);
1527 
1528       // This vector points to the start of each partition.
1529       std::vector<unsigned int> partition_size(2, 0);
1530 
1531       unsigned int partition = 0;
1532       // Here, we do not block inside the connectivity graph
1533       // blocking_connectivity = false;
1534 
1535       // Make the partitioning of the first layer of the blocks of cells.
1536       make_partitioning(connectivity,
1537                         cluster_size,
1538                         cell_partition,
1539                         partition_list,
1540                         partition_size,
1541                         partition);
1542 
1543       // Partition within partitions.
1544       make_partitioning_within_partitions_post_blocked(connectivity,
1545                                                        cell_active_fe_index,
1546                                                        partition,
1547                                                        cluster_size,
1548                                                        hp_bool,
1549                                                        cell_partition,
1550                                                        partition_list,
1551                                                        partition_size,
1552                                                        partition_partition_list,
1553                                                        irregular_cells);
1554 
1555       partition_list.swap(renumbering);
1556 
1557       for (unsigned int j = 0; j < renumbering.size(); j++)
1558         renumbering[j] = partition_list[partition_partition_list[j]];
1559 
1560       for (unsigned int i = 0; i < n_ghost_cells; ++i)
1561         renumbering.push_back(i + n_active_cells);
1562 
1563       update_task_info(partition);
1564     }
1565 
1566 
1567 
1568     void
make_connectivity_cells_to_blocks(const std::vector<unsigned char> & irregular_cells,const DynamicSparsityPattern & connectivity_cells,DynamicSparsityPattern & connectivity_blocks) const1569     TaskInfo::make_connectivity_cells_to_blocks(
1570       const std::vector<unsigned char> &irregular_cells,
1571       const DynamicSparsityPattern &    connectivity_cells,
1572       DynamicSparsityPattern &          connectivity_blocks) const
1573     {
1574       std::vector<std::vector<unsigned int>> cell_blocks(n_blocks);
1575       std::vector<unsigned int>              touched_cells(n_active_cells);
1576       unsigned int                           cell = 0;
1577       for (unsigned int i = 0, mcell = 0; i < n_blocks; ++i)
1578         {
1579           for (unsigned int c = 0;
1580                c < block_size && mcell < *(cell_partition_data.end() - 2);
1581                ++c, ++mcell)
1582             {
1583               unsigned int ncomp = (irregular_cells[mcell] > 0) ?
1584                                      irregular_cells[mcell] :
1585                                      vectorization_length;
1586               for (unsigned int c = 0; c < ncomp; ++c, ++cell)
1587                 {
1588                   cell_blocks[i].push_back(cell);
1589                   touched_cells[cell] = i;
1590                 }
1591             }
1592         }
1593       AssertDimension(cell, n_active_cells);
1594       for (unsigned int i = 0; i < cell_blocks.size(); ++i)
1595         for (unsigned int col = 0; col < cell_blocks[i].size(); ++col)
1596           {
1597             for (DynamicSparsityPattern::iterator it =
1598                    connectivity_cells.begin(cell_blocks[i][col]);
1599                  it != connectivity_cells.end(cell_blocks[i][col]);
1600                  ++it)
1601               {
1602                 if (touched_cells[it->column()] != i)
1603                   connectivity_blocks.add(i, touched_cells[it->column()]);
1604               }
1605           }
1606     }
1607 
1608 
1609 
1610     // Function to create partitioning on the second layer within each
1611     // partition. Version without preblocking.
1612     void
make_partitioning_within_partitions_post_blocked(const DynamicSparsityPattern & connectivity,const std::vector<unsigned int> & cell_active_fe_index,const unsigned int partition,const unsigned int cluster_size,const bool hp_bool,const std::vector<unsigned int> & cell_partition,const std::vector<unsigned int> & partition_list,const std::vector<unsigned int> & partition_size,std::vector<unsigned int> & partition_partition_list,std::vector<unsigned char> & irregular_cells)1613     TaskInfo::make_partitioning_within_partitions_post_blocked(
1614       const DynamicSparsityPattern &   connectivity,
1615       const std::vector<unsigned int> &cell_active_fe_index,
1616       const unsigned int               partition,
1617       const unsigned int               cluster_size,
1618       const bool                       hp_bool,
1619       const std::vector<unsigned int> &cell_partition,
1620       const std::vector<unsigned int> &partition_list,
1621       const std::vector<unsigned int> &partition_size,
1622       std::vector<unsigned int> &      partition_partition_list,
1623       std::vector<unsigned char> &     irregular_cells)
1624     {
1625       const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1626       const unsigned int n_ghost_slots =
1627         *(cell_partition_data.end() - 1) - n_macro_cells;
1628 
1629       // List of cells in previous partition
1630       std::vector<unsigned int> neighbor_list;
1631       // List of cells in current partition for use as neighbors in next
1632       // partition
1633       std::vector<unsigned int> neighbor_neighbor_list;
1634 
1635       std::vector<unsigned int> renumbering(n_active_cells);
1636 
1637       irregular_cells.back() = 0;
1638       irregular_cells.resize(n_active_cells + n_ghost_slots);
1639 
1640       unsigned int max_fe_index = 0;
1641       for (const unsigned int fe_index : cell_active_fe_index)
1642         max_fe_index = std::max(fe_index, max_fe_index);
1643 
1644       Assert(!hp_bool || cell_active_fe_index.size() == n_active_cells,
1645              ExcInternalError());
1646 
1647       {
1648         unsigned int n_macro_cells_before = 0;
1649         // Create partitioning within partitions.
1650 
1651         // For each block of cells, this variable saves to which partitions
1652         // the block belongs. Initialize all to n_macro_cells to mark them as
1653         // not yet assigned a partition.
1654         std::vector<unsigned int> cell_partition_l2(
1655           n_active_cells, numbers::invalid_unsigned_int);
1656         partition_row_index.clear();
1657         partition_row_index.resize(partition + 1, 0);
1658         cell_partition_data.resize(1, 0);
1659 
1660         unsigned int counter = 0;
1661         unsigned int missing_macros;
1662         for (unsigned int part = 0; part < partition; ++part)
1663           {
1664             neighbor_neighbor_list.resize(0);
1665             neighbor_list.resize(0);
1666             bool         work              = true;
1667             unsigned int partition_l2      = 0;
1668             unsigned int start_up          = partition_size[part];
1669             unsigned int partition_counter = 0;
1670             while (work)
1671               {
1672                 if (neighbor_list.size() == 0)
1673                   {
1674                     work              = false;
1675                     partition_counter = 0;
1676                     for (unsigned int j = start_up;
1677                          j < partition_size[part + 1];
1678                          ++j)
1679                       if (cell_partition[partition_list[j]] == part &&
1680                           cell_partition_l2[partition_list[j]] ==
1681                             numbers::invalid_unsigned_int)
1682                         {
1683                           start_up          = j;
1684                           work              = true;
1685                           partition_counter = 1;
1686                           // To start up, set the start_up cell to partition
1687                           // and list all its neighbors.
1688                           AssertIndexRange(start_up, partition_size[part + 1]);
1689                           cell_partition_l2[partition_list[start_up]] =
1690                             partition_l2;
1691                           neighbor_neighbor_list.push_back(
1692                             partition_list[start_up]);
1693                           partition_partition_list[counter++] =
1694                             partition_list[start_up];
1695                           start_up++;
1696                           break;
1697                         }
1698                   }
1699                 else
1700                   {
1701                     partition_counter = 0;
1702                     for (const unsigned int neighbor : neighbor_list)
1703                       {
1704                         Assert(cell_partition[neighbor] == part,
1705                                ExcInternalError());
1706                         Assert(cell_partition_l2[neighbor] == partition_l2 - 1,
1707                                ExcInternalError());
1708                         auto       neighbor_it = connectivity.begin(neighbor);
1709                         const auto end_it      = connectivity.end(neighbor);
1710                         for (; neighbor_it != end_it; ++neighbor_it)
1711                           {
1712                             if (cell_partition[neighbor_it->column()] == part &&
1713                                 cell_partition_l2[neighbor_it->column()] ==
1714                                   numbers::invalid_unsigned_int)
1715                               {
1716                                 cell_partition_l2[neighbor_it->column()] =
1717                                   partition_l2;
1718                                 neighbor_neighbor_list.push_back(
1719                                   neighbor_it->column());
1720                                 partition_partition_list[counter++] =
1721                                   neighbor_it->column();
1722                                 partition_counter++;
1723                               }
1724                           }
1725                       }
1726                   }
1727                 if (partition_counter > 0)
1728                   {
1729                     int index_before = neighbor_neighbor_list.size(),
1730                         index        = index_before;
1731                     {
1732                       // put the cells into separate lists for each FE index
1733                       // within one partition-partition
1734                       missing_macros = 0;
1735                       std::vector<unsigned int> remaining_per_macro_cell(
1736                         max_fe_index + 1);
1737                       std::vector<std::vector<unsigned int>>
1738                                    renumbering_fe_index;
1739                       unsigned int cell;
1740                       bool         filled = true;
1741                       if (hp_bool == true)
1742                         {
1743                           renumbering_fe_index.resize(max_fe_index + 1);
1744                           for (cell = counter - partition_counter;
1745                                cell < counter;
1746                                ++cell)
1747                             {
1748                               renumbering_fe_index
1749                                 [cell_active_fe_index.empty() ?
1750                                    0 :
1751                                    cell_active_fe_index
1752                                      [partition_partition_list[cell]]]
1753                                   .push_back(partition_partition_list[cell]);
1754                             }
1755                           // check how many more cells are needed in the lists
1756                           for (unsigned int j = 0; j < max_fe_index + 1; j++)
1757                             {
1758                               remaining_per_macro_cell[j] =
1759                                 renumbering_fe_index[j].size() %
1760                                 vectorization_length;
1761                               if (remaining_per_macro_cell[j] != 0)
1762                                 filled = false;
1763                               missing_macros +=
1764                                 ((renumbering_fe_index[j].size() +
1765                                   vectorization_length - 1) /
1766                                  vectorization_length);
1767                             }
1768                         }
1769                       else
1770                         {
1771                           remaining_per_macro_cell.resize(1);
1772                           remaining_per_macro_cell[0] =
1773                             partition_counter % vectorization_length;
1774                           missing_macros =
1775                             partition_counter / vectorization_length;
1776                           if (remaining_per_macro_cell[0] != 0)
1777                             {
1778                               filled = false;
1779                               missing_macros++;
1780                             }
1781                         }
1782                       missing_macros =
1783                         cluster_size - (missing_macros % cluster_size);
1784 
1785                       // now we realized that there are some cells missing.
1786                       while (missing_macros > 0 || filled == false)
1787                         {
1788                           if (index == 0)
1789                             {
1790                               index = neighbor_neighbor_list.size();
1791                               if (index == index_before)
1792                                 {
1793                                   if (missing_macros != 0)
1794                                     {
1795                                       neighbor_neighbor_list.resize(0);
1796                                     }
1797                                   start_up--;
1798                                   break; // not connected - start again
1799                                 }
1800                               index_before = index;
1801                             }
1802                           index--;
1803                           unsigned int additional =
1804                             neighbor_neighbor_list[index];
1805 
1806                           // go through the neighbors of the last cell in the
1807                           // current partition and check if we find some to
1808                           // fill up with.
1809                           DynamicSparsityPattern::iterator neighbor =
1810                                                              connectivity.begin(
1811                                                                additional),
1812                                                            end =
1813                                                              connectivity.end(
1814                                                                additional);
1815                           for (; neighbor != end; ++neighbor)
1816                             {
1817                               if (cell_partition[neighbor->column()] == part &&
1818                                   cell_partition_l2[neighbor->column()] ==
1819                                     numbers::invalid_unsigned_int)
1820                                 {
1821                                   unsigned int this_index = 0;
1822                                   if (hp_bool == true)
1823                                     this_index =
1824                                       cell_active_fe_index.empty() ?
1825                                         0 :
1826                                         cell_active_fe_index[neighbor
1827                                                                ->column()];
1828 
1829                                   // Only add this cell if we need more macro
1830                                   // cells in the current block or if there is
1831                                   // a macro cell with the FE index that is
1832                                   // not yet fully populated
1833                                   if (missing_macros > 0 ||
1834                                       remaining_per_macro_cell[this_index] > 0)
1835                                     {
1836                                       cell_partition_l2[neighbor->column()] =
1837                                         partition_l2;
1838                                       neighbor_neighbor_list.push_back(
1839                                         neighbor->column());
1840                                       if (hp_bool == true)
1841                                         renumbering_fe_index[this_index]
1842                                           .push_back(neighbor->column());
1843                                       partition_partition_list[counter] =
1844                                         neighbor->column();
1845                                       counter++;
1846                                       partition_counter++;
1847                                       if (remaining_per_macro_cell
1848                                               [this_index] == 0 &&
1849                                           missing_macros > 0)
1850                                         missing_macros--;
1851                                       remaining_per_macro_cell[this_index]++;
1852                                       if (remaining_per_macro_cell
1853                                             [this_index] ==
1854                                           vectorization_length)
1855                                         {
1856                                           remaining_per_macro_cell[this_index] =
1857                                             0;
1858                                         }
1859                                       if (missing_macros == 0)
1860                                         {
1861                                           filled = true;
1862                                           for (unsigned int fe_ind = 0;
1863                                                fe_ind < max_fe_index + 1;
1864                                                ++fe_ind)
1865                                             if (remaining_per_macro_cell
1866                                                   [fe_ind] != 0)
1867                                               filled = false;
1868                                         }
1869                                       if (filled == true)
1870                                         break;
1871                                     }
1872                                 }
1873                             }
1874                         }
1875                       if (hp_bool == true)
1876                         {
1877                           // set the renumbering according to their active FE
1878                           // index within one partition-partition which was
1879                           // implicitly assumed above
1880                           cell = counter - partition_counter;
1881                           for (unsigned int j = 0; j < max_fe_index + 1; j++)
1882                             {
1883                               for (const unsigned int jj :
1884                                    renumbering_fe_index[j])
1885                                 renumbering[cell++] = jj;
1886                               if (renumbering_fe_index[j].size() %
1887                                     vectorization_length !=
1888                                   0)
1889                                 irregular_cells[renumbering_fe_index[j].size() /
1890                                                   vectorization_length +
1891                                                 n_macro_cells_before] =
1892                                   renumbering_fe_index[j].size() %
1893                                   vectorization_length;
1894                               n_macro_cells_before +=
1895                                 (renumbering_fe_index[j].size() +
1896                                  vectorization_length - 1) /
1897                                 vectorization_length;
1898                               renumbering_fe_index[j].resize(0);
1899                             }
1900                         }
1901                       else
1902                         {
1903                           n_macro_cells_before +=
1904                             partition_counter / vectorization_length;
1905                           if (partition_counter % vectorization_length != 0)
1906                             {
1907                               irregular_cells[n_macro_cells_before] =
1908                                 partition_counter % vectorization_length;
1909                               n_macro_cells_before++;
1910                             }
1911                         }
1912                     }
1913                     cell_partition_data.push_back(n_macro_cells_before);
1914                     partition_l2++;
1915                   }
1916                 neighbor_list = neighbor_neighbor_list;
1917                 neighbor_neighbor_list.resize(0);
1918               }
1919             partition_row_index[part + 1] =
1920               partition_row_index[part] + partition_l2;
1921           }
1922       }
1923       if (hp_bool == true)
1924         {
1925           partition_partition_list.swap(renumbering);
1926         }
1927     }
1928 
1929 
1930 
1931     // Function to create coloring on the second layer within each partition.
1932     // Version assumes preblocking.
1933     void
make_coloring_within_partitions_pre_blocked(const DynamicSparsityPattern & connectivity,const unsigned int partition,const std::vector<unsigned int> & cell_partition,const std::vector<unsigned int> & partition_list,const std::vector<unsigned int> & partition_size,std::vector<unsigned int> & partition_color_list)1934     TaskInfo::make_coloring_within_partitions_pre_blocked(
1935       const DynamicSparsityPattern &   connectivity,
1936       const unsigned int               partition,
1937       const std::vector<unsigned int> &cell_partition,
1938       const std::vector<unsigned int> &partition_list,
1939       const std::vector<unsigned int> &partition_size,
1940       std::vector<unsigned int> &      partition_color_list)
1941     {
1942       const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1943       std::vector<unsigned int> cell_color(n_blocks, n_macro_cells);
1944       std::vector<bool>         color_finder;
1945 
1946       partition_row_index.resize(partition + 1);
1947       cell_partition_data.clear();
1948       unsigned int color_counter = 0, index_counter = 0;
1949       for (unsigned int part = 0; part < partition; part++)
1950         {
1951           partition_row_index[part] = index_counter;
1952           unsigned int max_color    = 0;
1953           for (unsigned int k = partition_size[part];
1954                k < partition_size[part + 1];
1955                k++)
1956             {
1957               unsigned int cell        = partition_list[k];
1958               unsigned int n_neighbors = connectivity.row_length(cell);
1959 
1960               // In the worst case, each neighbor has a different color. So we
1961               // find at least one available color between 0 and n_neighbors.
1962               color_finder.resize(n_neighbors + 1);
1963               for (unsigned int j = 0; j <= n_neighbors; ++j)
1964                 color_finder[j] = true;
1965               DynamicSparsityPattern::iterator neighbor =
1966                                                  connectivity.begin(cell),
1967                                                end = connectivity.end(cell);
1968               for (; neighbor != end; ++neighbor)
1969                 {
1970                   // Mark the color that a neighbor within the partition has
1971                   // as taken
1972                   if (cell_partition[neighbor->column()] == part &&
1973                       cell_color[neighbor->column()] <= n_neighbors)
1974                     color_finder[cell_color[neighbor->column()]] = false;
1975                 }
1976               // Choose the smallest color that is not taken for the block
1977               cell_color[cell] = 0;
1978               while (color_finder[cell_color[cell]] == false)
1979                 cell_color[cell]++;
1980               if (cell_color[cell] > max_color)
1981                 max_color = cell_color[cell];
1982             }
1983           // Reorder within partition: First, all blocks that belong the 0 and
1984           // then so on until those with color max (Note that the smaller the
1985           // number the larger the partition)
1986           for (unsigned int color = 0; color <= max_color; color++)
1987             {
1988               cell_partition_data.push_back(color_counter);
1989               index_counter++;
1990               for (unsigned int k = partition_size[part];
1991                    k < partition_size[part + 1];
1992                    k++)
1993                 {
1994                   unsigned int cell = partition_list[k];
1995                   if (cell_color[cell] == color)
1996                     {
1997                       partition_color_list[color_counter++] = cell;
1998                     }
1999                 }
2000             }
2001         }
2002       cell_partition_data.push_back(n_blocks);
2003       partition_row_index[partition] = index_counter;
2004       AssertDimension(color_counter, n_blocks);
2005     }
2006 
2007 
2008     // Function to create partitioning on the first layer.
2009     void
make_partitioning(const DynamicSparsityPattern & connectivity,const unsigned int cluster_size,std::vector<unsigned int> & cell_partition,std::vector<unsigned int> & partition_list,std::vector<unsigned int> & partition_size,unsigned int & partition) const2010     TaskInfo::make_partitioning(const DynamicSparsityPattern &connectivity,
2011                                 const unsigned int            cluster_size,
2012                                 std::vector<unsigned int> &   cell_partition,
2013                                 std::vector<unsigned int> &   partition_list,
2014                                 std::vector<unsigned int> &   partition_size,
2015                                 unsigned int &                partition) const
2016 
2017     {
2018       // For each block of cells, this variable saves to which partitions the
2019       // block belongs. Initialize all to n_macro_cells to mark them as not
2020       // yet assigned a partition.
2021       // std::vector<unsigned int> cell_partition (n_active_cells,
2022       //                                          numbers::invalid_unsigned_int);
2023       // List of cells in previous partition
2024       std::vector<unsigned int> neighbor_list;
2025       // List of cells in current partition for use as neighbors in next
2026       // partition
2027       std::vector<unsigned int> neighbor_neighbor_list;
2028 
2029       // In element j of this variable, one puts the old number of the block
2030       // that should be the jth block in the new numeration.
2031       // std::vector<unsigned int> partition_list(n_active_cells,0);
2032 
2033       // This vector points to the start of each partition.
2034       // std::vector<unsigned int> partition_size(2,0);
2035 
2036       partition            = 0;
2037       unsigned int counter = 0;
2038       unsigned int start_nonboundary =
2039         cell_partition_data.size() == 5 ?
2040           vectorization_length *
2041             (cell_partition_data[2] - cell_partition_data[1]) :
2042           0;
2043 
2044       const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
2045       if (n_macro_cells == 0)
2046         return;
2047       if (scheme == color)
2048         start_nonboundary = n_macro_cells;
2049       if (scheme == partition_color ||
2050           scheme == color) // blocking_connectivity == true
2051         start_nonboundary = ((start_nonboundary + block_size - 1) / block_size);
2052       unsigned int n_blocks;
2053       if (scheme == partition_color ||
2054           scheme == color) // blocking_connectivity == true
2055         n_blocks = this->n_blocks;
2056       else
2057         n_blocks = n_active_cells;
2058 
2059       if (start_nonboundary > n_blocks)
2060         start_nonboundary = n_blocks;
2061 
2062 
2063       unsigned int start_up  = 0;
2064       bool         work      = true;
2065       unsigned int remainder = cluster_size;
2066 
2067       // this performs a classical breath-first search in the connectivity
2068       // graph of the cells under the restriction that the size of the
2069       // partitions should be a multiple of the given block size
2070       while (work)
2071         {
2072           // put the cells with neighbors on remote MPI processes up front
2073           if (start_nonboundary > 0)
2074             {
2075               for (unsigned int cell = 0; cell < start_nonboundary; ++cell)
2076                 {
2077                   const unsigned int cell_nn = cell;
2078                   cell_partition[cell_nn]    = partition;
2079                   neighbor_list.push_back(cell_nn);
2080                   partition_list[counter++] = cell_nn;
2081                   partition_size.back()++;
2082                 }
2083               start_nonboundary = 0;
2084               remainder -= (start_nonboundary % cluster_size);
2085               if (remainder == cluster_size)
2086                 remainder = 0;
2087             }
2088           else
2089             {
2090               // To start up, set the start_up cell to partition and list all
2091               // its neighbors.
2092               cell_partition[start_up] = partition;
2093               neighbor_list.push_back(start_up);
2094               partition_list[counter++] = start_up;
2095               partition_size.back()++;
2096               start_up++;
2097               remainder--;
2098               if (remainder == cluster_size)
2099                 remainder = 0;
2100             }
2101           int index_before = neighbor_list.size(), index = index_before,
2102               index_stop = 0;
2103           while (remainder > 0)
2104             {
2105               if (index == index_stop)
2106                 {
2107                   index = neighbor_list.size();
2108                   if (index == index_before)
2109                     {
2110                       neighbor_list.resize(0);
2111                       goto not_connect;
2112                     }
2113                   index_stop   = index_before;
2114                   index_before = index;
2115                 }
2116               index--;
2117               unsigned int additional = neighbor_list[index];
2118               DynamicSparsityPattern::iterator neighbor =
2119                                                  connectivity.begin(additional),
2120                                                end =
2121                                                  connectivity.end(additional);
2122               for (; neighbor != end; ++neighbor)
2123                 {
2124                   if (cell_partition[neighbor->column()] ==
2125                       numbers::invalid_unsigned_int)
2126                     {
2127                       partition_size.back()++;
2128                       cell_partition[neighbor->column()] = partition;
2129                       neighbor_list.push_back(neighbor->column());
2130                       partition_list[counter++] = neighbor->column();
2131                       remainder--;
2132                       if (remainder == 0)
2133                         break;
2134                     }
2135                 }
2136             }
2137 
2138           while (neighbor_list.size() > 0)
2139             {
2140               partition++;
2141 
2142               // counter for number of cells so far in current partition
2143               unsigned int partition_counter = 0;
2144 
2145               // Mark the start of the new partition
2146               partition_size.push_back(partition_size.back());
2147 
2148               // Loop through the list of cells in previous partition and put
2149               // all their neighbors in current partition
2150               for (const unsigned int cell : neighbor_list)
2151                 {
2152                   Assert(cell_partition[cell] == partition - 1,
2153                          ExcInternalError());
2154                   auto       neighbor = connectivity.begin(cell);
2155                   const auto end      = connectivity.end(cell);
2156                   for (; neighbor != end; ++neighbor)
2157                     {
2158                       if (cell_partition[neighbor->column()] ==
2159                           numbers::invalid_unsigned_int)
2160                         {
2161                           partition_size.back()++;
2162                           cell_partition[neighbor->column()] = partition;
2163 
2164                           // collect the cells of the current partition for
2165                           // use as neighbors in next partition
2166                           neighbor_neighbor_list.push_back(neighbor->column());
2167                           partition_list[counter++] = neighbor->column();
2168                           partition_counter++;
2169                         }
2170                     }
2171                 }
2172               remainder = cluster_size - (partition_counter % cluster_size);
2173               if (remainder == cluster_size)
2174                 remainder = 0;
2175               int index_stop   = 0;
2176               int index_before = neighbor_neighbor_list.size(),
2177                   index        = index_before;
2178               while (remainder > 0)
2179                 {
2180                   if (index == index_stop)
2181                     {
2182                       index = neighbor_neighbor_list.size();
2183                       if (index == index_before)
2184                         {
2185                           neighbor_neighbor_list.resize(0);
2186                           break;
2187                         }
2188                       index_stop   = index_before;
2189                       index_before = index;
2190                     }
2191                   index--;
2192                   unsigned int additional = neighbor_neighbor_list[index];
2193                   DynamicSparsityPattern::iterator neighbor =
2194                                                      connectivity.begin(
2195                                                        additional),
2196                                                    end = connectivity.end(
2197                                                      additional);
2198                   for (; neighbor != end; ++neighbor)
2199                     {
2200                       if (cell_partition[neighbor->column()] ==
2201                           numbers::invalid_unsigned_int)
2202                         {
2203                           partition_size.back()++;
2204                           cell_partition[neighbor->column()] = partition;
2205                           neighbor_neighbor_list.push_back(neighbor->column());
2206                           partition_list[counter++] = neighbor->column();
2207                           remainder--;
2208                           if (remainder == 0)
2209                             break;
2210                         }
2211                     }
2212                 }
2213 
2214               neighbor_list = neighbor_neighbor_list;
2215               neighbor_neighbor_list.resize(0);
2216             }
2217         not_connect:
2218           // One has to check if the graph is not connected so we have to find
2219           // another partition.
2220           work = false;
2221           for (unsigned int j = start_up; j < n_blocks; ++j)
2222             if (cell_partition[j] == numbers::invalid_unsigned_int)
2223               {
2224                 start_up = j;
2225                 work     = true;
2226                 if (remainder == 0)
2227                   remainder = cluster_size;
2228                 break;
2229               }
2230         }
2231       if (remainder != 0)
2232         partition++;
2233 
2234       AssertDimension(partition_size[partition], n_blocks);
2235     }
2236 
2237 
2238     void
update_task_info(const unsigned int partition)2239     TaskInfo::update_task_info(const unsigned int partition)
2240     {
2241       evens             = (partition + 1) / 2;
2242       odds              = partition / 2;
2243       n_blocked_workers = odds - (odds + evens + 1) % 2;
2244       n_workers         = evens + odds - n_blocked_workers;
2245       // From here only used for partition partition option.
2246       partition_evens.resize(partition);
2247       partition_odds.resize(partition);
2248       partition_n_blocked_workers.resize(partition);
2249       partition_n_workers.resize(partition);
2250       for (unsigned int part = 0; part < partition; part++)
2251         {
2252           partition_evens[part] =
2253             (partition_row_index[part + 1] - partition_row_index[part] + 1) / 2;
2254           partition_odds[part] =
2255             (partition_row_index[part + 1] - partition_row_index[part]) / 2;
2256           partition_n_blocked_workers[part] =
2257             partition_odds[part] -
2258             (partition_odds[part] + partition_evens[part] + 1) % 2;
2259           partition_n_workers[part] = partition_evens[part] +
2260                                       partition_odds[part] -
2261                                       partition_n_blocked_workers[part];
2262         }
2263     }
2264   } // namespace MatrixFreeFunctions
2265 } // namespace internal
2266 
2267 
2268 
2269 // explicit instantiations of template functions
2270 template void
2271 internal::MatrixFreeFunctions::TaskInfo::print_memory_statistics<std::ostream>(
2272   std::ostream &,
2273   const std::size_t) const;
2274 template void
2275 internal::MatrixFreeFunctions::TaskInfo::print_memory_statistics<
2276   ConditionalOStream>(ConditionalOStream &, const std::size_t) const;
2277 
2278 
2279 DEAL_II_NAMESPACE_CLOSE
2280