1 // --------------------------------------------------------------------- 2 // 3 // Copyright (C) 2018 - 2020 by the deal.II authors 4 // 5 // This file is part of the deal.II library. 6 // 7 // The deal.II library is free software; you can use it, redistribute 8 // it, and/or modify it under the terms of the GNU Lesser General 9 // Public License as published by the Free Software Foundation; either 10 // version 2.1 of the License, or (at your option) any later version. 11 // The full text of the license can be found in the file LICENSE.md at 12 // the top level directory of deal.II. 13 // 14 // --------------------------------------------------------------------- 15 16 17 #include <deal.II/base/conditional_ostream.h> 18 #include <deal.II/base/memory_consumption.h> 19 #include <deal.II/base/mpi.h> 20 #include <deal.II/base/multithread_info.h> 21 #include <deal.II/base/parallel.h> 22 #include <deal.II/base/utilities.h> 23 24 #include <deal.II/matrix_free/task_info.h> 25 26 27 #ifdef DEAL_II_WITH_TBB 28 # include <tbb/blocked_range.h> 29 # include <tbb/parallel_for.h> 30 # include <tbb/task.h> 31 # include <tbb/task_scheduler_init.h> 32 #endif 33 34 #include <iostream> 35 #include <set> 36 37 DEAL_II_NAMESPACE_OPEN 38 39 40 41 /*-------------------- Implementation of the matrix-free loop --------------*/ 42 namespace internal 43 { 44 namespace MatrixFreeFunctions 45 { 46 #ifdef DEAL_II_WITH_TBB 47 48 // This defines the TBB data structures that are needed to schedule the 49 // partition-partition variant 50 51 namespace partition 52 { 53 class ActualCellWork 54 { 55 public: ActualCellWork(MFWorkerInterface ** worker_pointer,const unsigned int partition,const TaskInfo & task_info)56 ActualCellWork(MFWorkerInterface **worker_pointer, 57 const unsigned int partition, 58 const TaskInfo & task_info) 59 : worker(nullptr) 60 , worker_pointer(worker_pointer) 61 , partition(partition) 62 , task_info(task_info) 63 {} 64 ActualCellWork(MFWorkerInterface & worker,const unsigned int partition,const TaskInfo & task_info)65 ActualCellWork(MFWorkerInterface &worker, 66 const unsigned int partition, 67 const TaskInfo & task_info) 68 : worker(&worker) 69 , worker_pointer(nullptr) 70 , partition(partition) 71 , task_info(task_info) 72 {} 73 74 void operator ()() const75 operator()() const 76 { 77 MFWorkerInterface *used_worker = 78 worker != nullptr ? worker : *worker_pointer; 79 Assert(used_worker != nullptr, ExcInternalError()); 80 used_worker->cell( 81 std::make_pair(task_info.cell_partition_data[partition], 82 task_info.cell_partition_data[partition + 1])); 83 84 if (task_info.face_partition_data.empty() == false) 85 { 86 used_worker->face( 87 std::make_pair(task_info.face_partition_data[partition], 88 task_info.face_partition_data[partition + 1])); 89 90 used_worker->boundary(std::make_pair( 91 task_info.boundary_partition_data[partition], 92 task_info.boundary_partition_data[partition + 1])); 93 } 94 } 95 96 private: 97 MFWorkerInterface * worker; 98 MFWorkerInterface **worker_pointer; 99 const unsigned int partition; 100 const TaskInfo & task_info; 101 }; 102 103 class CellWork : public tbb::task 104 { 105 public: CellWork(MFWorkerInterface & worker,const unsigned int partition,const TaskInfo & task_info,const bool is_blocked)106 CellWork(MFWorkerInterface &worker, 107 const unsigned int partition, 108 const TaskInfo & task_info, 109 const bool is_blocked) 110 : dummy(nullptr) 111 , work(worker, partition, task_info) 112 , is_blocked(is_blocked) 113 {} 114 115 tbb::task * execute()116 execute() override 117 { 118 work(); 119 120 if (is_blocked == true) 121 tbb::empty_task::spawn(*dummy); 122 return nullptr; 123 } 124 125 tbb::empty_task *dummy; 126 127 private: 128 ActualCellWork work; 129 const bool is_blocked; 130 }; 131 132 133 134 class PartitionWork : public tbb::task 135 { 136 public: PartitionWork(MFWorkerInterface & function_in,const unsigned int partition_in,const TaskInfo & task_info_in,const bool is_blocked_in=false)137 PartitionWork(MFWorkerInterface &function_in, 138 const unsigned int partition_in, 139 const TaskInfo & task_info_in, 140 const bool is_blocked_in = false) 141 : dummy(nullptr) 142 , function(function_in) 143 , partition(partition_in) 144 , task_info(task_info_in) 145 , is_blocked(is_blocked_in) 146 {} 147 148 tbb::task * execute()149 execute() override 150 { 151 tbb::empty_task *root = 152 new (tbb::task::allocate_root()) tbb::empty_task; 153 const unsigned int evens = task_info.partition_evens[partition]; 154 const unsigned int odds = task_info.partition_odds[partition]; 155 const unsigned int n_blocked_workers = 156 task_info.partition_n_blocked_workers[partition]; 157 const unsigned int n_workers = 158 task_info.partition_n_workers[partition]; 159 std::vector<CellWork *> worker(n_workers); 160 std::vector<CellWork *> blocked_worker(n_blocked_workers); 161 162 root->set_ref_count(evens + 1); 163 for (unsigned int j = 0; j < evens; j++) 164 { 165 worker[j] = new (root->allocate_child()) 166 CellWork(function, 167 task_info.partition_row_index[partition] + 2 * j, 168 task_info, 169 false); 170 if (j > 0) 171 { 172 worker[j]->set_ref_count(2); 173 blocked_worker[j - 1]->dummy = 174 new (worker[j]->allocate_child()) tbb::empty_task; 175 tbb::task::spawn(*blocked_worker[j - 1]); 176 } 177 else 178 worker[j]->set_ref_count(1); 179 if (j < evens - 1) 180 { 181 blocked_worker[j] = new (worker[j]->allocate_child()) 182 CellWork(function, 183 task_info.partition_row_index[partition] + 2 * j + 184 1, 185 task_info, 186 true); 187 } 188 else 189 { 190 if (odds == evens) 191 { 192 worker[evens] = new (worker[j]->allocate_child()) 193 CellWork(function, 194 task_info.partition_row_index[partition] + 195 2 * j + 1, 196 task_info, 197 false); 198 tbb::task::spawn(*worker[evens]); 199 } 200 else 201 { 202 tbb::empty_task *child = 203 new (worker[j]->allocate_child()) tbb::empty_task(); 204 tbb::task::spawn(*child); 205 } 206 } 207 } 208 209 root->wait_for_all(); 210 root->destroy(*root); 211 if (is_blocked == true) 212 tbb::empty_task::spawn(*dummy); 213 return nullptr; 214 } 215 216 tbb::empty_task *dummy; 217 218 private: 219 MFWorkerInterface &function; 220 const unsigned int partition; 221 const TaskInfo & task_info; 222 const bool is_blocked; 223 }; 224 225 } // end of namespace partition 226 227 228 229 namespace color 230 { 231 class CellWork 232 { 233 public: CellWork(MFWorkerInterface & worker_in,const TaskInfo & task_info_in,const unsigned int partition_in)234 CellWork(MFWorkerInterface &worker_in, 235 const TaskInfo & task_info_in, 236 const unsigned int partition_in) 237 : worker(worker_in) 238 , task_info(task_info_in) 239 , partition(partition_in) 240 {} 241 242 void operator ()(const tbb::blocked_range<unsigned int> & r) const243 operator()(const tbb::blocked_range<unsigned int> &r) const 244 { 245 const unsigned int start_index = 246 task_info.cell_partition_data[partition] + 247 task_info.block_size * r.begin(); 248 const unsigned int end_index = 249 std::min(start_index + task_info.block_size * (r.end() - r.begin()), 250 task_info.cell_partition_data[partition + 1]); 251 worker.cell(std::make_pair(start_index, end_index)); 252 253 if (task_info.face_partition_data.empty() == false) 254 { 255 AssertThrow(false, ExcNotImplemented()); 256 } 257 } 258 259 private: 260 MFWorkerInterface &worker; 261 const TaskInfo & task_info; 262 const unsigned int partition; 263 }; 264 265 266 267 class PartitionWork : public tbb::task 268 { 269 public: PartitionWork(MFWorkerInterface & worker_in,const unsigned int partition_in,const TaskInfo & task_info_in,const bool is_blocked_in)270 PartitionWork(MFWorkerInterface &worker_in, 271 const unsigned int partition_in, 272 const TaskInfo & task_info_in, 273 const bool is_blocked_in) 274 : dummy(nullptr) 275 , worker(worker_in) 276 , partition(partition_in) 277 , task_info(task_info_in) 278 , is_blocked(is_blocked_in) 279 {} 280 281 tbb::task * execute()282 execute() override 283 { 284 const unsigned int n_chunks = 285 (task_info.cell_partition_data[partition + 1] - 286 task_info.cell_partition_data[partition] + task_info.block_size - 287 1) / 288 task_info.block_size; 289 parallel_for(tbb::blocked_range<unsigned int>(0, n_chunks, 1), 290 CellWork(worker, task_info, partition)); 291 if (is_blocked == true) 292 tbb::empty_task::spawn(*dummy); 293 return nullptr; 294 } 295 296 tbb::empty_task *dummy; 297 298 private: 299 MFWorkerInterface &worker; 300 const unsigned int partition; 301 const TaskInfo & task_info; 302 const bool is_blocked; 303 }; 304 305 } // end of namespace color 306 307 308 309 class MPICommunication : public tbb::task 310 { 311 public: MPICommunication(MFWorkerInterface & worker_in,const bool do_compress)312 MPICommunication(MFWorkerInterface &worker_in, const bool do_compress) 313 : worker(worker_in) 314 , do_compress(do_compress) 315 {} 316 317 tbb::task * execute()318 execute() override 319 { 320 if (do_compress == false) 321 worker.vector_update_ghosts_finish(); 322 else 323 worker.vector_compress_start(); 324 return nullptr; 325 } 326 327 private: 328 MFWorkerInterface &worker; 329 const bool do_compress; 330 }; 331 332 #endif // DEAL_II_WITH_TBB 333 334 335 336 void loop(MFWorkerInterface & funct) const337 TaskInfo::loop(MFWorkerInterface &funct) const 338 { 339 // If we use thread parallelism, we do not currently support to schedule 340 // pieces of updates within the loop, so this index will collect all 341 // calls in that case and work like a single complete loop over all 342 // cells 343 if (scheme != none) 344 funct.cell_loop_pre_range(numbers::invalid_unsigned_int); 345 else 346 funct.cell_loop_pre_range( 347 partition_row_index[partition_row_index.size() - 2]); 348 349 funct.vector_update_ghosts_start(); 350 351 #ifdef DEAL_II_WITH_TBB 352 353 if (scheme != none) 354 { 355 funct.zero_dst_vector_range(numbers::invalid_unsigned_int); 356 if (scheme == partition_partition) 357 { 358 tbb::empty_task *root = 359 new (tbb::task::allocate_root()) tbb::empty_task; 360 root->set_ref_count(evens + 1); 361 std::vector<partition::PartitionWork *> worker(n_workers); 362 std::vector<partition::PartitionWork *> blocked_worker( 363 n_blocked_workers); 364 MPICommunication *worker_compr = 365 new (root->allocate_child()) MPICommunication(funct, true); 366 worker_compr->set_ref_count(1); 367 for (unsigned int j = 0; j < evens; j++) 368 { 369 if (j > 0) 370 { 371 worker[j] = new (root->allocate_child()) 372 partition::PartitionWork(funct, 2 * j, *this, false); 373 worker[j]->set_ref_count(2); 374 blocked_worker[j - 1]->dummy = 375 new (worker[j]->allocate_child()) tbb::empty_task; 376 tbb::task::spawn(*blocked_worker[j - 1]); 377 } 378 else 379 { 380 worker[j] = new (worker_compr->allocate_child()) 381 partition::PartitionWork(funct, 2 * j, *this, false); 382 worker[j]->set_ref_count(2); 383 MPICommunication *worker_dist = 384 new (worker[j]->allocate_child()) 385 MPICommunication(funct, false); 386 tbb::task::spawn(*worker_dist); 387 } 388 if (j < evens - 1) 389 { 390 blocked_worker[j] = new (worker[j]->allocate_child()) 391 partition::PartitionWork(funct, 2 * j + 1, *this, true); 392 } 393 else 394 { 395 if (odds == evens) 396 { 397 worker[evens] = new (worker[j]->allocate_child()) 398 partition::PartitionWork(funct, 399 2 * j + 1, 400 *this, 401 false); 402 tbb::task::spawn(*worker[evens]); 403 } 404 else 405 { 406 tbb::empty_task *child = 407 new (worker[j]->allocate_child()) tbb::empty_task(); 408 tbb::task::spawn(*child); 409 } 410 } 411 } 412 413 root->wait_for_all(); 414 root->destroy(*root); 415 } 416 else // end of partition-partition, start of partition-color 417 { 418 // check whether there is only one partition. if not, build up the 419 // tree of partitions 420 if (odds > 0) 421 { 422 tbb::empty_task *root = 423 new (tbb::task::allocate_root()) tbb::empty_task; 424 root->set_ref_count(evens + 1); 425 const unsigned int n_blocked_workers = 426 odds - (odds + evens + 1) % 2; 427 const unsigned int n_workers = 428 cell_partition_data.size() - 1 - n_blocked_workers; 429 std::vector<color::PartitionWork *> worker(n_workers); 430 std::vector<color::PartitionWork *> blocked_worker( 431 n_blocked_workers); 432 unsigned int worker_index = 0, slice_index = 0; 433 int spawn_index_child = -2; 434 MPICommunication *worker_compr = 435 new (root->allocate_child()) MPICommunication(funct, true); 436 worker_compr->set_ref_count(1); 437 for (unsigned int part = 0; 438 part < partition_row_index.size() - 1; 439 part++) 440 { 441 if (part == 0) 442 worker[worker_index] = 443 new (worker_compr->allocate_child()) 444 color::PartitionWork(funct, 445 slice_index, 446 *this, 447 false); 448 else 449 worker[worker_index] = new (root->allocate_child()) 450 color::PartitionWork(funct, 451 slice_index, 452 *this, 453 false); 454 slice_index++; 455 for (; slice_index < partition_row_index[part + 1]; 456 slice_index++) 457 { 458 worker[worker_index]->set_ref_count(1); 459 worker_index++; 460 worker[worker_index] = 461 new (worker[worker_index - 1]->allocate_child()) 462 color::PartitionWork(funct, 463 slice_index, 464 *this, 465 false); 466 } 467 worker[worker_index]->set_ref_count(2); 468 if (part > 0) 469 { 470 blocked_worker[(part - 1) / 2]->dummy = 471 new (worker[worker_index]->allocate_child()) 472 tbb::empty_task; 473 worker_index++; 474 if (spawn_index_child == -1) 475 tbb::task::spawn(*blocked_worker[(part - 1) / 2]); 476 else 477 { 478 Assert(spawn_index_child >= 0, 479 ExcInternalError()); 480 tbb::task::spawn(*worker[spawn_index_child]); 481 } 482 spawn_index_child = -2; 483 } 484 else 485 { 486 MPICommunication *worker_dist = 487 new (worker[worker_index]->allocate_child()) 488 MPICommunication(funct, false); 489 tbb::task::spawn(*worker_dist); 490 worker_index++; 491 } 492 part += 1; 493 if (part < partition_row_index.size() - 1) 494 { 495 if (part < partition_row_index.size() - 2) 496 { 497 blocked_worker[part / 2] = 498 new (worker[worker_index - 1]->allocate_child()) 499 color::PartitionWork(funct, 500 slice_index, 501 *this, 502 true); 503 slice_index++; 504 if (slice_index < partition_row_index[part + 1]) 505 { 506 blocked_worker[part / 2]->set_ref_count(1); 507 worker[worker_index] = new ( 508 blocked_worker[part / 2]->allocate_child()) 509 color::PartitionWork(funct, 510 slice_index, 511 *this, 512 false); 513 slice_index++; 514 } 515 else 516 { 517 spawn_index_child = -1; 518 continue; 519 } 520 } 521 for (; slice_index < partition_row_index[part + 1]; 522 slice_index++) 523 { 524 if (slice_index > partition_row_index[part]) 525 { 526 worker[worker_index]->set_ref_count(1); 527 worker_index++; 528 } 529 worker[worker_index] = 530 new (worker[worker_index - 1]->allocate_child()) 531 color::PartitionWork(funct, 532 slice_index, 533 *this, 534 false); 535 } 536 spawn_index_child = worker_index; 537 worker_index++; 538 } 539 else 540 { 541 tbb::empty_task *final = 542 new (worker[worker_index - 1]->allocate_child()) 543 tbb::empty_task; 544 tbb::task::spawn(*final); 545 spawn_index_child = worker_index - 1; 546 } 547 } 548 if (evens == odds) 549 { 550 Assert(spawn_index_child >= 0, ExcInternalError()); 551 tbb::task::spawn(*worker[spawn_index_child]); 552 } 553 root->wait_for_all(); 554 root->destroy(*root); 555 } 556 // case when we only have one partition: this is the usual 557 // coloring scheme, and we just schedule a parallel for loop for 558 // each color 559 else 560 { 561 Assert(evens <= 1, ExcInternalError()); 562 funct.vector_update_ghosts_finish(); 563 564 for (unsigned int color = 0; color < partition_row_index[1]; 565 ++color) 566 { 567 tbb::empty_task *root = 568 new (tbb::task::allocate_root()) tbb::empty_task; 569 root->set_ref_count(2); 570 color::PartitionWork *worker = 571 new (root->allocate_child()) 572 color::PartitionWork(funct, color, *this, false); 573 tbb::empty_task::spawn(*worker); 574 root->wait_for_all(); 575 root->destroy(*root); 576 } 577 578 funct.vector_compress_start(); 579 } 580 } 581 } 582 else 583 #endif 584 // serial loop, go through up to three times and do the MPI transfer at 585 // the beginning/end of the second part 586 { 587 for (unsigned int part = 0; part < partition_row_index.size() - 2; 588 ++part) 589 { 590 if (part == 1) 591 funct.vector_update_ghosts_finish(); 592 593 for (unsigned int i = partition_row_index[part]; 594 i < partition_row_index[part + 1]; 595 ++i) 596 { 597 funct.cell_loop_pre_range(i); 598 funct.zero_dst_vector_range(i); 599 AssertIndexRange(i + 1, cell_partition_data.size()); 600 if (cell_partition_data[i + 1] > cell_partition_data[i]) 601 { 602 funct.cell(std::make_pair(cell_partition_data[i], 603 cell_partition_data[i + 1])); 604 } 605 606 if (face_partition_data.empty() == false) 607 { 608 if (face_partition_data[i + 1] > face_partition_data[i]) 609 funct.face(std::make_pair(face_partition_data[i], 610 face_partition_data[i + 1])); 611 if (boundary_partition_data[i + 1] > 612 boundary_partition_data[i]) 613 funct.boundary( 614 std::make_pair(boundary_partition_data[i], 615 boundary_partition_data[i + 1])); 616 } 617 funct.cell_loop_post_range(i); 618 } 619 620 if (part == 1) 621 funct.vector_compress_start(); 622 } 623 } 624 funct.vector_compress_finish(); 625 626 if (scheme != none) 627 funct.cell_loop_post_range(numbers::invalid_unsigned_int); 628 else 629 funct.cell_loop_post_range( 630 partition_row_index[partition_row_index.size() - 2]); 631 } 632 633 634 TaskInfo()635 TaskInfo::TaskInfo() 636 { 637 clear(); 638 } 639 640 641 642 void clear()643 TaskInfo::clear() 644 { 645 n_active_cells = 0; 646 n_ghost_cells = 0; 647 vectorization_length = 1; 648 block_size = 0; 649 n_blocks = 0; 650 scheme = none; 651 partition_row_index.clear(); 652 partition_row_index.resize(2); 653 cell_partition_data.clear(); 654 face_partition_data.clear(); 655 boundary_partition_data.clear(); 656 evens = 0; 657 odds = 0; 658 n_blocked_workers = 0; 659 n_workers = 0; 660 partition_evens.clear(); 661 partition_odds.clear(); 662 partition_n_blocked_workers.clear(); 663 partition_n_workers.clear(); 664 communicator = MPI_COMM_SELF; 665 my_pid = 0; 666 n_procs = 1; 667 } 668 669 670 671 template <typename StreamType> 672 void print_memory_statistics(StreamType & out,const std::size_t data_length) const673 TaskInfo::print_memory_statistics(StreamType & out, 674 const std::size_t data_length) const 675 { 676 Utilities::MPI::MinMaxAvg memory_c = 677 Utilities::MPI::min_max_avg(1e-6 * data_length, communicator); 678 if (n_procs < 2) 679 out << memory_c.min; 680 else 681 out << memory_c.min << "/" << memory_c.avg << "/" << memory_c.max; 682 out << " MB" << std::endl; 683 } 684 685 686 687 std::size_t memory_consumption() const688 TaskInfo::memory_consumption() const 689 { 690 return ( 691 sizeof(*this) + 692 MemoryConsumption::memory_consumption(partition_row_index) + 693 MemoryConsumption::memory_consumption(cell_partition_data) + 694 MemoryConsumption::memory_consumption(face_partition_data) + 695 MemoryConsumption::memory_consumption(boundary_partition_data) + 696 MemoryConsumption::memory_consumption(partition_evens) + 697 MemoryConsumption::memory_consumption(partition_odds) + 698 MemoryConsumption::memory_consumption(partition_n_blocked_workers) + 699 MemoryConsumption::memory_consumption(partition_n_workers)); 700 } 701 702 703 704 void make_boundary_cells_divisible(std::vector<unsigned int> & boundary_cells)705 TaskInfo::make_boundary_cells_divisible( 706 std::vector<unsigned int> &boundary_cells) 707 { 708 // try to make the number of boundary cells divisible by the number of 709 // vectors in vectorization 710 unsigned int fillup_needed = 711 (vectorization_length - boundary_cells.size() % vectorization_length) % 712 vectorization_length; 713 if (fillup_needed > 0 && boundary_cells.size() < n_active_cells) 714 { 715 // fill additional cells into the list of boundary cells to get a 716 // balanced number. Go through the indices successively until we 717 // found enough indices 718 std::vector<unsigned int> new_boundary_cells; 719 new_boundary_cells.reserve(boundary_cells.size()); 720 721 unsigned int next_free_slot = 0, bound_index = 0; 722 while (fillup_needed > 0 && bound_index < boundary_cells.size()) 723 { 724 if (next_free_slot < boundary_cells[bound_index]) 725 { 726 // check if there are enough cells to fill with in the 727 // current slot 728 if (next_free_slot + fillup_needed <= 729 boundary_cells[bound_index]) 730 { 731 for (unsigned int j = 732 boundary_cells[bound_index] - fillup_needed; 733 j < boundary_cells[bound_index]; 734 ++j) 735 new_boundary_cells.push_back(j); 736 fillup_needed = 0; 737 } 738 // ok, not enough indices, so just take them all up to the 739 // next boundary cell 740 else 741 { 742 for (unsigned int j = next_free_slot; 743 j < boundary_cells[bound_index]; 744 ++j) 745 new_boundary_cells.push_back(j); 746 fillup_needed -= 747 boundary_cells[bound_index] - next_free_slot; 748 } 749 } 750 new_boundary_cells.push_back(boundary_cells[bound_index]); 751 next_free_slot = boundary_cells[bound_index] + 1; 752 ++bound_index; 753 } 754 while (fillup_needed > 0 && 755 (new_boundary_cells.size() == 0 || 756 new_boundary_cells.back() < n_active_cells - 1)) 757 new_boundary_cells.push_back(new_boundary_cells.back() + 1); 758 while (bound_index < boundary_cells.size()) 759 new_boundary_cells.push_back(boundary_cells[bound_index++]); 760 761 boundary_cells.swap(new_boundary_cells); 762 } 763 764 // set the number of cells 765 std::sort(boundary_cells.begin(), boundary_cells.end()); 766 767 // check that number of boundary cells is divisible by 768 // vectorization_length or that it contains all cells 769 Assert(boundary_cells.size() % vectorization_length == 0 || 770 boundary_cells.size() == n_active_cells, 771 ExcInternalError()); 772 } 773 774 775 776 void create_blocks_serial(const std::vector<unsigned int> & cells_with_comm,const unsigned int dofs_per_cell,const bool categories_are_hp,const std::vector<unsigned int> & cell_vectorization_categories,const bool cell_vectorization_categories_strict,const std::vector<unsigned int> & parent_relation,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & incompletely_filled_vectorization)777 TaskInfo::create_blocks_serial( 778 const std::vector<unsigned int> &cells_with_comm, 779 const unsigned int dofs_per_cell, 780 const bool categories_are_hp, 781 const std::vector<unsigned int> &cell_vectorization_categories, 782 const bool cell_vectorization_categories_strict, 783 const std::vector<unsigned int> &parent_relation, 784 std::vector<unsigned int> & renumbering, 785 std::vector<unsigned char> & incompletely_filled_vectorization) 786 { 787 // This function is decomposed into several steps to determine a good 788 // ordering that satisfies the following constraints: 789 // a. Only cells belonging to the same category (or next higher if the 790 // cell_vectorization_categories_strict is false) can be grouped into 791 // the same SIMD batch 792 // b. hp adaptive computations must form contiguous ranges for the same 793 // degree (category) in cell_partition_data 794 // c. We want to group the cells with the same parent in the same SIMD 795 // lane if possible 796 // d. The cell order should be similar to the initial one 797 // e. Form sets without MPI communication and those with to overlap 798 // communication with computation 799 // 800 // These constraints are satisfied by first grouping by the categories 801 // and, within the groups, to distinguish between cells with a parent 802 // and those without. All of this is set up with batches of cells (with 803 // padding if the size does not match). Then we define a vector of 804 // arrays where we define sorting criteria for the cell batches to 805 // satisfy the items b and d together, split by different parts to 806 // satisfy item e. 807 808 // Give the compiler a chance to detect that vectorization_length is a 809 // power of two, which allows it to replace integer divisions by shifts 810 unsigned int vectorization_length_bits = 0; 811 unsigned int my_length = vectorization_length; 812 while (my_length >>= 1) 813 ++vectorization_length_bits; 814 const unsigned int n_lanes = 1 << vectorization_length_bits; 815 816 // Step 1: find tight map of categories for not taking exceeding amounts 817 // of memory below. Sort the new categories by the numbers in the 818 // old one to ensure we respect the given rules 819 unsigned int n_categories = 1; 820 std::vector<unsigned int> tight_category_map(n_active_cells, 0); 821 if (cell_vectorization_categories.empty() == false) 822 { 823 AssertDimension(cell_vectorization_categories.size(), 824 n_active_cells + n_ghost_cells); 825 826 std::set<unsigned int> used_categories; 827 for (unsigned int i = 0; i < n_active_cells; ++i) 828 used_categories.insert(cell_vectorization_categories[i]); 829 std::vector<unsigned int> used_categories_vector( 830 used_categories.size()); 831 n_categories = 0; 832 for (const auto &it : used_categories) 833 used_categories_vector[n_categories++] = it; 834 for (unsigned int i = 0; i < n_active_cells; ++i) 835 { 836 const unsigned int index = 837 std::lower_bound(used_categories_vector.begin(), 838 used_categories_vector.end(), 839 cell_vectorization_categories[i]) - 840 used_categories_vector.begin(); 841 AssertIndexRange(index, used_categories_vector.size()); 842 tight_category_map[i] = index; 843 } 844 } 845 846 // Step 2: Sort the cells by the category. If we want to fill up the 847 // ranges in vectorization, promote some of the cells to a higher 848 // category 849 std::vector<std::vector<unsigned int>> renumbering_category(n_categories); 850 for (unsigned int i = 0; i < n_active_cells; ++i) 851 renumbering_category[tight_category_map[i]].push_back(i); 852 853 if (cell_vectorization_categories_strict == false && n_categories > 1) 854 for (unsigned int j = n_categories - 1; j > 0; --j) 855 { 856 unsigned int lower_index = j - 1; 857 while (renumbering_category[j].size() % n_lanes) 858 { 859 while (renumbering_category[j].size() % n_lanes && 860 !renumbering_category[lower_index].empty()) 861 { 862 renumbering_category[j].push_back( 863 renumbering_category[lower_index].back()); 864 renumbering_category[lower_index].pop_back(); 865 } 866 if (lower_index == 0) 867 break; 868 else 869 --lower_index; 870 } 871 } 872 873 // Step 3: Use the parent relation to find a good grouping of cells. To 874 // do this, we first put cells of each category defined above into two 875 // bins, those which we know can be grouped together by the given parent 876 // relation and those which cannot 877 std::vector<unsigned int> temporary_numbering; 878 temporary_numbering.reserve(n_active_cells + 879 (n_lanes - 1) * n_categories); 880 const unsigned int n_cells_per_parent = 881 std::count(parent_relation.begin(), parent_relation.end(), 0); 882 std::vector<unsigned int> category_size; 883 for (unsigned int j = 0; j < n_categories; ++j) 884 { 885 std::vector<std::pair<unsigned int, unsigned int>> grouped_cells; 886 std::vector<unsigned int> other_cells; 887 for (const unsigned int cell : renumbering_category[j]) 888 if (parent_relation.empty() || 889 parent_relation[cell] == numbers::invalid_unsigned_int) 890 other_cells.push_back(cell); 891 else 892 grouped_cells.emplace_back(parent_relation[cell], cell); 893 894 // Compute the number of cells per group 895 std::sort(grouped_cells.begin(), grouped_cells.end()); 896 std::vector<unsigned int> n_cells_per_group; 897 unsigned int length = 0; 898 for (unsigned int i = 0; i < grouped_cells.size(); ++i, ++length) 899 if (i > 0 && grouped_cells[i].first != grouped_cells[i - 1].first) 900 { 901 n_cells_per_group.push_back(length); 902 length = 0; 903 } 904 if (length > 0) 905 n_cells_per_group.push_back(length); 906 907 // Move groups that do not have the complete size (due to 908 // categories) to the 'other_cells'. The cells with correct group 909 // size are immediately appended to the temporary cell numbering 910 auto group_it = grouped_cells.begin(); 911 for (unsigned int length : n_cells_per_group) 912 if (length < n_cells_per_parent) 913 for (unsigned int j = 0; j < length; ++j) 914 other_cells.push_back((group_it++)->second); 915 else 916 { 917 // we should not have more cells in a group than in the first 918 // check we did above 919 AssertDimension(length, n_cells_per_parent); 920 for (unsigned int j = 0; j < length; ++j) 921 temporary_numbering.push_back((group_it++)->second); 922 } 923 924 // Sort the remaining cells and append them as well 925 std::sort(other_cells.begin(), other_cells.end()); 926 temporary_numbering.insert(temporary_numbering.end(), 927 other_cells.begin(), 928 other_cells.end()); 929 930 while (temporary_numbering.size() % n_lanes != 0) 931 temporary_numbering.push_back(numbers::invalid_unsigned_int); 932 933 category_size.push_back(temporary_numbering.size()); 934 } 935 936 // Step 4: Identify the batches with cells marked as "comm" 937 std::vector<bool> batch_with_comm(temporary_numbering.size() / n_lanes, 938 false); 939 std::vector<unsigned int> temporary_numbering_inverse(n_active_cells); 940 for (unsigned int i = 0; i < temporary_numbering.size(); ++i) 941 if (temporary_numbering[i] != numbers::invalid_unsigned_int) 942 temporary_numbering_inverse[temporary_numbering[i]] = i; 943 for (const unsigned int cell : cells_with_comm) 944 batch_with_comm[temporary_numbering_inverse[cell] / n_lanes] = true; 945 946 // Step 5: Sort the batches of cells by their last cell index to get 947 // good locality, assuming that the initial cell order is of good 948 // locality. In case we have hp calculations with categories, we need to 949 // sort also by the category. 950 std::vector<std::array<unsigned int, 3>> batch_order; 951 std::vector<std::array<unsigned int, 3>> batch_order_comm; 952 for (unsigned int i = 0; i < temporary_numbering.size(); i += n_lanes) 953 { 954 unsigned int max_index = 0; 955 for (unsigned int j = 0; j < n_lanes; ++j) 956 if (temporary_numbering[i + j] < numbers::invalid_unsigned_int) 957 max_index = std::max(temporary_numbering[i + j], max_index); 958 const unsigned int category_hp = 959 categories_are_hp ? 960 std::upper_bound(category_size.begin(), category_size.end(), i) - 961 category_size.begin() : 962 0; 963 const std::array<unsigned int, 3> next{{category_hp, max_index, i}}; 964 if (batch_with_comm[i / n_lanes]) 965 batch_order_comm.emplace_back(next); 966 else 967 batch_order.emplace_back(next); 968 } 969 970 std::sort(batch_order.begin(), batch_order.end()); 971 std::sort(batch_order_comm.begin(), batch_order_comm.end()); 972 973 // Step 6: Put the cells with communication in the middle of the cell 974 // range. For the MPI case, we need three groups to enable overlap for 975 // communication and computation (part before comm, part with comm, part 976 // after comm), whereas we need one for the other case. And in each 977 // case, we allow for a slot of "ghosted" cells. 978 std::vector<unsigned int> blocks; 979 if (n_procs == 1) 980 { 981 if (batch_order.empty()) 982 std::swap(batch_order_comm, batch_order); 983 Assert(batch_order_comm.empty(), ExcInternalError()); 984 partition_row_index.resize(3); 985 blocks = {0, static_cast<unsigned int>(batch_order.size())}; 986 } 987 else 988 { 989 partition_row_index.resize(5); 990 const unsigned int comm_begin = batch_order.size() / 2; 991 batch_order.insert(batch_order.begin() + comm_begin, 992 batch_order_comm.begin(), 993 batch_order_comm.end()); 994 const unsigned int comm_end = comm_begin + batch_order_comm.size(); 995 const unsigned int end = batch_order.size(); 996 blocks = {0, comm_begin, comm_end, end}; 997 } 998 999 // Step 7: Fill in the data by batches for the locally owned cells. 1000 const unsigned int n_cell_batches = batch_order.size(); 1001 const unsigned int n_ghost_batches = 1002 (n_ghost_cells + n_lanes - 1) / n_lanes; 1003 incompletely_filled_vectorization.resize(n_cell_batches + 1004 n_ghost_batches); 1005 1006 cell_partition_data.clear(); 1007 cell_partition_data.resize(1, 0); 1008 1009 renumbering.clear(); 1010 renumbering.resize(n_active_cells + n_ghost_cells, 1011 numbers::invalid_unsigned_int); 1012 1013 unsigned int counter = 0; 1014 for (unsigned int block = 0; block < blocks.size() - 1; ++block) 1015 { 1016 const unsigned int grain_size = 1017 std::max((2048U / dofs_per_cell) / 8 * 4, 2U); 1018 for (unsigned int k = blocks[block]; k < blocks[block + 1]; 1019 k += grain_size) 1020 cell_partition_data.push_back( 1021 std::min(k + grain_size, blocks[block + 1])); 1022 partition_row_index[block + 1] = cell_partition_data.size() - 1; 1023 1024 // Set the numbering according to the reordered temporary one 1025 for (unsigned int k = blocks[block]; k < blocks[block + 1]; ++k) 1026 { 1027 const unsigned int pos = batch_order[k][2]; 1028 unsigned int j = 0; 1029 for (; j < n_lanes && temporary_numbering[pos + j] != 1030 numbers::invalid_unsigned_int; 1031 ++j) 1032 renumbering[counter++] = temporary_numbering[pos + j]; 1033 if (j < n_lanes) 1034 incompletely_filled_vectorization[k] = j; 1035 } 1036 } 1037 AssertDimension(counter, n_active_cells); 1038 1039 // Step 8: Treat the ghost cells 1040 for (unsigned int cell = n_active_cells; 1041 cell < n_active_cells + n_ghost_cells; 1042 ++cell) 1043 { 1044 if (!cell_vectorization_categories.empty()) 1045 AssertDimension(cell_vectorization_categories[cell], 1046 cell_vectorization_categories[n_active_cells]); 1047 renumbering[cell] = cell; 1048 } 1049 if (n_ghost_cells % n_lanes) 1050 incompletely_filled_vectorization.back() = n_ghost_cells % n_lanes; 1051 cell_partition_data.push_back(n_cell_batches + n_ghost_batches); 1052 partition_row_index.back() = cell_partition_data.size() - 1; 1053 1054 #ifdef DEBUG 1055 std::vector<unsigned int> renumber_cpy(renumbering); 1056 std::sort(renumber_cpy.begin(), renumber_cpy.end()); 1057 for (unsigned int i = 0; i < renumber_cpy.size(); ++i) 1058 AssertDimension(i, renumber_cpy[i]); 1059 #endif 1060 } 1061 1062 1063 1064 void initial_setup_blocks_tasks(const std::vector<unsigned int> & boundary_cells,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & incompletely_filled_vectorization)1065 TaskInfo::initial_setup_blocks_tasks( 1066 const std::vector<unsigned int> &boundary_cells, 1067 std::vector<unsigned int> & renumbering, 1068 std::vector<unsigned char> & incompletely_filled_vectorization) 1069 { 1070 const unsigned int n_macro_cells = 1071 (n_active_cells + vectorization_length - 1) / vectorization_length; 1072 const unsigned int n_ghost_slots = 1073 (n_ghost_cells + vectorization_length - 1) / vectorization_length; 1074 incompletely_filled_vectorization.resize(n_macro_cells + n_ghost_slots); 1075 if (n_macro_cells * vectorization_length > n_active_cells) 1076 incompletely_filled_vectorization[n_macro_cells - 1] = 1077 vectorization_length - 1078 (n_macro_cells * vectorization_length - n_active_cells); 1079 if (n_ghost_slots * vectorization_length > n_ghost_cells) 1080 incompletely_filled_vectorization[n_macro_cells + n_ghost_slots - 1] = 1081 vectorization_length - 1082 (n_ghost_slots * vectorization_length - n_ghost_cells); 1083 1084 std::vector<unsigned int> reverse_numbering( 1085 n_active_cells, numbers::invalid_unsigned_int); 1086 for (unsigned int j = 0; j < boundary_cells.size(); ++j) 1087 reverse_numbering[boundary_cells[j]] = j; 1088 unsigned int counter = boundary_cells.size(); 1089 for (unsigned int j = 0; j < n_active_cells; ++j) 1090 if (reverse_numbering[j] == numbers::invalid_unsigned_int) 1091 reverse_numbering[j] = counter++; 1092 1093 AssertDimension(counter, n_active_cells); 1094 renumbering = Utilities::invert_permutation(reverse_numbering); 1095 1096 for (unsigned int j = n_active_cells; j < n_active_cells + n_ghost_cells; 1097 ++j) 1098 renumbering.push_back(j); 1099 1100 // TODO: might be able to simplify this code by not relying on the cell 1101 // partition data while computing the thread graph 1102 cell_partition_data.clear(); 1103 cell_partition_data.push_back(0); 1104 if (n_procs > 1) 1105 { 1106 const unsigned int n_macro_boundary_cells = 1107 (boundary_cells.size() + vectorization_length - 1) / 1108 vectorization_length; 1109 cell_partition_data.push_back( 1110 (n_macro_cells - n_macro_boundary_cells) / 2); 1111 cell_partition_data.push_back(cell_partition_data[1] + 1112 n_macro_boundary_cells); 1113 } 1114 else 1115 AssertDimension(boundary_cells.size(), 0); 1116 cell_partition_data.push_back(n_macro_cells); 1117 cell_partition_data.push_back(cell_partition_data.back() + n_ghost_slots); 1118 partition_row_index.resize(n_procs > 1 ? 4 : 2); 1119 partition_row_index[0] = 0; 1120 partition_row_index[1] = 1; 1121 if (n_procs > 1) 1122 { 1123 partition_row_index[2] = 2; 1124 partition_row_index[3] = 3; 1125 } 1126 } 1127 1128 1129 1130 void guess_block_size(const unsigned int dofs_per_cell)1131 TaskInfo::guess_block_size(const unsigned int dofs_per_cell) 1132 { 1133 // user did not say a positive number, so we have to guess 1134 if (block_size == 0) 1135 { 1136 // we would like to have enough work to do, so as first guess, try 1137 // to get 16 times as many chunks as we have threads on the system. 1138 block_size = n_active_cells / (MultithreadInfo::n_threads() * 16 * 1139 vectorization_length); 1140 1141 // if there are too few degrees of freedom per cell, need to 1142 // increase the block size 1143 const unsigned int minimum_parallel_grain_size = 200; 1144 if (dofs_per_cell * block_size < minimum_parallel_grain_size) 1145 block_size = (minimum_parallel_grain_size / dofs_per_cell + 1); 1146 if (dofs_per_cell * block_size > 10000) 1147 block_size /= 4; 1148 1149 block_size = 1150 1 << static_cast<unsigned int>(std::log2(block_size + 1)); 1151 } 1152 if (block_size > n_active_cells) 1153 block_size = std::max(1U, n_active_cells); 1154 } 1155 1156 1157 1158 void make_thread_graph_partition_color(DynamicSparsityPattern & connectivity_large,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & irregular_cells,const bool)1159 TaskInfo::make_thread_graph_partition_color( 1160 DynamicSparsityPattern & connectivity_large, 1161 std::vector<unsigned int> & renumbering, 1162 std::vector<unsigned char> &irregular_cells, 1163 const bool) 1164 { 1165 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2); 1166 if (n_macro_cells == 0) 1167 return; 1168 1169 Assert(vectorization_length > 0, ExcInternalError()); 1170 1171 unsigned int partition = 0, counter = 0; 1172 1173 // Create connectivity graph for blocks based on connectivity graph for 1174 // cells. 1175 DynamicSparsityPattern connectivity(n_blocks, n_blocks); 1176 make_connectivity_cells_to_blocks(irregular_cells, 1177 connectivity_large, 1178 connectivity); 1179 1180 // Create cell-block partitioning. 1181 1182 // For each block of cells, this variable saves to which partitions the 1183 // block belongs. Initialize all to -1 to mark them as not yet assigned 1184 // a partition. 1185 std::vector<unsigned int> cell_partition(n_blocks, 1186 numbers::invalid_unsigned_int); 1187 1188 // In element j of this variable, one puts the old number of the block 1189 // that should be the jth block in the new numeration. 1190 std::vector<unsigned int> partition_list(n_blocks, 0); 1191 std::vector<unsigned int> partition_color_list(n_blocks, 0); 1192 1193 // This vector points to the start of each partition. 1194 std::vector<unsigned int> partition_size(2, 0); 1195 1196 // blocking_connectivity = true; 1197 1198 // The cluster_size in make_partitioning defines that the no. of cells 1199 // in each partition should be a multiple of cluster_size. 1200 unsigned int cluster_size = 1; 1201 1202 // Make the partitioning of the first layer of the blocks of cells. 1203 make_partitioning(connectivity, 1204 cluster_size, 1205 cell_partition, 1206 partition_list, 1207 partition_size, 1208 partition); 1209 1210 // Color the cells within each partition 1211 make_coloring_within_partitions_pre_blocked(connectivity, 1212 partition, 1213 cell_partition, 1214 partition_list, 1215 partition_size, 1216 partition_color_list); 1217 1218 partition_list = renumbering; 1219 1220 #ifdef DEBUG 1221 // in debug mode, check that the partition color list is one-to-one 1222 { 1223 std::vector<unsigned int> sorted_pc_list(partition_color_list); 1224 std::sort(sorted_pc_list.begin(), sorted_pc_list.end()); 1225 for (unsigned int i = 0; i < sorted_pc_list.size(); ++i) 1226 Assert(sorted_pc_list[i] == i, ExcInternalError()); 1227 } 1228 #endif 1229 1230 // set the start list for each block and compute the renumbering of 1231 // cells 1232 std::vector<unsigned int> block_start(n_macro_cells + 1); 1233 std::vector<unsigned char> irregular(n_macro_cells); 1234 1235 unsigned int mcell_start = 0; 1236 block_start[0] = 0; 1237 for (unsigned int block = 0; block < n_blocks; block++) 1238 { 1239 block_start[block + 1] = block_start[block]; 1240 for (unsigned int mcell = mcell_start; 1241 mcell < std::min(mcell_start + block_size, n_macro_cells); 1242 ++mcell) 1243 { 1244 unsigned int n_comp = (irregular_cells[mcell] > 0) ? 1245 irregular_cells[mcell] : 1246 vectorization_length; 1247 block_start[block + 1] += n_comp; 1248 ++counter; 1249 } 1250 mcell_start += block_size; 1251 } 1252 counter = 0; 1253 unsigned int counter_macro = 0; 1254 unsigned int block_size_last = 1255 n_macro_cells - block_size * (n_blocks - 1); 1256 if (block_size_last == 0) 1257 block_size_last = block_size; 1258 1259 unsigned int tick = 0; 1260 for (unsigned int block = 0; block < n_blocks; block++) 1261 { 1262 unsigned int present_block = partition_color_list[block]; 1263 for (unsigned int cell = block_start[present_block]; 1264 cell < block_start[present_block + 1]; 1265 ++cell) 1266 renumbering[counter++] = partition_list[cell]; 1267 unsigned int this_block_size = 1268 (present_block == n_blocks - 1) ? block_size_last : block_size; 1269 1270 // Also re-compute the content of cell_partition_data to 1271 // contain the numbers of cells, not blocks 1272 if (cell_partition_data[tick] == block) 1273 cell_partition_data[tick++] = counter_macro; 1274 1275 for (unsigned int j = 0; j < this_block_size; j++) 1276 irregular[counter_macro++] = 1277 irregular_cells[present_block * block_size + j]; 1278 } 1279 AssertDimension(tick + 1, cell_partition_data.size()); 1280 cell_partition_data.back() = counter_macro; 1281 1282 irregular_cells.swap(irregular); 1283 AssertDimension(counter, n_active_cells); 1284 AssertDimension(counter_macro, n_macro_cells); 1285 1286 // check that the renumbering is one-to-one 1287 #ifdef DEBUG 1288 { 1289 std::vector<unsigned int> sorted_renumbering(renumbering); 1290 std::sort(sorted_renumbering.begin(), sorted_renumbering.end()); 1291 for (unsigned int i = 0; i < sorted_renumbering.size(); ++i) 1292 Assert(sorted_renumbering[i] == i, ExcInternalError()); 1293 } 1294 #endif 1295 1296 1297 update_task_info( 1298 partition); // Actually sets too much for partition color case 1299 1300 AssertDimension(cell_partition_data.back(), n_macro_cells); 1301 } 1302 1303 1304 1305 void make_thread_graph(const std::vector<unsigned int> & cell_active_fe_index,DynamicSparsityPattern & connectivity,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & irregular_cells,const bool hp_bool)1306 TaskInfo::make_thread_graph( 1307 const std::vector<unsigned int> &cell_active_fe_index, 1308 DynamicSparsityPattern & connectivity, 1309 std::vector<unsigned int> & renumbering, 1310 std::vector<unsigned char> & irregular_cells, 1311 const bool hp_bool) 1312 { 1313 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2); 1314 if (n_macro_cells == 0) 1315 return; 1316 1317 Assert(vectorization_length > 0, ExcInternalError()); 1318 1319 // if we want to block before partitioning, create connectivity graph 1320 // for blocks based on connectivity graph for cells. 1321 DynamicSparsityPattern connectivity_blocks(n_blocks, n_blocks); 1322 make_connectivity_cells_to_blocks(irregular_cells, 1323 connectivity, 1324 connectivity_blocks); 1325 1326 unsigned int n_blocks = 0; 1327 if (scheme == partition_color || 1328 scheme == color) // blocking_connectivity == true 1329 n_blocks = this->n_blocks; 1330 else 1331 n_blocks = n_active_cells; 1332 1333 // For each block of cells, this variable saves to which partitions the 1334 // block belongs. Initialize all to -1 to mark them as not yet assigned 1335 // a partition. 1336 std::vector<unsigned int> cell_partition(n_blocks, 1337 numbers::invalid_unsigned_int); 1338 1339 // In element j of this variable, one puts the old number (but after 1340 // renumbering according to the input renumbering) of the block that 1341 // should be the jth block in the new numeration. 1342 std::vector<unsigned int> partition_list(n_blocks, 0); 1343 std::vector<unsigned int> partition_2layers_list(n_blocks, 0); 1344 1345 // This vector points to the start of each partition. 1346 std::vector<unsigned int> partition_size(2, 0); 1347 1348 unsigned int partition = 0; 1349 1350 // Within the partitions we want to be able to block for the case that 1351 // we do not block already in the connectivity. The cluster_size in 1352 // make_partitioning defines that the no. of cells in each partition 1353 // should be a multiple of cluster_size. 1354 unsigned int cluster_size = 1; 1355 if (scheme == partition_partition) 1356 cluster_size = block_size * vectorization_length; 1357 1358 // Make the partitioning of the first layer of the blocks of cells. 1359 if (scheme == partition_color || scheme == color) 1360 make_partitioning(connectivity_blocks, 1361 cluster_size, 1362 cell_partition, 1363 partition_list, 1364 partition_size, 1365 partition); 1366 else 1367 make_partitioning(connectivity, 1368 cluster_size, 1369 cell_partition, 1370 partition_list, 1371 partition_size, 1372 partition); 1373 1374 // Partition or color second layer 1375 if (scheme == partition_partition) 1376 1377 { 1378 // Partition within partitions. 1379 make_partitioning_within_partitions_post_blocked( 1380 connectivity, 1381 cell_active_fe_index, 1382 partition, 1383 cluster_size, 1384 hp_bool, 1385 cell_partition, 1386 partition_list, 1387 partition_size, 1388 partition_2layers_list, 1389 irregular_cells); 1390 } 1391 else if (scheme == partition_color || scheme == color) 1392 { 1393 make_coloring_within_partitions_pre_blocked(connectivity_blocks, 1394 partition, 1395 cell_partition, 1396 partition_list, 1397 partition_size, 1398 partition_2layers_list); 1399 } 1400 1401 // in debug mode, check that the partition_2layers_list is one-to-one 1402 #ifdef DEBUG 1403 { 1404 std::vector<unsigned int> sorted_pc_list(partition_2layers_list); 1405 std::sort(sorted_pc_list.begin(), sorted_pc_list.end()); 1406 for (unsigned int i = 0; i < sorted_pc_list.size(); ++i) 1407 Assert(sorted_pc_list[i] == i, ExcInternalError()); 1408 } 1409 #endif 1410 1411 // Set the new renumbering 1412 std::vector<unsigned int> renumbering_in(n_active_cells, 0); 1413 renumbering_in.swap(renumbering); 1414 if (scheme == partition_partition) // blocking_connectivity == false 1415 { 1416 // This is the simple case. The renumbering is just a combination of 1417 // the renumbering that we were given as an input and the 1418 // renumbering of partition/coloring given in partition_2layers_list 1419 for (unsigned int j = 0; j < renumbering.size(); j++) 1420 renumbering[j] = renumbering_in[partition_2layers_list[j]]; 1421 // Account for the ghost cells, finally. 1422 for (unsigned int i = 0; i < n_ghost_cells; ++i) 1423 renumbering.push_back(i + n_active_cells); 1424 } 1425 else 1426 { 1427 // set the start list for each block and compute the renumbering of 1428 // cells 1429 std::vector<unsigned int> block_start(n_macro_cells + 1); 1430 std::vector<unsigned char> irregular(n_macro_cells); 1431 1432 unsigned int counter = 0; 1433 unsigned int mcell_start = 0; 1434 block_start[0] = 0; 1435 for (unsigned int block = 0; block < n_blocks; block++) 1436 { 1437 block_start[block + 1] = block_start[block]; 1438 for (unsigned int mcell = mcell_start; 1439 mcell < std::min(mcell_start + block_size, n_macro_cells); 1440 ++mcell) 1441 { 1442 unsigned int n_comp = (irregular_cells[mcell] > 0) ? 1443 irregular_cells[mcell] : 1444 vectorization_length; 1445 block_start[block + 1] += n_comp; 1446 ++counter; 1447 } 1448 mcell_start += block_size; 1449 } 1450 counter = 0; 1451 unsigned int counter_macro = 0; 1452 unsigned int block_size_last = 1453 n_macro_cells - block_size * (n_blocks - 1); 1454 if (block_size_last == 0) 1455 block_size_last = block_size; 1456 1457 unsigned int tick = 0; 1458 for (unsigned int block = 0; block < n_blocks; block++) 1459 { 1460 unsigned int present_block = partition_2layers_list[block]; 1461 for (unsigned int cell = block_start[present_block]; 1462 cell < block_start[present_block + 1]; 1463 ++cell) 1464 renumbering[counter++] = renumbering_in[cell]; 1465 unsigned int this_block_size = 1466 (present_block == n_blocks - 1) ? block_size_last : block_size; 1467 1468 // Also re-compute the content of cell_partition_data to 1469 // contain the numbers of cells, not blocks 1470 if (cell_partition_data[tick] == block) 1471 cell_partition_data[tick++] = counter_macro; 1472 1473 for (unsigned int j = 0; j < this_block_size; j++) 1474 irregular[counter_macro++] = 1475 irregular_cells[present_block * block_size + j]; 1476 } 1477 AssertDimension(tick + 1, cell_partition_data.size()); 1478 cell_partition_data.back() = counter_macro; 1479 1480 irregular_cells.swap(irregular); 1481 AssertDimension(counter, n_active_cells); 1482 AssertDimension(counter_macro, n_macro_cells); 1483 // check that the renumbering is one-to-one 1484 #ifdef DEBUG 1485 { 1486 std::vector<unsigned int> sorted_renumbering(renumbering); 1487 std::sort(sorted_renumbering.begin(), sorted_renumbering.end()); 1488 for (unsigned int i = 0; i < sorted_renumbering.size(); ++i) 1489 Assert(sorted_renumbering[i] == i, ExcInternalError()); 1490 } 1491 #endif 1492 } 1493 1494 // Update the task_info with the more information for the thread graph. 1495 update_task_info(partition); 1496 } 1497 1498 1499 1500 void make_thread_graph_partition_partition(const std::vector<unsigned int> & cell_active_fe_index,DynamicSparsityPattern & connectivity,std::vector<unsigned int> & renumbering,std::vector<unsigned char> & irregular_cells,const bool hp_bool)1501 TaskInfo::make_thread_graph_partition_partition( 1502 const std::vector<unsigned int> &cell_active_fe_index, 1503 DynamicSparsityPattern & connectivity, 1504 std::vector<unsigned int> & renumbering, 1505 std::vector<unsigned char> & irregular_cells, 1506 const bool hp_bool) 1507 { 1508 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2); 1509 if (n_macro_cells == 0) 1510 return; 1511 1512 const unsigned int cluster_size = block_size * vectorization_length; 1513 1514 // Create cell-block partitioning. 1515 1516 // For each block of cells, this variable saves to which partitions the 1517 // block belongs. Initialize all to n_macro_cells to mark them as not 1518 // yet assigned a partition. 1519 std::vector<unsigned int> cell_partition(n_active_cells, 1520 numbers::invalid_unsigned_int); 1521 1522 1523 // In element j of this variable, one puts the old number of the block 1524 // that should be the jth block in the new numeration. 1525 std::vector<unsigned int> partition_list(n_active_cells, 0); 1526 std::vector<unsigned int> partition_partition_list(n_active_cells, 0); 1527 1528 // This vector points to the start of each partition. 1529 std::vector<unsigned int> partition_size(2, 0); 1530 1531 unsigned int partition = 0; 1532 // Here, we do not block inside the connectivity graph 1533 // blocking_connectivity = false; 1534 1535 // Make the partitioning of the first layer of the blocks of cells. 1536 make_partitioning(connectivity, 1537 cluster_size, 1538 cell_partition, 1539 partition_list, 1540 partition_size, 1541 partition); 1542 1543 // Partition within partitions. 1544 make_partitioning_within_partitions_post_blocked(connectivity, 1545 cell_active_fe_index, 1546 partition, 1547 cluster_size, 1548 hp_bool, 1549 cell_partition, 1550 partition_list, 1551 partition_size, 1552 partition_partition_list, 1553 irregular_cells); 1554 1555 partition_list.swap(renumbering); 1556 1557 for (unsigned int j = 0; j < renumbering.size(); j++) 1558 renumbering[j] = partition_list[partition_partition_list[j]]; 1559 1560 for (unsigned int i = 0; i < n_ghost_cells; ++i) 1561 renumbering.push_back(i + n_active_cells); 1562 1563 update_task_info(partition); 1564 } 1565 1566 1567 1568 void make_connectivity_cells_to_blocks(const std::vector<unsigned char> & irregular_cells,const DynamicSparsityPattern & connectivity_cells,DynamicSparsityPattern & connectivity_blocks) const1569 TaskInfo::make_connectivity_cells_to_blocks( 1570 const std::vector<unsigned char> &irregular_cells, 1571 const DynamicSparsityPattern & connectivity_cells, 1572 DynamicSparsityPattern & connectivity_blocks) const 1573 { 1574 std::vector<std::vector<unsigned int>> cell_blocks(n_blocks); 1575 std::vector<unsigned int> touched_cells(n_active_cells); 1576 unsigned int cell = 0; 1577 for (unsigned int i = 0, mcell = 0; i < n_blocks; ++i) 1578 { 1579 for (unsigned int c = 0; 1580 c < block_size && mcell < *(cell_partition_data.end() - 2); 1581 ++c, ++mcell) 1582 { 1583 unsigned int ncomp = (irregular_cells[mcell] > 0) ? 1584 irregular_cells[mcell] : 1585 vectorization_length; 1586 for (unsigned int c = 0; c < ncomp; ++c, ++cell) 1587 { 1588 cell_blocks[i].push_back(cell); 1589 touched_cells[cell] = i; 1590 } 1591 } 1592 } 1593 AssertDimension(cell, n_active_cells); 1594 for (unsigned int i = 0; i < cell_blocks.size(); ++i) 1595 for (unsigned int col = 0; col < cell_blocks[i].size(); ++col) 1596 { 1597 for (DynamicSparsityPattern::iterator it = 1598 connectivity_cells.begin(cell_blocks[i][col]); 1599 it != connectivity_cells.end(cell_blocks[i][col]); 1600 ++it) 1601 { 1602 if (touched_cells[it->column()] != i) 1603 connectivity_blocks.add(i, touched_cells[it->column()]); 1604 } 1605 } 1606 } 1607 1608 1609 1610 // Function to create partitioning on the second layer within each 1611 // partition. Version without preblocking. 1612 void make_partitioning_within_partitions_post_blocked(const DynamicSparsityPattern & connectivity,const std::vector<unsigned int> & cell_active_fe_index,const unsigned int partition,const unsigned int cluster_size,const bool hp_bool,const std::vector<unsigned int> & cell_partition,const std::vector<unsigned int> & partition_list,const std::vector<unsigned int> & partition_size,std::vector<unsigned int> & partition_partition_list,std::vector<unsigned char> & irregular_cells)1613 TaskInfo::make_partitioning_within_partitions_post_blocked( 1614 const DynamicSparsityPattern & connectivity, 1615 const std::vector<unsigned int> &cell_active_fe_index, 1616 const unsigned int partition, 1617 const unsigned int cluster_size, 1618 const bool hp_bool, 1619 const std::vector<unsigned int> &cell_partition, 1620 const std::vector<unsigned int> &partition_list, 1621 const std::vector<unsigned int> &partition_size, 1622 std::vector<unsigned int> & partition_partition_list, 1623 std::vector<unsigned char> & irregular_cells) 1624 { 1625 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2); 1626 const unsigned int n_ghost_slots = 1627 *(cell_partition_data.end() - 1) - n_macro_cells; 1628 1629 // List of cells in previous partition 1630 std::vector<unsigned int> neighbor_list; 1631 // List of cells in current partition for use as neighbors in next 1632 // partition 1633 std::vector<unsigned int> neighbor_neighbor_list; 1634 1635 std::vector<unsigned int> renumbering(n_active_cells); 1636 1637 irregular_cells.back() = 0; 1638 irregular_cells.resize(n_active_cells + n_ghost_slots); 1639 1640 unsigned int max_fe_index = 0; 1641 for (const unsigned int fe_index : cell_active_fe_index) 1642 max_fe_index = std::max(fe_index, max_fe_index); 1643 1644 Assert(!hp_bool || cell_active_fe_index.size() == n_active_cells, 1645 ExcInternalError()); 1646 1647 { 1648 unsigned int n_macro_cells_before = 0; 1649 // Create partitioning within partitions. 1650 1651 // For each block of cells, this variable saves to which partitions 1652 // the block belongs. Initialize all to n_macro_cells to mark them as 1653 // not yet assigned a partition. 1654 std::vector<unsigned int> cell_partition_l2( 1655 n_active_cells, numbers::invalid_unsigned_int); 1656 partition_row_index.clear(); 1657 partition_row_index.resize(partition + 1, 0); 1658 cell_partition_data.resize(1, 0); 1659 1660 unsigned int counter = 0; 1661 unsigned int missing_macros; 1662 for (unsigned int part = 0; part < partition; ++part) 1663 { 1664 neighbor_neighbor_list.resize(0); 1665 neighbor_list.resize(0); 1666 bool work = true; 1667 unsigned int partition_l2 = 0; 1668 unsigned int start_up = partition_size[part]; 1669 unsigned int partition_counter = 0; 1670 while (work) 1671 { 1672 if (neighbor_list.size() == 0) 1673 { 1674 work = false; 1675 partition_counter = 0; 1676 for (unsigned int j = start_up; 1677 j < partition_size[part + 1]; 1678 ++j) 1679 if (cell_partition[partition_list[j]] == part && 1680 cell_partition_l2[partition_list[j]] == 1681 numbers::invalid_unsigned_int) 1682 { 1683 start_up = j; 1684 work = true; 1685 partition_counter = 1; 1686 // To start up, set the start_up cell to partition 1687 // and list all its neighbors. 1688 AssertIndexRange(start_up, partition_size[part + 1]); 1689 cell_partition_l2[partition_list[start_up]] = 1690 partition_l2; 1691 neighbor_neighbor_list.push_back( 1692 partition_list[start_up]); 1693 partition_partition_list[counter++] = 1694 partition_list[start_up]; 1695 start_up++; 1696 break; 1697 } 1698 } 1699 else 1700 { 1701 partition_counter = 0; 1702 for (const unsigned int neighbor : neighbor_list) 1703 { 1704 Assert(cell_partition[neighbor] == part, 1705 ExcInternalError()); 1706 Assert(cell_partition_l2[neighbor] == partition_l2 - 1, 1707 ExcInternalError()); 1708 auto neighbor_it = connectivity.begin(neighbor); 1709 const auto end_it = connectivity.end(neighbor); 1710 for (; neighbor_it != end_it; ++neighbor_it) 1711 { 1712 if (cell_partition[neighbor_it->column()] == part && 1713 cell_partition_l2[neighbor_it->column()] == 1714 numbers::invalid_unsigned_int) 1715 { 1716 cell_partition_l2[neighbor_it->column()] = 1717 partition_l2; 1718 neighbor_neighbor_list.push_back( 1719 neighbor_it->column()); 1720 partition_partition_list[counter++] = 1721 neighbor_it->column(); 1722 partition_counter++; 1723 } 1724 } 1725 } 1726 } 1727 if (partition_counter > 0) 1728 { 1729 int index_before = neighbor_neighbor_list.size(), 1730 index = index_before; 1731 { 1732 // put the cells into separate lists for each FE index 1733 // within one partition-partition 1734 missing_macros = 0; 1735 std::vector<unsigned int> remaining_per_macro_cell( 1736 max_fe_index + 1); 1737 std::vector<std::vector<unsigned int>> 1738 renumbering_fe_index; 1739 unsigned int cell; 1740 bool filled = true; 1741 if (hp_bool == true) 1742 { 1743 renumbering_fe_index.resize(max_fe_index + 1); 1744 for (cell = counter - partition_counter; 1745 cell < counter; 1746 ++cell) 1747 { 1748 renumbering_fe_index 1749 [cell_active_fe_index.empty() ? 1750 0 : 1751 cell_active_fe_index 1752 [partition_partition_list[cell]]] 1753 .push_back(partition_partition_list[cell]); 1754 } 1755 // check how many more cells are needed in the lists 1756 for (unsigned int j = 0; j < max_fe_index + 1; j++) 1757 { 1758 remaining_per_macro_cell[j] = 1759 renumbering_fe_index[j].size() % 1760 vectorization_length; 1761 if (remaining_per_macro_cell[j] != 0) 1762 filled = false; 1763 missing_macros += 1764 ((renumbering_fe_index[j].size() + 1765 vectorization_length - 1) / 1766 vectorization_length); 1767 } 1768 } 1769 else 1770 { 1771 remaining_per_macro_cell.resize(1); 1772 remaining_per_macro_cell[0] = 1773 partition_counter % vectorization_length; 1774 missing_macros = 1775 partition_counter / vectorization_length; 1776 if (remaining_per_macro_cell[0] != 0) 1777 { 1778 filled = false; 1779 missing_macros++; 1780 } 1781 } 1782 missing_macros = 1783 cluster_size - (missing_macros % cluster_size); 1784 1785 // now we realized that there are some cells missing. 1786 while (missing_macros > 0 || filled == false) 1787 { 1788 if (index == 0) 1789 { 1790 index = neighbor_neighbor_list.size(); 1791 if (index == index_before) 1792 { 1793 if (missing_macros != 0) 1794 { 1795 neighbor_neighbor_list.resize(0); 1796 } 1797 start_up--; 1798 break; // not connected - start again 1799 } 1800 index_before = index; 1801 } 1802 index--; 1803 unsigned int additional = 1804 neighbor_neighbor_list[index]; 1805 1806 // go through the neighbors of the last cell in the 1807 // current partition and check if we find some to 1808 // fill up with. 1809 DynamicSparsityPattern::iterator neighbor = 1810 connectivity.begin( 1811 additional), 1812 end = 1813 connectivity.end( 1814 additional); 1815 for (; neighbor != end; ++neighbor) 1816 { 1817 if (cell_partition[neighbor->column()] == part && 1818 cell_partition_l2[neighbor->column()] == 1819 numbers::invalid_unsigned_int) 1820 { 1821 unsigned int this_index = 0; 1822 if (hp_bool == true) 1823 this_index = 1824 cell_active_fe_index.empty() ? 1825 0 : 1826 cell_active_fe_index[neighbor 1827 ->column()]; 1828 1829 // Only add this cell if we need more macro 1830 // cells in the current block or if there is 1831 // a macro cell with the FE index that is 1832 // not yet fully populated 1833 if (missing_macros > 0 || 1834 remaining_per_macro_cell[this_index] > 0) 1835 { 1836 cell_partition_l2[neighbor->column()] = 1837 partition_l2; 1838 neighbor_neighbor_list.push_back( 1839 neighbor->column()); 1840 if (hp_bool == true) 1841 renumbering_fe_index[this_index] 1842 .push_back(neighbor->column()); 1843 partition_partition_list[counter] = 1844 neighbor->column(); 1845 counter++; 1846 partition_counter++; 1847 if (remaining_per_macro_cell 1848 [this_index] == 0 && 1849 missing_macros > 0) 1850 missing_macros--; 1851 remaining_per_macro_cell[this_index]++; 1852 if (remaining_per_macro_cell 1853 [this_index] == 1854 vectorization_length) 1855 { 1856 remaining_per_macro_cell[this_index] = 1857 0; 1858 } 1859 if (missing_macros == 0) 1860 { 1861 filled = true; 1862 for (unsigned int fe_ind = 0; 1863 fe_ind < max_fe_index + 1; 1864 ++fe_ind) 1865 if (remaining_per_macro_cell 1866 [fe_ind] != 0) 1867 filled = false; 1868 } 1869 if (filled == true) 1870 break; 1871 } 1872 } 1873 } 1874 } 1875 if (hp_bool == true) 1876 { 1877 // set the renumbering according to their active FE 1878 // index within one partition-partition which was 1879 // implicitly assumed above 1880 cell = counter - partition_counter; 1881 for (unsigned int j = 0; j < max_fe_index + 1; j++) 1882 { 1883 for (const unsigned int jj : 1884 renumbering_fe_index[j]) 1885 renumbering[cell++] = jj; 1886 if (renumbering_fe_index[j].size() % 1887 vectorization_length != 1888 0) 1889 irregular_cells[renumbering_fe_index[j].size() / 1890 vectorization_length + 1891 n_macro_cells_before] = 1892 renumbering_fe_index[j].size() % 1893 vectorization_length; 1894 n_macro_cells_before += 1895 (renumbering_fe_index[j].size() + 1896 vectorization_length - 1) / 1897 vectorization_length; 1898 renumbering_fe_index[j].resize(0); 1899 } 1900 } 1901 else 1902 { 1903 n_macro_cells_before += 1904 partition_counter / vectorization_length; 1905 if (partition_counter % vectorization_length != 0) 1906 { 1907 irregular_cells[n_macro_cells_before] = 1908 partition_counter % vectorization_length; 1909 n_macro_cells_before++; 1910 } 1911 } 1912 } 1913 cell_partition_data.push_back(n_macro_cells_before); 1914 partition_l2++; 1915 } 1916 neighbor_list = neighbor_neighbor_list; 1917 neighbor_neighbor_list.resize(0); 1918 } 1919 partition_row_index[part + 1] = 1920 partition_row_index[part] + partition_l2; 1921 } 1922 } 1923 if (hp_bool == true) 1924 { 1925 partition_partition_list.swap(renumbering); 1926 } 1927 } 1928 1929 1930 1931 // Function to create coloring on the second layer within each partition. 1932 // Version assumes preblocking. 1933 void make_coloring_within_partitions_pre_blocked(const DynamicSparsityPattern & connectivity,const unsigned int partition,const std::vector<unsigned int> & cell_partition,const std::vector<unsigned int> & partition_list,const std::vector<unsigned int> & partition_size,std::vector<unsigned int> & partition_color_list)1934 TaskInfo::make_coloring_within_partitions_pre_blocked( 1935 const DynamicSparsityPattern & connectivity, 1936 const unsigned int partition, 1937 const std::vector<unsigned int> &cell_partition, 1938 const std::vector<unsigned int> &partition_list, 1939 const std::vector<unsigned int> &partition_size, 1940 std::vector<unsigned int> & partition_color_list) 1941 { 1942 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2); 1943 std::vector<unsigned int> cell_color(n_blocks, n_macro_cells); 1944 std::vector<bool> color_finder; 1945 1946 partition_row_index.resize(partition + 1); 1947 cell_partition_data.clear(); 1948 unsigned int color_counter = 0, index_counter = 0; 1949 for (unsigned int part = 0; part < partition; part++) 1950 { 1951 partition_row_index[part] = index_counter; 1952 unsigned int max_color = 0; 1953 for (unsigned int k = partition_size[part]; 1954 k < partition_size[part + 1]; 1955 k++) 1956 { 1957 unsigned int cell = partition_list[k]; 1958 unsigned int n_neighbors = connectivity.row_length(cell); 1959 1960 // In the worst case, each neighbor has a different color. So we 1961 // find at least one available color between 0 and n_neighbors. 1962 color_finder.resize(n_neighbors + 1); 1963 for (unsigned int j = 0; j <= n_neighbors; ++j) 1964 color_finder[j] = true; 1965 DynamicSparsityPattern::iterator neighbor = 1966 connectivity.begin(cell), 1967 end = connectivity.end(cell); 1968 for (; neighbor != end; ++neighbor) 1969 { 1970 // Mark the color that a neighbor within the partition has 1971 // as taken 1972 if (cell_partition[neighbor->column()] == part && 1973 cell_color[neighbor->column()] <= n_neighbors) 1974 color_finder[cell_color[neighbor->column()]] = false; 1975 } 1976 // Choose the smallest color that is not taken for the block 1977 cell_color[cell] = 0; 1978 while (color_finder[cell_color[cell]] == false) 1979 cell_color[cell]++; 1980 if (cell_color[cell] > max_color) 1981 max_color = cell_color[cell]; 1982 } 1983 // Reorder within partition: First, all blocks that belong the 0 and 1984 // then so on until those with color max (Note that the smaller the 1985 // number the larger the partition) 1986 for (unsigned int color = 0; color <= max_color; color++) 1987 { 1988 cell_partition_data.push_back(color_counter); 1989 index_counter++; 1990 for (unsigned int k = partition_size[part]; 1991 k < partition_size[part + 1]; 1992 k++) 1993 { 1994 unsigned int cell = partition_list[k]; 1995 if (cell_color[cell] == color) 1996 { 1997 partition_color_list[color_counter++] = cell; 1998 } 1999 } 2000 } 2001 } 2002 cell_partition_data.push_back(n_blocks); 2003 partition_row_index[partition] = index_counter; 2004 AssertDimension(color_counter, n_blocks); 2005 } 2006 2007 2008 // Function to create partitioning on the first layer. 2009 void make_partitioning(const DynamicSparsityPattern & connectivity,const unsigned int cluster_size,std::vector<unsigned int> & cell_partition,std::vector<unsigned int> & partition_list,std::vector<unsigned int> & partition_size,unsigned int & partition) const2010 TaskInfo::make_partitioning(const DynamicSparsityPattern &connectivity, 2011 const unsigned int cluster_size, 2012 std::vector<unsigned int> & cell_partition, 2013 std::vector<unsigned int> & partition_list, 2014 std::vector<unsigned int> & partition_size, 2015 unsigned int & partition) const 2016 2017 { 2018 // For each block of cells, this variable saves to which partitions the 2019 // block belongs. Initialize all to n_macro_cells to mark them as not 2020 // yet assigned a partition. 2021 // std::vector<unsigned int> cell_partition (n_active_cells, 2022 // numbers::invalid_unsigned_int); 2023 // List of cells in previous partition 2024 std::vector<unsigned int> neighbor_list; 2025 // List of cells in current partition for use as neighbors in next 2026 // partition 2027 std::vector<unsigned int> neighbor_neighbor_list; 2028 2029 // In element j of this variable, one puts the old number of the block 2030 // that should be the jth block in the new numeration. 2031 // std::vector<unsigned int> partition_list(n_active_cells,0); 2032 2033 // This vector points to the start of each partition. 2034 // std::vector<unsigned int> partition_size(2,0); 2035 2036 partition = 0; 2037 unsigned int counter = 0; 2038 unsigned int start_nonboundary = 2039 cell_partition_data.size() == 5 ? 2040 vectorization_length * 2041 (cell_partition_data[2] - cell_partition_data[1]) : 2042 0; 2043 2044 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2); 2045 if (n_macro_cells == 0) 2046 return; 2047 if (scheme == color) 2048 start_nonboundary = n_macro_cells; 2049 if (scheme == partition_color || 2050 scheme == color) // blocking_connectivity == true 2051 start_nonboundary = ((start_nonboundary + block_size - 1) / block_size); 2052 unsigned int n_blocks; 2053 if (scheme == partition_color || 2054 scheme == color) // blocking_connectivity == true 2055 n_blocks = this->n_blocks; 2056 else 2057 n_blocks = n_active_cells; 2058 2059 if (start_nonboundary > n_blocks) 2060 start_nonboundary = n_blocks; 2061 2062 2063 unsigned int start_up = 0; 2064 bool work = true; 2065 unsigned int remainder = cluster_size; 2066 2067 // this performs a classical breath-first search in the connectivity 2068 // graph of the cells under the restriction that the size of the 2069 // partitions should be a multiple of the given block size 2070 while (work) 2071 { 2072 // put the cells with neighbors on remote MPI processes up front 2073 if (start_nonboundary > 0) 2074 { 2075 for (unsigned int cell = 0; cell < start_nonboundary; ++cell) 2076 { 2077 const unsigned int cell_nn = cell; 2078 cell_partition[cell_nn] = partition; 2079 neighbor_list.push_back(cell_nn); 2080 partition_list[counter++] = cell_nn; 2081 partition_size.back()++; 2082 } 2083 start_nonboundary = 0; 2084 remainder -= (start_nonboundary % cluster_size); 2085 if (remainder == cluster_size) 2086 remainder = 0; 2087 } 2088 else 2089 { 2090 // To start up, set the start_up cell to partition and list all 2091 // its neighbors. 2092 cell_partition[start_up] = partition; 2093 neighbor_list.push_back(start_up); 2094 partition_list[counter++] = start_up; 2095 partition_size.back()++; 2096 start_up++; 2097 remainder--; 2098 if (remainder == cluster_size) 2099 remainder = 0; 2100 } 2101 int index_before = neighbor_list.size(), index = index_before, 2102 index_stop = 0; 2103 while (remainder > 0) 2104 { 2105 if (index == index_stop) 2106 { 2107 index = neighbor_list.size(); 2108 if (index == index_before) 2109 { 2110 neighbor_list.resize(0); 2111 goto not_connect; 2112 } 2113 index_stop = index_before; 2114 index_before = index; 2115 } 2116 index--; 2117 unsigned int additional = neighbor_list[index]; 2118 DynamicSparsityPattern::iterator neighbor = 2119 connectivity.begin(additional), 2120 end = 2121 connectivity.end(additional); 2122 for (; neighbor != end; ++neighbor) 2123 { 2124 if (cell_partition[neighbor->column()] == 2125 numbers::invalid_unsigned_int) 2126 { 2127 partition_size.back()++; 2128 cell_partition[neighbor->column()] = partition; 2129 neighbor_list.push_back(neighbor->column()); 2130 partition_list[counter++] = neighbor->column(); 2131 remainder--; 2132 if (remainder == 0) 2133 break; 2134 } 2135 } 2136 } 2137 2138 while (neighbor_list.size() > 0) 2139 { 2140 partition++; 2141 2142 // counter for number of cells so far in current partition 2143 unsigned int partition_counter = 0; 2144 2145 // Mark the start of the new partition 2146 partition_size.push_back(partition_size.back()); 2147 2148 // Loop through the list of cells in previous partition and put 2149 // all their neighbors in current partition 2150 for (const unsigned int cell : neighbor_list) 2151 { 2152 Assert(cell_partition[cell] == partition - 1, 2153 ExcInternalError()); 2154 auto neighbor = connectivity.begin(cell); 2155 const auto end = connectivity.end(cell); 2156 for (; neighbor != end; ++neighbor) 2157 { 2158 if (cell_partition[neighbor->column()] == 2159 numbers::invalid_unsigned_int) 2160 { 2161 partition_size.back()++; 2162 cell_partition[neighbor->column()] = partition; 2163 2164 // collect the cells of the current partition for 2165 // use as neighbors in next partition 2166 neighbor_neighbor_list.push_back(neighbor->column()); 2167 partition_list[counter++] = neighbor->column(); 2168 partition_counter++; 2169 } 2170 } 2171 } 2172 remainder = cluster_size - (partition_counter % cluster_size); 2173 if (remainder == cluster_size) 2174 remainder = 0; 2175 int index_stop = 0; 2176 int index_before = neighbor_neighbor_list.size(), 2177 index = index_before; 2178 while (remainder > 0) 2179 { 2180 if (index == index_stop) 2181 { 2182 index = neighbor_neighbor_list.size(); 2183 if (index == index_before) 2184 { 2185 neighbor_neighbor_list.resize(0); 2186 break; 2187 } 2188 index_stop = index_before; 2189 index_before = index; 2190 } 2191 index--; 2192 unsigned int additional = neighbor_neighbor_list[index]; 2193 DynamicSparsityPattern::iterator neighbor = 2194 connectivity.begin( 2195 additional), 2196 end = connectivity.end( 2197 additional); 2198 for (; neighbor != end; ++neighbor) 2199 { 2200 if (cell_partition[neighbor->column()] == 2201 numbers::invalid_unsigned_int) 2202 { 2203 partition_size.back()++; 2204 cell_partition[neighbor->column()] = partition; 2205 neighbor_neighbor_list.push_back(neighbor->column()); 2206 partition_list[counter++] = neighbor->column(); 2207 remainder--; 2208 if (remainder == 0) 2209 break; 2210 } 2211 } 2212 } 2213 2214 neighbor_list = neighbor_neighbor_list; 2215 neighbor_neighbor_list.resize(0); 2216 } 2217 not_connect: 2218 // One has to check if the graph is not connected so we have to find 2219 // another partition. 2220 work = false; 2221 for (unsigned int j = start_up; j < n_blocks; ++j) 2222 if (cell_partition[j] == numbers::invalid_unsigned_int) 2223 { 2224 start_up = j; 2225 work = true; 2226 if (remainder == 0) 2227 remainder = cluster_size; 2228 break; 2229 } 2230 } 2231 if (remainder != 0) 2232 partition++; 2233 2234 AssertDimension(partition_size[partition], n_blocks); 2235 } 2236 2237 2238 void update_task_info(const unsigned int partition)2239 TaskInfo::update_task_info(const unsigned int partition) 2240 { 2241 evens = (partition + 1) / 2; 2242 odds = partition / 2; 2243 n_blocked_workers = odds - (odds + evens + 1) % 2; 2244 n_workers = evens + odds - n_blocked_workers; 2245 // From here only used for partition partition option. 2246 partition_evens.resize(partition); 2247 partition_odds.resize(partition); 2248 partition_n_blocked_workers.resize(partition); 2249 partition_n_workers.resize(partition); 2250 for (unsigned int part = 0; part < partition; part++) 2251 { 2252 partition_evens[part] = 2253 (partition_row_index[part + 1] - partition_row_index[part] + 1) / 2; 2254 partition_odds[part] = 2255 (partition_row_index[part + 1] - partition_row_index[part]) / 2; 2256 partition_n_blocked_workers[part] = 2257 partition_odds[part] - 2258 (partition_odds[part] + partition_evens[part] + 1) % 2; 2259 partition_n_workers[part] = partition_evens[part] + 2260 partition_odds[part] - 2261 partition_n_blocked_workers[part]; 2262 } 2263 } 2264 } // namespace MatrixFreeFunctions 2265 } // namespace internal 2266 2267 2268 2269 // explicit instantiations of template functions 2270 template void 2271 internal::MatrixFreeFunctions::TaskInfo::print_memory_statistics<std::ostream>( 2272 std::ostream &, 2273 const std::size_t) const; 2274 template void 2275 internal::MatrixFreeFunctions::TaskInfo::print_memory_statistics< 2276 ConditionalOStream>(ConditionalOStream &, const std::size_t) const; 2277 2278 2279 DEAL_II_NAMESPACE_CLOSE 2280