1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2011 - 2020 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 #ifndef dealii_la_parallel_vector_templates_h
17 #define dealii_la_parallel_vector_templates_h
18 
19 
20 #include <deal.II/base/config.h>
21 
22 #include <deal.II/base/cuda.h>
23 #include <deal.II/base/cuda_size.h>
24 
25 #include <deal.II/lac/exceptions.h>
26 #include <deal.II/lac/la_parallel_vector.h>
27 #include <deal.II/lac/petsc_vector.h>
28 #include <deal.II/lac/read_write_vector.h>
29 #include <deal.II/lac/trilinos_vector.h>
30 #include <deal.II/lac/vector_operations_internal.h>
31 
32 #include <memory>
33 
34 
35 DEAL_II_NAMESPACE_OPEN
36 
37 
38 namespace LinearAlgebra
39 {
40   namespace distributed
41   {
42     namespace internal
43     {
44       // In the import_from_ghosted_array_finish we might need to calculate the
45       // maximal and minimal value for the given number type, which is not
46       // straightforward for complex numbers. Therefore, comparison of complex
47       // numbers is prohibited and throws an exception.
48       template <typename Number>
49       Number
get_min(const Number a,const Number b)50       get_min(const Number a, const Number b)
51       {
52         return std::min(a, b);
53       }
54 
55       template <typename Number>
56       std::complex<Number>
get_min(const std::complex<Number> a,const std::complex<Number>)57       get_min(const std::complex<Number> a, const std::complex<Number>)
58       {
59         AssertThrow(false,
60                     ExcMessage("VectorOperation::min not "
61                                "implemented for complex numbers"));
62         return a;
63       }
64 
65       template <typename Number>
66       Number
get_max(const Number a,const Number b)67       get_max(const Number a, const Number b)
68       {
69         return std::max(a, b);
70       }
71 
72       template <typename Number>
73       std::complex<Number>
get_max(const std::complex<Number> a,const std::complex<Number>)74       get_max(const std::complex<Number> a, const std::complex<Number>)
75       {
76         AssertThrow(false,
77                     ExcMessage("VectorOperation::max not "
78                                "implemented for complex numbers"));
79         return a;
80       }
81 
82 
83 
84       // Resize the underlying array on the host or on the device
85       template <typename Number, typename MemorySpaceType>
86       struct la_parallel_vector_templates_functions
87       {
88         static_assert(std::is_same<MemorySpaceType, MemorySpace::Host>::value ||
89                         std::is_same<MemorySpaceType, MemorySpace::CUDA>::value,
90                       "MemorySpace should be Host or CUDA");
91 
92         static void
resize_valla_parallel_vector_templates_functions93         resize_val(
94           const types::global_dof_index /*new_alloc_size*/,
95           types::global_dof_index & /*allocated_size*/,
96           ::dealii::MemorySpace::MemorySpaceData<Number, MemorySpaceType>
97             & /*data*/)
98         {}
99 
100         static void
importla_parallel_vector_templates_functions101         import(
102           const ::dealii::LinearAlgebra::ReadWriteVector<Number> & /*V*/,
103           ::dealii::VectorOperation::values /*operation*/,
104           const std::shared_ptr<const ::dealii::Utilities::MPI::Partitioner> &
105           /*communication_pattern*/,
106           const IndexSet & /*locally_owned_elem*/,
107           ::dealii::MemorySpace::MemorySpaceData<Number, MemorySpaceType>
108             & /*data*/)
109         {}
110 
111         template <typename RealType>
112         static void
linfty_norm_localla_parallel_vector_templates_functions113         linfty_norm_local(
114           const ::dealii::MemorySpace::MemorySpaceData<Number, MemorySpaceType>
115             & /*data*/,
116           const unsigned int /*size*/,
117           RealType & /*max*/)
118         {}
119       };
120 
121       template <typename Number>
122       struct la_parallel_vector_templates_functions<Number,
123                                                     ::dealii::MemorySpace::Host>
124       {
125         using size_type = types::global_dof_index;
126 
127         static void
128         resize_val(const types::global_dof_index new_alloc_size,
129                    types::global_dof_index &     allocated_size,
130                    ::dealii::MemorySpace::
131                      MemorySpaceData<Number, ::dealii::MemorySpace::Host> &data)
132         {
133           if (new_alloc_size > allocated_size)
134             {
135               Assert(((allocated_size > 0 && data.values != nullptr) ||
136                       data.values == nullptr),
137                      ExcInternalError());
138 
139               Number *new_val;
140               Utilities::System::posix_memalign(
141                 reinterpret_cast<void **>(&new_val),
142                 64,
143                 sizeof(Number) * new_alloc_size);
144               data.values = {new_val, [](Number *data) { std::free(data); }};
145 
146               allocated_size = new_alloc_size;
147             }
148           else if (new_alloc_size == 0)
149             {
150               data.values.reset();
151               allocated_size = 0;
152             }
153         }
154 
155         static void
156         import(
157           const ::dealii::LinearAlgebra::ReadWriteVector<Number> &V,
158           ::dealii::VectorOperation::values                       operation,
159           const std::shared_ptr<const ::dealii::Utilities::MPI::Partitioner>
160             &             communication_pattern,
161           const IndexSet &locally_owned_elem,
162           ::dealii::MemorySpace::MemorySpaceData<Number,
163                                                  ::dealii::MemorySpace::Host>
164             &data)
165         {
166           Assert(
167             (operation == ::dealii::VectorOperation::add) ||
168               (operation == ::dealii::VectorOperation::insert),
169             ExcMessage(
170               "Only VectorOperation::add and VectorOperation::insert are allowed"));
171 
172           ::dealii::LinearAlgebra::distributed::
173             Vector<Number, ::dealii::MemorySpace::Host>
174               tmp_vector(communication_pattern);
175 
176           // fill entries from ReadWriteVector into the distributed vector,
177           // including ghost entries. this is not really efficient right now
178           // because indices are translated twice, once by nth_index_in_set(i)
179           // and once for operator() of tmp_vector
180           const IndexSet &v_stored = V.get_stored_elements();
181           for (size_type i = 0; i < v_stored.n_elements(); ++i)
182             tmp_vector(v_stored.nth_index_in_set(i)) = V.local_element(i);
183 
184           tmp_vector.compress(operation);
185 
186           // Copy the local elements of tmp_vector to the right place in val
187           IndexSet tmp_index_set = tmp_vector.locally_owned_elements();
188           if (operation == VectorOperation::add)
189             {
190               for (size_type i = 0; i < tmp_index_set.n_elements(); ++i)
191                 {
192                   data.values[locally_owned_elem.index_within_set(
193                     tmp_index_set.nth_index_in_set(i))] +=
194                     tmp_vector.local_element(i);
195                 }
196             }
197           else
198             {
199               for (size_type i = 0; i < tmp_index_set.n_elements(); ++i)
200                 {
201                   data.values[locally_owned_elem.index_within_set(
202                     tmp_index_set.nth_index_in_set(i))] =
203                     tmp_vector.local_element(i);
204                 }
205             }
206         }
207 
208         template <typename RealType>
209         static void
210         linfty_norm_local(const ::dealii::MemorySpace::MemorySpaceData<
211                             Number,
212                             ::dealii::MemorySpace::Host> &data,
213                           const unsigned int              size,
214                           RealType &                      max)
215         {
216           for (size_type i = 0; i < size; ++i)
217             max =
218               std::max(numbers::NumberTraits<Number>::abs(data.values[i]), max);
219         }
220       };
221 
222 #ifdef DEAL_II_COMPILER_CUDA_AWARE
223       template <typename Number>
224       struct la_parallel_vector_templates_functions<Number,
225                                                     ::dealii::MemorySpace::CUDA>
226       {
227         using size_type = types::global_dof_index;
228 
229         static void
230         resize_val(const types::global_dof_index new_alloc_size,
231                    types::global_dof_index &     allocated_size,
232                    ::dealii::MemorySpace::
233                      MemorySpaceData<Number, ::dealii::MemorySpace::CUDA> &data)
234         {
235           static_assert(
236             std::is_same<Number, float>::value ||
237               std::is_same<Number, double>::value,
238             "Number should be float or double for CUDA memory space");
239 
240           if (new_alloc_size > allocated_size)
241             {
242               Assert(((allocated_size > 0 && data.values_dev != nullptr) ||
243                       data.values_dev == nullptr),
244                      ExcInternalError());
245 
246               Number *new_val_dev;
247               Utilities::CUDA::malloc(new_val_dev, new_alloc_size);
248               data.values_dev.reset(new_val_dev);
249 
250               allocated_size = new_alloc_size;
251             }
252           else if (new_alloc_size == 0)
253             {
254               data.values_dev.reset();
255               allocated_size = 0;
256             }
257         }
258 
259         static void
260         import(const ReadWriteVector<Number> &V,
261                VectorOperation::values        operation,
262                std::shared_ptr<const Utilities::MPI::Partitioner>
263                                communication_pattern,
264                const IndexSet &locally_owned_elem,
265                ::dealii::MemorySpace::
266                  MemorySpaceData<Number, ::dealii::MemorySpace::CUDA> &data)
267         {
268           Assert(
269             (operation == ::dealii::VectorOperation::add) ||
270               (operation == ::dealii::VectorOperation::insert),
271             ExcMessage(
272               "Only VectorOperation::add and VectorOperation::insert are allowed"));
273 
274           ::dealii::LinearAlgebra::distributed::
275             Vector<Number, ::dealii::MemorySpace::CUDA>
276               tmp_vector(communication_pattern);
277 
278           // fill entries from ReadWriteVector into the distributed vector,
279           // including ghost entries. this is not really efficient right now
280           // because indices are translated twice, once by nth_index_in_set(i)
281           // and once for operator() of tmp_vector
282           const IndexSet &       v_stored   = V.get_stored_elements();
283           const size_type        n_elements = v_stored.n_elements();
284           std::vector<size_type> indices(n_elements);
285           for (size_type i = 0; i < n_elements; ++i)
286             indices[i] = communication_pattern->global_to_local(
287               v_stored.nth_index_in_set(i));
288           // Move the indices to the device
289           size_type *indices_dev;
290           ::dealii::Utilities::CUDA::malloc(indices_dev, n_elements);
291           ::dealii::Utilities::CUDA::copy_to_dev(indices, indices_dev);
292           // Move the data to the device
293           Number *V_dev;
294           ::dealii::Utilities::CUDA::malloc(V_dev, n_elements);
295           cudaError_t cuda_error_code = cudaMemcpy(V_dev,
296                                                    V.begin(),
297                                                    n_elements * sizeof(Number),
298                                                    cudaMemcpyHostToDevice);
299           AssertCuda(cuda_error_code);
300 
301           // Set the values in tmp_vector
302           const int n_blocks =
303             1 + n_elements / (::dealii::CUDAWrappers::chunk_size *
304                               ::dealii::CUDAWrappers::block_size);
305           ::dealii::LinearAlgebra::CUDAWrappers::kernel::set_permutated<Number>
306             <<<n_blocks, ::dealii::CUDAWrappers::block_size>>>(
307               indices_dev, tmp_vector.begin(), V_dev, n_elements);
308 
309           tmp_vector.compress(operation);
310 
311           // Copy the local elements of tmp_vector to the right place in val
312           IndexSet        tmp_index_set  = tmp_vector.locally_owned_elements();
313           const size_type tmp_n_elements = tmp_index_set.n_elements();
314           indices.resize(tmp_n_elements);
315           for (size_type i = 0; i < tmp_n_elements; ++i)
316             indices[i] = locally_owned_elem.index_within_set(
317               tmp_index_set.nth_index_in_set(i));
318           ::dealii::Utilities::CUDA::free(indices_dev);
319           ::dealii::Utilities::CUDA::malloc(indices_dev, tmp_n_elements);
320           ::dealii::Utilities::CUDA::copy_to_dev(indices, indices_dev);
321 
322           if (operation == VectorOperation::add)
323             ::dealii::LinearAlgebra::CUDAWrappers::kernel::add_permutated<
324               Number><<<n_blocks, ::dealii::CUDAWrappers::block_size>>>(
325               indices_dev,
326               data.values_dev.get(),
327               tmp_vector.begin(),
328               tmp_n_elements);
329           else
330             ::dealii::LinearAlgebra::CUDAWrappers::kernel::set_permutated<
331               Number><<<n_blocks, ::dealii::CUDAWrappers::block_size>>>(
332               indices_dev,
333               data.values_dev.get(),
334               tmp_vector.begin(),
335               tmp_n_elements);
336 
337           ::dealii::Utilities::CUDA::free(indices_dev);
338           ::dealii::Utilities::CUDA::free(V_dev);
339         }
340 
341         template <typename RealType>
342         static void
343         linfty_norm_local(const ::dealii::MemorySpace::MemorySpaceData<
344                             Number,
345                             ::dealii::MemorySpace::CUDA> &data,
346                           const unsigned int              size,
347                           RealType &                      result)
348         {
349           static_assert(std::is_same<Number, RealType>::value,
350                         "RealType should be the same type as Number");
351 
352           Number *    result_device;
353           cudaError_t error_code = cudaMalloc(&result_device, sizeof(Number));
354           AssertCuda(error_code);
355           error_code = cudaMemset(result_device, 0, sizeof(Number));
356 
357           const int n_blocks = 1 + size / (::dealii::CUDAWrappers::chunk_size *
358                                            ::dealii::CUDAWrappers::block_size);
359           ::dealii::LinearAlgebra::CUDAWrappers::kernel::reduction<
360             Number,
361             ::dealii::LinearAlgebra::CUDAWrappers::kernel::LInfty<Number>>
362             <<<dim3(n_blocks, 1), dim3(::dealii::CUDAWrappers::block_size)>>>(
363               result_device, data.values_dev.get(), size);
364 
365           // Copy the result back to the host
366           error_code = cudaMemcpy(&result,
367                                   result_device,
368                                   sizeof(Number),
369                                   cudaMemcpyDeviceToHost);
370           AssertCuda(error_code);
371           // Free the memory on the device
372           error_code = cudaFree(result_device);
373           AssertCuda(error_code);
374         }
375       };
376 #endif
377     } // namespace internal
378 
379 
380     template <typename Number, typename MemorySpaceType>
381     void
382     Vector<Number, MemorySpaceType>::clear_mpi_requests()
383     {
384 #ifdef DEAL_II_WITH_MPI
385       for (auto &compress_request : compress_requests)
386         {
387           const int ierr = MPI_Request_free(&compress_request);
388           AssertThrowMPI(ierr);
389         }
390       compress_requests.clear();
391       for (auto &update_ghost_values_request : update_ghost_values_requests)
392         {
393           const int ierr = MPI_Request_free(&update_ghost_values_request);
394           AssertThrowMPI(ierr);
395         }
396       update_ghost_values_requests.clear();
397 #endif
398     }
399 
400 
401 
402     template <typename Number, typename MemorySpaceType>
403     void
404     Vector<Number, MemorySpaceType>::resize_val(const size_type new_alloc_size)
405     {
406       internal::la_parallel_vector_templates_functions<
407         Number,
408         MemorySpaceType>::resize_val(new_alloc_size, allocated_size, data);
409 
410       thread_loop_partitioner =
411         std::make_shared<::dealii::parallel::internal::TBBPartitioner>();
412     }
413 
414 
415 
416     template <typename Number, typename MemorySpaceType>
417     void
418     Vector<Number, MemorySpaceType>::reinit(const size_type size,
419                                             const bool omit_zeroing_entries)
420     {
421       clear_mpi_requests();
422 
423       // check whether we need to reallocate
424       resize_val(size);
425 
426       // delete previous content in import data
427       import_data.values.reset();
428       import_data.values_dev.reset();
429 
430       // set partitioner to serial version
431       partitioner = std::make_shared<Utilities::MPI::Partitioner>(size);
432 
433       // set entries to zero if so requested
434       if (omit_zeroing_entries == false)
435         this->operator=(Number());
436       else
437         zero_out_ghosts();
438     }
439 
440 
441 
442     template <typename Number, typename MemorySpaceType>
443     template <typename Number2>
444     void
445     Vector<Number, MemorySpaceType>::reinit(
446       const Vector<Number2, MemorySpaceType> &v,
447       const bool                              omit_zeroing_entries)
448     {
449       clear_mpi_requests();
450       Assert(v.partitioner.get() != nullptr, ExcNotInitialized());
451 
452       // check whether the partitioners are
453       // different (check only if the are allocated
454       // differently, not if the actual data is
455       // different)
456       if (partitioner.get() != v.partitioner.get())
457         {
458           partitioner = v.partitioner;
459           const size_type new_allocated_size =
460             partitioner->local_size() + partitioner->n_ghost_indices();
461           resize_val(new_allocated_size);
462         }
463 
464       if (omit_zeroing_entries == false)
465         this->operator=(Number());
466       else
467         zero_out_ghosts();
468 
469       // do not reallocate import_data directly, but only upon request. It
470       // is only used as temporary storage for compress() and
471       // update_ghost_values, and we might have vectors where we never
472       // call these methods and hence do not need to have the storage.
473       import_data.values.reset();
474       import_data.values_dev.reset();
475 
476       thread_loop_partitioner = v.thread_loop_partitioner;
477     }
478 
479 
480 
481     template <typename Number, typename MemorySpaceType>
482     void
483     Vector<Number, MemorySpaceType>::reinit(
484       const IndexSet &locally_owned_indices,
485       const IndexSet &ghost_indices,
486       const MPI_Comm  communicator)
487     {
488       // set up parallel partitioner with index sets and communicator
489       reinit(std::make_shared<Utilities::MPI::Partitioner>(
490         locally_owned_indices, ghost_indices, communicator));
491     }
492 
493 
494 
495     template <typename Number, typename MemorySpaceType>
496     void
497     Vector<Number, MemorySpaceType>::reinit(
498       const IndexSet &locally_owned_indices,
499       const MPI_Comm  communicator)
500     {
501       // set up parallel partitioner with index sets and communicator
502       reinit(
503         std::make_shared<Utilities::MPI::Partitioner>(locally_owned_indices,
504                                                       communicator));
505     }
506 
507 
508 
509     template <typename Number, typename MemorySpaceType>
510     void
511     Vector<Number, MemorySpaceType>::reinit(
512       const std::shared_ptr<const Utilities::MPI::Partitioner> &partitioner_in)
513     {
514       clear_mpi_requests();
515       partitioner = partitioner_in;
516 
517       // set vector size and allocate memory
518       const size_type new_allocated_size =
519         partitioner->local_size() + partitioner->n_ghost_indices();
520       resize_val(new_allocated_size);
521 
522       // initialize to zero
523       *this = Number();
524 
525 
526       // do not reallocate import_data directly, but only upon request. It
527       // is only used as temporary storage for compress() and
528       // update_ghost_values, and we might have vectors where we never
529       // call these methods and hence do not need to have the storage.
530       import_data.values.reset();
531       import_data.values_dev.reset();
532 
533       vector_is_ghosted = false;
534     }
535 
536 
537 
538     template <typename Number, typename MemorySpaceType>
539     Vector<Number, MemorySpaceType>::Vector()
540       : partitioner(std::make_shared<Utilities::MPI::Partitioner>())
541       , allocated_size(0)
542     {
543       reinit(0);
544     }
545 
546 
547 
548     template <typename Number, typename MemorySpaceType>
549     Vector<Number, MemorySpaceType>::Vector(
550       const Vector<Number, MemorySpaceType> &v)
551       : Subscriptor()
552       , allocated_size(0)
553       , vector_is_ghosted(false)
554     {
555       reinit(v, true);
556 
557       thread_loop_partitioner = v.thread_loop_partitioner;
558 
559       const size_type this_size = local_size();
560       if (this_size > 0)
561         {
562           dealii::internal::VectorOperations::
563             functions<Number, Number, MemorySpaceType>::copy(
564               thread_loop_partitioner, partitioner->local_size(), v.data, data);
565         }
566     }
567 
568 
569 
570     template <typename Number, typename MemorySpaceType>
571     Vector<Number, MemorySpaceType>::Vector(const IndexSet &local_range,
572                                             const IndexSet &ghost_indices,
573                                             const MPI_Comm  communicator)
574       : allocated_size(0)
575       , vector_is_ghosted(false)
576     {
577       reinit(local_range, ghost_indices, communicator);
578     }
579 
580 
581 
582     template <typename Number, typename MemorySpaceType>
583     Vector<Number, MemorySpaceType>::Vector(const IndexSet &local_range,
584                                             const MPI_Comm  communicator)
585       : allocated_size(0)
586       , vector_is_ghosted(false)
587     {
588       reinit(local_range, communicator);
589     }
590 
591 
592 
593     template <typename Number, typename MemorySpaceType>
594     Vector<Number, MemorySpaceType>::Vector(const size_type size)
595       : allocated_size(0)
596       , vector_is_ghosted(false)
597     {
598       reinit(size, false);
599     }
600 
601 
602 
603     template <typename Number, typename MemorySpaceType>
604     Vector<Number, MemorySpaceType>::Vector(
605       const std::shared_ptr<const Utilities::MPI::Partitioner> &partitioner)
606       : allocated_size(0)
607       , vector_is_ghosted(false)
608     {
609       reinit(partitioner);
610     }
611 
612 
613 
614     template <typename Number, typename MemorySpaceType>
615     inline Vector<Number, MemorySpaceType>::~Vector()
616     {
617       try
618         {
619           clear_mpi_requests();
620         }
621       catch (...)
622         {}
623     }
624 
625 
626 
627     template <typename Number, typename MemorySpaceType>
628     inline Vector<Number, MemorySpaceType> &
629     Vector<Number, MemorySpaceType>::
630     operator=(const Vector<Number, MemorySpaceType> &c)
631     {
632 #ifdef _MSC_VER
633       return this->operator=<Number>(c);
634 #else
635       return this->template operator=<Number>(c);
636 #endif
637     }
638 
639 
640 
641     template <typename Number, typename MemorySpaceType>
642     template <typename Number2>
643     inline Vector<Number, MemorySpaceType> &
644     Vector<Number, MemorySpaceType>::
645     operator=(const Vector<Number2, MemorySpaceType> &c)
646     {
647       Assert(c.partitioner.get() != nullptr, ExcNotInitialized());
648 
649       // we update ghost values whenever one of the input or output vector
650       // already held ghost values or when we import data from a vector with
651       // the same local range but different ghost layout
652       bool must_update_ghost_values = c.vector_is_ghosted;
653 
654       // check whether the two vectors use the same parallel partitioner. if
655       // not, check if all local ranges are the same (that way, we can
656       // exchange data between different parallel layouts). One variant which
657       // is included here and necessary for compatibility with the other
658       // distributed vector classes (Trilinos, PETSc) is the case when vector
659       // c does not have any ghosts (constructed without ghost elements given)
660       // but the current vector does: In that case, we need to exchange data
661       // also when none of the two vector had updated its ghost values before.
662       if (partitioner.get() == nullptr)
663         reinit(c, true);
664       else if (partitioner.get() != c.partitioner.get())
665         {
666           // local ranges are also the same if both partitioners are empty
667           // (even if they happen to define the empty range as [0,0) or [c,c)
668           // for some c!=0 in a different way).
669           int local_ranges_are_identical =
670             (partitioner->local_range() == c.partitioner->local_range() ||
671              (partitioner->local_range().second ==
672                 partitioner->local_range().first &&
673               c.partitioner->local_range().second ==
674                 c.partitioner->local_range().first));
675           if ((c.partitioner->n_mpi_processes() > 1 &&
676                Utilities::MPI::min(local_ranges_are_identical,
677                                    c.partitioner->get_mpi_communicator()) ==
678                  0) ||
679               !local_ranges_are_identical)
680             reinit(c, true);
681           else
682             must_update_ghost_values |= vector_is_ghosted;
683 
684           must_update_ghost_values |=
685             (c.partitioner->ghost_indices_initialized() == false &&
686              partitioner->ghost_indices_initialized() == true);
687         }
688       else
689         must_update_ghost_values |= vector_is_ghosted;
690 
691       thread_loop_partitioner = c.thread_loop_partitioner;
692 
693       const size_type this_size = partitioner->local_size();
694       if (this_size > 0)
695         {
696           dealii::internal::VectorOperations::
697             functions<Number, Number2, MemorySpaceType>::copy(
698               thread_loop_partitioner, this_size, c.data, data);
699         }
700 
701       if (must_update_ghost_values)
702         update_ghost_values();
703       else
704         zero_out_ghosts();
705       return *this;
706     }
707 
708 
709 
710     template <typename Number, typename MemorySpaceType>
711     template <typename Number2>
712     void
713     Vector<Number, MemorySpaceType>::copy_locally_owned_data_from(
714       const Vector<Number2, MemorySpaceType> &src)
715     {
716       AssertDimension(partitioner->local_size(), src.partitioner->local_size());
717       if (partitioner->local_size() > 0)
718         {
719           dealii::internal::VectorOperations::
720             functions<Number, Number2, MemorySpaceType>::copy(
721               thread_loop_partitioner,
722               partitioner->local_size(),
723               src.data,
724               data);
725         }
726     }
727 
728 
729 
730     template <typename Number, typename MemorySpaceType>
731     template <typename MemorySpaceType2>
732     void
733     Vector<Number, MemorySpaceType>::import(
734       const Vector<Number, MemorySpaceType2> &src,
735       VectorOperation::values                 operation)
736     {
737       Assert(src.partitioner.get() != nullptr, ExcNotInitialized());
738       Assert(partitioner->locally_owned_range() ==
739                src.partitioner->locally_owned_range(),
740              ExcMessage("Locally owned indices should be identical."));
741       Assert(partitioner->ghost_indices() == src.partitioner->ghost_indices(),
742              ExcMessage("Ghost indices should be identical."));
743       ::dealii::internal::VectorOperations::
744         functions<Number, Number, MemorySpaceType>::import(
745           thread_loop_partitioner, allocated_size, operation, src.data, data);
746     }
747 
748 
749 
750     template <typename Number, typename MemorySpaceType>
751     void
752     Vector<Number, MemorySpaceType>::compress(
753       ::dealii::VectorOperation::values operation)
754     {
755       compress_start(0, operation);
756       compress_finish(operation);
757     }
758 
759 
760 
761     template <typename Number, typename MemorySpaceType>
762     void
763     Vector<Number, MemorySpaceType>::update_ghost_values() const
764     {
765       update_ghost_values_start();
766       update_ghost_values_finish();
767     }
768 
769 
770 
771     template <typename Number, typename MemorySpaceType>
772     void
773     Vector<Number, MemorySpaceType>::zero_out_ghosts() const
774     {
775       if (data.values != nullptr)
776         std::fill_n(data.values.get() + partitioner->local_size(),
777                     partitioner->n_ghost_indices(),
778                     Number());
779 #ifdef DEAL_II_COMPILER_CUDA_AWARE
780       if (data.values_dev != nullptr)
781         {
782           const cudaError_t cuda_error_code =
783             cudaMemset(data.values_dev.get() + partitioner->local_size(),
784                        0,
785                        partitioner->n_ghost_indices() * sizeof(Number));
786           AssertCuda(cuda_error_code);
787         }
788 #endif
789 
790       vector_is_ghosted = false;
791     }
792 
793 
794 
795     template <typename Number, typename MemorySpaceType>
796     void
797     Vector<Number, MemorySpaceType>::compress_start(
798       const unsigned int                communication_channel,
799       ::dealii::VectorOperation::values operation)
800     {
801       AssertIndexRange(communication_channel, 200);
802       Assert(vector_is_ghosted == false,
803              ExcMessage("Cannot call compress() on a ghosted vector"));
804 
805 #ifdef DEAL_II_WITH_MPI
806       // make this function thread safe
807       std::lock_guard<std::mutex> lock(mutex);
808 
809       // allocate import_data in case it is not set up yet
810       if (partitioner->n_import_indices() > 0)
811         {
812 #  if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
813     defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
814           if (std::is_same<MemorySpaceType, dealii::MemorySpace::CUDA>::value)
815             {
816               if (import_data.values_dev == nullptr)
817                 import_data.values_dev.reset(
818                   Utilities::CUDA::allocate_device_data<Number>(
819                     partitioner->n_import_indices()));
820             }
821           else
822 #  endif
823             {
824 #  if !defined(DEAL_II_COMPILER_CUDA_AWARE) && \
825     defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
826               static_assert(
827                 std::is_same<MemorySpaceType, dealii::MemorySpace::Host>::value,
828                 "This code path should only be compiled for CUDA-aware-MPI for MemorySpace::Host!");
829 #  endif
830               if (import_data.values == nullptr)
831                 {
832                   Number *new_val;
833                   Utilities::System::posix_memalign(
834                     reinterpret_cast<void **>(&new_val),
835                     64,
836                     sizeof(Number) * partitioner->n_import_indices());
837                   import_data.values.reset(new_val);
838                 }
839             }
840         }
841 
842 #  if defined DEAL_II_COMPILER_CUDA_AWARE && \
843     !defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
844       if (std::is_same<MemorySpaceType, dealii::MemorySpace::CUDA>::value)
845         {
846           // Move the data to the host and then move it back to the
847           // device. We use values to store the elements because the function
848           // uses a view of the array and thus we need the data on the host to
849           // outlive the scope of the function.
850           Number *new_val;
851           Utilities::System::posix_memalign(reinterpret_cast<void **>(&new_val),
852                                             64,
853                                             sizeof(Number) * allocated_size);
854 
855           data.values = {new_val, [](Number *data) { std::free(data); }};
856 
857           cudaError_t cuda_error_code =
858             cudaMemcpy(data.values.get(),
859                        data.values_dev.get(),
860                        allocated_size * sizeof(Number),
861                        cudaMemcpyDeviceToHost);
862           AssertCuda(cuda_error_code);
863         }
864 #  endif
865 
866 #  if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
867     defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
868       if (std::is_same<MemorySpaceType, dealii::MemorySpace::CUDA>::value)
869         {
870           partitioner->import_from_ghosted_array_start(
871             operation,
872             communication_channel,
873             ArrayView<Number, MemorySpace::CUDA>(
874               data.values_dev.get() + partitioner->local_size(),
875               partitioner->n_ghost_indices()),
876             ArrayView<Number, MemorySpace::CUDA>(
877               import_data.values_dev.get(), partitioner->n_import_indices()),
878             compress_requests);
879         }
880       else
881 #  endif
882         {
883           partitioner->import_from_ghosted_array_start(
884             operation,
885             communication_channel,
886             ArrayView<Number, MemorySpace::Host>(
887               data.values.get() + partitioner->local_size(),
888               partitioner->n_ghost_indices()),
889             ArrayView<Number, MemorySpace::Host>(
890               import_data.values.get(), partitioner->n_import_indices()),
891             compress_requests);
892         }
893 #else
894       (void)communication_channel;
895       (void)operation;
896 #endif
897     }
898 
899 
900 
901     template <typename Number, typename MemorySpaceType>
902     void
903     Vector<Number, MemorySpaceType>::compress_finish(
904       ::dealii::VectorOperation::values operation)
905     {
906 #ifdef DEAL_II_WITH_MPI
907       vector_is_ghosted = false;
908 
909       // in order to zero ghost part of the vector, we need to call
910       // import_from_ghosted_array_finish() regardless of
911       // compress_requests.size() == 0
912 
913       // make this function thread safe
914       std::lock_guard<std::mutex> lock(mutex);
915 #  if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
916     defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
917       if (std::is_same<MemorySpaceType, MemorySpace::CUDA>::value)
918         {
919           Assert(partitioner->n_import_indices() == 0 ||
920                    import_data.values_dev != nullptr,
921                  ExcNotInitialized());
922           partitioner
923             ->import_from_ghosted_array_finish<Number, MemorySpace::CUDA>(
924               operation,
925               ArrayView<const Number, MemorySpace::CUDA>(
926                 import_data.values_dev.get(), partitioner->n_import_indices()),
927               ArrayView<Number, MemorySpace::CUDA>(data.values_dev.get(),
928                                                    partitioner->local_size()),
929               ArrayView<Number, MemorySpace::CUDA>(
930                 data.values_dev.get() + partitioner->local_size(),
931                 partitioner->n_ghost_indices()),
932               compress_requests);
933         }
934       else
935 #  endif
936         {
937           Assert(partitioner->n_import_indices() == 0 ||
938                    import_data.values != nullptr,
939                  ExcNotInitialized());
940           partitioner
941             ->import_from_ghosted_array_finish<Number, MemorySpace::Host>(
942               operation,
943               ArrayView<const Number, MemorySpace::Host>(
944                 import_data.values.get(), partitioner->n_import_indices()),
945               ArrayView<Number, MemorySpace::Host>(data.values.get(),
946                                                    partitioner->local_size()),
947               ArrayView<Number, MemorySpace::Host>(
948                 data.values.get() + partitioner->local_size(),
949                 partitioner->n_ghost_indices()),
950               compress_requests);
951         }
952 
953 #  if defined DEAL_II_COMPILER_CUDA_AWARE && \
954     !defined  DEAL_II_MPI_WITH_CUDA_SUPPORT
955       // The communication is done on the host, so we need to
956       // move the data back to the device.
957       if (std::is_same<MemorySpaceType, MemorySpace::CUDA>::value)
958         {
959           cudaError_t cuda_error_code =
960             cudaMemcpy(data.values_dev.get(),
961                        data.values.get(),
962                        allocated_size * sizeof(Number),
963                        cudaMemcpyHostToDevice);
964           AssertCuda(cuda_error_code);
965 
966           data.values.reset();
967         }
968 #  endif
969 #else
970       (void)operation;
971 #endif
972     }
973 
974 
975 
976     template <typename Number, typename MemorySpaceType>
977     void
978     Vector<Number, MemorySpaceType>::update_ghost_values_start(
979       const unsigned int communication_channel) const
980     {
981       AssertIndexRange(communication_channel, 200);
982 #ifdef DEAL_II_WITH_MPI
983       // nothing to do when we neither have import nor ghost indices.
984       if (partitioner->n_ghost_indices() == 0 &&
985           partitioner->n_import_indices() == 0)
986         return;
987 
988       // make this function thread safe
989       std::lock_guard<std::mutex> lock(mutex);
990 
991       // allocate import_data in case it is not set up yet
992       if (partitioner->n_import_indices() > 0)
993         {
994 #  if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
995     defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
996           Assert(
997             (std::is_same<MemorySpaceType, dealii::MemorySpace::CUDA>::value),
998             ExcMessage(
999               "Using MemorySpace::CUDA only allowed if the code is compiled with a CUDA compiler!"));
1000           if (import_data.values_dev == nullptr)
1001             import_data.values_dev.reset(
1002               Utilities::CUDA::allocate_device_data<Number>(
1003                 partitioner->n_import_indices()));
1004 #  else
1005 #    ifdef DEAL_II_MPI_WITH_CUDA_SUPPORT
1006           static_assert(
1007             std::is_same<MemorySpaceType, dealii::MemorySpace::Host>::value,
1008             "This code path should only be compiled for CUDA-aware-MPI for MemorySpace::Host!");
1009 #    endif
1010           if (import_data.values == nullptr)
1011             {
1012               Number *new_val;
1013               Utilities::System::posix_memalign(
1014                 reinterpret_cast<void **>(&new_val),
1015                 64,
1016                 sizeof(Number) * partitioner->n_import_indices());
1017               import_data.values.reset(new_val);
1018             }
1019 #  endif
1020         }
1021 
1022 #  if defined DEAL_II_COMPILER_CUDA_AWARE && \
1023     !defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
1024       // Move the data to the host and then move it back to the
1025       // device. We use values to store the elements because the function
1026       // uses a view of the array and thus we need the data on the host to
1027       // outlive the scope of the function.
1028       Number *new_val;
1029       Utilities::System::posix_memalign(reinterpret_cast<void **>(&new_val),
1030                                         64,
1031                                         sizeof(Number) * allocated_size);
1032 
1033       data.values = {new_val, [](Number *data) { std::free(data); }};
1034 
1035       cudaError_t cuda_error_code = cudaMemcpy(data.values.get(),
1036                                                data.values_dev.get(),
1037                                                allocated_size * sizeof(Number),
1038                                                cudaMemcpyDeviceToHost);
1039       AssertCuda(cuda_error_code);
1040 #  endif
1041 
1042 #  if !(defined(DEAL_II_COMPILER_CUDA_AWARE) && \
1043         defined(DEAL_II_MPI_WITH_CUDA_SUPPORT))
1044       partitioner->export_to_ghosted_array_start<Number, MemorySpace::Host>(
1045         communication_channel,
1046         ArrayView<const Number, MemorySpace::Host>(data.values.get(),
1047                                                    partitioner->local_size()),
1048         ArrayView<Number, MemorySpace::Host>(import_data.values.get(),
1049                                              partitioner->n_import_indices()),
1050         ArrayView<Number, MemorySpace::Host>(data.values.get() +
1051                                                partitioner->local_size(),
1052                                              partitioner->n_ghost_indices()),
1053         update_ghost_values_requests);
1054 #  else
1055       partitioner->export_to_ghosted_array_start<Number, MemorySpace::CUDA>(
1056         communication_channel,
1057         ArrayView<const Number, MemorySpace::CUDA>(data.values_dev.get(),
1058                                                    partitioner->local_size()),
1059         ArrayView<Number, MemorySpace::CUDA>(import_data.values_dev.get(),
1060                                              partitioner->n_import_indices()),
1061         ArrayView<Number, MemorySpace::CUDA>(data.values_dev.get() +
1062                                                partitioner->local_size(),
1063                                              partitioner->n_ghost_indices()),
1064         update_ghost_values_requests);
1065 #  endif
1066 
1067 #else
1068       (void)communication_channel;
1069 #endif
1070     }
1071 
1072 
1073 
1074     template <typename Number, typename MemorySpaceType>
1075     void
1076     Vector<Number, MemorySpaceType>::update_ghost_values_finish() const
1077     {
1078 #ifdef DEAL_II_WITH_MPI
1079       // wait for both sends and receives to complete, even though only
1080       // receives are really necessary. this gives (much) better performance
1081       AssertDimension(partitioner->ghost_targets().size() +
1082                         partitioner->import_targets().size(),
1083                       update_ghost_values_requests.size());
1084       if (update_ghost_values_requests.size() > 0)
1085         {
1086           // make this function thread safe
1087           std::lock_guard<std::mutex> lock(mutex);
1088 
1089 #  if !(defined(DEAL_II_COMPILER_CUDA_AWARE) && \
1090         defined(DEAL_II_MPI_WITH_CUDA_SUPPORT))
1091           partitioner->export_to_ghosted_array_finish(
1092             ArrayView<Number, MemorySpace::Host>(
1093               data.values.get() + partitioner->local_size(),
1094               partitioner->n_ghost_indices()),
1095             update_ghost_values_requests);
1096 #  else
1097           partitioner->export_to_ghosted_array_finish(
1098             ArrayView<Number, MemorySpace::CUDA>(
1099               data.values_dev.get() + partitioner->local_size(),
1100               partitioner->n_ghost_indices()),
1101             update_ghost_values_requests);
1102 #  endif
1103         }
1104 
1105 #  if defined DEAL_II_COMPILER_CUDA_AWARE && \
1106     !defined  DEAL_II_MPI_WITH_CUDA_SUPPORT
1107       // The communication is done on the host, so we need to
1108       // move the data back to the device.
1109       if (std::is_same<MemorySpaceType, MemorySpace::CUDA>::value)
1110         {
1111           cudaError_t cuda_error_code =
1112             cudaMemcpy(data.values_dev.get() + partitioner->local_size(),
1113                        data.values.get() + partitioner->local_size(),
1114                        partitioner->n_ghost_indices() * sizeof(Number),
1115                        cudaMemcpyHostToDevice);
1116           AssertCuda(cuda_error_code);
1117 
1118           data.values.reset();
1119         }
1120 #  endif
1121 
1122 #endif
1123       vector_is_ghosted = true;
1124     }
1125 
1126 
1127 
1128     template <typename Number, typename MemorySpaceType>
1129     void
1130     Vector<Number, MemorySpaceType>::import(
1131       const ReadWriteVector<Number> &                 V,
1132       VectorOperation::values                         operation,
1133       std::shared_ptr<const CommunicationPatternBase> communication_pattern)
1134     {
1135       // If no communication pattern is given, create one. Otherwise, use the
1136       // given one.
1137       std::shared_ptr<const Utilities::MPI::Partitioner> comm_pattern;
1138       if (communication_pattern.get() == nullptr)
1139         {
1140           // Split the IndexSet of V in locally owned elements and ghost indices
1141           // then create the communication pattern
1142           IndexSet locally_owned_elem = locally_owned_elements();
1143           IndexSet ghost_indices      = V.get_stored_elements();
1144           ghost_indices.subtract_set(locally_owned_elem);
1145           comm_pattern = std::make_shared<Utilities::MPI::Partitioner>(
1146             locally_owned_elem, ghost_indices, get_mpi_communicator());
1147         }
1148       else
1149         {
1150           comm_pattern =
1151             std::dynamic_pointer_cast<const Utilities::MPI::Partitioner>(
1152               communication_pattern);
1153           AssertThrow(comm_pattern != nullptr,
1154                       ExcMessage("The communication pattern is not of type "
1155                                  "Utilities::MPI::Partitioner."));
1156         }
1157       Vector<Number, ::dealii::MemorySpace::Host> tmp_vector(comm_pattern);
1158 
1159       data.copy_to(tmp_vector.begin(), local_size());
1160 
1161       // fill entries from ReadWriteVector into the distributed vector,
1162       // including ghost entries. this is not really efficient right now
1163       // because indices are translated twice, once by nth_index_in_set(i) and
1164       // once for operator() of tmp_vector
1165       const IndexSet &v_stored     = V.get_stored_elements();
1166       const size_type v_n_elements = v_stored.n_elements();
1167       switch (operation)
1168         {
1169           case VectorOperation::insert:
1170             {
1171               for (size_type i = 0; i < v_n_elements; ++i)
1172                 tmp_vector(v_stored.nth_index_in_set(i)) = V.local_element(i);
1173 
1174               break;
1175             }
1176           case VectorOperation::add:
1177             {
1178               for (size_type i = 0; i < v_n_elements; ++i)
1179                 tmp_vector(v_stored.nth_index_in_set(i)) += V.local_element(i);
1180 
1181               break;
1182             }
1183           case VectorOperation::min:
1184             {
1185               for (size_type i = 0; i < v_n_elements; ++i)
1186                 tmp_vector(v_stored.nth_index_in_set(i)) =
1187                   internal::get_min(tmp_vector(v_stored.nth_index_in_set(i)),
1188                                     V.local_element(i));
1189 
1190               break;
1191             }
1192           case VectorOperation::max:
1193             {
1194               for (size_type i = 0; i < v_n_elements; ++i)
1195                 tmp_vector(v_stored.nth_index_in_set(i)) =
1196                   internal::get_max(tmp_vector(v_stored.nth_index_in_set(i)),
1197                                     V.local_element(i));
1198 
1199               break;
1200             }
1201           default:
1202             {
1203               Assert(false, ExcMessage("This operation is not supported."));
1204             }
1205         }
1206       tmp_vector.compress(operation);
1207 
1208       data.copy_from(tmp_vector.begin(), local_size());
1209     }
1210 
1211     template <typename Number, typename MemorySpaceType>
1212     void
1213     Vector<Number, MemorySpaceType>::swap(Vector<Number, MemorySpaceType> &v)
1214     {
1215 #ifdef DEAL_II_WITH_MPI
1216 
1217 #  ifdef DEBUG
1218       if (Utilities::MPI::job_supports_mpi())
1219         {
1220           // make sure that there are not outstanding requests from updating
1221           // ghost values or compress
1222           int flag = 1;
1223           if (update_ghost_values_requests.size() > 0)
1224             {
1225               const int ierr = MPI_Testall(update_ghost_values_requests.size(),
1226                                            update_ghost_values_requests.data(),
1227                                            &flag,
1228                                            MPI_STATUSES_IGNORE);
1229               AssertThrowMPI(ierr);
1230               Assert(flag == 1,
1231                      ExcMessage(
1232                        "MPI found unfinished update_ghost_values() requests "
1233                        "when calling swap, which is not allowed."));
1234             }
1235           if (compress_requests.size() > 0)
1236             {
1237               const int ierr = MPI_Testall(compress_requests.size(),
1238                                            compress_requests.data(),
1239                                            &flag,
1240                                            MPI_STATUSES_IGNORE);
1241               AssertThrowMPI(ierr);
1242               Assert(flag == 1,
1243                      ExcMessage("MPI found unfinished compress() requests "
1244                                 "when calling swap, which is not allowed."));
1245             }
1246         }
1247 #  endif
1248 
1249       std::swap(compress_requests, v.compress_requests);
1250       std::swap(update_ghost_values_requests, v.update_ghost_values_requests);
1251 #endif
1252 
1253       std::swap(partitioner, v.partitioner);
1254       std::swap(thread_loop_partitioner, v.thread_loop_partitioner);
1255       std::swap(allocated_size, v.allocated_size);
1256       std::swap(data, v.data);
1257       std::swap(import_data, v.import_data);
1258       std::swap(vector_is_ghosted, v.vector_is_ghosted);
1259     }
1260 
1261 
1262 
1263     template <typename Number, typename MemorySpaceType>
1264     Vector<Number, MemorySpaceType> &
1265     Vector<Number, MemorySpaceType>::operator=(const Number s)
1266     {
1267       const size_type this_size = local_size();
1268       if (this_size > 0)
1269         {
1270           dealii::internal::VectorOperations::
1271             functions<Number, Number, MemorySpaceType>::set(
1272               thread_loop_partitioner, this_size, s, data);
1273         }
1274 
1275       // if we call Vector::operator=0, we want to zero out all the entries
1276       // plus ghosts.
1277       if (s == Number())
1278         zero_out_ghosts();
1279 
1280       return *this;
1281     }
1282 
1283 
1284 
1285     template <typename Number, typename MemorySpaceType>
1286     void
1287     Vector<Number, MemorySpaceType>::reinit(const VectorSpaceVector<Number> &V,
1288                                             const bool omit_zeroing_entries)
1289     {
1290       // Downcast. Throws an exception if invalid.
1291       using VectorType = Vector<Number, MemorySpaceType>;
1292       Assert(dynamic_cast<const VectorType *>(&V) != nullptr,
1293              ExcVectorTypeNotCompatible());
1294       const VectorType &down_V = dynamic_cast<const VectorType &>(V);
1295 
1296       reinit(down_V, omit_zeroing_entries);
1297     }
1298 
1299 
1300 
1301     template <typename Number, typename MemorySpaceType>
1302     Vector<Number, MemorySpaceType> &
1303     Vector<Number, MemorySpaceType>::
1304     operator+=(const VectorSpaceVector<Number> &vv)
1305     {
1306       // Downcast. Throws an exception if invalid.
1307       using VectorType = Vector<Number, MemorySpaceType>;
1308       Assert(dynamic_cast<const VectorType *>(&vv) != nullptr,
1309              ExcVectorTypeNotCompatible());
1310       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1311 
1312       AssertDimension(local_size(), v.local_size());
1313 
1314       dealii::internal::VectorOperations::
1315         functions<Number, Number, MemorySpaceType>::add_vector(
1316           thread_loop_partitioner, partitioner->local_size(), v.data, data);
1317 
1318       if (vector_is_ghosted)
1319         update_ghost_values();
1320 
1321       return *this;
1322     }
1323 
1324 
1325 
1326     template <typename Number, typename MemorySpaceType>
1327     Vector<Number, MemorySpaceType> &
1328     Vector<Number, MemorySpaceType>::
1329     operator-=(const VectorSpaceVector<Number> &vv)
1330     {
1331       // Downcast. Throws an exception if invalid.
1332       using VectorType = Vector<Number, MemorySpaceType>;
1333       Assert(dynamic_cast<const VectorType *>(&vv) != nullptr,
1334              ExcVectorTypeNotCompatible());
1335       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1336 
1337       AssertDimension(local_size(), v.local_size());
1338 
1339       dealii::internal::VectorOperations::
1340         functions<Number, Number, MemorySpaceType>::subtract_vector(
1341           thread_loop_partitioner, partitioner->local_size(), v.data, data);
1342 
1343       if (vector_is_ghosted)
1344         update_ghost_values();
1345 
1346       return *this;
1347     }
1348 
1349 
1350 
1351     template <typename Number, typename MemorySpaceType>
1352     void
1353     Vector<Number, MemorySpaceType>::add(const Number a)
1354     {
1355       AssertIsFinite(a);
1356 
1357       dealii::internal::VectorOperations::
1358         functions<Number, Number, MemorySpaceType>::add_factor(
1359           thread_loop_partitioner, partitioner->local_size(), a, data);
1360 
1361       if (vector_is_ghosted)
1362         update_ghost_values();
1363     }
1364 
1365 
1366 
1367     template <typename Number, typename MemorySpaceType>
1368     void
1369     Vector<Number, MemorySpaceType>::add_local(
1370       const Number                     a,
1371       const VectorSpaceVector<Number> &vv)
1372     {
1373       // Downcast. Throws an exception if invalid.
1374       using VectorType = Vector<Number, MemorySpaceType>;
1375       Assert(dynamic_cast<const VectorType *>(&vv) != nullptr,
1376              ExcVectorTypeNotCompatible());
1377       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1378 
1379       AssertIsFinite(a);
1380       AssertDimension(local_size(), v.local_size());
1381 
1382       // nothing to do if a is zero
1383       if (a == Number(0.))
1384         return;
1385 
1386       dealii::internal::VectorOperations::
1387         functions<Number, Number, MemorySpaceType>::add_av(
1388           thread_loop_partitioner, partitioner->local_size(), a, v.data, data);
1389     }
1390 
1391 
1392 
1393     template <typename Number, typename MemorySpaceType>
1394     void
1395     Vector<Number, MemorySpaceType>::add(const Number                     a,
1396                                          const VectorSpaceVector<Number> &vv)
1397     {
1398       add_local(a, vv);
1399 
1400       if (vector_is_ghosted)
1401         update_ghost_values();
1402     }
1403 
1404 
1405 
1406     template <typename Number, typename MemorySpaceType>
1407     void
1408     Vector<Number, MemorySpaceType>::add(const Number                     a,
1409                                          const VectorSpaceVector<Number> &vv,
1410                                          const Number                     b,
1411                                          const VectorSpaceVector<Number> &ww)
1412     {
1413       // Downcast. Throws an exception if invalid.
1414       using VectorType = Vector<Number, MemorySpaceType>;
1415       Assert(dynamic_cast<const VectorType *>(&vv) != nullptr,
1416              ExcVectorTypeNotCompatible());
1417       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1418       Assert(dynamic_cast<const VectorType *>(&ww) != nullptr,
1419              ExcVectorTypeNotCompatible());
1420       const VectorType &w = dynamic_cast<const VectorType &>(ww);
1421 
1422       AssertIsFinite(a);
1423       AssertIsFinite(b);
1424 
1425       AssertDimension(local_size(), v.local_size());
1426       AssertDimension(local_size(), w.local_size());
1427 
1428       dealii::internal::VectorOperations::
1429         functions<Number, Number, MemorySpaceType>::add_avpbw(
1430           thread_loop_partitioner,
1431           partitioner->local_size(),
1432           a,
1433           b,
1434           v.data,
1435           w.data,
1436           data);
1437 
1438       if (vector_is_ghosted)
1439         update_ghost_values();
1440     }
1441 
1442 
1443 
1444     template <typename Number, typename MemorySpaceType>
1445     void
1446     Vector<Number, MemorySpaceType>::add(const std::vector<size_type> &indices,
1447                                          const std::vector<Number> &   values)
1448     {
1449       for (std::size_t i = 0; i < indices.size(); ++i)
1450         {
1451           this->operator()(indices[i]) += values[i];
1452         }
1453     }
1454 
1455 
1456 
1457     template <typename Number, typename MemorySpaceType>
1458     void
1459     Vector<Number, MemorySpaceType>::sadd(
1460       const Number                           x,
1461       const Vector<Number, MemorySpaceType> &v)
1462     {
1463       AssertIsFinite(x);
1464       AssertDimension(local_size(), v.local_size());
1465 
1466       dealii::internal::VectorOperations::
1467         functions<Number, Number, MemorySpaceType>::sadd_xv(
1468           thread_loop_partitioner, partitioner->local_size(), x, v.data, data);
1469 
1470       if (vector_is_ghosted)
1471         update_ghost_values();
1472     }
1473 
1474 
1475 
1476     template <typename Number, typename MemorySpaceType>
1477     void
1478     Vector<Number, MemorySpaceType>::sadd_local(
1479       const Number                     x,
1480       const Number                     a,
1481       const VectorSpaceVector<Number> &vv)
1482     {
1483       // Downcast. Throws an exception if invalid.
1484       using VectorType = Vector<Number, MemorySpaceType>;
1485       Assert((dynamic_cast<const VectorType *>(&vv) != nullptr),
1486              ExcVectorTypeNotCompatible());
1487       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1488 
1489       AssertIsFinite(x);
1490       AssertIsFinite(a);
1491       AssertDimension(local_size(), v.local_size());
1492 
1493       dealii::internal::VectorOperations::
1494         functions<Number, Number, MemorySpaceType>::sadd_xav(
1495           thread_loop_partitioner,
1496           partitioner->local_size(),
1497           x,
1498           a,
1499           v.data,
1500           data);
1501     }
1502 
1503 
1504 
1505     template <typename Number, typename MemorySpaceType>
1506     void
1507     Vector<Number, MemorySpaceType>::sadd(const Number                     x,
1508                                           const Number                     a,
1509                                           const VectorSpaceVector<Number> &vv)
1510     {
1511       sadd_local(x, a, vv);
1512 
1513       if (vector_is_ghosted)
1514         update_ghost_values();
1515     }
1516 
1517 
1518 
1519     template <typename Number, typename MemorySpaceType>
1520     Vector<Number, MemorySpaceType> &
1521     Vector<Number, MemorySpaceType>::operator*=(const Number factor)
1522     {
1523       AssertIsFinite(factor);
1524 
1525       dealii::internal::VectorOperations::
1526         functions<Number, Number, MemorySpaceType>::multiply_factor(
1527           thread_loop_partitioner, partitioner->local_size(), factor, data);
1528 
1529       if (vector_is_ghosted)
1530         update_ghost_values();
1531 
1532       return *this;
1533     }
1534 
1535 
1536 
1537     template <typename Number, typename MemorySpaceType>
1538     Vector<Number, MemorySpaceType> &
1539     Vector<Number, MemorySpaceType>::operator/=(const Number factor)
1540     {
1541       operator*=(static_cast<Number>(1.) / factor);
1542       return *this;
1543     }
1544 
1545 
1546 
1547     template <typename Number, typename MemorySpaceType>
1548     void
1549     Vector<Number, MemorySpaceType>::scale(const VectorSpaceVector<Number> &vv)
1550     {
1551       // Downcast. Throws an exception if invalid.
1552       using VectorType = Vector<Number, MemorySpaceType>;
1553       Assert(dynamic_cast<const VectorType *>(&vv) != nullptr,
1554              ExcVectorTypeNotCompatible());
1555       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1556 
1557       AssertDimension(local_size(), v.local_size());
1558 
1559       dealii::internal::VectorOperations::
1560         functions<Number, Number, MemorySpaceType>::scale(
1561           thread_loop_partitioner, local_size(), v.data, data);
1562 
1563       if (vector_is_ghosted)
1564         update_ghost_values();
1565     }
1566 
1567 
1568 
1569     template <typename Number, typename MemorySpaceType>
1570     void
1571     Vector<Number, MemorySpaceType>::equ(const Number                     a,
1572                                          const VectorSpaceVector<Number> &vv)
1573     {
1574       // Downcast. Throws an exception if invalid.
1575       using VectorType = Vector<Number, MemorySpaceType>;
1576       Assert(dynamic_cast<const VectorType *>(&vv) != nullptr,
1577              ExcVectorTypeNotCompatible());
1578       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1579 
1580       AssertIsFinite(a);
1581       AssertDimension(local_size(), v.local_size());
1582 
1583       dealii::internal::VectorOperations::
1584         functions<Number, Number, MemorySpaceType>::equ_au(
1585           thread_loop_partitioner, partitioner->local_size(), a, v.data, data);
1586 
1587 
1588       if (vector_is_ghosted)
1589         update_ghost_values();
1590     }
1591 
1592 
1593 
1594     template <typename Number, typename MemorySpaceType>
1595     bool
1596     Vector<Number, MemorySpaceType>::all_zero() const
1597     {
1598       return (linfty_norm() == 0) ? true : false;
1599     }
1600 
1601 
1602 
1603     template <typename Number, typename MemorySpaceType>
1604     template <typename Number2>
1605     Number
1606     Vector<Number, MemorySpaceType>::inner_product_local(
1607       const Vector<Number2, MemorySpaceType> &v) const
1608     {
1609       if (PointerComparison::equal(this, &v))
1610         return norm_sqr_local();
1611 
1612       AssertDimension(partitioner->local_size(), v.partitioner->local_size());
1613 
1614       return dealii::internal::VectorOperations::
1615         functions<Number, Number2, MemorySpaceType>::dot(
1616           thread_loop_partitioner, partitioner->local_size(), v.data, data);
1617     }
1618 
1619 
1620 
1621     template <typename Number, typename MemorySpaceType>
1622     Number Vector<Number, MemorySpaceType>::
1623            operator*(const VectorSpaceVector<Number> &vv) const
1624     {
1625       // Downcast. Throws an exception if invalid.
1626       using VectorType = Vector<Number, MemorySpaceType>;
1627       Assert((dynamic_cast<const VectorType *>(&vv) != nullptr),
1628              ExcVectorTypeNotCompatible());
1629       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1630 
1631       Number local_result = inner_product_local(v);
1632       if (partitioner->n_mpi_processes() > 1)
1633         return Utilities::MPI::sum(local_result,
1634                                    partitioner->get_mpi_communicator());
1635       else
1636         return local_result;
1637     }
1638 
1639 
1640 
1641     template <typename Number, typename MemorySpaceType>
1642     typename Vector<Number, MemorySpaceType>::real_type
1643     Vector<Number, MemorySpaceType>::norm_sqr_local() const
1644     {
1645       real_type sum;
1646 
1647 
1648       dealii::internal::VectorOperations::
1649         functions<Number, Number, MemorySpaceType>::norm_2(
1650           thread_loop_partitioner, partitioner->local_size(), sum, data);
1651 
1652       AssertIsFinite(sum);
1653 
1654       return sum;
1655     }
1656 
1657 
1658 
1659     template <typename Number, typename MemorySpaceType>
1660     Number
1661     Vector<Number, MemorySpaceType>::mean_value_local() const
1662     {
1663       Assert(size() != 0, ExcEmptyObject());
1664 
1665       if (partitioner->local_size() == 0)
1666         return Number();
1667 
1668       Number sum = ::dealii::internal::VectorOperations::
1669         functions<Number, Number, MemorySpaceType>::mean_value(
1670           thread_loop_partitioner, partitioner->local_size(), data);
1671 
1672       return sum / real_type(partitioner->local_size());
1673     }
1674 
1675 
1676 
1677     template <typename Number, typename MemorySpaceType>
1678     Number
1679     Vector<Number, MemorySpaceType>::mean_value() const
1680     {
1681       Number local_result = mean_value_local();
1682       if (partitioner->n_mpi_processes() > 1)
1683         return Utilities::MPI::sum(local_result * static_cast<real_type>(
1684                                                     partitioner->local_size()),
1685                                    partitioner->get_mpi_communicator()) /
1686                static_cast<real_type>(partitioner->size());
1687       else
1688         return local_result;
1689     }
1690 
1691 
1692 
1693     template <typename Number, typename MemorySpaceType>
1694     typename Vector<Number, MemorySpaceType>::real_type
1695     Vector<Number, MemorySpaceType>::l1_norm_local() const
1696     {
1697       real_type sum;
1698 
1699       dealii::internal::VectorOperations::
1700         functions<Number, Number, MemorySpaceType>::norm_1(
1701           thread_loop_partitioner, partitioner->local_size(), sum, data);
1702 
1703       return sum;
1704     }
1705 
1706 
1707 
1708     template <typename Number, typename MemorySpaceType>
1709     typename Vector<Number, MemorySpaceType>::real_type
1710     Vector<Number, MemorySpaceType>::l1_norm() const
1711     {
1712       real_type local_result = l1_norm_local();
1713       if (partitioner->n_mpi_processes() > 1)
1714         return Utilities::MPI::sum(local_result,
1715                                    partitioner->get_mpi_communicator());
1716       else
1717         return local_result;
1718     }
1719 
1720 
1721 
1722     template <typename Number, typename MemorySpaceType>
1723     typename Vector<Number, MemorySpaceType>::real_type
1724     Vector<Number, MemorySpaceType>::norm_sqr() const
1725     {
1726       real_type local_result = norm_sqr_local();
1727       if (partitioner->n_mpi_processes() > 1)
1728         return Utilities::MPI::sum(local_result,
1729                                    partitioner->get_mpi_communicator());
1730       else
1731         return local_result;
1732     }
1733 
1734 
1735 
1736     template <typename Number, typename MemorySpaceType>
1737     typename Vector<Number, MemorySpaceType>::real_type
1738     Vector<Number, MemorySpaceType>::l2_norm() const
1739     {
1740       return std::sqrt(norm_sqr());
1741     }
1742 
1743 
1744 
1745     template <typename Number, typename MemorySpaceType>
1746     typename Vector<Number, MemorySpaceType>::real_type
1747     Vector<Number, MemorySpaceType>::lp_norm_local(const real_type p) const
1748     {
1749       real_type sum = 0.;
1750 
1751       dealii::internal::VectorOperations::
1752         functions<Number, Number, MemorySpaceType>::norm_p(
1753           thread_loop_partitioner, partitioner->local_size(), sum, p, data);
1754 
1755       return std::pow(sum, 1. / p);
1756     }
1757 
1758 
1759 
1760     template <typename Number, typename MemorySpaceType>
1761     typename Vector<Number, MemorySpaceType>::real_type
1762     Vector<Number, MemorySpaceType>::lp_norm(const real_type p) const
1763     {
1764       const real_type local_result = lp_norm_local(p);
1765       if (partitioner->n_mpi_processes() > 1)
1766         return std::pow(
1767           Utilities::MPI::sum(std::pow(local_result, p),
1768                               partitioner->get_mpi_communicator()),
1769           static_cast<real_type>(1.0 / p));
1770       else
1771         return local_result;
1772     }
1773 
1774 
1775 
1776     template <typename Number, typename MemorySpaceType>
1777     typename Vector<Number, MemorySpaceType>::real_type
1778     Vector<Number, MemorySpaceType>::linfty_norm_local() const
1779     {
1780       real_type max = 0.;
1781 
1782       const size_type local_size = partitioner->local_size();
1783       internal::la_parallel_vector_templates_functions<
1784         Number,
1785         MemorySpaceType>::linfty_norm_local(data, local_size, max);
1786 
1787       return max;
1788     }
1789 
1790 
1791 
1792     template <typename Number, typename MemorySpaceType>
1793     inline typename Vector<Number, MemorySpaceType>::real_type
1794     Vector<Number, MemorySpaceType>::linfty_norm() const
1795     {
1796       const real_type local_result = linfty_norm_local();
1797       if (partitioner->n_mpi_processes() > 1)
1798         return Utilities::MPI::max(local_result,
1799                                    partitioner->get_mpi_communicator());
1800       else
1801         return local_result;
1802     }
1803 
1804 
1805 
1806     template <typename Number, typename MemorySpaceType>
1807     Number
1808     Vector<Number, MemorySpaceType>::add_and_dot_local(
1809       const Number                           a,
1810       const Vector<Number, MemorySpaceType> &v,
1811       const Vector<Number, MemorySpaceType> &w)
1812     {
1813       const size_type vec_size = partitioner->local_size();
1814       AssertDimension(vec_size, v.local_size());
1815       AssertDimension(vec_size, w.local_size());
1816 
1817       Number sum = dealii::internal::VectorOperations::
1818         functions<Number, Number, MemorySpaceType>::add_and_dot(
1819           thread_loop_partitioner, vec_size, a, v.data, w.data, data);
1820 
1821       AssertIsFinite(sum);
1822 
1823       return sum;
1824     }
1825 
1826 
1827 
1828     template <typename Number, typename MemorySpaceType>
1829     Number
1830     Vector<Number, MemorySpaceType>::add_and_dot(
1831       const Number                     a,
1832       const VectorSpaceVector<Number> &vv,
1833       const VectorSpaceVector<Number> &ww)
1834     {
1835       // Downcast. Throws an exception if invalid.
1836       using VectorType = Vector<Number, MemorySpaceType>;
1837       Assert((dynamic_cast<const VectorType *>(&vv) != nullptr),
1838              ExcVectorTypeNotCompatible());
1839       const VectorType &v = dynamic_cast<const VectorType &>(vv);
1840       Assert((dynamic_cast<const VectorType *>(&ww) != nullptr),
1841              ExcVectorTypeNotCompatible());
1842       const VectorType &w = dynamic_cast<const VectorType &>(ww);
1843 
1844       Number local_result = add_and_dot_local(a, v, w);
1845       if (partitioner->n_mpi_processes() > 1)
1846         return Utilities::MPI::sum(local_result,
1847                                    partitioner->get_mpi_communicator());
1848       else
1849         return local_result;
1850     }
1851 
1852 
1853 
1854     template <typename Number, typename MemorySpaceType>
1855     inline bool
1856     Vector<Number, MemorySpaceType>::partitioners_are_compatible(
1857       const Utilities::MPI::Partitioner &part) const
1858     {
1859       return partitioner->is_compatible(part);
1860     }
1861 
1862 
1863 
1864     template <typename Number, typename MemorySpaceType>
1865     inline bool
1866     Vector<Number, MemorySpaceType>::partitioners_are_globally_compatible(
1867       const Utilities::MPI::Partitioner &part) const
1868     {
1869       return partitioner->is_globally_compatible(part);
1870     }
1871 
1872 
1873 
1874     template <typename Number, typename MemorySpaceType>
1875     std::size_t
1876     Vector<Number, MemorySpaceType>::memory_consumption() const
1877     {
1878       std::size_t memory = sizeof(*this);
1879       memory += sizeof(Number) * static_cast<std::size_t>(allocated_size);
1880 
1881       // if the partitioner is shared between more processors, just count a
1882       // fraction of that memory, since we're not actually using more memory
1883       // for it.
1884       if (partitioner.use_count() > 0)
1885         memory +=
1886           partitioner->memory_consumption() / partitioner.use_count() + 1;
1887       if (import_data.values != nullptr || import_data.values_dev != nullptr)
1888         memory += (static_cast<std::size_t>(partitioner->n_import_indices()) *
1889                    sizeof(Number));
1890       return memory;
1891     }
1892 
1893 
1894 
1895     template <typename Number, typename MemorySpaceType>
1896     void
1897     Vector<Number, MemorySpaceType>::print(std::ostream &     out,
1898                                            const unsigned int precision,
1899                                            const bool         scientific,
1900                                            const bool         across) const
1901     {
1902       Assert(partitioner.get() != nullptr, ExcInternalError());
1903       AssertThrow(out, ExcIO());
1904       std::ios::fmtflags old_flags     = out.flags();
1905       unsigned int       old_precision = out.precision(precision);
1906 
1907       out.precision(precision);
1908       if (scientific)
1909         out.setf(std::ios::scientific, std::ios::floatfield);
1910       else
1911         out.setf(std::ios::fixed, std::ios::floatfield);
1912 
1913         // to make the vector write out all the information in order, use as
1914         // many barriers as there are processors and start writing when it's our
1915         // turn
1916 #ifdef DEAL_II_WITH_MPI
1917       if (partitioner->n_mpi_processes() > 1)
1918         for (unsigned int i = 0; i < partitioner->this_mpi_process(); i++)
1919           {
1920             const int ierr = MPI_Barrier(partitioner->get_mpi_communicator());
1921             AssertThrowMPI(ierr);
1922           }
1923 #endif
1924 
1925       std::vector<Number> stored_elements(allocated_size);
1926       data.copy_to(stored_elements.data(), allocated_size);
1927 
1928       out << "Process #" << partitioner->this_mpi_process() << std::endl
1929           << "Local range: [" << partitioner->local_range().first << ", "
1930           << partitioner->local_range().second
1931           << "), global size: " << partitioner->size() << std::endl
1932           << "Vector data:" << std::endl;
1933       if (across)
1934         for (size_type i = 0; i < partitioner->local_size(); ++i)
1935           out << stored_elements[i] << ' ';
1936       else
1937         for (size_type i = 0; i < partitioner->local_size(); ++i)
1938           out << stored_elements[i] << std::endl;
1939       out << std::endl;
1940 
1941       if (vector_is_ghosted)
1942         {
1943           out << "Ghost entries (global index / value):" << std::endl;
1944           if (across)
1945             for (size_type i = 0; i < partitioner->n_ghost_indices(); ++i)
1946               out << '(' << partitioner->ghost_indices().nth_index_in_set(i)
1947                   << '/' << stored_elements[partitioner->local_size() + i]
1948                   << ") ";
1949           else
1950             for (size_type i = 0; i < partitioner->n_ghost_indices(); ++i)
1951               out << '(' << partitioner->ghost_indices().nth_index_in_set(i)
1952                   << '/' << stored_elements[partitioner->local_size() + i]
1953                   << ")" << std::endl;
1954           out << std::endl;
1955         }
1956       out << std::flush;
1957 
1958 #ifdef DEAL_II_WITH_MPI
1959       if (partitioner->n_mpi_processes() > 1)
1960         {
1961           int ierr = MPI_Barrier(partitioner->get_mpi_communicator());
1962           AssertThrowMPI(ierr);
1963 
1964           for (unsigned int i = partitioner->this_mpi_process() + 1;
1965                i < partitioner->n_mpi_processes();
1966                i++)
1967             {
1968               ierr = MPI_Barrier(partitioner->get_mpi_communicator());
1969               AssertThrowMPI(ierr);
1970             }
1971         }
1972 #endif
1973 
1974       AssertThrow(out, ExcIO());
1975       // reset output format
1976       out.flags(old_flags);
1977       out.precision(old_precision);
1978     }
1979 
1980   } // end of namespace distributed
1981 } // end of namespace LinearAlgebra
1982 
1983 
1984 DEAL_II_NAMESPACE_CLOSE
1985 
1986 #endif
1987