1 /* Copyright 2009-2016 Francesco Biscani (bluescarni@gmail.com)
2 
3 This file is part of the Piranha library.
4 
5 The Piranha library is free software; you can redistribute it and/or modify
6 it under the terms of either:
7 
8   * the GNU Lesser General Public License as published by the Free
9     Software Foundation; either version 3 of the License, or (at your
10     option) any later version.
11 
12 or
13 
14   * the GNU General Public License as published by the Free Software
15     Foundation; either version 3 of the License, or (at your option) any
16     later version.
17 
18 or both in parallel, as here.
19 
20 The Piranha library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
23 for more details.
24 
25 You should have received copies of the GNU General Public License and the
26 GNU Lesser General Public License along with the Piranha library.  If not,
27 see https://www.gnu.org/licenses/. */
28 
29 #ifndef PIRANHA_DETAIL_BASE_SERIES_MULTIPLIER_HPP
30 #define PIRANHA_DETAIL_BASE_SERIES_MULTIPLIER_HPP
31 
32 #include <algorithm>
33 #include <array>
34 #include <boost/numeric/conversion/cast.hpp>
35 #include <cmath>
36 #include <cstddef>
37 #include <iterator>
38 #include <limits>
39 #include <mutex>
40 #include <random>
41 #include <stdexcept>
42 #include <type_traits>
43 #include <utility>
44 #include <vector>
45 
46 #include "config.hpp"
47 #include "detail/atomic_flag_array.hpp"
48 #include "detail/atomic_lock_guard.hpp"
49 #include "exceptions.hpp"
50 #include "key_is_multipliable.hpp"
51 #include "math.hpp"
52 #include "mp_integer.hpp"
53 #include "mp_rational.hpp"
54 #include "safe_cast.hpp"
55 #include "series.hpp"
56 #include "settings.hpp"
57 #include "symbol_set.hpp"
58 #include "thread_pool.hpp"
59 #include "tuning.hpp"
60 #include "type_traits.hpp"
61 
62 namespace piranha
63 {
64 
65 namespace detail
66 {
67 
68 template <typename Series, typename Derived, typename = void>
69 struct base_series_multiplier_impl {
70     using term_type = typename Series::term_type;
71     using container_type = typename std::decay<decltype(std::declval<Series>()._container())>::type;
72     using c_size_type = typename container_type::size_type;
73     using v_size_type = typename std::vector<term_type const *>::size_type;
74     template <typename Term,
75               typename std::enable_if<!is_less_than_comparable<typename Term::key_type>::value, int>::type = 0>
fill_term_pointerspiranha::detail::base_series_multiplier_impl76     void fill_term_pointers(const container_type &c1, const container_type &c2, std::vector<Term const *> &v1,
77                             std::vector<Term const *> &v2)
78     {
79         // If the key is not less-than comparable, we can only copy over the pointers as they are.
80         std::transform(c1.begin(), c1.end(), std::back_inserter(v1), [](const term_type &t) { return &t; });
81         std::transform(c2.begin(), c2.end(), std::back_inserter(v2), [](const term_type &t) { return &t; });
82     }
83     template <typename Term,
84               typename std::enable_if<is_less_than_comparable<typename Term::key_type>::value, int>::type = 0>
fill_term_pointerspiranha::detail::base_series_multiplier_impl85     void fill_term_pointers(const container_type &c1, const container_type &c2, std::vector<Term const *> &v1,
86                             std::vector<Term const *> &v2)
87     {
88         // Fetch the number of threads from the derived class.
89         const unsigned n_threads = static_cast<Derived *>(this)->m_n_threads;
90         piranha_assert(n_threads > 0u);
91         // Threading functor.
92         auto thread_func
93             = [n_threads](unsigned thread_idx, const container_type *c, std::vector<term_type const *> *v) {
94                   piranha_assert(thread_idx < n_threads);
95                   // Total bucket count.
96                   const auto b_count = c->bucket_count();
97                   // Buckets per thread.
98                   const auto bpt = b_count / n_threads;
99                   // End index.
100                   const auto end
101                       = static_cast<c_size_type>((thread_idx == n_threads - 1u) ? b_count : (bpt * (thread_idx + 1u)));
102                   // Sorter.
103                   auto sorter = [](term_type const *p1, term_type const *p2) { return p1->m_key < p2->m_key; };
104                   v_size_type j = 0u;
105                   for (auto start = static_cast<c_size_type>(bpt * thread_idx); start < end; ++start) {
106                       const auto &b = c->_get_bucket_list(start);
107                       v_size_type tmp = 0u;
108                       for (const auto &t : b) {
109                           v->push_back(&t);
110                           ++tmp;
111                       }
112                       std::stable_sort(v->data() + j, v->data() + j + tmp, sorter);
113                       j += tmp;
114                   }
115               };
116         if (n_threads == 1u) {
117             thread_func(0u, &c1, &v1);
118             thread_func(0u, &c2, &v2);
119             return;
120         }
121         auto thread_wrapper = [&thread_func, n_threads](const container_type *c, std::vector<term_type const *> *v) {
122             // In the multi-threaded case, each thread needs to work on a separate vector.
123             // We will merge the vectors later.
124             using vv_t = std::vector<std::vector<Term const *>>;
125             using vv_size_t = typename vv_t::size_type;
126             vv_t vv(safe_cast<vv_size_t>(n_threads));
127             // Go with the threads.
128             future_list<void> ff_list;
129             try {
130                 for (unsigned i = 0u; i < n_threads; ++i) {
131                     ff_list.push_back(thread_pool::enqueue(i, thread_func, i, c, &(vv[static_cast<vv_size_t>(i)])));
132                 }
133                 // First let's wait for everything to finish.
134                 ff_list.wait_all();
135                 // Then, let's handle the exceptions.
136                 ff_list.get_all();
137             } catch (...) {
138                 ff_list.wait_all();
139                 throw;
140             }
141             // Last, we need to merge everything into v.
142             for (const auto &vi : vv) {
143                 v->insert(v->end(), vi.begin(), vi.end());
144             }
145         };
146         thread_wrapper(&c1, &v1);
147         thread_wrapper(&c2, &v2);
148     }
149 };
150 
151 template <typename Series, typename Derived>
152 struct base_series_multiplier_impl<Series, Derived, typename std::enable_if<is_mp_rational<
153                                                         typename Series::term_type::cf_type>::value>::type> {
154     // Useful shortcuts.
155     using term_type = typename Series::term_type;
156     using rat_type = typename term_type::cf_type;
157     using int_type = typename std::decay<decltype(std::declval<rat_type>().num())>::type;
158     using container_type = typename std::decay<decltype(std::declval<Series>()._container())>::type;
fill_term_pointerspiranha::detail::base_series_multiplier_impl159     void fill_term_pointers(const container_type &c1, const container_type &c2, std::vector<term_type const *> &v1,
160                             std::vector<term_type const *> &v2)
161     {
162         // Compute the least common multiplier.
163         m_lcm = 1;
164         auto it_f = c1.end();
165         int_type g;
166         for (auto it = c1.begin(); it != it_f; ++it) {
167             math::gcd3(g, m_lcm, it->m_cf.den());
168             math::mul3(m_lcm, m_lcm, it->m_cf.den());
169             int_type::_divexact(m_lcm, m_lcm, g);
170         }
171         it_f = c2.end();
172         for (auto it = c2.begin(); it != it_f; ++it) {
173             math::gcd3(g, m_lcm, it->m_cf.den());
174             math::mul3(m_lcm, m_lcm, it->m_cf.den());
175             int_type::_divexact(m_lcm, m_lcm, g);
176         }
177         // All these computations involve only positive numbers,
178         // the GCD must always be positive.
179         piranha_assert(m_lcm.sign() == 1);
180         // Copy over the terms and renormalise to lcm.
181         it_f = c1.end();
182         for (auto it = c1.begin(); it != it_f; ++it) {
183             // NOTE: these divisions are exact, we could take advantage of that.
184             m_terms1.push_back(term_type(rat_type(m_lcm / it->m_cf.den() * it->m_cf.num(), int_type(1)), it->m_key));
185         }
186         it_f = c2.end();
187         for (auto it = c2.begin(); it != it_f; ++it) {
188             m_terms2.push_back(term_type(rat_type(m_lcm / it->m_cf.den() * it->m_cf.num(), int_type(1)), it->m_key));
189         }
190         // Copy over the pointers.
191         std::transform(m_terms1.begin(), m_terms1.end(), std::back_inserter(v1), [](const term_type &t) { return &t; });
192         std::transform(m_terms2.begin(), m_terms2.end(), std::back_inserter(v2), [](const term_type &t) { return &t; });
193         piranha_assert(v1.size() == c1.size());
194         piranha_assert(v2.size() == c2.size());
195     }
196     std::vector<term_type> m_terms1;
197     std::vector<term_type> m_terms2;
198     int_type m_lcm;
199 };
200 }
201 
202 /// Base series multiplier.
203 /**
204  * This is a class that provides functionality useful to define a piranha::series_multiplier specialisation. Note that
205  * this class
206  * alone does *not* fulfill the requirements of a piranha::series_multiplier - it merely provides a set of utilities
207  * that can be
208  * useful for the implementation of a piranha::series_multiplier.
209  *
210  * ## Type requirements ##
211  *
212  * \p Series must satisfy piranha::is_series.
213  *
214  * ## Exception safety guarantee ##
215  *
216  * This class provides the strong exception safety guarantee.
217  *
218  * ## Move semantics ##
219  *
220  * Instances of this class cannot be copied, moved or assigned.
221  */
222 // Some performance ideas:
223 // - optimisation in case one series has 1 term with unitary key and both series same type: multiply directly
224 // coefficients;
225 // - optimisation for coefficient series that merges all args, similar to the rational optimisation;
226 // - optimisation for load balancing similar to the poly multiplier.
227 template <typename Series>
228 class base_series_multiplier : private detail::base_series_multiplier_impl<Series, base_series_multiplier<Series>>
229 {
230     PIRANHA_TT_CHECK(is_series, Series);
231     // Make friends with the base, so it can access protected/private members of this.
232     friend struct detail::base_series_multiplier_impl<Series, base_series_multiplier<Series>>;
233     // Alias for the series' container type.
234     using container_type = uncvref_t<decltype(std::declval<const Series &>()._container())>;
235 
236 public:
237     /// Alias for a vector of const pointers to series terms.
238     using v_ptr = std::vector<typename Series::term_type const *>;
239     /// The size type of base_series_multiplier::v_ptr.
240     using size_type = typename v_ptr::size_type;
241     /// The size type of \p Series.
242     using bucket_size_type = typename Series::size_type;
243 
244 private:
245     // The default limit functor: it will include all terms in the second series.
246     struct default_limit_functor {
default_limit_functorpiranha::base_series_multiplier::default_limit_functor247         default_limit_functor(const base_series_multiplier &m) : m_size2(m.m_v2.size())
248         {
249         }
operator ()piranha::base_series_multiplier::default_limit_functor250         size_type operator()(const size_type &) const
251         {
252             return m_size2;
253         }
254         const size_type m_size2;
255     };
256     // The purpose of this helper is to move in a coefficient series during insertion. For series,
257     // we know that moves leave the series in a valid state, and series multiplications do not benefit
258     // from an already-constructed destination - hence it is convenient to move them rather than copy.
259     template <typename Term, typename std::enable_if<!is_series<typename Term::cf_type>::value, int>::type = 0>
term_insertion(Term & t)260     static Term &term_insertion(Term &t)
261     {
262         return t;
263     }
264     template <typename Term, typename std::enable_if<is_series<typename Term::cf_type>::value, int>::type = 0>
term_insertion(Term & t)265     static Term term_insertion(Term &t)
266     {
267         return Term{std::move(t.m_cf), t.m_key};
268     }
269     // Implementation of finalise().
270     template <typename T,
271               typename std::enable_if<detail::is_mp_rational<typename T::term_type::cf_type>::value, int>::type = 0>
finalise_impl(T & s) const272     void finalise_impl(T &s) const
273     {
274         // Nothing to do if the lcm is unitary.
275         if (math::is_unitary(this->m_lcm)) {
276             return;
277         }
278         // NOTE: this has to be the square of the lcm, as in addition to uniformising
279         // the denominators in each series we are also multiplying the two series.
280         const auto l2 = this->m_lcm * this->m_lcm;
281         auto &container = s._container();
282         // Single thread implementation.
283         if (m_n_threads == 1u) {
284             for (const auto &t : container) {
285                 t.m_cf._set_den(l2);
286                 t.m_cf.canonicalise();
287             }
288             return;
289         }
290         // Multi-thread implementation.
291         // Buckets per thread.
292         const bucket_size_type bpt = static_cast<bucket_size_type>(container.bucket_count() / m_n_threads);
293         auto thread_func = [l2, &container, this, bpt](unsigned t_idx) {
294             bucket_size_type start_idx = static_cast<bucket_size_type>(t_idx * bpt);
295             // Special handling for the last thread.
296             const bucket_size_type end_idx = t_idx == (this->m_n_threads - 1u)
297                                                  ? container.bucket_count()
298                                                  : static_cast<bucket_size_type>((t_idx + 1u) * bpt);
299             for (; start_idx != end_idx; ++start_idx) {
300                 auto &list = container._get_bucket_list(start_idx);
301                 for (const auto &t : list) {
302                     t.m_cf._set_den(l2);
303                     t.m_cf.canonicalise();
304                 }
305             }
306         };
307         // Go with the threads.
308         future_list<decltype(thread_func(0u))> ff_list;
309         try {
310             for (unsigned i = 0u; i < m_n_threads; ++i) {
311                 ff_list.push_back(thread_pool::enqueue(i, thread_func, i));
312             }
313             // First let's wait for everything to finish.
314             ff_list.wait_all();
315             // Then, let's handle the exceptions.
316             ff_list.get_all();
317         } catch (...) {
318             ff_list.wait_all();
319             throw;
320         }
321     }
322     template <typename T,
323               typename std::enable_if<!detail::is_mp_rational<typename T::term_type::cf_type>::value, int>::type = 0>
finalise_impl(T &) const324     void finalise_impl(T &) const
325     {
326     }
327 
328 public:
329     /// Constructor.
330     /**
331      * This constructor will store in the base_series_multiplier::m_v1 and base_series_multiplier::m_v2 protected
332      * members references to the terms of the input series \p s1 and \p s2. \p m_v1 will store references to the larger
333      * series, \p m_v2 will store references to the smaller series. The constructor will also store a copy of the symbol
334      * set of \p s1 and \p s2 in the protected member base_series_multiplier::m_ss.
335      *
336      * If the coefficient type of \p Series is an instance of piranha::mp_rational, then the pointers in \p m_v1 and \p
337      * m_v2 will refer not to the original terms in \p s1 and \p s2 but to *copies* of these terms, in which all
338      * coefficients have unitary denominator and the numerators have all been multiplied by the global least common
339      * multiplier. This transformation allows to reduce the multiplication of series with rational coefficients to the
340      * multiplication of series with integral coefficients.
341      *
342      * If an operand is empty and the series type does not satisfy piranha::zero_is_absorbing, then a hidden
343      * private series consisting of a single term with zero coefficient is created, and the pointers in
344      * base_series_multiplier::m_v1 and/or base_series_multiplier::m_v2 will refer to this hidden instance. This ensures
345      * that a multiplication by a null series results in multiplications by a zero coefficient, which may not
346      * necessarily lead to a zero result (e.g., with IEEE floats inf times zero gives NaN).
347      *
348      * @param s1 first series.
349      * @param s2 second series.
350      *
351      * @throws std::invalid_argument if the symbol sets of \p s1 and \p s2 differ.
352      * @throws unspecified any exception thrown by:
353      * - thread_pool::use_threads(),
354      * - memory allocation errors in standard containers,
355      * - the construction of the term, coefficient and key types of \p Series,
356      * - the public interface of piranha::hash_set.
357      */
base_series_multiplier(const Series & s1,const Series & s2)358     explicit base_series_multiplier(const Series &s1, const Series &s2) : m_ss(s1.get_symbol_set())
359     {
360         if (unlikely(s1.get_symbol_set() != s2.get_symbol_set())) {
361             piranha_throw(std::invalid_argument, "incompatible arguments sets");
362         }
363         // The largest series goes first.
364         const Series *p1 = &s1, *p2 = &s2;
365         if (s1.size() < s2.size()) {
366             std::swap(p1, p2);
367         }
368         // This is just an optimisation, no troubles if there is a truncation due to static_cast.
369         m_v1.reserve(static_cast<size_type>(p1->size()));
370         m_v2.reserve(static_cast<size_type>(p2->size()));
371         container_type const *ctr1 = &p1->_container(), *ctr2 = &p2->_container();
372         // NOTE: if the zero element of Series is not absorbing, we need to create a temporary zero series in place
373         // of any factor that is zero, and then use it in the multiplication. This ensures a correct series
374         // multiplication result for coefficient types (such as IEEE floats) for which 0 times x is not necessarily
375         // always 0. The temporary zero series is stored in the m_zero_f member as a collection of 1 term with
376         // zero coefficient.
377         if (!zero_is_absorbing<Series>::value) {
378             using term_type = typename Series::term_type;
379             using cf_type = typename term_type::cf_type;
380             using key_type = typename term_type::key_type;
381             if (p1->empty()) {
382                 m_zero_f1.insert(term_type{cf_type(0), key_type(s1.get_symbol_set())});
383                 ctr1 = &m_zero_f1;
384             }
385             if (p2->empty()) {
386                 m_zero_f2.insert(term_type{cf_type(0), key_type(s1.get_symbol_set())});
387                 ctr2 = &m_zero_f2;
388             }
389         }
390         // Set the number of threads.
391         m_n_threads = (ctr1->size() && ctr2->size())
392                           ? thread_pool::use_threads(integer(ctr1->size()) * ctr2->size(),
393                                                      integer(settings::get_min_work_per_thread()))
394                           : 1u;
395         this->fill_term_pointers(*ctr1, *ctr2, m_v1, m_v2);
396     }
397 
398 private:
399     base_series_multiplier() = delete;
400     base_series_multiplier(const base_series_multiplier &) = delete;
401     base_series_multiplier(base_series_multiplier &&) = delete;
402     base_series_multiplier &operator=(const base_series_multiplier &) = delete;
403     base_series_multiplier &operator=(base_series_multiplier &&) = delete;
404 
405 protected:
406     /// Blocked multiplication.
407     /**
408      * \note
409      * If \p MultFunctor or \p LimitFunctor do not satisfy the requirements outlined below,
410      * a compile-time error will be produced.
411      *
412      * This method is logically equivalent to the following double loop:
413      *
414      * @code
415      * for (size_type i = start1; i < end1; ++i) {
416      *      const size_type limit = std::min(lf(i),m_v2.size());
417      *	for (size_type j = 0; j < limit; ++j) {
418      *		mf(i,j);
419      *	}
420      * }
421      * @endcode
422      *
423      * \p mf must be a function object with a call operator accepting two instances of
424      * base_series_multiplier::size_type and returning \p void. \p lf must be a function object
425      * with a call operator accepting and returning a base_series_multiplier::size_type.
426      *
427      * Internally, the double loops is decomposed in blocks of size tuning::get_multiplication_block_size() in an
428      * attempt to optimise cache memory access patterns.
429      *
430      * This method is meant to be used for series multiplication. \p mf is intended to be a function object that
431      * multiplies the <tt>i</tt>-th term of the first series by the <tt>j</tt>-th term of the second series.
432      * \p lf is intended to be a functor that establishes how many terms in the second series have to be multiplied by
433      * the <tt>i</tt>-th term of the first series.
434      *
435      * @param mf the multiplication functor.
436      * @param start1 start index in the first series.
437      * @param end1 end index in the first series.
438      * @param lf the limit functor.
439      *
440      * @throws std::invalid_argument if \p start1 is greater than \p end1 or greater than the size of
441      * base_series_multiplier::m_v1, or if \p end1 is greater than the size of base_series_multiplier::m_v1.
442      * @throws unspecified any exception thrown by the call operator of \p mf or \p sf, or piranha::safe_cast.
443      */
444     template <typename MultFunctor, typename LimitFunctor>
blocked_multiplication(const MultFunctor & mf,const size_type & start1,const size_type & end1,const LimitFunctor & lf) const445     void blocked_multiplication(const MultFunctor &mf, const size_type &start1, const size_type &end1,
446                                 const LimitFunctor &lf) const
447     {
448         PIRANHA_TT_CHECK(is_function_object, MultFunctor, void, const size_type &, const size_type &);
449         if (unlikely(start1 > end1 || start1 > m_v1.size() || end1 > m_v1.size())) {
450             piranha_throw(std::invalid_argument, "invalid bounds in blocked_multiplication");
451         }
452         // Block size and number of regular blocks.
453         const size_type bsize = safe_cast<size_type>(tuning::get_multiplication_block_size()),
454                         nblocks1 = static_cast<size_type>((end1 - start1) / bsize),
455                         nblocks2 = static_cast<size_type>(m_v2.size() / bsize);
456         // Start and end of last (possibly irregular) blocks.
457         const size_type i_ir_start = static_cast<size_type>(nblocks1 * bsize + start1), i_ir_end = end1;
458         const size_type j_ir_start = static_cast<size_type>(nblocks2 * bsize), j_ir_end = m_v2.size();
459         for (size_type n1 = 0u; n1 < nblocks1; ++n1) {
460             const size_type i_start = static_cast<size_type>(n1 * bsize + start1),
461                             i_end = static_cast<size_type>(i_start + bsize);
462             // regulars1 * regulars2
463             for (size_type n2 = 0u; n2 < nblocks2; ++n2) {
464                 const size_type j_start = static_cast<size_type>(n2 * bsize),
465                                 j_end = static_cast<size_type>(j_start + bsize);
466                 for (size_type i = i_start; i < i_end; ++i) {
467                     const size_type limit = std::min<size_type>(lf(i), j_end);
468                     for (size_type j = j_start; j < limit; ++j) {
469                         mf(i, j);
470                     }
471                 }
472             }
473             // regulars1 * rem2
474             for (size_type i = i_start; i < i_end; ++i) {
475                 const size_type limit = std::min<size_type>(lf(i), j_ir_end);
476                 for (size_type j = j_ir_start; j < limit; ++j) {
477                     mf(i, j);
478                 }
479             }
480         }
481         // rem1 * regulars2
482         for (size_type n2 = 0u; n2 < nblocks2; ++n2) {
483             const size_type j_start = static_cast<size_type>(n2 * bsize),
484                             j_end = static_cast<size_type>(j_start + bsize);
485             for (size_type i = i_ir_start; i < i_ir_end; ++i) {
486                 const size_type limit = std::min<size_type>(lf(i), j_end);
487                 for (size_type j = j_start; j < limit; ++j) {
488                     mf(i, j);
489                 }
490             }
491         }
492         // rem1 * rem2.
493         for (size_type i = i_ir_start; i < i_ir_end; ++i) {
494             const size_type limit = std::min<size_type>(lf(i), j_ir_end);
495             for (size_type j = j_ir_start; j < limit; ++j) {
496                 mf(i, j);
497             }
498         }
499     }
500     /// Blocked multiplication (convenience overload).
501     /**
502      * This method is equivalent to the other overload with a limit functor that will unconditionally
503      * always return the size of the second series.
504      *
505      * @param mf the multiplication functor.
506      * @param start1 start index in the first series.
507      * @param end1 end index in the first series.
508      *
509      * @throws unspecified any exception thrown by the other overload.
510      */
511     template <typename MultFunctor>
blocked_multiplication(const MultFunctor & mf,const size_type & start1,const size_type & end1) const512     void blocked_multiplication(const MultFunctor &mf, const size_type &start1, const size_type &end1) const
513     {
514         blocked_multiplication(mf, start1, end1, default_limit_functor{*this});
515     }
516     /// Estimate size of series multiplication.
517     /**
518      * \note
519      * If \p MultArity, \p MultFunctor or \p LimitFunctor do not satisfy the requirements outlined below,
520      * a compile-time error will be produced.
521      *
522      * This method expects a \p MultFunctor type exposing the same inteface as explained in blocked_multiplication().
523      * Additionally, \p MultFunctor must be constructible from a const reference to piranha::base_series_multiplier
524      * and a mutable reference to \p Series. \p MultFunctor objects will be constructed internally by this method,
525      * passing \p this and a temporary local \p Series object as construction parameters.
526      * It will be assumed that a call <tt>mf(i,j)</tt> multiplies the <tt>i</tt>-th
527      * term of the first series by the <tt>j</tt>-th term of the second series, accumulating the result into the \p
528      * Series
529      * passed as second parameter for construction.
530      *
531      * This method will apply a statistical approach to estimate the final size of the result of the multiplication of
532      * the first
533      * series by the second. It will perform random term-by-term multiplications
534      * and deduce the estimate from the number of term multiplications performed before finding the first duplicate
535      * term.
536      * The \p MultArity parameter represents the arity of term multiplications - that is, the number of terms generated
537      * by a single
538      * term-by-term multiplication. It must be strictly positive.
539      *
540      * The \p lf parameter must be a function object exposing the same inteface as explained in
541      * blocked_multiplication().
542      * This functor establishes how many terms in the second series must be multiplied by the <tt>i</tt>-th term of the
543      * first series.
544      *
545      * The number returned by this function is always at least 1. Multiple threads might be used by this method: in such
546      * a case, different
547      * instances of \p MultFunctor are constructed in different threads, but \p lf is shared among all threads.
548      *
549      * @param lf the limit functor.
550      *
551      * @return the estimated size of the multiplication of the first series by the second, always at least 1.
552      *
553      * @throws std::overflow_error in case of (unlikely) overflows in integral arithmetics.
554      * @throws unspecified any exception thrown by:
555      * - overflow errors in the conversion of piranha::integer to integral types,
556      * - the call operator of \p mf or \p lf, and the constructor of \p mf,
557      * - memory errors in standard containers,
558      * - the conversion operator of piranha::integer,
559      * - standard threading primitives,
560      * - thread_pool::enqueue(),
561      * - future_list::push_back().
562      */
563     template <std::size_t MultArity, typename MultFunctor, typename LimitFunctor>
estimate_final_series_size(const LimitFunctor & lf) const564     bucket_size_type estimate_final_series_size(const LimitFunctor &lf) const
565     {
566         PIRANHA_TT_CHECK(is_function_object, MultFunctor, void, const size_type &, const size_type &);
567         PIRANHA_TT_CHECK(std::is_constructible, MultFunctor, const base_series_multiplier &, Series &);
568         PIRANHA_TT_CHECK(is_function_object, LimitFunctor, size_type, const size_type &);
569         // Cache these.
570         const size_type size1 = m_v1.size(), size2 = m_v2.size();
571         constexpr std::size_t result_size = MultArity;
572         // If one of the two series is empty, just return 0.
573         if (unlikely(!size1 || !size2)) {
574             return 1u;
575         }
576         // If either series has a size of 1, just return size1 * size2 * result_size.
577         if (size1 == 1u || size2 == 1u) {
578             return static_cast<bucket_size_type>(integer(size1) * size2 * result_size);
579         }
580         // NOTE: Hard-coded number of trials.
581         // NOTE: here consider that in case of extremely sparse series with few terms this will incur in noticeable
582         // overhead, since we will need many term-by-term before encountering the first duplicate.
583         const unsigned n_trials = 15u;
584         // NOTE: Hard-coded value for the estimation multiplier.
585         // NOTE: This value should be tuned for performance/memory usage tradeoffs.
586         const unsigned multiplier = 2u;
587         // Number of threads to use. If there are more threads than trials, then reduce
588         // the number of actual threads to use.
589         // NOTE: this is a bit different from usual, where we do not care if the workload per thread is zero.
590         // We do like this because n_trials is a small number and there still seems to be benefit in running
591         // just 1 trial per thread.
592         const unsigned n_threads = (n_trials >= m_n_threads) ? m_n_threads : n_trials;
593         piranha_assert(n_threads > 0u);
594         // Trials per thread. This will always be at least 1.
595         const unsigned tpt = n_trials / n_threads;
596         piranha_assert(tpt >= 1u);
597         // The cumulative estimate.
598         integer c_estimate(0);
599         // Sync mutex - actually used only in multithreading.
600         std::mutex mut;
601         // The estimation functor.
602         auto estimator = [&lf, size1, n_threads, multiplier, tpt, n_trials, this, &c_estimate, &mut,
603                           result_size](unsigned thread_idx) {
604             piranha_assert(thread_idx < n_threads);
605             // Vectors of indices into m_v1.
606             std::vector<size_type> v_idx1(safe_cast<typename std::vector<size_type>::size_type>(size1));
607             std::iota(v_idx1.begin(), v_idx1.end(), size_type(0));
608             // Copy in order to reset to initial state later.
609             const auto v_idx1_copy = v_idx1;
610             // Random number engine.
611             std::mt19937 engine;
612             // Uniform int distribution.
613             using dist_type = std::uniform_int_distribution<size_type>;
614             dist_type dist;
615             // Init the accumulated estimation for averaging later.
616             integer acc(0);
617             // Number of trials for this thread - usual special casing for the last thread.
618             const unsigned cur_trials = (thread_idx == n_threads - 1u) ? (n_trials - thread_idx * tpt) : tpt;
619             // This should always be guaranteed because tpt is never 0.
620             piranha_assert(cur_trials > 0u);
621             // Create and setup the temp series.
622             Series tmp;
623             tmp.set_symbol_set(m_ss);
624             // Create the multiplier.
625             MultFunctor mf(*this, tmp);
626             // Go with the trials.
627             for (auto n = 0u; n < cur_trials; ++n) {
628                 // Seed the engine. The seed should be the global trial number, accounting for multiple
629                 // threads. This way the estimation will not depend on the number of threads.
630                 engine.seed(static_cast<std::mt19937::result_type>(tpt * thread_idx + n));
631                 // Reset the indices vector and re-randomise it.
632                 // NOTE: we need to do this as every run inside this for loop must be completely independent
633                 // of any previous run, we cannot keep any state.
634                 v_idx1 = v_idx1_copy;
635                 std::shuffle(v_idx1.begin(), v_idx1.end(), engine);
636                 // The counter. This will be increased each time a term-by-term multiplication
637                 // does not generate a duplicate term.
638                 size_type count = 0u;
639                 // This will be used to determine the average number of terms in s2
640                 // that participate in the multiplication.
641                 integer acc_s2(0);
642                 auto it1 = v_idx1.begin();
643                 for (; it1 != v_idx1.end(); ++it1) {
644                     // Get the limit idx in s2.
645                     const size_type limit = lf(*it1);
646                     // This is the upper limit of an open ended interval, so it needs
647                     // to be decreased by one in order to be used in dist. If zero, it means
648                     // there are no terms in v2 that can be multiplied by the current term in t1.
649                     if (limit == 0u) {
650                         continue;
651                     }
652                     acc_s2 += limit;
653                     // Pick a random index in m_v2 within the limit.
654                     const size_type idx2
655                         = dist(engine, typename dist_type::param_type(static_cast<size_type>(0u),
656                                                                       static_cast<size_type>(limit - 1u)));
657                     // Perform term multiplication.
658                     mf(*it1, idx2);
659                     // Check for unlikely overflows when increasing count.
660                     if (unlikely(result_size > std::numeric_limits<size_type>::max()
661                                  || count > std::numeric_limits<size_type>::max() - result_size)) {
662                         piranha_throw(std::overflow_error, "overflow error");
663                     }
664                     if (tmp.size() != count + result_size) {
665                         break;
666                     }
667                     // Increase cycle variables.
668                     count = static_cast<size_type>(count + result_size);
669                 }
670                 integer add;
671                 if (it1 == v_idx1.end()) {
672                     // We never found a duplicate. count is now the number of terms in s1
673                     // which actually participate in the multiplication, while acc_s2 / count
674                     // is the average number of terms in s2 that participate in the multiplication.
675                     // The result will be then count * acc_s2 / count = acc_s2.
676                     add = acc_s2;
677                 } else {
678                     // If we found a duplicate, we use the heuristic.
679                     add = integer(multiplier) * count * count;
680                 }
681                 // Fix if zero, so that the average later never results in zero.
682                 if (add.sign() == 0) {
683                     add = 1;
684                 }
685                 acc += add;
686                 // Reset tmp.
687                 tmp._container().clear();
688             }
689             // Accumulate in the shared variable.
690             if (n_threads == 1u) {
691                 // No locking needed.
692                 c_estimate += acc;
693             } else {
694                 std::lock_guard<std::mutex> lock(mut);
695                 c_estimate += acc;
696             }
697         };
698         // Run the estimation functor.
699         if (n_threads == 1u) {
700             estimator(0u);
701         } else {
702             future_list<void> f_list;
703             try {
704                 for (unsigned i = 0u; i < n_threads; ++i) {
705                     f_list.push_back(thread_pool::enqueue(i, estimator, i));
706                 }
707                 // First let's wait for everything to finish.
708                 f_list.wait_all();
709                 // Then, let's handle the exceptions.
710                 f_list.get_all();
711             } catch (...) {
712                 f_list.wait_all();
713                 throw;
714             }
715         }
716         piranha_assert(c_estimate >= n_trials);
717         // Return the mean.
718         return static_cast<bucket_size_type>(c_estimate / n_trials);
719     }
720     /// Estimate size of series multiplication (convenience overload)
721     /**
722      * @return the output of the other overload of estimate_final_series_size(), with a limit
723      * functor whose call operator will always return the size of the second series unconditionally.
724      *
725      * @throws unspecified any exception thrown by the other overload of estimate_final_series_size().
726      */
727     template <std::size_t MultArity, typename MultFunctor>
estimate_final_series_size() const728     bucket_size_type estimate_final_series_size() const
729     {
730         return estimate_final_series_size<MultArity, MultFunctor>(default_limit_functor{*this});
731     }
732     /// A plain multiplier functor.
733     /**
734      * \note
735      * If the key and coefficient types of \p Series do not satisfy piranha::key_is_multipliable, a compile-time error
736      * will
737      * be produced.
738      *
739      * This is a functor that conforms to the protocol expected by base_series_multiplier::blocked_multiplication() and
740      * base_series_multiplier::estimate_final_series_size().
741      *
742      * This functor requires that the key and coefficient types of \p Series satisfy piranha::key_is_multipliable, and
743      * it will
744      * use the key's <tt>multiply()</tt> method to perform term-by-term multiplications.
745      *
746      * If the \p FastMode boolean parameter is \p true, then the call operator will insert terms into the return value
747      * series
748      * using the low-level interface of piranha::hash_set, otherwise the call operator will use
749      * piranha::series::insert() for
750      * term insertion.
751      */
752     template <bool FastMode>
753     class plain_multiplier
754     {
755         using term_type = typename Series::term_type;
756         using key_type = typename term_type::key_type;
757         PIRANHA_TT_CHECK(key_is_multipliable, typename term_type::cf_type, key_type);
758         using it_type = decltype(std::declval<Series &>()._container().end());
759         static constexpr std::size_t m_arity = key_type::multiply_arity;
760 
761     public:
762         /// Constructor.
763         /**
764          * The constructor will store references to the input arguments internally.
765          *
766          * @param bsm a const reference to an instance of piranha::base_series_multiplier, from which
767          * the vectors of term pointers will be extracted.
768          * @param retval the \p Series instance into which terms resulting from multiplications will be inserted.
769          */
plain_multiplier(const base_series_multiplier & bsm,Series & retval)770         explicit plain_multiplier(const base_series_multiplier &bsm, Series &retval)
771             : m_v1(bsm.m_v1), m_v2(bsm.m_v2), m_retval(retval), m_c_end(retval._container().end())
772         {
773         }
774 
775     private:
776         plain_multiplier(const plain_multiplier &) = delete;
777         plain_multiplier(plain_multiplier &&) = delete;
778         plain_multiplier &operator=(const plain_multiplier &) = delete;
779         plain_multiplier &operator=(plain_multiplier &&) = delete;
780 
781     public:
782         /// Call operator.
783         /**
784          * The call operator will perform the multiplication of the <tt>i</tt>-th term of the first series by the
785          * <tt>j</tt>-th term of the second series, and it will insert the result into the return value.
786          *
787          * @param i index of a term in the first series.
788          * @param j index of a term in the second series.
789          *
790          * @throws unspecified any exception thrown by:
791          * - piranha::series::insert(),
792          * - the low-level interface of piranha::hash_set,
793          * - the in-place addition operator of the coefficient type,
794          * - term construction.
795          */
operator ()(const size_type & i,const size_type & j) const796         void operator()(const size_type &i, const size_type &j) const
797         {
798             // First perform the multiplication.
799             key_type::multiply(m_tmp_t, *m_v1[i], *m_v2[j], m_retval.get_symbol_set());
800             for (std::size_t n = 0u; n < m_arity; ++n) {
801                 auto &tmp_term = m_tmp_t[n];
802                 if (FastMode) {
803                     auto &container = m_retval._container();
804                     // Try to locate the term into retval.
805                     auto bucket_idx = container._bucket(tmp_term);
806                     const auto it = container._find(tmp_term, bucket_idx);
807                     if (it == m_c_end) {
808                         container._unique_insert(term_insertion(tmp_term), bucket_idx);
809                     } else {
810                         it->m_cf += tmp_term.m_cf;
811                     }
812                 } else {
813                     m_retval.insert(term_insertion(tmp_term));
814                 }
815             }
816         }
817 
818     private:
819         mutable std::array<term_type, m_arity> m_tmp_t;
820         const std::vector<term_type const *> &m_v1;
821         const std::vector<term_type const *> &m_v2;
822         Series &m_retval;
823         const it_type m_c_end;
824     };
825     /// Sanitise series.
826     /**
827      * When using the low-level interface of piranha::hash_set for term insertion, invariants might be violated
828      * both in piranha::hash_set and piranha::series. In particular:
829      *
830      * - terms may not be checked for compatibility or ignorability upon insertion,
831      * - the count of elements in piranha::hash_set might not be updated.
832      *
833      * This method can be used to fix these invariants: each term of \p retval will be checked for ignorability and
834      * compatibility,
835      * and the total count of terms in the series will be set to the number of non-ignorable terms. Ignorable terms will
836      * be erased.
837      *
838      * Note that in case of exceptions \p retval will likely be left in an inconsistent state which violates internal
839      * invariants.
840      * Calls to this function should always be wrapped in a try/catch block that makes sure that \p retval is cleared
841      * before
842      * re-throwing.
843      *
844      * @param retval the series to be sanitised.
845      * @param n_threads the number of threads to be used.
846      *
847      * @throws std::invalid_argument if \p n_threads is zero, or if one of the terms in \p retval is not compatible
848      * with the symbol set of \p retval.
849      * @throws std::overflow_error if the number of terms in \p retval overflows the maximum value representable by
850      * base_series_multiplier::bucket_size_type.
851      * @throws unspecified any exception thrown by:
852      * - the cast operator of piranha::integer,
853      * - standard threading primitives,
854      * - thread_pool::enqueue(),
855      * - future_list::push_back().
856      */
sanitise_series(Series & retval,unsigned n_threads)857     static void sanitise_series(Series &retval, unsigned n_threads)
858     {
859         using term_type = typename Series::term_type;
860         if (unlikely(n_threads == 0u)) {
861             piranha_throw(std::invalid_argument, "invalid number of threads");
862         }
863         auto &container = retval._container();
864         const auto &args = retval.get_symbol_set();
865         // Reset the size to zero before doing anything.
866         container._update_size(static_cast<bucket_size_type>(0u));
867         // Single-thread implementation.
868         if (n_threads == 1u) {
869             const auto it_end = container.end();
870             for (auto it = container.begin(); it != it_end;) {
871                 if (unlikely(!it->is_compatible(args))) {
872                     piranha_throw(std::invalid_argument, "incompatible term");
873                 }
874                 if (unlikely(container.size() == std::numeric_limits<bucket_size_type>::max())) {
875                     piranha_throw(std::overflow_error, "overflow error in the number of terms of a series");
876                 }
877                 // First update the size, it will be scaled back in the erase() method if necessary.
878                 container._update_size(static_cast<bucket_size_type>(container.size() + 1u));
879                 if (unlikely(it->is_ignorable(args))) {
880                     it = container.erase(it);
881                 } else {
882                     ++it;
883                 }
884             }
885             return;
886         }
887         // Multi-thread implementation.
888         const auto b_count = container.bucket_count();
889         std::mutex m;
890         integer global_count(0);
891         auto eraser = [b_count, &container, &m, &args, &global_count](const bucket_size_type &start,
892                                                                       const bucket_size_type &end) {
893             piranha_assert(start <= end && end <= b_count);
894             (void)b_count;
895             bucket_size_type count = 0u;
896             std::vector<term_type> term_list;
897             // Examine and count the terms bucket-by-bucket.
898             for (bucket_size_type i = start; i != end; ++i) {
899                 term_list.clear();
900                 const auto &bl = container._get_bucket_list(i);
901                 const auto it_f = bl.end();
902                 for (auto it = bl.begin(); it != it_f; ++it) {
903                     // Check first for compatibility.
904                     if (unlikely(!it->is_compatible(args))) {
905                         piranha_throw(std::invalid_argument, "incompatible term");
906                     }
907                     // Check for ignorability.
908                     if (unlikely(it->is_ignorable(args))) {
909                         term_list.push_back(*it);
910                     }
911                     // Update the count of terms.
912                     if (unlikely(count == std::numeric_limits<bucket_size_type>::max())) {
913                         piranha_throw(std::overflow_error, "overflow error in the number of terms of a series");
914                     }
915                     count = static_cast<bucket_size_type>(count + 1u);
916                 }
917                 for (auto it = term_list.begin(); it != term_list.end(); ++it) {
918                     // NOTE: must use _erase to avoid concurrent modifications
919                     // to the number of elements in the table.
920                     container._erase(container._find(*it, i));
921                     // Account for the erased term.
922                     piranha_assert(count > 0u);
923                     count = static_cast<bucket_size_type>(count - 1u);
924                 }
925             }
926             // Update the global count.
927             std::lock_guard<std::mutex> lock(m);
928             global_count += count;
929         };
930         future_list<decltype(eraser(bucket_size_type(), bucket_size_type()))> f_list;
931         try {
932             for (unsigned i = 0u; i < n_threads; ++i) {
933                 const auto start = static_cast<bucket_size_type>((b_count / n_threads) * i),
934                            end = static_cast<bucket_size_type>(
935                                (i == n_threads - 1u) ? b_count : (b_count / n_threads) * (i + 1u));
936                 f_list.push_back(thread_pool::enqueue(i, eraser, start, end));
937             }
938             // First let's wait for everything to finish.
939             f_list.wait_all();
940             // Then, let's handle the exceptions.
941             f_list.get_all();
942         } catch (...) {
943             f_list.wait_all();
944             // NOTE: there's not need to clear retval here - it was already in an inconsistent
945             // state coming into this method. We rather need to make sure sanitise_series() is always
946             // called in a try/catch block that clears retval in case of errors.
947             throw;
948         }
949         // Final update of the total count.
950         container._update_size(static_cast<bucket_size_type>(global_count));
951     }
952     /// A plain series multiplication routine.
953     /**
954      * \note
955      * If the key and coefficient types of \p Series do not satisfy piranha::key_is_multipliable, or \p LimitFunctor
956      * does
957      * not satisfy the requirements outlined in base_series_multiplier::blocked_multiplication(), a compile-time error
958      * will
959      * be produced.
960      *
961      * This method implements a generic series multiplication routine suitable for key types that satisfy
962      * piranha::key_is_multipliable.
963      * The implementation is either single-threaded or multi-threaded, depending on the sizes of the input series, and
964      * it will use
965      * either base_series_multiplier::plain_multiplier or a similar thread-safe multiplier for the term-by-term
966      * multiplications.
967      * The \p lf functor will be forwarded as limit functor to base_series_multiplier::blocked_multiplication()
968      * and base_series_multiplier::estimate_final_series_size().
969      *
970      * Note that, in multithreaded mode, \p lf will be shared among (and called concurrently from) all the threads.
971      *
972      * @param lf the limit functor (see base_series_multiplier::blocked_multiplication()).
973      *
974      * @return the series resulting from the multiplication of the two series used to construct \p this.
975      *
976      * @throws unspecified any exception thrown by:
977      * - piranha::safe_cast(),
978      * - base_series_multiplier::estimate_final_series_size(),
979      * - <tt>boost::numeric_cast()</tt>,
980      * - the public interface of piranha::hash_set,
981      * - base_series_multiplier::blocked_multiplication(),
982      * - base_series_multiplier::sanitise_series(),
983      * - the <tt>multiply()</tt> method of the key type of \p Series,
984      * - thread_pool::enqueue(),
985      * - future_list::push_back(),
986      * - the construction of terms,
987      * - in-place addition of coefficients.
988      */
989     template <typename LimitFunctor>
plain_multiplication(const LimitFunctor & lf) const990     Series plain_multiplication(const LimitFunctor &lf) const
991     {
992         // Shortcuts.
993         using term_type = typename Series::term_type;
994         using cf_type = typename term_type::cf_type;
995         using key_type = typename term_type::key_type;
996         PIRANHA_TT_CHECK(key_is_multipliable, cf_type, key_type);
997         constexpr std::size_t m_arity = key_type::multiply_arity;
998         // Setup the return value with the merged symbol set.
999         Series retval;
1000         retval.set_symbol_set(m_ss);
1001         // Do not do anything if one of the two series is empty.
1002         if (unlikely(m_v1.empty() || m_v2.empty())) {
1003             return retval;
1004         }
1005         const size_type size1 = m_v1.size(), size2 = m_v2.size();
1006         (void)size2;
1007         piranha_assert(size1 && size2);
1008         // Convert n_threads to size_type for convenience.
1009         const size_type n_threads = safe_cast<size_type>(m_n_threads);
1010         piranha_assert(n_threads);
1011         // Determine if we should estimate the size. We check the threshold, but we always
1012         // need to estimate in multithreaded mode.
1013         bool estimate = true;
1014         const auto e_thr = tuning::get_estimate_threshold();
1015         if (integer(m_v1.size()) * m_v2.size() < integer(e_thr) * e_thr && n_threads == 1u) {
1016             estimate = false;
1017         }
1018         if (estimate) {
1019             // Estimate and rehash.
1020             const auto est = estimate_final_series_size<m_arity, plain_multiplier<false>>(lf);
1021             // NOTE: use numeric cast here as safe_cast is expensive, going through an integer-double conversion,
1022             // and in this case the behaviour of numeric_cast is appropriate.
1023             const auto n_buckets = boost::numeric_cast<bucket_size_type>(
1024                 std::ceil(static_cast<double>(est) / retval._container().max_load_factor()));
1025             piranha_assert(n_buckets > 0u);
1026             // Check if we want to use the parallel memory set.
1027             // NOTE: it is important here that we use the same n_threads for multiplication and memset as
1028             // we tie together pinned threads with potentially different NUMA regions.
1029             const unsigned n_threads_rehash = tuning::get_parallel_memory_set() ? static_cast<unsigned>(n_threads) : 1u;
1030             retval._container().rehash(n_buckets, n_threads_rehash);
1031         }
1032         if (n_threads == 1u) {
1033             try {
1034                 // Single-thread case.
1035                 if (estimate) {
1036                     blocked_multiplication(plain_multiplier<true>(*this, retval), 0u, size1, lf);
1037                     // If we estimated beforehand, we need to sanitise the series.
1038                     sanitise_series(retval, static_cast<unsigned>(n_threads));
1039                 } else {
1040                     blocked_multiplication(plain_multiplier<false>(*this, retval), 0u, size1, lf);
1041                 }
1042                 finalise_series(retval);
1043                 return retval;
1044             } catch (...) {
1045                 retval._container().clear();
1046                 throw;
1047             }
1048         }
1049         // Multi-threaded case.
1050         piranha_assert(estimate);
1051         // Init the vector of spinlocks.
1052         detail::atomic_flag_array sl_array(safe_cast<std::size_t>(retval._container().bucket_count()));
1053         // Init the future list.
1054         future_list<void> f_list;
1055         // Thread block size.
1056         const auto block_size = size1 / n_threads;
1057         try {
1058             for (size_type idx = 0u; idx < n_threads; ++idx) {
1059                 // Thread functor.
1060                 auto tf = [idx, this, block_size, n_threads, &sl_array, &retval, &lf]() {
1061                     // Used to store the result of term multiplication.
1062                     std::array<term_type, key_type::multiply_arity> tmp_t;
1063                     // End of retval container (thread-safe).
1064                     const auto c_end = retval._container().end();
1065                     // Block functor.
1066                     // NOTE: this is very similar to the plain functor, but it does the bucket locking
1067                     // additionally.
1068                     auto f = [&c_end, &tmp_t, this, &retval, &sl_array](const size_type &i, const size_type &j) {
1069                         // Run the term multiplication.
1070                         key_type::multiply(tmp_t, *(this->m_v1[i]), *(this->m_v2[j]), retval.get_symbol_set());
1071                         for (std::size_t n = 0u; n < key_type::multiply_arity; ++n) {
1072                             auto &container = retval._container();
1073                             auto &tmp_term = tmp_t[n];
1074                             // Try to locate the term into retval.
1075                             auto bucket_idx = container._bucket(tmp_term);
1076                             // Lock the bucket.
1077                             detail::atomic_lock_guard alg(sl_array[static_cast<std::size_t>(bucket_idx)]);
1078                             const auto it = container._find(tmp_term, bucket_idx);
1079                             if (it == c_end) {
1080                                 container._unique_insert(term_insertion(tmp_term), bucket_idx);
1081                             } else {
1082                                 it->m_cf += tmp_term.m_cf;
1083                             }
1084                         }
1085                     };
1086                     // Thread block limit.
1087                     const auto e1
1088                         = (idx == n_threads - 1u) ? this->m_v1.size() : static_cast<size_type>((idx + 1u) * block_size);
1089                     this->blocked_multiplication(f, static_cast<size_type>(idx * block_size), e1, lf);
1090                 };
1091                 f_list.push_back(thread_pool::enqueue(static_cast<unsigned>(idx), tf));
1092             }
1093             f_list.wait_all();
1094             f_list.get_all();
1095             sanitise_series(retval, static_cast<unsigned>(n_threads));
1096             finalise_series(retval);
1097         } catch (...) {
1098             f_list.wait_all();
1099             // Clean up retval as it might be in an inconsistent state.
1100             retval._container().clear();
1101             throw;
1102         }
1103         return retval;
1104     }
1105     /// A plain series multiplication routine (convenience overload).
1106     /**
1107      * @return the output of the other overload of plain_multiplication(), with a limit
1108      * functor whose call operator will always return the size of the second series unconditionally.
1109      *
1110      * @throws unspecified any exception thrown by the other overload of plain_multiplication().
1111      */
plain_multiplication() const1112     Series plain_multiplication() const
1113     {
1114         return plain_multiplication(default_limit_functor{*this});
1115     }
1116     /// Finalise series.
1117     /**
1118      * This method will finalise the output \p s of a series multiplication undertaken via
1119      * piranha::base_series_multiplier.
1120      * Currently, this method will not do anything unless the coefficient type of \p Series is an instance of
1121      * piranha::mp_rational.
1122      * In this case, the coefficients of \p s will be normalised with respect to the least common multiplier computed in
1123      * the
1124      * constructor of piranha::base_series_multiplier.
1125      *
1126      * @param s the \p Series to be finalised.
1127      *
1128      * @throws unspecified any exception thrown by:
1129      * - thread_pool::enqueue(),
1130      * - future_list::push_back().
1131      */
finalise_series(Series & s) const1132     void finalise_series(Series &s) const
1133     {
1134         finalise_impl(s);
1135     }
1136 
1137 protected:
1138     /// Vector of const pointers to the terms in the larger series.
1139     mutable v_ptr m_v1;
1140     /// Vector of const pointers to the terms in the smaller series.
1141     mutable v_ptr m_v2;
1142     /// The symbol set of the series used during construction.
1143     const symbol_set m_ss;
1144     /// Number of threads.
1145     /**
1146      * This value will be set by the constructor, and it represents the number of threads
1147      * that will be used by the multiplier. The value is always at least 1 and it is calculated
1148      * via thread_pool::use_threads().
1149      */
1150     unsigned m_n_threads;
1151 
1152 private:
1153     // See the constructor for an explanation.
1154     container_type m_zero_f1;
1155     container_type m_zero_f2;
1156 };
1157 }
1158 
1159 #endif
1160