1 #ifndef LINGEN_SUBSTEP_CHARACTERISTICS_HPP_
2 #define LINGEN_SUBSTEP_CHARACTERISTICS_HPP_
3 
4 #include <iostream>
5 #include <memory>
6 #include <tuple>
7 
8 #include "cxx_mpz.hpp"
9 #include "fmt/format.h"
10 #include "fmt/printf.h"
11 #include "subdivision.hpp"
12 #include "lingen_platform.hpp"
13 #include "lingen_substep_schedule.hpp"
14 #include "lingen_tuning_cache.hpp"
15 #include "lingen_matpoly_ft.hpp"
16 #include "lingen_mul_substeps.hpp"
17 #include "lingen_call_companion.hpp"
18 
19 
20 /* This file is an offspring from lingen_tuning.cpp - some of it is not
21  * really polished to the expected level of a usable interface. It
22  * happens to be also used by the binary
23  * time_matpoly_ft_parallel_${gfp_layer}
24  */
25 
26 template<typename OP> struct microbench_dft;
27 template<typename OP> struct microbench_ift;
28 template<typename OP> struct microbench_conv;
29 
30 struct lingen_substep_characteristics {
31     typedef lingen_substep_characteristics ch_t;
32     typedef lingen_platform pc_t;
33     typedef lingen_substep_schedule sc_t;
34     typedef lingen_tuning_cache tc_t;
35 
36     abdst_field ab;
37     cxx_mpz p;
38     gmp_randstate_t & rstate;
39 
40     /* length of the input (E) for the call under consideration ; this is
41      * not the input length for the overall algorithm !
42      *
43      * We only use it for printing.
44      * */
45     size_t input_length;
46 
47     op_mul_or_mp_base::op_type_t op_type;
48 
49     /* We're talking products (or any other bilinear operation that uses
50      * transforms 1 by 1, e.g. MP) of matrices of size n0*n1 and n1*n2
51      */
52     unsigned int n0;
53     unsigned int n1;
54     unsigned int n2;
55 
56     /* length of the three operands.
57      * A has dimension n0*n1, length asize
58      * B has dimension n1*n2, length bsize
59      * C has dimension n1*n2, length csize
60      */
61     size_t asize, bsize, csize;
62 
63     public:
64 
is_valid_schedulelingen_substep_characteristics65     bool is_valid_schedule(lingen_platform, unsigned int mesh, lingen_substep_schedule const & S) const
66     {
67         unsigned int shrink0 = S.shrink0;
68         unsigned int shrink2 = S.shrink2;
69         unsigned int r = mesh;
70         subdivision mpi_split0(n0, r);
71         subdivision mpi_split1(n1, r);
72         subdivision mpi_split2(n2, r);
73         subdivision shrink_split0(mpi_split0.block_size_upper_bound(), shrink0);
74         subdivision shrink_split2(mpi_split2.block_size_upper_bound(), shrink2);
75         unsigned int nrs0(shrink_split0.block_size_upper_bound());
76         unsigned int nrs2(shrink_split2.block_size_upper_bound());
77         const unsigned int nr1 = mpi_split1.block_size_upper_bound();
78         unsigned int b0 = S.batch[0];
79         unsigned int b1 = S.batch[1];
80         unsigned int b2 = S.batch[2];
81         subdivision loop1 = subdivision::by_block_size(nr1, b1);
82         for(unsigned int round = 0 ; round < shrink0 * shrink2 ; round++) {
83             unsigned round0 = round % shrink0;
84             unsigned round2 = round / shrink0;
85             unsigned int i0, i1;
86             unsigned int j0, j1;
87             std::tie(i0, i1) = shrink_split0.nth_block(round0);
88             std::tie(j0, j1) = shrink_split2.nth_block(round2);
89             ASSERT_ALWAYS((i1 - i0) <= nrs0);
90             ASSERT_ALWAYS((j1 - j0) <= nrs2);
91             subdivision loop0 = subdivision::by_block_size(i1 - i0, b0);
92             subdivision loop2 = subdivision::by_block_size(j1 - j0, b2);
93 
94             if (loop0.nblocks() != 1 && loop2.nblocks() != 1)
95                 return false;
96             bool process_blocks_row_major = b0 == nrs0;
97 
98             for(unsigned int iloop1 = 0 ; iloop1 < loop1.nblocks() ; iloop1++) {
99                 if (process_blocks_row_major) {
100                     if (loop0.nblocks() != 1) return false;
101                     if (nrs0 != b0) return false;
102                 } else {
103                     if (loop2.nblocks() != 1) return false;
104                     if (nrs2 != b2) return false;
105                 }
106             }
107         }
108         return true;
109     }
110 
lingen_substep_characteristicslingen_substep_characteristics111     lingen_substep_characteristics(abdst_field ab, gmp_randstate_t & rstate, size_t input_length, op_mul_or_mp_base::op_type_t op_type, unsigned int n0, unsigned int n1, unsigned int n2, size_t asize, size_t bsize, size_t csize) :/*{{{*/
112         ab(ab),
113         rstate(rstate),
114         input_length(input_length),
115         op_type(op_type),
116         n0(n0), n1(n1), n2(n2),
117         asize(asize), bsize(bsize), csize(csize)
118     {
119 #ifdef SELECT_MPFQ_LAYER_u64k1
120         mpz_set_ui(p, 2);
121         // op = OP(asize, bsize);
122 #else
123         mpz_set(p, abfield_characteristic_srcptr(ab));
124         // op = OP(p, asize, bsize, n1);
125 #endif
126         // fft_alloc_sizes = op.fti.get_alloc_sizes();
127     }/*}}}*/
128 
max_threadslingen_substep_characteristics129     static inline int max_threads() {
130 #ifdef HAVE_OPENMP
131         return omp_get_max_threads();
132 #else
133         return 1;
134 #endif
135     }
136 
137     private:
has_cached_timelingen_substep_characteristics138     bool has_cached_time(tc_t const & C, lingen_substep_schedule::fft_type_t fft_type) const {/*{{{*/
139         lingen_tuning_cache::mul_or_mp_key K { op_type, fft_type, mpz_sizeinbase(p, 2), asize, bsize };
140         return C.has(K);
141     }/*}}}*/
142 
143     /* tl;dr: array of doubles is input, weighted_double is output.
144      *
145      * A parallelizable timing is created from an array of
146      * micro-measurements that tell how long (WCT) it takes to do k times
147      * the same operation in parallel, for k less than some bound
148      * (typically the number of threads, but it can be less if RAM
149      * allows only so many operations in parallel).
150      *
151      * We _use_ a parallelizable timing in the form of the parent
152      * weighted_double, essentially. This carries the info on the
153      * _amortized time_ (t) it takes to do some number (n) of operations.
154      * Originally t is initialized to what the array of concurrent
155      * timings says of what happens for just one thread.
156      *
157      * The gist of it is that we have the .parallelize() method. It
158      * changes the underlying weighted_double to reflect the amortized
159      * timing, again, but when some number of operations are done in
160      * parallel.
161      *
162      * This is in contrast with operator*(), which counts more operations
163      * to do but not in parallel.
164      */
165     class parallelizable_timing : public weighted_double/*{{{*/
166     {
167         std::vector<double> concurrent_timings;
168         bool counted_as_parallel = false;
max_parallel() const169         unsigned int max_parallel() const { return concurrent_timings.size() - 1; }
170         private:
get_concurrent_time(unsigned int k)171         double get_concurrent_time(unsigned int k) {
172             /* How long does it take (WCT) to do k operations in parallel ? */
173             /* Ideally, this should be almost constant as long as k is
174              * less than the number of threads, and then it should reach
175              * linear behaviour (but we're restricting to small k
176              * anyway).
177              * In practice, "constant" isn't really constant because
178              * there's some overhead. After all, all cores and
179              * hyperthreads are competing for cache, for example. So we
180              * allow a linear regression.
181              */
182             ASSERT_ALWAYS(k > 0);
183             ASSERT_ALWAYS(k <= max_parallel());
184             unsigned int kl=1, kh=1;
185             double tl=t, th=t;
186             for(unsigned int i = 1 ; i <= k ; i++) {
187                 if (concurrent_timings[i] < 0) continue;
188                 kl = i;
189                 tl = concurrent_timings[i];
190             }
191             for(unsigned int i = max_parallel() ; i >= k ; i--) {
192                 if (concurrent_timings[i] < 0) continue;
193                 kh = i;
194                 th = concurrent_timings[i];
195             }
196             if (kl == kh) return tl;
197             return tl + (th-tl)*(k-kl)/(kh-kl);
198         }
199         public:
parallelizable_timing(std::vector<double> const & c)200         parallelizable_timing(std::vector<double> const & c) : concurrent_timings(c) {
201             // ASSERT_ALWAYS(c.size() == (size_t) max_threads() + 1);
202             this->n = 1;
203             this->t = concurrent_timings[1];
204         }
205         /* This ctor is only for something that we will _never_
206          * parallelize, so that we have no use for an array of
207          * concurrent_timings.
208          * In other cases, we insist on providing the
209          * concurrent_timings array, via the other ctor.
210          */
parallelizable_timing(double d)211         parallelizable_timing(double d)
212             : weighted_double { 1, d }
213             , concurrent_timings { 0, d }
214         {
215         }
216 
parallelizable_timing()217         parallelizable_timing() : weighted_double { 0, 0 } {}
218 
operator *=(unsigned int x)219         parallelizable_timing& operator*=(unsigned int x) {
220             n *= x;
221             return *this;
222         }
operator *(unsigned int x) const223         parallelizable_timing operator*(unsigned int x) const {
224             parallelizable_timing X = *this;
225             X *= x;
226             return X;
227         }
parallelize(unsigned int k,unsigned int T)228         parallelizable_timing& parallelize(unsigned int k, unsigned int T) {
229             ASSERT_ALWAYS(!counted_as_parallel);
230             counted_as_parallel = true;
231             /* We suppose that on input, *this reprensents n operations
232              * that sequentially take time t, and that this time
233              * represents something single-threaded.  This function
234              * replaces the inner operation by the fact of doing k times
235              * this same thing, in parallel. The max level of expected
236              * parallelism is given by T. The field t is replaced then
237              * by the _average time_ that these operation take (that is,
238              * 1/k times the cumulated time it takes to do k operations).
239              * If perfect parallelism is achieved (all k operations done
240              * up to T times in parallel, with linear scaling), this
241              * effectively does
242              *  this->n *= k
243              *  this->t *= iceildiv(k, T) / (double) k;
244              * However, even with T <= max_threads(), this
245              * process is likely to give somewhat different results
246              * because not everything achieves perfect parallelism at the
247              * CPU level.
248              *
249              * Note that T should really be max_threads().
250              */
251             T = std::min(T, max_parallel());
252             unsigned int qk = k / T;
253             unsigned int rk = k - qk * T;
254             /* get_concurrent_time returns the wall-clock time needed to
255              * do T dfts in parallel
256              */
257             double tt = qk * get_concurrent_time(T);
258             if (rk) tt += get_concurrent_time(rk);
259             n *= k;
260             t = tt / k;
261             return *this;
262         }
operator double() const263         operator double() const { return n * t; }
operator +(parallelizable_timing const & b) const264         double operator+(parallelizable_timing const & b) const { return *this + (double) b; }
operator +(double y) const265         double operator+(double y) const { return (double) *this + y; }
266     };/*}}}*/
267 
268 
269     /* We can measure timings in one of three manners here.
270      *  - "cache-hot" do repeatedly the same thing, on identical data.
271      *  - "cache-cold", quick: do the calculation only once (malloc,
272      *  compute, free).
273      *  - "cache-cold", thorough: do the calculation on matrices of
274      *  different inputs.
275      *
276      * None is a true winner. "cache-hot" is easily optimistic, and both
277      * "cache-cold" options are somewhat pessimistic.
278      */
279 
280 // #define CALL_MEMBER_FN(object,ptrToMember)  ((object).*(ptrToMember))
281 
282     public:
283     // typedef double (lingen_substep_characteristics::*raw_timer_member)(unsigned int, unsigned int, unsigned int) const;
284 
285     private:
286     template<typename T>
complete_tveclingen_substep_characteristics287     void complete_tvec(std::ostream& os, std::vector<double> & tvec, T & F, unsigned int kl, unsigned int kh) const/*{{{*/
288     {
289         /* The ideal expected behaviour is that this should be constant.
290          * But it's not, since flint itself uses openmp. We're doing
291          * better, since we're coarser grain. But still, as timing goes,
292          * this means that we should be prepared to have a roughly linear
293          * curve.
294          */
295         if (kh <= kl + 1) return;
296         double tl = tvec[kl];
297         double th = tvec[kh];
298         if (th - tl <= 0.1 * tl) return;
299         unsigned int k = (kl + kh) / 2;
300         double tk = F(k);
301         os << fmt::format(FMT_STRING(" {}:{:.2g}"), k, tk);
302         os.flush();
303         tvec[k] = tk;
304         double linfit_tk = tl + (th-tl)*(k-kl)/(kh-kl);
305         if ((tk - linfit_tk) <= 0.1*linfit_tk) return;
306         complete_tvec(os, tvec, F, kl, k);
307         complete_tvec(os, tvec, F, k, kh);
308     }/*}}}*/
309 
310     public:
311     template<typename T>
fill_tveclingen_substep_characteristics312     std::vector<double> fill_tvec(std::ostream& os, T const & F) const {/*{{{*/
313         /* can't make my mind as to whether I should just output that to
314          * the terminal, or stow it in a string first...
315          */
316 
317         constexpr const char * Fname = T::name;
318 
319         unsigned int TMAX = max_threads();
320 
321         std::vector<double> tvec(TMAX + 1, -1);
322 
323         if (Fname == NULL) {
324             tvec[1] = tvec[TMAX] = 0;
325             return tvec;
326         }
327 
328         os << fmt::sprintf("# %s%s;%s (@%d) wct for %s by nthreads:",
329                 F.mesh > 1 ? "MPI-" : "",
330                 F.op.op_name(),
331                 lingen_substep_schedule::fft_name(encode_fft_type<typename T::OP::FFT>()),
332                 input_length,
333                 Fname
334                 );
335 
336         if (F.max_parallel() < TMAX) {
337             TMAX = F.max_parallel();
338             os << fmt::format(FMT_STRING(" [capped to {}]"), TMAX);
339         }
340 
341         unsigned int kl = 1;
342         double tl = F(kl);
343         tvec[kl] = tl;
344         os << fmt::format(FMT_STRING(" {}:{:.2g}"), kl, tl);
345         os.flush();
346 
347 
348         unsigned int kh = TMAX;
349         if (kh > 1) {
350             double th = F(kh);
351             tvec[kh] = th;
352             os << fmt::format(FMT_STRING(" {}:{:.2g}"), kh, th);
353             os.flush();
354         }
355 
356         complete_tvec(os, tvec, F, kl, kh);
357 
358         os << std::endl;
359         return tvec;
360     }/*}}}*/
361 
362     private:
363     template<typename T>
get_ft_time_from_cache_or_recomputelingen_substep_characteristics364     parallelizable_timing get_ft_time_from_cache_or_recompute(std::ostream& os, T const & F, tc_t::mul_or_mp_value::value_type & store) const
365     {
366         if (!store.empty()) {
367             /* what do we have in cache, to start with ? We need to know how
368              * many threads max were considered at the time when the tuning
369              * was made.
370              */
371             unsigned int th_cache = 0;
372             for(auto const & x : store) {
373                 if (x.first > th_cache)
374                     th_cache = x.first;
375             }
376             /* How many threads do we support max on the current platform ?
377              * This is not necessarily equal.
378              */
379             if (F.max_parallel() == th_cache) {
380                 std::vector<double> tvec(th_cache + 1, -1);
381                 for(auto const & x : store) {
382                     tvec[x.first] = x.second;
383                 }
384                 return parallelizable_timing(tvec);
385             }
386             os << fmt::format(FMT_STRING("# ignoring cached entry, computed for up to {} threads (here {} max)\n"), th_cache, F.max_parallel());
387             store.clear();
388         }
389 
390         std::vector<double> tvec = fill_tvec(os, F);
391         for(unsigned int i = 1 ; i < tvec.size() ; i++) {
392             if (tvec[i] >= 0) store.push_back( { i, tvec[i] } );
393         }
394         return parallelizable_timing(tvec);
395     }
396 
397     template<typename OP>
get_ft_timeslingen_substep_characteristics398     std::array<parallelizable_timing, 4> get_ft_times(std::ostream& os, OP const & op, pc_t const& P, unsigned int mesh, lingen_substep_schedule const & S, tc_t & C) const {/*{{{*/
399         lingen_tuning_cache::mul_or_mp_key K { op_type, S.fft_type, mpz_sizeinbase(p, 2), asize, bsize };
400         typedef lingen_tuning_cache::mul_or_mp_value cache_value_type;
401 
402         std::array<parallelizable_timing, 4> res;
403         cache_value_type cached_res;
404 
405         if (C.has(K))
406             cached_res = C[K];
407 
408         {
409             microbench_dft<OP> F(op, P, mesh, *this);
410             res[0] = get_ft_time_from_cache_or_recompute(os, F, cached_res[0]);
411             res[1] = res[0];
412             cached_res[1] = cached_res[0];
413         }
414 
415         {
416             microbench_ift<OP> F(op, P, mesh, *this);
417             res[2] = get_ft_time_from_cache_or_recompute(os, F, cached_res[2]);
418         }
419 
420         {
421             microbench_conv<OP> F(op, P, mesh, *this);
422             res[3] = get_ft_time_from_cache_or_recompute(os, F, cached_res[3]);
423         }
424 
425         /* A priori this is either all-fresh or all-old. But nothing in
426          * the code checks that the situation is indeed the same for the
427          * four items.
428          */
429         C[K] = cached_res;
430 
431         return res;
432     }/*}}}*/
433 
434     public:
435     /* {{{ std::tuple<size_t, size_t, size_t> get_operand_ram() const {
436         return {
437             n0 * n1 * asize * mpz_size(p) * sizeof(mp_limb_t),
438             n1 * n2 * bsize * mpz_size(p) * sizeof(mp_limb_t),
439             n0 * n2 * csize * mpz_size(p) * sizeof(mp_limb_t) };
440     }
441     *//*}}}*/
442 #if 0
443     int mesh_inner_size(pc_t const & P) const {/*{{{*/
444         return P.r;
445     }/*}}}*/
446 #endif
mpi_split0lingen_substep_characteristics447     subdivision mpi_split0(unsigned int mesh) const {/*{{{*/
448         return subdivision(n0, mesh);
449     }/*}}}*/
mpi_split1lingen_substep_characteristics450     subdivision mpi_split1(unsigned int mesh) const {/*{{{*/
451         return subdivision(n1, mesh);
452     }/*}}}*/
mpi_split2lingen_substep_characteristics453     subdivision mpi_split2(unsigned int mesh) const {/*{{{*/
454         return subdivision(n2, mesh);
455     }/*}}}*/
shrink_split0lingen_substep_characteristics456     subdivision shrink_split0(unsigned int mesh, unsigned int shrink0) const {/*{{{*/
457         unsigned int nr0 = mpi_split0(mesh).block_size_upper_bound();
458         return subdivision(nr0, shrink0);
459     }/*}}}*/
shrink_split2lingen_substep_characteristics460     subdivision shrink_split2(unsigned int mesh, unsigned int shrink2) const {/*{{{*/
461         unsigned int nr2 = mpi_split2(mesh).block_size_upper_bound();
462         return subdivision(nr2, shrink2);
463     }/*}}}*/
shrink_split0lingen_substep_characteristics464     subdivision shrink_split0(unsigned int mesh, sc_t const & S) const {/*{{{*/
465         return shrink_split0(mesh, S.shrink0);
466     }/*}}}*/
shrink_split2lingen_substep_characteristics467     subdivision shrink_split2(unsigned int mesh, sc_t const & S) const {/*{{{*/
468         return shrink_split2(mesh, S.shrink2);
469     }/*}}}*/
get_peak_ram_multiplierslingen_substep_characteristics470     std::array<std::array<unsigned int, 3>, 2> get_peak_ram_multipliers(unsigned int mesh, sc_t const & S) const { /* {{{ */
471         unsigned int nrs0 = shrink_split0(mesh, S).block_size_upper_bound();
472         unsigned int nrs2 = shrink_split2(mesh, S).block_size_upper_bound();
473 
474         unsigned int b0 = S.batch[0];
475         unsigned int b1 = S.batch[1];
476         unsigned int b2 = S.batch[2];
477         unsigned int mul0 = 0;
478         mul0 += b1 * mesh * b0;
479         mul0 += b1 * mesh * b2;
480         mul0 += nrs0*nrs2;
481         unsigned int mul1 = std::max(b0 * std::max(b1, b2), nrs0 * nrs2);
482         unsigned int mul12 = b0 * b2;
483 
484         /* *IF* the pragma omp parallel statements go with an appropriate
485          * num_threads() clause, then yes, it makes sense to do this
486          * min(). Otherwise, the multipliers will be max_threads() in all
487          * cases.
488          */
489         mul1 = std::min(mul1, (unsigned int) max_threads());
490         mul12 = std::min(mul12, (unsigned int) max_threads());
491 
492         /*
493         size_t live_transforms = fft_alloc_sizes[0] * mul0;
494         size_t maxtemp_ft = std::min(nrs0 * nrs2,
495                 (unsigned int) max_threads()) * fft_alloc_sizes[1];
496         size_t maxtemp_addmul = std::min(S.batch[0] * S.batch[2],
497                 (unsigned int) max_threads())
498             * (fft_alloc_sizes[1] + fft_alloc_sizes[2]);
499         return live_transforms + std::max(maxtemp_ft, maxtemp_addmul);
500         */
501         return {{
502             // if (mul1 * op.get_alloc_sizes()[1] < mul12 *
503             // (op.get_alloc_sizes()[1] + op.get_alloc_sizes()[2])) then
504             // the largest amount of ram is with the first of the two
505             // options below.
506             {{ mul0, mul12, mul12 }},
507             {{ mul0, mul1, 0 }}
508         }};
509     }
510     /*}}}*/
get_peak_ramlingen_substep_characteristics511     size_t get_peak_ram(op_mul_or_mp_base const & op, unsigned int mesh, sc_t const & S) const { /* {{{ */
512         auto multipliers = get_peak_ram_multipliers(mesh, S);
513         size_t rpeak = 0;
514         for(auto const & M : multipliers) {
515             size_t r = 0;
516             for(unsigned int i = 0 ; i < 3 ; i++)
517                 r += M[i] * op.get_alloc_sizes()[i];
518             if (r > rpeak) rpeak = r;
519         }
520         return rpeak;
521     }
522     /*}}}*/
get_peak_ramlingen_substep_characteristics523     size_t get_peak_ram(unsigned int mesh, sc_t const & S) const { /* {{{ */
524         std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type);
525         return get_peak_ram(*op, mesh, S);
526     }
527     /*}}}*/
528 
fft_type_validlingen_substep_characteristics529     bool fft_type_valid(lingen_substep_schedule::fft_type_t fft_type) const {
530         try {
531             std::shared_ptr<op_mul_or_mp_base> op = instantiate(fft_type);
532 #ifdef SELECT_MPFQ_LAYER_u64k1
533             if (op_type == op_mul_or_mp_base::OP_MUL) {
534                 auto x = dynamic_cast<op_mul<gf2x_ternary_fft_info> const *>(op.get());
535                 if (x)
536                     return x->fti.K != 0;
537             }
538             if (op_type == op_mul_or_mp_base::OP_MP) {
539                 auto x = dynamic_cast<op_mp<gf2x_ternary_fft_info> const *>(op.get());
540                 if (x)
541                     return x->fti.K != 0;
542             }
543 #endif
544         } catch (std::exception const &) {
545             return false;
546         }
547         return true;
548     }
549 
550 
551     private:
552 
553     struct call_time_digest {
554         parallelizable_timing T_dft0;
555         parallelizable_timing T_dft2;
556         parallelizable_timing T_conv;
557         parallelizable_timing T_ift;
558         parallelizable_timing T_comm0;
559         parallelizable_timing T_comm2;
560         std::array<size_t, 3> fft_alloc_sizes;
totallingen_substep_characteristics::call_time_digest561         double total() const {
562             double tt = 0;
563             tt += T_dft0;
564             tt += T_dft2;
565             tt += T_conv;
566             tt += T_ift;
567             tt += T_comm0;
568             tt += T_comm2;
569             return tt;
570         }
571     };
572 
573     template<typename OP>
get_call_time_backendlingen_substep_characteristics574     call_time_digest get_call_time_backend(
575             std::ostream& os,
576             OP const & op,
577             pc_t const & P,
578             unsigned int mesh,
579             sc_t const & S,
580             tc_t & C,
581             bool do_timings) const
582     { /* {{{ */
583         ASSERT_FOR_STATIC_ANALYZER(mesh != 0);
584 
585         /* XXX Any change here must also be reflected in the mp_or_mul
586          * structure in lingen_matpoly_bigmatpoly_ft_common.hpp
587          */
588         parallelizable_timing T_dft0 { -1 };
589         parallelizable_timing T_dft2 { -1 };
590         parallelizable_timing T_ift  { -1 };
591         parallelizable_timing T_conv { -1 };
592 
593         if (do_timings) {
594             auto ft = get_ft_times(os, op, P, mesh, S, C);
595             /* These are just base values, we'll multiply them later on */
596             T_dft0 = ft[0];
597             T_dft2 = ft[1];
598             T_ift = ft[2];
599             T_conv = ft[3];
600         }
601 
602         /* Each of the r*r nodes has local matrices size (at most)
603          * nr0*nr1, nr1*nr2, and nr0*nr2. For the actual computations, we
604          * care more about the shrunk submatrices, though */
605         unsigned int nr1 = mpi_split1(mesh).block_size_upper_bound();
606 
607         /* The shrink parameters will divide the size of the local
608          * matrices we consider by numbers shrink0 and shrink2. This
609          * increases the time, and decreases the memory footprint */
610         unsigned int nrs0 = shrink_split0(mesh, S).block_size_upper_bound();
611         unsigned int nrs2 = shrink_split2(mesh, S).block_size_upper_bound();
612 
613         /* The 1-threaded timing will be obtained by setting T=batch=1.  */
614 
615         /* Locally, we'll add together r matrices of size nrs0*nrs2,
616          * which will be computed as products of vectors of size
617          * batch[0]*batch[1] and batch[1]*batch[2]. (with wither
618          * batch[0]==nrs0 or batch[2]==nrs2). The operation will be
619          * repeated nrs0/batch[0] * nr1/batch[1] * nrs2/batch[2] times
620          * (not all multipliers apply to everyone).
621          */
622 
623         T_dft0.parallelize(S.batch[0] * S.batch[1], P.T);
624         T_dft0 *= iceildiv(nrs0, S.batch[0]);
625         T_dft0 *= iceildiv(nr1, S.batch[1]);
626 
627         T_dft2.parallelize(S.batch[1] * S.batch[2], P.T);
628         T_dft2 *= iceildiv(nrs2, S.batch[2]);
629         T_dft2 *= iceildiv(nr1, S.batch[1]);
630 
631         T_conv.parallelize(S.batch[0] * S.batch[2], P.T);
632         T_conv *= mesh;
633         T_conv *= iceildiv(nr1, S.batch[1]) * S.batch[1];
634         T_conv *= iceildiv(nrs0, S.batch[0]);
635         T_conv *= iceildiv(nrs2, S.batch[2]);
636 
637         T_ift.parallelize(nrs0 * nrs2, P.T);
638 
639         /* Now the communication time _per call_ */
640         /* TODO: maybe include some model of latency... */
641         /* FIXME: This is wrong with the non-caching approach */
642         parallelizable_timing T = op.get_alloc_sizes()[0] / P.mpi_xput;
643         T *= S.batch[1] * iceildiv(nr1, S.batch[1]);
644         /* allgather over n nodes */
645         T *= mesh - 1;
646         parallelizable_timing T_comm0 = T;
647         parallelizable_timing T_comm2 = T;
648         T_comm0 *= S.batch[0] * iceildiv(nrs0, S.batch[0]);
649         T_comm2 *= S.batch[2] * iceildiv(nrs2, S.batch[2]);
650 
651         call_time_digest ret;
652 
653         ret.T_dft0 = T_dft0 *= S.shrink0 * S.shrink2;
654         ret.T_dft2 = T_dft2 *= S.shrink0 * S.shrink2;
655         ret.T_conv = T_conv *= S.shrink0 * S.shrink2;
656         ret.T_ift  = T_ift  *= S.shrink0 * S.shrink2;
657         ret.T_comm0 = T_comm0 *= S.shrink0 * S.shrink2;
658         ret.T_comm2 = T_comm2 *= S.shrink0 * S.shrink2;
659         ret.fft_alloc_sizes = op.get_alloc_sizes();
660 
661         /* This is for _one_ call only (one node in the tree).
662          * We must apply multiplier coefficients (typically 1<<depth) */
663         return ret;
664     }/*}}}*/
665 
instantiatelingen_substep_characteristics666     std::shared_ptr<op_mul_or_mp_base> instantiate(lingen_substep_schedule::fft_type_t fft_type) const
667     {
668         /* must create the op type from the lingen_substep_schedule
669          * (a.k.a. sc_t) type */
670         switch(op_type) {
671             case op_mul_or_mp_base::OP_MP:
672                 switch(fft_type) {
673                     case lingen_substep_schedule::FFT_NONE:
674                         return std::make_shared<op_mp<void>>(p, asize, bsize, n1);
675 #ifndef SELECT_MPFQ_LAYER_u64k1
676                     case lingen_substep_schedule::FFT_FLINT:
677                         return std::make_shared<op_mp<fft_transform_info>>(p, asize, bsize, n1);
678 #else
679                     case lingen_substep_schedule::FFT_CANTOR:
680                         return std::make_shared<op_mp<gf2x_cantor_fft_info>>(p, asize, bsize, n1);
681                     case lingen_substep_schedule::FFT_TERNARY:
682                         return std::make_shared<op_mp<gf2x_ternary_fft_info>>(p, asize, bsize, n1);
683 #endif
684                     default:
685                         throw std::runtime_error("invalid data (fft_type)");
686                 }
687             case op_mul_or_mp_base::OP_MUL:
688                 switch(fft_type) {
689                     case lingen_substep_schedule::FFT_NONE:
690                         return std::make_shared<op_mul<void>>(p, asize, bsize, n1);
691 #ifndef SELECT_MPFQ_LAYER_u64k1
692                     case lingen_substep_schedule::FFT_FLINT:
693                         return std::make_shared<op_mul<fft_transform_info>>(p, asize, bsize, n1);
694 #else
695                     case lingen_substep_schedule::FFT_CANTOR:
696                         return std::make_shared<op_mul<gf2x_cantor_fft_info>>(p, asize, bsize, n1);
697                     case lingen_substep_schedule::FFT_TERNARY:
698                         return std::make_shared<op_mul<gf2x_ternary_fft_info>>(p, asize, bsize, n1);
699 #endif
700                     default:
701                         throw std::runtime_error("invalid data (fft_type)");
702                 }
703             default:
704                 throw std::runtime_error("invalid data (op)");
705         }
706     }
707 
get_call_time_backendlingen_substep_characteristics708     call_time_digest get_call_time_backend(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, bool do_timings) const { /* {{{ */
709         switch(op_type) {
710             case op_mul_or_mp_base::OP_MP:
711                 switch(S.fft_type) {
712                     case lingen_substep_schedule::FFT_NONE:
713                         return get_call_time_backend(os, op_mp<void>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
714 #ifndef SELECT_MPFQ_LAYER_u64k1
715                     case lingen_substep_schedule::FFT_FLINT:
716                         return get_call_time_backend(os, op_mp<fft_transform_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
717 #else
718                     case lingen_substep_schedule::FFT_CANTOR:
719                         return get_call_time_backend(os, op_mp<gf2x_cantor_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
720                     case lingen_substep_schedule::FFT_TERNARY:
721                         return get_call_time_backend(os, op_mp<gf2x_ternary_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
722 #endif
723                     default:
724                         throw std::runtime_error("invalid data (fft_type)");
725                 }
726             case op_mul_or_mp_base::OP_MUL:
727                 switch(S.fft_type) {
728                     case lingen_substep_schedule::FFT_NONE:
729                         return get_call_time_backend(os, op_mul<void>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
730 #ifndef SELECT_MPFQ_LAYER_u64k1
731                     case lingen_substep_schedule::FFT_FLINT:
732                         return get_call_time_backend(os, op_mul<fft_transform_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
733 #else
734                     case lingen_substep_schedule::FFT_CANTOR:
735                         return get_call_time_backend(os, op_mul<gf2x_cantor_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
736                     case lingen_substep_schedule::FFT_TERNARY:
737                         return get_call_time_backend(os, op_mul<gf2x_ternary_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings);
738 #endif
739                     default:
740                         throw std::runtime_error("invalid data (fft_type)");
741                 }
742             default:
743                 throw std::runtime_error("invalid data (op)");
744         }
745 
746     }/*}}}*/
747 
748     public:
749 
get_companionlingen_substep_characteristics750     lingen_call_companion::mul_or_mp_times get_companion(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, size_t reserved_ram, bool do_timings) const { /* {{{ */
751         lingen_call_companion::mul_or_mp_times D { op_type };
752         D.S = S;
753         auto A = get_call_time_backend(os, P, mesh, S, C, do_timings);
754         double tt = A.total();
755         D.tt = { 1, tt };
756         D.t_dft_A = A.T_dft0;
757         D.t_dft_B = A.T_dft2;
758         D.t_conv = A.T_conv;
759         D.t_ift_C = A.T_ift;
760         D.t_dft_A_comm = A.T_comm0;
761         D.t_dft_B_comm = A.T_comm2;
762         D.fft_alloc_sizes = A.fft_alloc_sizes;
763         D.peak_ram_multipliers = get_peak_ram_multipliers(mesh, S);
764         D.asize = asize;
765         D.bsize = bsize;
766         D.csize = csize;
767         D.reserved_ram = reserved_ram;
768         return D;
769     }/*}}}*/
get_call_timelingen_substep_characteristics770     double get_call_time(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, bool do_timings) const {/*{{{*/
771         return get_call_time_backend(os, P, mesh, S, C, do_timings).total();
772     }/*}}}*/
773 
774     private:
775     struct sortop {
operator ()lingen_substep_characteristics::sortop776         bool operator()(std::tuple<double, unsigned int, lingen_substep_schedule> const & a, std::tuple<double, unsigned int, lingen_substep_schedule> const & b) const {
777             if (std::get<0>(a) != std::get<0>(b))
778                 return std::get<0>(a) < std::get<0>(b);
779             else
780                 return std::get<1>(a) > std::get<1>(b);
781         }
782     };
783 
784     public:
sort_scheduleslingen_substep_characteristics785     void sort_schedules(
786             std::ostream& os,
787             std::vector<lingen_substep_schedule>& schedules,
788             lingen_platform const & P,
789             unsigned int mesh,
790             lingen_tuning_cache & C,
791             bool do_timings) const
792     {/*{{{*/
793         std::vector<std::tuple<double, unsigned int, lingen_substep_schedule>> precomp;
794         for(auto & S : schedules) {
795             /* This should ensure that all timings are obtained from cache */
796             /* This may print timing info to the output stream */
797             double t = do_timings ? get_call_time(os, P, mesh, S, C, do_timings) : 0;
798             unsigned int B = S.batch[1] * std::min(S.batch[0], S.batch[2]);
799             precomp.emplace_back(t, B, std::move(S));
800         }
801         sort(precomp.begin(), precomp.end(), sortop());
802         schedules.clear();
803         for(auto & P : precomp) {
804             schedules.emplace_back(std::move(std::get<2>(P)));
805         }
806     }/*}}}*/
807 
get_and_report_call_timelingen_substep_characteristics808     double get_and_report_call_time(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, bool do_timings) const { /* {{{ */
809         bool cached = has_cached_time(C, S.fft_type);
810         std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type);
811         os << fmt::sprintf("# %s%s;%s (@%zu) [shrink=(%u,%u) batch=(%u,%u,%u)] %s, ",
812                 mesh > 1 ? "MPI-" : "",
813                 op_mul_or_mp_base::op_name(op_type),
814                 S.fft_name(),
815                 input_length,
816                 S.shrink0,
817                 S.shrink2,
818                 S.batch[0],
819                 S.batch[1],
820                 S.batch[2],
821                 size_disp(get_peak_ram(*op, mesh, S)));
822         os << std::flush;
823         double tt = get_call_time(os, P, mesh, S, C, do_timings);
824         if (do_timings) {
825             os << fmt::sprintf("%.2f%s\n",
826                     tt,
827                     cached ? " [from cache]" : "");
828         } else {
829             os << "not timed\n";
830             tt = -1;
831         }
832         return tt;
833     }/*}}}*/
report_op_winnerlingen_substep_characteristics834     void report_op_winner(std::ostream& os, unsigned int mesh, sc_t const & S) const
835     {
836         std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type);
837         os << fmt::sprintf("# %s%s;%s wins : %s\n",
838                 mesh > 1 ? "MPI-" : "",
839                 op_mul_or_mp_base::op_name(op_type),
840                 S.fft_name(),
841                 op->explain());
842     }
843 
844 #if 0
845     void report_size_stats_human(std::ostream& os, sc_t const & S) const {/*{{{*/
846         std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type);
847         os << fmt::sprintf("# %s (per op): %s+%s+%s, transforms 3*%s\n",
848                 step_name(),
849                 size_disp(asize*mpz_size(p)*sizeof(mp_limb_t)),
850                 size_disp(bsize*mpz_size(p)*sizeof(mp_limb_t)),
851                 size_disp(csize*mpz_size(p)*sizeof(mp_limb_t)),
852                 size_disp(op->get_alloc_sizes()[0]));
853         os << fmt::sprintf("# %s (total for %u*%u * %u*%u): %s, transforms %s\n",
854                 step_name(),
855                 n0,n1,n1,n2,
856                 size_disp((n0*n1*asize+n1*n2*bsize+n0*n2*csize)*mpz_size(p)*sizeof(mp_limb_t)),
857                 size_disp((n0*n1+n1*n2+n0*n2)*op->get_alloc_sizes()[0]));
858     }/*}}}*/
859 #endif
860 };
861 
862 /* the three structs below return the WCT needed to do the
863  * operation named (name) doing it with (nparallel) threads working
864  * at the same time (at our level -- we don't preclude the use of
865  * more omp threads at the level below). The computation is repeated
866  * (n) times, and we work with allocated space that is (k) times
867  * bigger than what is necessary to accomodate the (nparallel)
868  * parallel calls.
869  */
870 
871 template<typename OP_T>
872 struct microbench_dft { /*{{{*/
873     typedef lingen_substep_characteristics ch_t;
874     typedef lingen_platform pc_t;
875     typedef OP_T OP;
876     OP op;
877     pc_t P;
878     unsigned int mesh;
879     ch_t const & U;
880     static constexpr const char * name = "dft";
microbench_dftmicrobench_dft881     microbench_dft(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {}
max_parallelmicrobench_dft882     unsigned int max_parallel() const {
883         size_t R = 0;
884         /* storage for one input coeff and one output transform */
885         constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
886         R += iceildiv(U.asize, simd) * mpz_size(U.p) * sizeof(mp_limb_t);
887         R += op.get_alloc_sizes()[0];
888         /* plus the temp memory for the dft operation */
889         R += op.get_alloc_sizes()[1];
890         size_t n = P.available_ram / R;
891         /* TODO: we probably want to ask P about max_threads. */
892         n = std::min(n, (size_t) U.max_threads());
893         if (n == 0) throw std::runtime_error("not enough RAM");
894         return n;
895     }
operator ()microbench_dft896     double operator()(unsigned int nparallel) const
897     {
898         matpoly a(U.ab, nparallel, 1, U.asize);
899         a.zero_pad(U.asize);
900         a.fill_random(0, U.asize, U.rstate);
901         matpoly_ft<typename OP::FFT> ta(a.m, a.n, op.fti);
902         double tt = -wct_seconds();
903         matpoly_ft<typename OP::FFT>::dft(ta, a);
904         return tt + wct_seconds();
905     }
906 };/*}}}*/
907 template<typename OP_T>
908 struct microbench_ift { /*{{{*/
909     typedef lingen_substep_characteristics ch_t;
910     typedef lingen_platform pc_t;
911     typedef OP_T OP;
912     OP op;
913     pc_t P;
914     unsigned int mesh;
915     ch_t const & U;
916     static constexpr const char * name = "ift";
microbench_iftmicrobench_ift917     microbench_ift(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {}
max_parallelmicrobench_ift918     unsigned int max_parallel() const {
919         size_t R = 0;
920         /* storage for one input transform and one output coeff */
921         constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
922         R += iceildiv(U.csize, simd) * mpz_size(U.p) * sizeof(mp_limb_t);
923         R += op.get_alloc_sizes()[0];
924         /* plus the temp memory for the ift operation */
925         R += op.get_alloc_sizes()[1];
926         size_t n = P.available_ram / R;
927         /* TODO: we probably want to ask P about max_threads. */
928         n = std::min(n, (size_t) U.max_threads());
929         if (n == 0) throw std::runtime_error("not enough RAM");
930         return n;
931     }
operator ()microbench_ift932     double operator()(unsigned int nparallel) const {
933         matpoly c(U.ab, nparallel, 1, U.csize);
934         matpoly_ft<typename OP::FFT> tc(c.m, c.n, op.fti);
935         /* This is important, since otherwise the inverse transform won't
936          * work */
937         c.set_size(U.csize);
938         tc.zero(); /* would be .fill_random(rstate) if we had it */
939         double tt = -wct_seconds();
940         matpoly_ft<typename OP::FFT>::ift(c, tc);
941         return tt + wct_seconds();
942     }
943 };/*}}}*/
944 template<typename OP_T>
945 struct microbench_conv { /*{{{*/
946     typedef lingen_substep_characteristics ch_t;
947     typedef lingen_platform pc_t;
948     typedef OP_T OP;
949     OP op;
950     pc_t P;
951     unsigned int mesh;
952     ch_t const & U;
953     static constexpr const char * name = "conv";
microbench_convmicrobench_conv954     microbench_conv(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {}
max_parallelmicrobench_conv955     unsigned int max_parallel() const {
956         /* for k = nparallel, we need 2k+1 transforms in ram */
957         size_t R = 0;
958         /* storage for two input transform */
959         R += 2 * op.get_alloc_sizes()[0];
960         /* plus the temp memory for the addmul operation */
961         R += op.get_alloc_sizes()[1];
962         R += op.get_alloc_sizes()[2];
963         /* take into account the +1 transform */
964         size_t n = (P.available_ram - op.get_alloc_sizes()[0]) / R;
965         /* TODO: we probably want to ask P about max_threads. */
966         n = std::min(n, (size_t) U.max_threads());
967         if (n == 0) throw std::runtime_error("not enough RAM");
968         return n;
969     }
operator ()microbench_conv970     double operator()(unsigned int nparallel) const {
971         matpoly_ft<typename OP::FFT> tc( nparallel, 1, op.fti);
972         matpoly_ft<typename OP::FFT> tc0(nparallel, 1, op.fti);
973         matpoly_ft<typename OP::FFT> tc1(1, 1, op.fti);
974         tc0.zero(); /* would be .fill_random(rstate) if we had it */
975         tc1.zero(); /* would be .fill_random(rstate) if we had it */
976         double tt = -wct_seconds();
977         matpoly_ft<typename OP::FFT>::addcompose(tc, tc0, tc1);
978         return tt + wct_seconds();
979     }
980 };/*}}}*/
981 template<template<typename> class META_OP>
982 struct microbench_dft<META_OP<void>> { /*{{{*/
983     typedef lingen_substep_characteristics ch_t;
984     typedef lingen_platform pc_t;
985     typedef META_OP<void> OP;
986     OP op;
987     pc_t P;
988     unsigned int mesh;
989     ch_t const & U;
990     static constexpr const char * name = NULL;
microbench_dftmicrobench_dft991     microbench_dft(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {}
max_parallelmicrobench_dft992     unsigned int max_parallel() const { return U.max_threads(); }
operator ()microbench_dft993     double operator()(unsigned int) const { return 0; }
994 };/*}}}*/
995 template<template<typename> class META_OP>
996 struct microbench_ift<META_OP<void>> { /*{{{*/
997     typedef lingen_substep_characteristics ch_t;
998     typedef lingen_platform pc_t;
999     typedef META_OP<void> OP;
1000     OP op;
1001     pc_t P;
1002     unsigned int mesh;
1003     ch_t const & U;
1004     static constexpr const char * name = NULL;
microbench_iftmicrobench_ift1005     microbench_ift(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {}
max_parallelmicrobench_ift1006     unsigned int max_parallel() const { return U.max_threads(); }
operator ()microbench_ift1007     double operator()(unsigned int) const { return 0; }
1008 };/*}}}*/
1009 template<template<typename> class META_OP>
1010 struct microbench_conv<META_OP<void>> { /*{{{*/
1011     typedef lingen_substep_characteristics ch_t;
1012     typedef lingen_platform pc_t;
1013     typedef META_OP<void> OP;
1014     OP op;
1015     pc_t P;
1016     unsigned int mesh;
1017     ch_t const & U;
1018     static constexpr const char * name = "product";
microbench_convmicrobench_conv1019     microbench_conv(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {}
max_parallelmicrobench_conv1020     unsigned int max_parallel() const { return U.max_threads(); }
operator ()microbench_conv1021     double operator()(unsigned int nparallel) const {
1022         constexpr const unsigned int splitwidth = matpoly::over_gf2 ? 64 : 1;
1023         matpoly a(U.ab, nparallel, splitwidth, U.asize);
1024         matpoly b(U.ab, splitwidth, 1, U.asize);
1025         a.zero_pad(U.asize);
1026         a.fill_random(0, U.asize, U.rstate);
1027         b.zero_pad(U.bsize);
1028         b.fill_random(0, U.bsize, U.rstate);
1029         matpoly c(U.ab, nparallel, 1, U.csize);
1030         c.zero_pad(U.csize);
1031         double tt = -wct_seconds();
1032         OP::addcompose(c, a, b);
1033         return (tt + wct_seconds()) / splitwidth;
1034     }
1035 };/*}}}*/
1036 
1037 #endif	/* LINGEN_SUBSTEP_CHARACTERISTICS_HPP_ */
1038