1 #include "cado.h" // IWYU pragma: keep
2 
3 #include <climits>                            // for UINT_MAX
4 #include <cstdint>                            // for SIZE_MAX
5 #include <cfloat>                              // for DBL_MAX
6 #include <cstdio>                              // for fputs, stderr
7 #include <cmath>                              // for log2
8 
9 #include <algorithm>                           // for max, sort, find, unique
10 #include <array>                               // for array
11 #include <iostream>                            // for operator<<, basic_ostream
12 #include <fstream> // ofstream // IWYU pragma: keep
13 #include <map>                                 // for map, operator!=, map<>...
14 #include <memory>                              // for allocator_traits<>::va...
15 #include <stdexcept>                           // for invalid_argument, over...
16 #include <string>                              // for string, operator<<
17 #include <tuple>                               // for tie, get, tuple
18 #include <utility>                             // for pair, make_pair
19 #include <vector>                              // for vector
20 
21 #include <gmp.h>                               // for mp_limb_t, mpz_size
22 
23 #include "lingen_tuning.hpp"
24 #include "cxx_mpz.hpp"                         // for cxx_mpz
25 #include "fmt/core.h"                          // for check_format_string
26 #include "fmt/format.h"                        // for basic_buffer::append
27 #include "fmt/ostream.h"                       // for formatbuf<>::int_type
28 #include "lingen_abfield.hpp"                  // for abdst_field
29 #include "lingen_bw_dimensions.hpp"            // for bw_dimensions
30 #include "lingen_call_companion.hpp"           // for lingen_call_companion
31 #include "lingen_matpoly_select.hpp"           // for matpoly, matpoly::over...
32 #include "lingen_mul_substeps_base.hpp"        // for op_mul_or_mp_base, op_...
33 #include "lingen_platform.hpp"                 // for lingen_platform
34 #include "lingen_qcode_select.hpp"             // for test_basecase
35 #include "lingen_substep_characteristics.hpp"  // for lingen_substep_charact...
36 #include "lingen_substep_schedule.hpp"         // for lingen_substep_schedule
37 #include "lingen_tuning_cache.hpp"             // for lingen_tuning_cache::c...
38 #include "macros.h"                            // for iceildiv, ASSERT_ALWAYS
39 #include "misc.h"                              // for size_disp
40 #include "params.h"                            // for cxx_param_list, param_...
41 #include "subdivision.hpp"                     // for subdivision
42 #include "timing.h"                            // for wct_seconds, weighted_...
43 
44 /* {{{ all_splits_of
45  * Given n>=1 return the list of all integers k (1<=k<=n) such
46  * that it is possible to divide n into k blocks (not necessarily of
47  * equal size) but such that all the splits that are obtained this way
48  * have different maximal block sizes.
49  *
50  * (in the binary case we impose that block sizes are always multiples of
51  * 8 -- this is done to limit the number of possible configurations)
52  *
53  * The returned list is such that k->iceildiv(n, k) actually performs the
54  * reversal of the list. And furthermore, for all k's such that k^2<=n,
55  * the obtained splits are different. Hence it suffices to iterate over
56  * all these k's
57  */
all_splits_of(unsigned int n)58 std::vector<unsigned int> all_splits_of(unsigned int n)
59 {
60 #ifdef SELECT_MPFQ_LAYER_u64k1
61     n /= 8;
62 #endif
63     std::vector<unsigned int> res;
64     for(unsigned int k = 1 ; k * k <= n ; k++) res.push_back(k);
65     unsigned int j = res.size();
66     if (res[j-1] * res[j-1] == n) j--;
67     for( ; j-- ; ) res.push_back(iceildiv(n, res[j]));
68     return res;
69 }
70 /* }}} */
71 
optimize(std::ostream & os,lingen_substep_characteristics const & U,lingen_platform const & P,unsigned int mesh,std::vector<lingen_substep_schedule::fft_type_t> const & allowed_ffts,bool do_timings,lingen_tuning_cache & C,size_t reserved)72 std::vector<lingen_substep_schedule> optimize(
73         std::ostream& os,
74         lingen_substep_characteristics const & U,
75         lingen_platform const & P,
76         unsigned int mesh,
77         std::vector<lingen_substep_schedule::fft_type_t> const & allowed_ffts,
78         bool do_timings,
79         lingen_tuning_cache & C,
80         size_t reserved)
81 { /* {{{ */
82     unsigned int nr0 = U.mpi_split0(mesh).block_size_upper_bound();
83     unsigned int nr1 = U.mpi_split1(mesh).block_size_upper_bound();
84     unsigned int nr2 = U.mpi_split2(mesh).block_size_upper_bound();
85     size_t min_my_ram = SIZE_MAX;
86     lingen_substep_schedule S_lean;
87     S_lean.fft_type = lingen_substep_schedule::FFT_NONE; /* placate compiler */
88     std::vector<lingen_substep_schedule> all_schedules;
89 
90     unsigned int nvalid = 0;
91     for(lingen_substep_schedule::fft_type_t fft : allowed_ffts) {
92         if (!U.fft_type_valid(fft)) continue;
93         nvalid++;
94     }
95     if (!nvalid) {
96         throw std::invalid_argument("FFT choice restricted to zero valid FFTs, tuning_thresholds is probably wrong");
97     }
98 
99     for(lingen_substep_schedule::fft_type_t fft : allowed_ffts) {
100         if (!U.fft_type_valid(fft)) continue;
101         for(unsigned int shrink0 : all_splits_of(nr0)) {
102             for(unsigned int shrink2 : all_splits_of(nr2)) {
103                 if (fft == lingen_substep_schedule::FFT_NONE)
104                     if (shrink0 > 1 || shrink2 > 1) continue;
105                 unsigned int nrs0 = U.shrink_split0(mesh, shrink0).block_size_upper_bound();
106                 unsigned int nrs2 = U.shrink_split2(mesh, shrink2).block_size_upper_bound();
107                 /* first the splits with b0 == nrs0 */
108                 {
109                     unsigned int b0 = nrs0;
110                     for(unsigned int b1 : all_splits_of(nr1)) {
111                         for(unsigned int b2 : all_splits_of(nrs2)) {
112                             lingen_substep_schedule S;
113                             S.fft_type = fft;
114                             S.shrink0 = shrink0;
115                             S.shrink2 = shrink2;
116                             S.batch = {{ b0, b1, b2 }};
117                             size_t my_ram = U.get_peak_ram(mesh, S);
118                             if (reserved + my_ram <= P.available_ram) {
119                                 all_schedules.push_back(S);
120                             }
121                             if (my_ram < min_my_ram) {
122                                 min_my_ram = my_ram;
123                                 S_lean = S;
124                             }
125                         }
126                     }
127                 }
128                 /* then the splits with b2 == nrs2 */
129                 {
130                     unsigned int b2 = nrs2;
131                     for(unsigned int b1 : all_splits_of(nr1)) {
132                         for(unsigned int b0 : all_splits_of(nrs0)) {
133                             lingen_substep_schedule S;
134                             S.fft_type = fft;
135                             S.shrink0 = shrink0;
136                             S.shrink2 = shrink2;
137                             S.batch = {{ b0, b1, b2 }};
138                             size_t my_ram = U.get_peak_ram(mesh, S);
139                             if (reserved + my_ram <= P.available_ram) {
140                                 all_schedules.push_back(S);
141                             }
142                             if (my_ram < min_my_ram) {
143                                 min_my_ram = my_ram;
144                                 S_lean = S;
145                             }
146                         }
147                     }
148                 }
149             }
150         }
151     }
152 
153     std::sort(all_schedules.begin(), all_schedules.end());
154 
155     all_schedules.erase(
156             std::unique(all_schedules.begin(), all_schedules.end()),
157             all_schedules.end());
158 
159     if (all_schedules.empty()) {
160         char buf[20];
161         std::ostringstream os;
162         os << "Fatal error:"
163             << " it is not possible to complete this calculation with only "
164             << size_disp(P.available_ram, buf)
165             << " of memory for intermediate transforms.\n";
166         os << "Based on the cost for input length "
167             << U.input_length
168             << ", we need at the very least "
169             << size_disp(U.get_peak_ram(mesh, S_lean), buf);
170         os  << ", plus "
171             << size_disp(reserved, buf)
172             << " for reserved memory at upper levels";
173         os << " [with shrink="
174             << fmt::format(FMT_STRING("({},{})"),
175                     S_lean.shrink0, S_lean.shrink2)
176             << ", batch="
177             << fmt::format(FMT_STRING("({},{},{})"),
178                     S_lean.batch[0], S_lean.batch[1], S_lean.batch[2])
179             << "]\n";
180         fputs(os.str().c_str(), stderr);
181         throw std::overflow_error(os.str());
182     }
183 
184     U.sort_schedules(os, all_schedules, P, mesh, C, do_timings);
185 
186     std::map<std::string, lingen_substep_schedule> families;
187     std::vector<lingen_substep_schedule> res;
188 
189     for(auto const & S : all_schedules) {
190         std::string f = fmt::sprintf("%s%s;%s",
191                 mesh > 1 ? "MPI-" : "",
192                 op_mul_or_mp_base::op_name(U.op_type),
193                 S.fft_name());
194         if (families.find(f) == families.end()) {
195             families[f] = S;
196             res.push_back(S);
197         }
198     }
199 
200     return res;
201 }
202 /* }}} */
203 
204 struct lingen_tuner {
205     typedef lingen_platform pc_t;
206     typedef lingen_substep_schedule sc_t;
207 
208     abdst_field ab; /* imported from the dims struct */
209     cxx_mpz p;
210     unsigned int m,n;
211     size_t L;
212     lingen_platform P;
213     lingen_tuning_cache C;
214     gmp_randstate_t rstate;
215     const char * timing_cache_filename = NULL;
216     const char * schedule_filename = NULL;
217 
218     /* stop measuring the time taken by the basecase when it is
219      * more than this number times the time taken by the other
220      * alternatives
221      */
222     double basecase_keep_until = 1.8;
223 
224     struct tuning_thresholds_t : public std::map<std::string, unsigned int> {/*{{{*/
225         typedef std::map<std::string, unsigned int> super;
226         static constexpr const char * recursive = "recursive";
227         static constexpr const char * collective = "collective";
228         static constexpr const char * ternary = "ternary";
229         static constexpr const char * cantor = "cantor";
230         static constexpr const char * flint = "flint";
231         static constexpr const char * notiming = "notiming";
232         static const std::vector<const char *> thresholds_verbs;
233         static const std::vector<std::pair<lingen_substep_schedule::fft_type_t, const char *>> code_to_key;
234 
haslingen_tuner::tuning_thresholds_t235         bool has(std::string const & key) const {
236             return find(key) != end();
237         }
238         private:
getreflingen_tuner::tuning_thresholds_t239         unsigned int& getref(std::string const & key) {
240             return ((super&)(*this))[key];
241         }
242         public:
operator []lingen_tuner::tuning_thresholds_t243         unsigned int operator[](std::string const & key) const {
244             return has(key) ? at(key) : UINT_MAX;
245         }
tuning_thresholds_tlingen_tuner::tuning_thresholds_t246         tuning_thresholds_t(cxx_param_list & pl, std::ostream& os, lingen_platform const & P) {/*{{{*/
247             const char * tmp = param_list_lookup_string(pl, "tuning_thresholds");
248             if (!tmp) return;
249             std::string tlist = tmp;
250             for(size_t pos = 0 ; pos != std::string::npos ; ) {
251                 size_t next = tlist.find(',', pos);
252                 std::string tok;
253                 if (next == std::string::npos) {
254                     tok = tlist.substr(pos);
255                     pos = next;
256                 } else {
257                     tok = tlist.substr(pos, next - pos);
258                     pos = next + 1;
259                 }
260                 auto error = [&tok](std::string const& reason) {
261                     std::string base = fmt::format(
262                             "tuning_thresholds is bad:"
263                             " pair \"{}\" ", tok);
264                     throw std::invalid_argument(base + reason);
265                 };
266 
267                 size_t colon = tok.find(':');
268                 if (colon == std::string::npos)
269                     error("has no colon");
270 
271                 std::string algorithm = tok.substr(0, colon);
272                 if (std::find(thresholds_verbs.begin(), thresholds_verbs.end(), algorithm) == thresholds_verbs.end()) {
273                     std::ostringstream os;
274                     for(auto const & x : thresholds_verbs)
275                         os << " " << x;
276                     error(fmt::format(
277                                 "uses unrecognized key \"{}\""
278                                 " (recognized keys:{})",
279                                 algorithm, os.str()));
280                 }
281 
282                 unsigned int & dst(getref(algorithm));
283                 if (!(std::istringstream(tok.substr(colon + 1)) >> dst))
284                     error("has no understandable integer threshold");
285             }
286             if (has(collective) && P.r == 1) {
287                 const char * what = "interpreted as \"recursive\"";
288                 if (!has(recursive)) {
289                     getref(recursive) = getref(collective);
290                 } else if (getref(collective) < getref(recursive)) {
291                     getref(recursive) = getref(collective);
292                 } else {
293                     what = "ignored";
294                 }
295                 erase(find(collective));
296                 os << "# Note: the tuning threshold \"collective\" is "
297                     << what << " here, since we have a non-MPI run\n";
298             }
299         }/*}}}*/
300     };/*}}}*/
301 
302     tuning_thresholds_t tuning_thresholds;
303 
304     /* length(E), length(E_left), length(E_right), number of occurrences */
305     typedef std::tuple<size_t, size_t, size_t, unsigned int> weighted_call_t;
306     std::vector<unsigned int> mesh_all;
307     std::map<unsigned int, std::string> strat_name;
308 
309 
310     struct output_info {/*{{{*/
311         int quiet = 0;
312         const char * tuning_log_filename = NULL;
declare_usagelingen_tuner::output_info313         static void declare_usage(cxx_param_list & pl) {/*{{{*/
314             param_list_decl_usage(pl, "tuning_log_filename",
315                     "Output tuning log to this file\n");
316             param_list_decl_usage(pl, "tuning_quiet",
317                     "Silence tuning log\n");
318         }/*}}}*/
lookup_parameterslingen_tuner::output_info319         static void lookup_parameters(cxx_param_list & pl) {/*{{{*/
320             lingen_platform::lookup_parameters(pl);
321             param_list_lookup_string(pl, "tuning_quiet");
322             param_list_lookup_string(pl, "tuning_log_filename");
323         }/*}}}*/
output_infolingen_tuner::output_info324         output_info(cxx_param_list & pl) {
325             tuning_log_filename = param_list_lookup_string(pl, "tuning_log_filename");
326             param_list_parse_int(pl, "tuning_quiet", &quiet);
327         }
328     };/*}}}*/
declare_usagelingen_tuner329     static void declare_usage(cxx_param_list & pl) {/*{{{*/
330         lingen_platform::declare_usage(pl);
331         output_info::declare_usage(pl);
332         param_list_decl_usage(pl, "tuning_schedule_filename",
333                 "Save (and re-load if it exists) tuning schedule from this file");
334         param_list_decl_usage(pl, "tuning_timing_cache_filename",
335                 "Save (and re-load) timings for individual transforms in this file\n");
336         param_list_decl_usage(pl, "basecase-keep-until",
337                 "When tuning, stop measuring basecase timing when it exceeds the time of the recursive algorithm (counting its leaf calls) by this factor\n");
338         param_list_decl_usage(pl, "tuning_thresholds",
339                 "comma-separated list of threshols, given in the form <algorithm>:<threshold> value. Recognized values for <algorithm> are a subset of recursive,gfp_plain,flint,cantor,gf2x_plain. Thresholds are integers corresponding to the input size of E\n");
340     }/*}}}*/
lookup_parameterslingen_tuner341     static void lookup_parameters(cxx_param_list & pl) {/*{{{*/
342         lingen_platform::lookup_parameters(pl);
343         output_info::lookup_parameters(pl);
344         param_list_lookup_string(pl, "tuning_schedule_filename");
345         param_list_lookup_string(pl, "tuning_timing_cache_filename");
346         param_list_lookup_string(pl, "basecase-keep-until");
347         param_list_lookup_string(pl, "tuning_thresholds");
348     }/*}}}*/
349 
lingen_tunerlingen_tuner350     lingen_tuner(std::ostream& os, bw_dimensions & d, size_t L, MPI_Comm comm, cxx_param_list & pl) :/*{{{*/
351         ab(d.ab),
352         m(d.m), n(d.n), L(L), P(comm, pl),
353         tuning_thresholds(pl, os, P)
354     {
355 #ifdef SELECT_MPFQ_LAYER_u64k1
356         mpz_set_ui(p, 2);
357 #else
358         mpz_set (p, abfield_characteristic_srcptr(ab));
359 #endif
360         gmp_randinit_default(rstate);
361         gmp_randseed_ui(rstate, 1);
362 
363         param_list_parse_double(pl, "basecase-keep-until", &basecase_keep_until);
364 
365         schedule_filename = param_list_lookup_string(pl, "tuning_schedule_filename");
366         /* only the leader will do the tuning, so only the leader cares
367          * about loading/saving it...
368          */
369         timing_cache_filename = param_list_lookup_string(pl, "tuning_timing_cache_filename");
370         int rank;
371         MPI_Comm_rank(P.comm, &rank);
372         if (rank == 0)
373             C.load(timing_cache_filename);
374     }/*}}}*/
375     // coverity[exn_spec_violation]
~lingen_tunerlingen_tuner376     ~lingen_tuner() {/*{{{*/
377         int rank;
378         MPI_Comm_rank(P.comm, &rank);
379         if (rank == 0)
380             C.save(timing_cache_filename);
381         gmp_randclear(rstate);
382     }/*}}}*/
mpi_threshold_comm_and_timelingen_tuner383     std::tuple<size_t, double> mpi_threshold_comm_and_time() {/*{{{*/
384         /* This is the time taken by gather() and scatter() right at the
385          * threshold point. This total time is independent of the
386          * threshold itself, in fact.
387          */
388         size_t data0 = m*(m+n)*(P.r*P.r-1);
389         data0 = data0 * L;
390 #ifdef SELECT_MPFQ_LAYER_u64k1
391         data0 = iceildiv(data0, ULONG_BITS) * sizeof(unsigned long);
392 #else
393         data0 = abvec_elt_stride(ab, data0);
394 #endif
395 
396         // We must **NOT** divide by r*r, because the problem is
397         // precisely caused by the fact that gather() and scatter() all
398         // imply one contention point which is the central node.
399         // data0 = data0 / (r*r);
400         std::tuple<size_t, double> vv { 2 * data0, 2 * data0 / P.mpi_xput};
401         return vv;
402     }/*}}}*/
compute_and_report_basecaselingen_tuner403     double compute_and_report_basecase(std::ostream& os, size_t length, bool do_timings) { /*{{{*/
404         double tt;
405 
406         lingen_tuning_cache::basecase_key K { mpz_sizeinbase(p, 2), m, n, length, P.openmp_threads };
407 
408         os << fmt::format(FMT_STRING("# basecase (@{}): "), length);
409 
410         if (!do_timings) {
411             C[K] = { 0 };
412             os << "not timed\n";
413             return C[K];
414         }
415 
416         if (!C.has(K)) {
417             os << std::flush;
418             tt = wct_seconds();
419             test_basecase(ab, m, n, length, rstate);
420             tt = wct_seconds() - tt;
421             C[K] = { tt };
422             os << fmt::sprintf("%.2f\n", tt);
423         } else {
424             os << fmt::sprintf("%.2f [from cache]\n", C[K]);
425         }
426         return C[K];
427     }/*}}}*/
calls_and_weights_at_depthlingen_tuner428     std::vector<weighted_call_t> calls_and_weights_at_depth(int i) {/*{{{*/
429         /*
430          * Let L = (Q << (i+1)) + (u << i) + v, with u={0,1} and
431          * 0<=v<(1<<i). We have L % (1 << i) = v. Note that u=(L>>i)%2.
432          * Let U = u << i.
433          *
434          * input length at depth i, a.k.a. length(E), is either
435          * L>>i=floor(L/2^i)=2Q+u or one above.  The number of calls is:
436          *
437          * v times:
438          *         length(E) = 2Q + u + 1
439          *         length(E_left) = Q + 1
440          *         length(E_right) = Q + u
441          * otherwise:
442          *         length(E) = 2Q + u
443          *         length(E_left) = Q + u
444          *         length(E_right) = Q
445          * And in all cases:
446          *         length(pi_left)  = ceiling(alpha * length(E_left))
447          *         length(pi_right) = ceiling(alpha * length(E_right))
448          *
449          * The length of the chopped copy of E is:
450          *         length(E_right) + length(pi_right) - 1
451          *
452          * The length of the product pi_left*pi_right is
453          *      ceiling(alpha * length(E_left)) + ceiling(alpha * length(E_right)) - 1
454          * This should also be less than or equal to
455          *      ceiling(alpha * length(E))
456          * So that in cases where the former bound is larger than the
457          * latter, we're almost sure that the top coefficients multiply
458          * to a zero matrix.
459          *
460          */
461 
462         size_t Q = L >> (i+1);
463         size_t u = (L >> i) & 1;
464         size_t v = L % (1 << i);
465 
466         /* at i == floor level, we have (1<<(i-1)) < L <= (1<<i), Q = 0, and:
467          *
468          * if v, then u=0 and v=L. we have v calls with size 1 and (1<<i)-v
469          * with size 0 on the first read, but the calls with size 0 don't
470          * really exist, so that in fact we're seeing leftovers of calls
471          * with size 1 at the level above. So we actually have _at depth
472          * i proper_ 2*v-(1<<i) calls with size 1, and that's all.
473          *
474          * if v==0, then u=1. we have all calls with size 1, code below
475          * is fine
476          */
477 
478         if (v && !((L-1)>>i)) {
479             ASSERT_ALWAYS(Q == 0 && u == 0);
480             weighted_call_t w1 { 2*Q + u + 1, Q + 1, Q + u, 2*v - (1 << i) };
481             std::vector<weighted_call_t> res {{ w1 }};
482             return res;
483         } else if (v) {
484             weighted_call_t w0 { 2*Q + u,     Q + u, Q,     (1 << i) - v };
485             weighted_call_t w1 { 2*Q + u + 1, Q + 1, Q + u, v };
486             std::vector<weighted_call_t> res {{ w0, w1 }};
487             return res;
488         } else {
489             weighted_call_t w0 { 2*Q + u,     Q + u, Q,     (1 << i) - v };
490             std::vector<weighted_call_t> res {{ w0 }};
491             return res;
492         }
493     }/*}}}*/
substeplingen_tuner494     lingen_substep_characteristics substep(weighted_call_t const & cw, op_mul_or_mp_base::op_type_t op) { /* {{{ */
495         size_t length_E;
496         size_t length_E_left;
497         size_t length_E_right;
498         unsigned int weight;
499 
500         std::tie(length_E, length_E_left, length_E_right, weight) = cw;
501 
502         ASSERT_ALWAYS(length_E >= 2);
503         ASSERT_ALWAYS(weight);
504 
505         size_t asize, bsize, csize;
506 
507         if (op == op_mul_or_mp_base::OP_MP) {
508             csize = length_E_right;
509             /* pi is the identity matrix for zero coefficients, but the
510              * identity matrix already has length 1.
511              */
512             bsize = 1 + iceildiv(m * length_E_left, m+n);
513             asize = csize + bsize - 1;
514         } else {
515             asize = 1 + iceildiv(m * length_E_left, m+n);
516             bsize = 1 + iceildiv(m * length_E_right, m+n);
517             csize = asize + bsize - 1;
518         }
519 
520         ASSERT_ALWAYS(asize);
521         ASSERT_ALWAYS(bsize);
522 
523         return lingen_substep_characteristics(
524                 ab, rstate, length_E,
525                 op,
526                 op == op_mul_or_mp_base::OP_MP ? m : (m+n),
527                 m+n, m+n,
528                 asize, bsize, csize);
529     } /* }}} */
recursion_makes_senselingen_tuner530     bool recursion_makes_sense(size_t L) const {/*{{{*/
531         return L >= 2;
532     }/*}}}*/
533     struct tuner_persistent_data {/*{{{*/
534         typedef std::map<size_t, std::pair<unsigned int, double>, lingen_tuning_cache::coarse_compare> level_strategy_map;
535         lingen_hints hints;
536         lingen_hints const & stored_hints;
537         level_strategy_map best;
538         /* The "minimum mesh" field is only used by
539          * tune_local_at_depth. This determines the "mesh" of calls
540          * that will be tried. Options are:
541          *
542          * mesh=0 (if 0>=minimum_mesh) : basecase
543          * mesh=1 (if 1>=minimum_mesh) : recursive, single-node
544          * other                       : recursive, multi-node
545          *
546          * Currently, because scatter_mat and gather_mat are limited to
547          * 1-n and n-1 conversions, we transition to single- to
548          * multi-node all in one go.
549          */
550         unsigned int minimum_mesh = 0;
551         double last_save = 0;
552         size_t peak = 0;
553         int ipeak = 0;
554         size_t upper_threshold = 0;
555         bool impose_hints;
tuner_persistent_datalingen_tuner::tuner_persistent_data556         tuner_persistent_data(lingen_hints const & stored_hints) : stored_hints(stored_hints) {
557             last_save = wct_seconds();
558             impose_hints = !stored_hints.empty();
559         }
560     };/*}}}*/
tune_local_at_depth_mp_or_mullingen_tuner561     lingen_call_companion::mul_or_mp_times tune_local_at_depth_mp_or_mul(
562             std::ostream& os,
563             tuner_persistent_data & persist,
564             weighted_call_t cw,
565             int depth,
566             unsigned int mesh,
567             std::vector<lingen_substep_schedule::fft_type_t> const & allowed_ffts,
568             bool do_timings,
569             op_mul_or_mp_base::op_type_t op_type)/*{{{*/
570     {
571         lingen_call_companion::mul_or_mp_times U { op_type };
572         lingen_hints const & stored_hints(persist.stored_hints);
573         double & last_save(persist.last_save);
574         size_t & peak(persist.peak);
575         int & ipeak(persist.ipeak);
576 
577         /* For input length L, the reserved
578          * storage at depth i is
579          *   RMP'(i)  = [m/r][(m+n)/r][(1+\alpha)(L-2\ell_i)] + [(m+n)/r]^2*[2\alpha\ell_i]
580          *   RMUL'(i) = [m/r][(m+n)/r][(1+\alpha)(L-2\ell_i)] + [(m+n)/r]^2*[4\alpha\ell_i]
581          * with the notations \alpha=m/(m+n), \ell_i=L/2^(i+1), and
582          * [] denotes ceiling.
583          * The details of the computation are in the comments in
584          * lingen.cpp
585          *
586          * Note that here, we're using P.r and not the mesh parameter,
587          * because what matters is what we're reserving at the _upper_
588          * levels.  And most importantly, chances are that the largest
589          * share of the reserved memory comes from the topmost levels
590          * anyway, so we surmise that P.r is appropriate there.
591          */
592         size_t base_E  = iceildiv(m,P.r)*iceildiv(m+n,P.r)*mpz_size(p)*sizeof(mp_limb_t);
593         size_t base_pi = iceildiv(m+n,P.r)*iceildiv(m+n,P.r)*mpz_size(p)*sizeof(mp_limb_t);
594         constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
595         size_t reserved_base = base_E * iceildiv(L - (L >> depth), simd);
596         size_t reserved_mp  = base_pi * iceildiv(iceildiv(m * iceildiv(L, 1<<depth), m+n), simd);
597         size_t reserved_mul = base_pi * iceildiv(iceildiv(m * iceildiv(2*L, 1<<depth), m+n), simd);
598         reserved_mp += reserved_base;
599         reserved_mul += reserved_base;
600 
601         size_t reserved = op_type == op_mul_or_mp_base::OP_MP ? reserved_mp : reserved_mul;
602 
603         os << fmt::format(FMT_STRING("# {} reserved storage = {}\n"),
604                 op_mul_or_mp_base::op_name(op_type),
605                 size_disp(reserved));
606 
607         size_t L, Lleft, Lright;
608         unsigned int weight;
609         std::tie(L, Lleft, Lright, weight) = cw;
610         lingen_call_companion::key K { depth, L };
611         ASSERT_ALWAYS(weight);
612 
613         ASSERT_ALWAYS (recursion_makes_sense(L));
614         auto step = substep(cw, op_type);
615         bool print_here = true;
616 #if 0
617         if (print_here)
618             step.report_size_stats_human(os);
619 #endif
620 
621         std::vector<lingen_substep_schedule> SS;
622 
623         if (stored_hints.find(K) != stored_hints.end()) {
624             lingen_substep_schedule S = stored_hints.at(K)[op_type].S;
625             if (step.is_valid_schedule(P, mesh, S)) {
626                 os << "# Using imposed schedule " << S.serialize() << "\n";
627                 SS.push_back(S);
628             } else {
629                 os << "# IGNORING invalid schedule " << S.serialize() << "\n";
630             }
631         }
632 
633         if (SS.empty()) {
634             /* get the schedule by trying all possibilities for the
635              * shrink and batch options. */
636             SS = optimize(os, step, P, mesh,
637                     allowed_ffts, do_timings, C, reserved);
638         }
639 
640         for(auto const & S : SS)
641             step.get_and_report_call_time(os, P, mesh, S, C, do_timings);
642 
643         lingen_substep_schedule S = SS.front();
644 
645         if (print_here)
646             step.report_op_winner(os, mesh, S);
647 
648         if (do_timings) {
649             if (wct_seconds() > last_save + 10) {
650                 int rank;
651                 MPI_Comm_rank(P.comm, &rank);
652                 if (rank == 0)
653                     C.save(timing_cache_filename);
654                 last_save = wct_seconds();
655             }
656         }
657 
658         U = step.get_companion(os, P, mesh, S, C, reserved, do_timings);
659         os << "#\n";
660 
661         size_t mm = U.ram_total();
662         if (mm > peak) { ipeak = depth; peak = mm; }
663 
664         return U;
665     }/*}}}*/
666     struct configurations_to_test {/*{{{*/
667         /* This is the list of dimensions or the MPI mesh that
668          * we're going to try. For the moment, it's limited to
669          * either {1*1, r*r}, or only one of them. In principle,
670          * the framework could be extended to support further
671          * mesh sizes, but that requires support in scatter_mat
672          * and gather_mat to dispatch from m nodes to n nodes.
673          */
674         std::vector<unsigned int> mesh_sizes;
675         std::vector<lingen_substep_schedule::fft_type_t> ffts;
676         /* The various choices for the "shrink" and "batch" options are
677          * not listed here, but are rather computed on the fly in the
678          * optimize() function. This seems more convenient there, as it's
679          * easier to do pruning based on the mesh sizes and such.
680          */
681         bool do_timings = true;
configurations_to_testlingen_tuner::configurations_to_test682         configurations_to_test() {
683             ffts = { lingen_substep_schedule::FFT_NONE };
684 #ifndef SELECT_MPFQ_LAYER_u64k1
685             ffts.push_back(lingen_substep_schedule::FFT_FLINT);
686 #else
687             ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
688             ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
689 #endif
690         }
from_stored_hintslingen_tuner::configurations_to_test691         bool from_stored_hints(std::ostream& os, lingen_tuner & tuner, tuner_persistent_data & persist, lingen_call_companion::key const & K) {/*{{{*/
692             lingen_hints const & stored_hints(persist.stored_hints);
693             lingen_platform const & P(tuner.P);
694 
695             if (stored_hints.find(K) == stored_hints.end())
696                 return false;
697             os << ("# Re-using stored schedule\n");
698             /* This is _not_ a complete set ! We still must compute the
699              * timings, the RAM, and so on.
700              *
701              * This is done via tune_local_at_depth_mp_or_mul, which
702              * reads stored_hints a second time. In particular, it
703              * fetches the info of the previously selected fft types for
704              * both operations, and eventually sets U.mp and U.mul. This
705              * implies that we do not need to restrict the list of
706              * allowed ffts here.
707              */
708             unsigned int mesh = stored_hints.at(K).mesh;
709 
710             if (mesh == UINT_MAX) mesh = P.r;
711             if (mesh > 1 && mesh != P.r) {
712                 throw std::runtime_error(
713                         fmt::sprintf(
714                             "stored schedule is invalid,"
715                             " we cannot (yet) run on a %d*%d grid"
716                             " a schedule meant for a %d*%d grid\n",
717                             P.r, P.r,
718                             mesh, mesh));
719             }
720 
721             mesh_sizes = { mesh };
722             os << fmt::format(FMT_STRING("# Forcing {} at depth {} L={}\n"),
723                     tuner.strat_name[mesh], K.depth, K.L);
724 
725             return true;
726         }/* }}} */
explainlingen_tuner::configurations_to_test727         std::string explain(tuning_thresholds_t const & T, std::string const & k) {
728             if (T.has(k)) {
729                 return fmt::format(FMT_STRING(" tuning_threshold[{}]={}"), k, T[k]);
730             } else {
731                 return fmt::format(FMT_STRING(" tuning_threshold[{}]=undef"), k);
732             }
733         }
734         typedef tuning_thresholds_t T_t;
from_thresholds_mesh_levellingen_tuner::configurations_to_test735         bool from_thresholds_mesh_level(std::ostream& os, lingen_tuner & tuner, tuner_persistent_data & persist, lingen_call_companion::key const & K) {/*{{{*/
736             T_t const & T(tuner.tuning_thresholds);
737             lingen_platform const & P(tuner.P);
738             unsigned int L = K.L;
739             /* first decision: do we force recursion. There are various
740              * reasons to do so, since we have various thresholds that
741              * implicitly enable recursion */
742             std::vector<std::pair<unsigned int, const char *>> all_rec;
743             for(auto k : T_t::thresholds_verbs) {
744                 if (k == T_t::notiming) continue;
745                 if (T.has(k))
746                     all_rec.push_back(std::make_pair(T[k], k));
747             }
748             /* If no threshold talks about being recursive, then it's
749              * left to our decision. We keep all possible mesh sizes a
750              * priori. However, we must pay attention to the fact that we
751              * may know already that some mesh sizes should be discarded
752              * (this can only happen if at an earlier level, the current mesh
753              * size X was beaten by a larger one Y, which means that the
754              * minimum mesh size was at most X for the previous size, and
755              * is not at most Y, not more).
756              */
757             if (all_rec.empty()) {
758                 mesh_sizes.clear();
759                 if (persist.minimum_mesh <= 0) mesh_sizes.push_back(0);
760                 if (persist.minimum_mesh <= 1) mesh_sizes.push_back(1);
761                 if (P.r != 1 && persist.minimum_mesh <= P.r)
762                     mesh_sizes.push_back(P.r);
763                 return true;
764             }
765             std::sort(all_rec.begin(), all_rec.end());
766 
767             mesh_sizes.clear();
768             unsigned int next_mesh = 1;
769             std::string explainer;
770             for(auto const & tk : all_rec) {
771                 if (L < tk.first)
772                     break;
773                 if (tk.second == T_t::collective)
774                     next_mesh = P.r;
775 
776                 mesh_sizes = { next_mesh };
777                 explainer = tk.second;
778             }
779             std::ostringstream explanation;
780             bool done = false;
781             if (mesh_sizes.empty()) {
782                 /* Given that we have identified at least one threshold
783                  * that says "go recursive" and that we decided that we
784                  * _don't_ go recursive, the conclusion is here that we
785                  * want to do basecase only.
786                  */
787                 mesh_sizes = { 0 };
788                 explanation << explain(T, all_rec.front().second);
789                 done = true;
790             } else {
791                 explanation << explain(T, explainer);
792                 constexpr const char * ck = T_t::collective;
793                 if (!T.has(ck) && P.r > 1) {
794                     mesh_sizes.push_back(P.r);
795                     explanation << explain(T, ck);
796                 }
797             }
798 
799             os << "# Testing only";
800             for(auto mesh : mesh_sizes)
801                 os << " " << tuner.strat_name[mesh];
802             os << fmt::format(FMT_STRING(" at depth {} L={} since"), K.depth, K.L)
803                 << explanation.str()
804                 << "\n";
805             return done;
806         }/*}}}*/
from_thresholds_fft_levellingen_tuner::configurations_to_test807         bool from_thresholds_fft_level(std::ostream& os, lingen_tuner & tuner, lingen_call_companion::key const & K) {/*{{{*/
808             T_t const & T(tuner.tuning_thresholds);
809             /* second decision: should we restrict the set of ffts ? */
810             /* In the following, we rely on the fact that the asymptotic
811              * ordering is known in advance. If we add more cases, this
812              * assumption will likely not hold anymore
813              *
814              * Note that by design, if we arrive here, we completed the
815              * function find_configurations_mesh_level and got a true value,
816              * meaning that L is larger than at least one of the recursive
817              * thresholds. Therefore, checking against "recursive" or
818              * "collective" doesn't make sense.
819              */
820             unsigned int L = K.L;
821             ffts.clear();
822             /* Okay, it's a bit of spaghetti, but the small and concise
823              * loop would not be easier to understand.
824              */
825             std::ostringstream explanation;
826 #ifdef SELECT_MPFQ_LAYER_u64k1
827             bool hc = T.has(T_t::cantor);
828             bool ht = T.has(T_t::ternary);
829             unsigned int tc = T[T_t::cantor];
830             unsigned int tt = T[T_t::ternary];
831             if (hc && ht) {
832                 if (tc >= tt) {
833                     if (L >= tc) {
834                         ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
835                         explanation << explain(T, T_t::cantor);
836                     } else if (L >= tt) {
837                         ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
838                         explanation << explain(T, T_t::ternary);
839                         explanation << explain(T, T_t::cantor);
840                     } else {
841                         ffts.push_back(lingen_substep_schedule::FFT_NONE);
842                         explanation << explain(T, T_t::recursive);
843                         explanation << explain(T, T_t::ternary);
844                     }
845                 } else if (tt >= tc) {
846                     /* quite unlikely except for ridiculously small
847                      * matrices perhaps */
848                     if (L >= tt) {
849                         ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
850                         explanation << explain(T, T_t::ternary);
851                     } else if (L >= tc) {
852                         ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
853                         explanation << explain(T, T_t::cantor);
854                         explanation << explain(T, T_t::ternary);
855                     } else {
856                         ffts.push_back(lingen_substep_schedule::FFT_NONE);
857                         explanation << explain(T, T_t::recursive);
858                         explanation << explain(T, T_t::cantor);
859                     }
860                 }
861             } else if (hc) {
862                 if (L >= tc) {
863                     ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
864                     explanation << explain(T, T_t::cantor);
865                 } else {
866                     ffts.push_back(lingen_substep_schedule::FFT_NONE);
867                     ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
868                     explanation << explain(T, T_t::recursive);
869                     explanation << explain(T, T_t::cantor);
870                 }
871             } else if (ht) {
872                 if (L >= tt) {
873                     ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
874                     ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
875                     explanation << explain(T, T_t::ternary);
876                 } else {
877                     ffts.push_back(lingen_substep_schedule::FFT_NONE);
878                     explanation << explain(T, T_t::recursive);
879                     explanation << explain(T, T_t::ternary);
880                 }
881             } else {
882                 ffts.push_back(lingen_substep_schedule::FFT_NONE);
883                 ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
884                 ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
885                 explanation << explain(T, T_t::recursive);
886             }
887 #else
888             bool hf = T.has(T_t::flint);
889             bool tf = T[T_t::flint];
890             if (hf) {
891                 if (L >= tf) {
892                     ffts.push_back(lingen_substep_schedule::FFT_FLINT);
893                     explanation << explain(T, T_t::flint);
894                 } else {
895                     ffts.push_back(lingen_substep_schedule::FFT_NONE);
896                     explanation << explain(T, T_t::recursive);
897                     explanation << explain(T, T_t::flint);
898                 }
899             } else {
900                 ffts.push_back(lingen_substep_schedule::FFT_NONE);
901                 ffts.push_back(lingen_substep_schedule::FFT_FLINT);
902                 explanation << explain(T, T_t::recursive);
903             }
904 #endif
905             os << "# Testing only";
906             for(auto fft : ffts)
907                 os << " " << lingen_substep_schedule::fft_name(fft);
908             os << fmt::format(FMT_STRING(" at depth {} L={} since"), K.depth, K.L)
909                 << explanation.str()
910                 << "\n";
911             return true;
912         }/*}}}*/
913     };/*}}}*/
914 
915     /* Determine which configurations will undergo testing. This covers
916      * the single/collective, basecase/recursive choices, as well as the
917      * fft choice. This does NOT cover the shrink and batch setting,
918      * which are covered in tune_local_at_depth_mp_or_mul() and optimize()
919      */
find_configurations_to_testlingen_tuner920     configurations_to_test find_configurations_to_test(std::ostream & os,  tuner_persistent_data & persist, lingen_call_companion::key const & K) {/*{{{*/
921         unsigned int L = K.L;
922         configurations_to_test C;
923         bool impose_hints(persist.impose_hints);
924 
925         if (C.from_stored_hints(os, *this, persist, K))
926             return C;
927 
928         if (L >= tuning_thresholds[tuning_thresholds_t::notiming])
929             C.do_timings = false;
930 
931         if (!recursion_makes_sense(L)) {
932             C.mesh_sizes = { 0 };
933             C.ffts.clear();
934             return C;
935         }
936 
937         if (impose_hints) {
938             os << "# No stored schedule found in provided file,"
939                 " computing new one\n";
940         }
941         /* Try with the tuning_thresholds */
942 
943         if (C.from_thresholds_mesh_level(os, *this, persist, K)) {
944             /* then the decision was trivial because lingen_thresholds
945              * told us nothing. No need to bother with fft types either.
946              */
947             return C;
948         }
949 
950         C.from_thresholds_fft_level(os, *this, K);
951         return C;
952     }/*}}}*/
953 
tune_local_at_depthlingen_tuner954     bool tune_local_at_depth(std::ostream& os, tuner_persistent_data & persist, int depth)/*{{{*/
955     {
956         tuner_persistent_data::level_strategy_map & best(persist.best);
957         unsigned int & minimum_mesh(persist.minimum_mesh);
958         lingen_hints & hints(persist.hints);
959 
960         auto cws = calls_and_weights_at_depth(depth);
961 
962         // output first goes to a temp string until we're sure that we're
963         // doing timings.
964 
965         std::ostringstream os_pre;
966         bool timed_something = false;
967 
968         os_pre << fmt::format(FMT_STRING("####################### Measuring time at depth {} #######################\n"), depth);
969 
970         ASSERT_ALWAYS(cws.size() <= 2);
971 
972         /* Given a mesh size (or 0 to mean "basecase"), this returns a
973          * pair (number of calls, sum of the time over all calls)
974          */
975         std::map<unsigned int, std::pair<unsigned int, double>> mesh_tt_weighted;
976 
977         lingen_call_companion U_typical;
978 
979         for(size_t idx = 0 ; idx < cws.size() ; idx++) {
980             auto const & cw(cws[idx]);
981             size_t L, Lleft, Lright;
982             unsigned int weight;
983             std::tie(L, Lleft, Lright, weight) = cw;
984             if (!L) continue;
985             double ratio = weight / (double) (1U << depth);
986             os_pre << fmt::sprintf("# input size %zu, %u times [%.1f%%]\n",
987                     L, weight, 100*ratio);
988         }
989         os_pre << "#\n";
990 
991         bool do_all_timings = true;
992 
993         unsigned int ncalls_at_depth = 0;
994 
995         /* This goes to the "preliminary" buffer until we're sure that
996          * there's timing to print!
997          */
998         std::ostream * p_talk = &os_pre;
999 
1000         for(size_t idx = 0 ; idx < cws.size() ; idx++) {
1001             auto const & cw(cws[idx]);
1002             size_t L, Lleft, Lright;
1003             unsigned int weight;
1004             std::tie(L, Lleft, Lright, weight) = cw;
1005 
1006             /* the weight is the number of calls that must be made
1007              * with this input length. The sum of weights at this
1008              * depth must equal the total input length, whatever the
1009              * level */
1010             ASSERT_ALWAYS(weight);
1011 
1012             if (!L) {
1013                 /* This is a non-call. It's only here in situations where
1014                  * at depth D, we have x calls of size 1, and then
1015                  * ((1<<D)-x) calls of size 0, which obviously correspond
1016                  * to nothing.
1017                  *
1018                  * One may question the meaning of the "weight" of these
1019                  * non-calls. I'd hazard the idea that 0 is most
1020                  * appropriate.
1021                  */
1022                 best[L] = { 0, 0 };
1023                 mesh_tt_weighted[0].first = 0; // += weight ?
1024                 continue;
1025             }
1026 
1027             lingen_call_companion::key K { depth, L };
1028 
1029             /* We **MUST** create hints[K], at this point. We will gain
1030              * _some_ insight from stored_hints[] and maybe from the
1031              * thresholds passed on the command line, but in any case we
1032              * will have to recompute the timings and the RAM usage */
1033 
1034             if (hints.find(K) != hints.end()) {
1035                 /* Then we only have to adjust, I think. At any rate, if
1036                  * we end up doing hints[K]=U here, we're going to
1037                  * overwrite the previously computed data, which sounds a
1038                  * bad idea. */
1039                 hints[K].total_ncalls += weight;
1040                 ncalls_at_depth += weight;
1041                 continue;
1042             }
1043 
1044             configurations_to_test CF = find_configurations_to_test(os_pre, persist, K);
1045 
1046             std::map<unsigned int, lingen_call_companion> mesh_res;
1047             std::map<unsigned int, double> mesh_tt;
1048             std::map<unsigned int, double> mesh_tt_children;
1049 
1050             if (!CF.do_timings) do_all_timings = false;
1051 
1052             if (!CF.do_timings && CF.mesh_sizes.size() != 1)
1053                 throw std::invalid_argument("Cannot have \"notiming\" in tuning_thresholds without specified thresholds for all choices");
1054 
1055             if (CF.do_timings && !timed_something) {
1056                 os << os_pre.str();
1057                 os_pre.str();
1058                 p_talk = &os;
1059                 timed_something = true;
1060             }
1061 
1062             for(auto mesh : CF.mesh_sizes) {
1063                 lingen_call_companion U;
1064                 U.mesh = mesh;
1065                 if (mesh == 0) {
1066                     U.ttb = compute_and_report_basecase(*p_talk, L, CF.do_timings);
1067                     mesh_res[mesh] = U;
1068                     if (!CF.do_timings) continue;
1069                     mesh_tt[mesh] = U.ttb;
1070                     mesh_tt_weighted[mesh].first += weight;
1071                     mesh_tt_weighted[mesh].second += mesh_tt[mesh] * weight;
1072                 } else {
1073                     U.mp = tune_local_at_depth_mp_or_mul(
1074                             *p_talk,
1075                             persist, cw, depth, mesh,
1076                             CF.ffts, CF.do_timings,
1077                             op_mul_or_mp_base::OP_MP);
1078                     U.mul = tune_local_at_depth_mp_or_mul(
1079                             *p_talk,
1080                             persist, cw, depth, mesh,
1081                             CF.ffts, CF.do_timings,
1082                             op_mul_or_mp_base::OP_MUL);
1083                     mesh_res[mesh] = U;
1084                     if (!CF.do_timings) continue;
1085                     mesh_tt[mesh] = U.mp.tt.t + U.mul.tt.t;
1086                     mesh_tt_children[mesh] += best[Lleft].second;
1087                     mesh_tt_children[mesh] += best[Lright].second;
1088                     mesh_tt[mesh] += mesh_tt_children[mesh];
1089                     mesh_tt_weighted[mesh].first += weight;
1090                     mesh_tt_weighted[mesh].second += mesh_tt[mesh] * weight;
1091                 }
1092             }
1093 
1094             /* find the best mesh value */
1095             unsigned int meshbest = UINT_MAX;
1096             double ttbest = DBL_MAX;
1097             if (CF.do_timings) {
1098                 for(auto x : mesh_tt) {
1099                     if (x.second < ttbest) {
1100                         meshbest = x.first;
1101                         ttbest = x.second;
1102                     }
1103                 }
1104             } else {
1105                 meshbest = CF.mesh_sizes.front();
1106             }
1107 
1108             lingen_call_companion U = mesh_res[meshbest];
1109             U.total_ncalls = 0;
1110             U.complete = true;
1111             /* we no longer store ttb in the call companions which
1112              * are intended for recursion */
1113             hints[K] = U;
1114             U_typical = U;
1115 
1116             if (CF.do_timings) {
1117                 /* discard mesh values that are less than the winner and
1118                  * appear to be slow enough that we don't think they'll
1119                  * ever catch up.
1120                  */
1121                 for(auto x : mesh_tt) {
1122                     if (x.first == meshbest || x.first > meshbest)
1123                         continue;
1124                     if (x.second >= basecase_keep_until * mesh_tt[meshbest]) {
1125                         (*p_talk) << fmt::sprintf("# Discarding %s from now on\n",
1126                                 strat_name[x.first]);
1127                         minimum_mesh = x.first + 1;
1128                     }
1129                 }
1130             }
1131 
1132             /* At this point, we started using collective operations,
1133              * which means that we don't want to go back. A priori.
1134              * But it's quite difficult indeed because we might see
1135              * some spurious results at small sizes. */
1136             // if (mesh > 1 && minimum_mesh <= 1) minimum_mesh = 2;
1137 
1138             best[L] = { meshbest, CF.do_timings ? mesh_tt[meshbest] : -1 };
1139 
1140             hints[K].total_ncalls += weight;
1141             ncalls_at_depth += weight;
1142 
1143         }
1144 
1145         /* Now give a summary at this level */
1146 
1147         size_t L0 = std::get<0>(cws.front());
1148         size_t L1 = std::get<0>(cws.back());
1149         /* calls_and_weights_at_depth must return a sorted list */
1150         ASSERT_ALWAYS(L0 <= L1);
1151         unsigned int mesh0 = best[L0].first;
1152         unsigned int mesh1 = best[L1].first;
1153 
1154         if (do_all_timings) {
1155             for(auto mesh : mesh_all) {
1156                 if (mesh_tt_weighted.find(mesh) != mesh_tt_weighted.end()) {
1157                     double tt = mesh_tt_weighted[mesh].second;
1158                     // std::string rescaled;
1159                     if (mesh_tt_weighted[mesh].first != (1U << depth)) {
1160                         double ratio = mesh_tt_weighted[mesh].first / (double) (1U << depth);
1161                         // rescaled = fmt::sprintf("[rescaled from %.1f%%] ", 100*ratio);
1162                         tt /= ratio;
1163                     }
1164                     (*p_talk) << fmt::sprintf("# %s (%u calls): %.2f [%.1fd]\n",
1165                             strat_name[mesh],
1166                             ncalls_at_depth,
1167                             // rescaled,
1168                             tt, tt / 86400);
1169                 }
1170             }
1171 
1172             double tt_total = best[L0].second * std::get<3>(cws.front())
1173                             + best[L1].second * std::get<3>(cws.back());
1174 
1175             if (mesh0 == mesh1) {
1176                 (*p_talk) << fmt::sprintf("# BEST: %s: %.2f [%.1fd]\n",
1177                         strat_name[mesh0],
1178                         tt_total, tt_total / 86400);
1179             } else {
1180                 (*p_talk) << fmt::sprintf("# BEST: mix of %s and %s: %.2f [%.1fd]\n",
1181                         strat_name[mesh0],
1182                         strat_name[mesh1],
1183                         tt_total, tt_total / 86400);
1184             }
1185         }
1186 
1187         if (mesh0 || mesh1) {
1188             lingen_call_companion U = U_typical;
1189             if (U.mp.ram_total() > U.mul.ram_total()) {
1190                 (*p_talk) << fmt::sprintf("#   (memory(MP): %s, incl %s reserved)\n",
1191                         size_disp(U.mp.ram_total()),
1192                         size_disp(U.mp.reserved_ram));
1193             } else {
1194                 (*p_talk) << fmt::sprintf("#   (memory(MUL): %s, incl %s reserved)\n",
1195                         size_disp(U.mul.ram_total()),
1196                         size_disp(U.mul.reserved_ram));
1197             }
1198 
1199         }
1200         return timed_something;
1201     }/*}}}*/
tune_locallingen_tuner1202     lingen_hints tune_local(std::ostream& os, lingen_hints & stored_hints) {/*{{{*/
1203         size_t N = m*n*L/(m+n);
1204         char buf[20];
1205 
1206         /* same mechanism in tune_local_at_depth ; output goes to a
1207          * separate stringstream until we're sure that there's
1208          * interesting output somewhere.
1209          */
1210         std::ostringstream os_pre;
1211         bool timed_something = false;
1212         std::ostream * p_talk = &os_pre;
1213 
1214         (*p_talk) << fmt::sprintf("# Measuring lingen data"
1215                 " for N ~ %zu m=%u n=%u"
1216                 " for a %zu-bit prime p,"
1217                 " using a %u*%u grid of %u-thread nodes"
1218                 " [max target RAM = %s]\n",
1219                 N, m, n, mpz_sizeinbase(p, 2),
1220                 P.r, P.r, P.T,
1221                 size_disp(P.available_ram, buf));
1222 #ifdef HAVE_OPENMP
1223         (*p_talk) << fmt::sprintf("# Note: non-cached basecase measurements"
1224                 " are done using openmp as it is configured"
1225                 " for the running code, that is, with %d threads\n",
1226                 P.openmp_threads);
1227 #endif
1228 
1229         mesh_all.push_back(0); strat_name[0] = "basecase";
1230         mesh_all.push_back(1); strat_name[1] = "recursive(single-node)";
1231         if (P.r > 1) {
1232             mesh_all.push_back(P.r);
1233             strat_name[P.r] = fmt::sprintf("recursive(%d*%d-nodes)", P.r, P.r);
1234         }
1235 
1236         int fl = log2(L-1) + 1; /* ceil(log_2(L)) */
1237 
1238         tuner_persistent_data persist(stored_hints);
1239 
1240         /* with basecase_keep_until == 0, then we never measure basecase */
1241         if (basecase_keep_until == 0)
1242             persist.minimum_mesh = 1;
1243 
1244         if (persist.impose_hints) {
1245             (*p_talk) << fmt::sprintf("# While we are doing timings here,"
1246                     " we'll take schedule decisions based on the hints"
1247                     " found in %s when they apply\n", schedule_filename);
1248         }
1249 
1250         for(int i = fl ; i>=0 ; i--) {
1251             bool timed_here = tune_local_at_depth((*p_talk), persist, i);
1252             if (timed_here && !timed_something) {
1253                 os << os_pre.str();
1254                 os_pre.str();
1255                 p_talk = &os;
1256                 timed_something = true;
1257             }
1258         }
1259 
1260         tuner_persistent_data::level_strategy_map & best(persist.best);
1261         lingen_hints & hints(persist.hints);
1262         size_t peak(persist.peak);
1263         int ipeak(persist.ipeak);
1264 
1265         /*****************************************************************/
1266         /* make the mesh sizes monotonic. In truth, we can only do this
1267          * safely for mesh size 0 (basecase), since otherwise we would
1268          * have an inconsistency in the sub-block sizes, which would make
1269          * the batch values invalid. */
1270         /* keys in the hint table are sorted as "top-level first" */
1271         std::map<unsigned int, lingen_call_companion::key> max_win_per_mesh;
1272         for(auto const & x : hints) {
1273             if (max_win_per_mesh.size() == mesh_all.size()) break;
1274             if (max_win_per_mesh.find(x.second.mesh) != max_win_per_mesh.end())
1275                 max_win_per_mesh[x.second.mesh] = x.first;
1276         }
1277         for(auto & x : hints) {
1278             for(auto const & y : max_win_per_mesh) {
1279                 if (y.first) continue;  // see above
1280                 if (x.second.mesh > y.first && x.first < y.second) {
1281                     (*p_talk) << fmt::format(FMT_STRING("## forcing %s at ({}) since it is known to win at ({})\n"), strat_name[y.first],
1282                             y.first, y.second);
1283                     x.second.mesh = y.first;
1284                 }
1285             }
1286         }
1287 
1288         os << ("#########################  Summary of timing data  #######################\n");
1289         if (!tuning_thresholds.empty()) {
1290             std::ostringstream ss;
1291             for(auto const & x : tuning_thresholds) {
1292                 if (!ss.str().empty()) ss << ",";
1293                 ss << x.first << ':' << x.second;
1294             }
1295             os << fmt::sprintf("# Using explicit tuning_thresholds=%s (from command-line)\n", ss.str());
1296         }
1297         if (!timed_something) {
1298             os << "# note: no real timing data is reported, since the notiming\n"
1299                << "#       threshold has explicitly prevented all measurements\n";
1300         }
1301 
1302         size_t size_com0;
1303         double tt_com0;
1304         std::tie(size_com0, tt_com0) = mpi_threshold_comm_and_time();
1305         os << fmt::sprintf("# Communication time at lingen_mpi_threshold (%s): %.2f [%.1fd]\n", size_disp(size_com0, buf), tt_com0, tt_com0/86400);
1306         double time_best = best[L].second;
1307         if (time_best != -1) {
1308             time_best += tt_com0;
1309             os << fmt::sprintf("# Expected total time: %.2f [%.1fd], peak memory %s (at depth %d)\n", time_best, time_best / 86400, size_disp(peak, buf), ipeak);
1310         }
1311         hints.ipeak=ipeak;
1312         hints.peak=peak;
1313         os << fmt::sprintf("(%u,%u,%u,%.1f,%1.f)\n",m,n,P.r,time_best,(double)peak/1024./1024./1024.);
1314 
1315         /* This one is strictly linear anyway */
1316         hints.tt_gather_per_unit = tt_com0 / 2 / L;
1317         hints.tt_scatter_per_unit = tt_com0 / 2 / L;
1318 
1319         return hints;
1320     }/*}}}*/
tunelingen_tuner1321     lingen_hints tune(std::ostream& os) {/*{{{*/
1322         int rank;
1323         MPI_Comm_rank(P.comm, &rank);
1324         lingen_hints hints;
1325 
1326         if (rank == 0) {
1327             lingen_hints stored_hints;
1328             if (schedule_filename) {
1329                 std::ifstream is(schedule_filename);
1330                 if (is && is >> stored_hints) {
1331                     /* This one _always_ goes to stdout */
1332                     std::cout << fmt::sprintf("# Read tuning schedule from %s\n", schedule_filename);
1333                 } else {
1334                     std::cerr << fmt::sprintf("# Failed to read tuning schedule from %s\n", schedule_filename);
1335                 }
1336             }
1337             hints = tune_local(os, stored_hints);
1338             if (schedule_filename && stored_hints.empty()) {
1339                 std::ofstream os(schedule_filename);
1340                 if (os && os << hints) {
1341                     /* This one _always_ goes to stdout */
1342                     std::cout << fmt::sprintf("# Written tuning schedule to %s\n", schedule_filename);
1343                 } else {
1344                     std::cerr << fmt::sprintf("# Failed to write tuning schedule to %s\n", schedule_filename);
1345                 }
1346             }
1347         }
1348 
1349         hints.share(0, P.comm);
1350 
1351         return hints;
1352     }/*}}}*/
1353 };
1354 
1355 const std::vector<const char *>
1356 lingen_tuner::tuning_thresholds_t::thresholds_verbs
1357 {
1358     recursive,
1359     collective,
1360     // "collective" should allow finer grain, so as to allow
1361     // multiple mesh sizes.
1362     // "plain" makes no sense per se, as it's the base thing
1363     // to use as soon as we go recursive
1364 #ifdef SELECT_MPFQ_LAYER_u64k1
1365     ternary,
1366     cantor,
1367 #else
1368     flint,
1369 #endif
1370     notiming,
1371 };
1372 
1373 const std::vector<std::pair<lingen_substep_schedule::fft_type_t, const char *>> lingen_tuner::tuning_thresholds_t::code_to_key
1374 #ifdef SELECT_MPFQ_LAYER_u64k1
1375 {
1376     { lingen_substep_schedule::FFT_NONE, recursive, },
1377     { lingen_substep_schedule::FFT_TERNARY, ternary, },
1378     { lingen_substep_schedule::FFT_CANTOR, cantor },
1379 };
1380 #else
1381 {
1382     { lingen_substep_schedule::FFT_NONE, recursive, },
1383     { lingen_substep_schedule::FFT_FLINT, flint, },
1384 };
1385 #endif
1386 
1387 /* Need this for conformance */
1388 constexpr const char * lingen_tuner::tuning_thresholds_t::recursive;
1389 constexpr const char * lingen_tuner::tuning_thresholds_t::collective;
1390 constexpr const char * lingen_tuner::tuning_thresholds_t::ternary;
1391 constexpr const char * lingen_tuner::tuning_thresholds_t::cantor;
1392 constexpr const char * lingen_tuner::tuning_thresholds_t::flint;
1393 constexpr const char * lingen_tuner::tuning_thresholds_t::notiming;
1394 
1395 
lingen_tuning(bw_dimensions & d,size_t L,MPI_Comm comm,cxx_param_list & pl)1396 lingen_hints lingen_tuning(bw_dimensions & d, size_t L, MPI_Comm comm, cxx_param_list & pl)
1397 {
1398     lingen_tuner::output_info O(pl);
1399     if (O.quiet) {
1400         std::ostringstream os;
1401         return lingen_tuner(os, d, L, comm, pl).tune(os);
1402     } else if (O.tuning_log_filename) {
1403         std::ofstream os(O.tuning_log_filename, std::ios_base::out);
1404         return lingen_tuner(os, d, L, comm, pl).tune(os);
1405     } else {
1406         return lingen_tuner(std::cout, d, L, comm, pl).tune(std::cout);
1407     }
1408 }
1409 
lingen_tuning_decl_usage(cxx_param_list & pl)1410 void lingen_tuning_decl_usage(cxx_param_list & pl)
1411 {
1412     lingen_tuner::declare_usage(pl);
1413 }
1414 
lingen_tuning_lookup_parameters(cxx_param_list & pl)1415 void lingen_tuning_lookup_parameters(cxx_param_list & pl)
1416 {
1417     lingen_tuner::lookup_parameters(pl);
1418 }
1419 
1420