1 #include "cado.h" // IWYU pragma: keep
2
3 #include <climits> // for UINT_MAX
4 #include <cstdint> // for SIZE_MAX
5 #include <cfloat> // for DBL_MAX
6 #include <cstdio> // for fputs, stderr
7 #include <cmath> // for log2
8
9 #include <algorithm> // for max, sort, find, unique
10 #include <array> // for array
11 #include <iostream> // for operator<<, basic_ostream
12 #include <fstream> // ofstream // IWYU pragma: keep
13 #include <map> // for map, operator!=, map<>...
14 #include <memory> // for allocator_traits<>::va...
15 #include <stdexcept> // for invalid_argument, over...
16 #include <string> // for string, operator<<
17 #include <tuple> // for tie, get, tuple
18 #include <utility> // for pair, make_pair
19 #include <vector> // for vector
20
21 #include <gmp.h> // for mp_limb_t, mpz_size
22
23 #include "lingen_tuning.hpp"
24 #include "cxx_mpz.hpp" // for cxx_mpz
25 #include "fmt/core.h" // for check_format_string
26 #include "fmt/format.h" // for basic_buffer::append
27 #include "fmt/ostream.h" // for formatbuf<>::int_type
28 #include "lingen_abfield.hpp" // for abdst_field
29 #include "lingen_bw_dimensions.hpp" // for bw_dimensions
30 #include "lingen_call_companion.hpp" // for lingen_call_companion
31 #include "lingen_matpoly_select.hpp" // for matpoly, matpoly::over...
32 #include "lingen_mul_substeps_base.hpp" // for op_mul_or_mp_base, op_...
33 #include "lingen_platform.hpp" // for lingen_platform
34 #include "lingen_qcode_select.hpp" // for test_basecase
35 #include "lingen_substep_characteristics.hpp" // for lingen_substep_charact...
36 #include "lingen_substep_schedule.hpp" // for lingen_substep_schedule
37 #include "lingen_tuning_cache.hpp" // for lingen_tuning_cache::c...
38 #include "macros.h" // for iceildiv, ASSERT_ALWAYS
39 #include "misc.h" // for size_disp
40 #include "params.h" // for cxx_param_list, param_...
41 #include "subdivision.hpp" // for subdivision
42 #include "timing.h" // for wct_seconds, weighted_...
43
44 /* {{{ all_splits_of
45 * Given n>=1 return the list of all integers k (1<=k<=n) such
46 * that it is possible to divide n into k blocks (not necessarily of
47 * equal size) but such that all the splits that are obtained this way
48 * have different maximal block sizes.
49 *
50 * (in the binary case we impose that block sizes are always multiples of
51 * 8 -- this is done to limit the number of possible configurations)
52 *
53 * The returned list is such that k->iceildiv(n, k) actually performs the
54 * reversal of the list. And furthermore, for all k's such that k^2<=n,
55 * the obtained splits are different. Hence it suffices to iterate over
56 * all these k's
57 */
all_splits_of(unsigned int n)58 std::vector<unsigned int> all_splits_of(unsigned int n)
59 {
60 #ifdef SELECT_MPFQ_LAYER_u64k1
61 n /= 8;
62 #endif
63 std::vector<unsigned int> res;
64 for(unsigned int k = 1 ; k * k <= n ; k++) res.push_back(k);
65 unsigned int j = res.size();
66 if (res[j-1] * res[j-1] == n) j--;
67 for( ; j-- ; ) res.push_back(iceildiv(n, res[j]));
68 return res;
69 }
70 /* }}} */
71
optimize(std::ostream & os,lingen_substep_characteristics const & U,lingen_platform const & P,unsigned int mesh,std::vector<lingen_substep_schedule::fft_type_t> const & allowed_ffts,bool do_timings,lingen_tuning_cache & C,size_t reserved)72 std::vector<lingen_substep_schedule> optimize(
73 std::ostream& os,
74 lingen_substep_characteristics const & U,
75 lingen_platform const & P,
76 unsigned int mesh,
77 std::vector<lingen_substep_schedule::fft_type_t> const & allowed_ffts,
78 bool do_timings,
79 lingen_tuning_cache & C,
80 size_t reserved)
81 { /* {{{ */
82 unsigned int nr0 = U.mpi_split0(mesh).block_size_upper_bound();
83 unsigned int nr1 = U.mpi_split1(mesh).block_size_upper_bound();
84 unsigned int nr2 = U.mpi_split2(mesh).block_size_upper_bound();
85 size_t min_my_ram = SIZE_MAX;
86 lingen_substep_schedule S_lean;
87 S_lean.fft_type = lingen_substep_schedule::FFT_NONE; /* placate compiler */
88 std::vector<lingen_substep_schedule> all_schedules;
89
90 unsigned int nvalid = 0;
91 for(lingen_substep_schedule::fft_type_t fft : allowed_ffts) {
92 if (!U.fft_type_valid(fft)) continue;
93 nvalid++;
94 }
95 if (!nvalid) {
96 throw std::invalid_argument("FFT choice restricted to zero valid FFTs, tuning_thresholds is probably wrong");
97 }
98
99 for(lingen_substep_schedule::fft_type_t fft : allowed_ffts) {
100 if (!U.fft_type_valid(fft)) continue;
101 for(unsigned int shrink0 : all_splits_of(nr0)) {
102 for(unsigned int shrink2 : all_splits_of(nr2)) {
103 if (fft == lingen_substep_schedule::FFT_NONE)
104 if (shrink0 > 1 || shrink2 > 1) continue;
105 unsigned int nrs0 = U.shrink_split0(mesh, shrink0).block_size_upper_bound();
106 unsigned int nrs2 = U.shrink_split2(mesh, shrink2).block_size_upper_bound();
107 /* first the splits with b0 == nrs0 */
108 {
109 unsigned int b0 = nrs0;
110 for(unsigned int b1 : all_splits_of(nr1)) {
111 for(unsigned int b2 : all_splits_of(nrs2)) {
112 lingen_substep_schedule S;
113 S.fft_type = fft;
114 S.shrink0 = shrink0;
115 S.shrink2 = shrink2;
116 S.batch = {{ b0, b1, b2 }};
117 size_t my_ram = U.get_peak_ram(mesh, S);
118 if (reserved + my_ram <= P.available_ram) {
119 all_schedules.push_back(S);
120 }
121 if (my_ram < min_my_ram) {
122 min_my_ram = my_ram;
123 S_lean = S;
124 }
125 }
126 }
127 }
128 /* then the splits with b2 == nrs2 */
129 {
130 unsigned int b2 = nrs2;
131 for(unsigned int b1 : all_splits_of(nr1)) {
132 for(unsigned int b0 : all_splits_of(nrs0)) {
133 lingen_substep_schedule S;
134 S.fft_type = fft;
135 S.shrink0 = shrink0;
136 S.shrink2 = shrink2;
137 S.batch = {{ b0, b1, b2 }};
138 size_t my_ram = U.get_peak_ram(mesh, S);
139 if (reserved + my_ram <= P.available_ram) {
140 all_schedules.push_back(S);
141 }
142 if (my_ram < min_my_ram) {
143 min_my_ram = my_ram;
144 S_lean = S;
145 }
146 }
147 }
148 }
149 }
150 }
151 }
152
153 std::sort(all_schedules.begin(), all_schedules.end());
154
155 all_schedules.erase(
156 std::unique(all_schedules.begin(), all_schedules.end()),
157 all_schedules.end());
158
159 if (all_schedules.empty()) {
160 char buf[20];
161 std::ostringstream os;
162 os << "Fatal error:"
163 << " it is not possible to complete this calculation with only "
164 << size_disp(P.available_ram, buf)
165 << " of memory for intermediate transforms.\n";
166 os << "Based on the cost for input length "
167 << U.input_length
168 << ", we need at the very least "
169 << size_disp(U.get_peak_ram(mesh, S_lean), buf);
170 os << ", plus "
171 << size_disp(reserved, buf)
172 << " for reserved memory at upper levels";
173 os << " [with shrink="
174 << fmt::format(FMT_STRING("({},{})"),
175 S_lean.shrink0, S_lean.shrink2)
176 << ", batch="
177 << fmt::format(FMT_STRING("({},{},{})"),
178 S_lean.batch[0], S_lean.batch[1], S_lean.batch[2])
179 << "]\n";
180 fputs(os.str().c_str(), stderr);
181 throw std::overflow_error(os.str());
182 }
183
184 U.sort_schedules(os, all_schedules, P, mesh, C, do_timings);
185
186 std::map<std::string, lingen_substep_schedule> families;
187 std::vector<lingen_substep_schedule> res;
188
189 for(auto const & S : all_schedules) {
190 std::string f = fmt::sprintf("%s%s;%s",
191 mesh > 1 ? "MPI-" : "",
192 op_mul_or_mp_base::op_name(U.op_type),
193 S.fft_name());
194 if (families.find(f) == families.end()) {
195 families[f] = S;
196 res.push_back(S);
197 }
198 }
199
200 return res;
201 }
202 /* }}} */
203
204 struct lingen_tuner {
205 typedef lingen_platform pc_t;
206 typedef lingen_substep_schedule sc_t;
207
208 abdst_field ab; /* imported from the dims struct */
209 cxx_mpz p;
210 unsigned int m,n;
211 size_t L;
212 lingen_platform P;
213 lingen_tuning_cache C;
214 gmp_randstate_t rstate;
215 const char * timing_cache_filename = NULL;
216 const char * schedule_filename = NULL;
217
218 /* stop measuring the time taken by the basecase when it is
219 * more than this number times the time taken by the other
220 * alternatives
221 */
222 double basecase_keep_until = 1.8;
223
224 struct tuning_thresholds_t : public std::map<std::string, unsigned int> {/*{{{*/
225 typedef std::map<std::string, unsigned int> super;
226 static constexpr const char * recursive = "recursive";
227 static constexpr const char * collective = "collective";
228 static constexpr const char * ternary = "ternary";
229 static constexpr const char * cantor = "cantor";
230 static constexpr const char * flint = "flint";
231 static constexpr const char * notiming = "notiming";
232 static const std::vector<const char *> thresholds_verbs;
233 static const std::vector<std::pair<lingen_substep_schedule::fft_type_t, const char *>> code_to_key;
234
haslingen_tuner::tuning_thresholds_t235 bool has(std::string const & key) const {
236 return find(key) != end();
237 }
238 private:
getreflingen_tuner::tuning_thresholds_t239 unsigned int& getref(std::string const & key) {
240 return ((super&)(*this))[key];
241 }
242 public:
operator []lingen_tuner::tuning_thresholds_t243 unsigned int operator[](std::string const & key) const {
244 return has(key) ? at(key) : UINT_MAX;
245 }
tuning_thresholds_tlingen_tuner::tuning_thresholds_t246 tuning_thresholds_t(cxx_param_list & pl, std::ostream& os, lingen_platform const & P) {/*{{{*/
247 const char * tmp = param_list_lookup_string(pl, "tuning_thresholds");
248 if (!tmp) return;
249 std::string tlist = tmp;
250 for(size_t pos = 0 ; pos != std::string::npos ; ) {
251 size_t next = tlist.find(',', pos);
252 std::string tok;
253 if (next == std::string::npos) {
254 tok = tlist.substr(pos);
255 pos = next;
256 } else {
257 tok = tlist.substr(pos, next - pos);
258 pos = next + 1;
259 }
260 auto error = [&tok](std::string const& reason) {
261 std::string base = fmt::format(
262 "tuning_thresholds is bad:"
263 " pair \"{}\" ", tok);
264 throw std::invalid_argument(base + reason);
265 };
266
267 size_t colon = tok.find(':');
268 if (colon == std::string::npos)
269 error("has no colon");
270
271 std::string algorithm = tok.substr(0, colon);
272 if (std::find(thresholds_verbs.begin(), thresholds_verbs.end(), algorithm) == thresholds_verbs.end()) {
273 std::ostringstream os;
274 for(auto const & x : thresholds_verbs)
275 os << " " << x;
276 error(fmt::format(
277 "uses unrecognized key \"{}\""
278 " (recognized keys:{})",
279 algorithm, os.str()));
280 }
281
282 unsigned int & dst(getref(algorithm));
283 if (!(std::istringstream(tok.substr(colon + 1)) >> dst))
284 error("has no understandable integer threshold");
285 }
286 if (has(collective) && P.r == 1) {
287 const char * what = "interpreted as \"recursive\"";
288 if (!has(recursive)) {
289 getref(recursive) = getref(collective);
290 } else if (getref(collective) < getref(recursive)) {
291 getref(recursive) = getref(collective);
292 } else {
293 what = "ignored";
294 }
295 erase(find(collective));
296 os << "# Note: the tuning threshold \"collective\" is "
297 << what << " here, since we have a non-MPI run\n";
298 }
299 }/*}}}*/
300 };/*}}}*/
301
302 tuning_thresholds_t tuning_thresholds;
303
304 /* length(E), length(E_left), length(E_right), number of occurrences */
305 typedef std::tuple<size_t, size_t, size_t, unsigned int> weighted_call_t;
306 std::vector<unsigned int> mesh_all;
307 std::map<unsigned int, std::string> strat_name;
308
309
310 struct output_info {/*{{{*/
311 int quiet = 0;
312 const char * tuning_log_filename = NULL;
declare_usagelingen_tuner::output_info313 static void declare_usage(cxx_param_list & pl) {/*{{{*/
314 param_list_decl_usage(pl, "tuning_log_filename",
315 "Output tuning log to this file\n");
316 param_list_decl_usage(pl, "tuning_quiet",
317 "Silence tuning log\n");
318 }/*}}}*/
lookup_parameterslingen_tuner::output_info319 static void lookup_parameters(cxx_param_list & pl) {/*{{{*/
320 lingen_platform::lookup_parameters(pl);
321 param_list_lookup_string(pl, "tuning_quiet");
322 param_list_lookup_string(pl, "tuning_log_filename");
323 }/*}}}*/
output_infolingen_tuner::output_info324 output_info(cxx_param_list & pl) {
325 tuning_log_filename = param_list_lookup_string(pl, "tuning_log_filename");
326 param_list_parse_int(pl, "tuning_quiet", &quiet);
327 }
328 };/*}}}*/
declare_usagelingen_tuner329 static void declare_usage(cxx_param_list & pl) {/*{{{*/
330 lingen_platform::declare_usage(pl);
331 output_info::declare_usage(pl);
332 param_list_decl_usage(pl, "tuning_schedule_filename",
333 "Save (and re-load if it exists) tuning schedule from this file");
334 param_list_decl_usage(pl, "tuning_timing_cache_filename",
335 "Save (and re-load) timings for individual transforms in this file\n");
336 param_list_decl_usage(pl, "basecase-keep-until",
337 "When tuning, stop measuring basecase timing when it exceeds the time of the recursive algorithm (counting its leaf calls) by this factor\n");
338 param_list_decl_usage(pl, "tuning_thresholds",
339 "comma-separated list of threshols, given in the form <algorithm>:<threshold> value. Recognized values for <algorithm> are a subset of recursive,gfp_plain,flint,cantor,gf2x_plain. Thresholds are integers corresponding to the input size of E\n");
340 }/*}}}*/
lookup_parameterslingen_tuner341 static void lookup_parameters(cxx_param_list & pl) {/*{{{*/
342 lingen_platform::lookup_parameters(pl);
343 output_info::lookup_parameters(pl);
344 param_list_lookup_string(pl, "tuning_schedule_filename");
345 param_list_lookup_string(pl, "tuning_timing_cache_filename");
346 param_list_lookup_string(pl, "basecase-keep-until");
347 param_list_lookup_string(pl, "tuning_thresholds");
348 }/*}}}*/
349
lingen_tunerlingen_tuner350 lingen_tuner(std::ostream& os, bw_dimensions & d, size_t L, MPI_Comm comm, cxx_param_list & pl) :/*{{{*/
351 ab(d.ab),
352 m(d.m), n(d.n), L(L), P(comm, pl),
353 tuning_thresholds(pl, os, P)
354 {
355 #ifdef SELECT_MPFQ_LAYER_u64k1
356 mpz_set_ui(p, 2);
357 #else
358 mpz_set (p, abfield_characteristic_srcptr(ab));
359 #endif
360 gmp_randinit_default(rstate);
361 gmp_randseed_ui(rstate, 1);
362
363 param_list_parse_double(pl, "basecase-keep-until", &basecase_keep_until);
364
365 schedule_filename = param_list_lookup_string(pl, "tuning_schedule_filename");
366 /* only the leader will do the tuning, so only the leader cares
367 * about loading/saving it...
368 */
369 timing_cache_filename = param_list_lookup_string(pl, "tuning_timing_cache_filename");
370 int rank;
371 MPI_Comm_rank(P.comm, &rank);
372 if (rank == 0)
373 C.load(timing_cache_filename);
374 }/*}}}*/
375 // coverity[exn_spec_violation]
~lingen_tunerlingen_tuner376 ~lingen_tuner() {/*{{{*/
377 int rank;
378 MPI_Comm_rank(P.comm, &rank);
379 if (rank == 0)
380 C.save(timing_cache_filename);
381 gmp_randclear(rstate);
382 }/*}}}*/
mpi_threshold_comm_and_timelingen_tuner383 std::tuple<size_t, double> mpi_threshold_comm_and_time() {/*{{{*/
384 /* This is the time taken by gather() and scatter() right at the
385 * threshold point. This total time is independent of the
386 * threshold itself, in fact.
387 */
388 size_t data0 = m*(m+n)*(P.r*P.r-1);
389 data0 = data0 * L;
390 #ifdef SELECT_MPFQ_LAYER_u64k1
391 data0 = iceildiv(data0, ULONG_BITS) * sizeof(unsigned long);
392 #else
393 data0 = abvec_elt_stride(ab, data0);
394 #endif
395
396 // We must **NOT** divide by r*r, because the problem is
397 // precisely caused by the fact that gather() and scatter() all
398 // imply one contention point which is the central node.
399 // data0 = data0 / (r*r);
400 std::tuple<size_t, double> vv { 2 * data0, 2 * data0 / P.mpi_xput};
401 return vv;
402 }/*}}}*/
compute_and_report_basecaselingen_tuner403 double compute_and_report_basecase(std::ostream& os, size_t length, bool do_timings) { /*{{{*/
404 double tt;
405
406 lingen_tuning_cache::basecase_key K { mpz_sizeinbase(p, 2), m, n, length, P.openmp_threads };
407
408 os << fmt::format(FMT_STRING("# basecase (@{}): "), length);
409
410 if (!do_timings) {
411 C[K] = { 0 };
412 os << "not timed\n";
413 return C[K];
414 }
415
416 if (!C.has(K)) {
417 os << std::flush;
418 tt = wct_seconds();
419 test_basecase(ab, m, n, length, rstate);
420 tt = wct_seconds() - tt;
421 C[K] = { tt };
422 os << fmt::sprintf("%.2f\n", tt);
423 } else {
424 os << fmt::sprintf("%.2f [from cache]\n", C[K]);
425 }
426 return C[K];
427 }/*}}}*/
calls_and_weights_at_depthlingen_tuner428 std::vector<weighted_call_t> calls_and_weights_at_depth(int i) {/*{{{*/
429 /*
430 * Let L = (Q << (i+1)) + (u << i) + v, with u={0,1} and
431 * 0<=v<(1<<i). We have L % (1 << i) = v. Note that u=(L>>i)%2.
432 * Let U = u << i.
433 *
434 * input length at depth i, a.k.a. length(E), is either
435 * L>>i=floor(L/2^i)=2Q+u or one above. The number of calls is:
436 *
437 * v times:
438 * length(E) = 2Q + u + 1
439 * length(E_left) = Q + 1
440 * length(E_right) = Q + u
441 * otherwise:
442 * length(E) = 2Q + u
443 * length(E_left) = Q + u
444 * length(E_right) = Q
445 * And in all cases:
446 * length(pi_left) = ceiling(alpha * length(E_left))
447 * length(pi_right) = ceiling(alpha * length(E_right))
448 *
449 * The length of the chopped copy of E is:
450 * length(E_right) + length(pi_right) - 1
451 *
452 * The length of the product pi_left*pi_right is
453 * ceiling(alpha * length(E_left)) + ceiling(alpha * length(E_right)) - 1
454 * This should also be less than or equal to
455 * ceiling(alpha * length(E))
456 * So that in cases where the former bound is larger than the
457 * latter, we're almost sure that the top coefficients multiply
458 * to a zero matrix.
459 *
460 */
461
462 size_t Q = L >> (i+1);
463 size_t u = (L >> i) & 1;
464 size_t v = L % (1 << i);
465
466 /* at i == floor level, we have (1<<(i-1)) < L <= (1<<i), Q = 0, and:
467 *
468 * if v, then u=0 and v=L. we have v calls with size 1 and (1<<i)-v
469 * with size 0 on the first read, but the calls with size 0 don't
470 * really exist, so that in fact we're seeing leftovers of calls
471 * with size 1 at the level above. So we actually have _at depth
472 * i proper_ 2*v-(1<<i) calls with size 1, and that's all.
473 *
474 * if v==0, then u=1. we have all calls with size 1, code below
475 * is fine
476 */
477
478 if (v && !((L-1)>>i)) {
479 ASSERT_ALWAYS(Q == 0 && u == 0);
480 weighted_call_t w1 { 2*Q + u + 1, Q + 1, Q + u, 2*v - (1 << i) };
481 std::vector<weighted_call_t> res {{ w1 }};
482 return res;
483 } else if (v) {
484 weighted_call_t w0 { 2*Q + u, Q + u, Q, (1 << i) - v };
485 weighted_call_t w1 { 2*Q + u + 1, Q + 1, Q + u, v };
486 std::vector<weighted_call_t> res {{ w0, w1 }};
487 return res;
488 } else {
489 weighted_call_t w0 { 2*Q + u, Q + u, Q, (1 << i) - v };
490 std::vector<weighted_call_t> res {{ w0 }};
491 return res;
492 }
493 }/*}}}*/
substeplingen_tuner494 lingen_substep_characteristics substep(weighted_call_t const & cw, op_mul_or_mp_base::op_type_t op) { /* {{{ */
495 size_t length_E;
496 size_t length_E_left;
497 size_t length_E_right;
498 unsigned int weight;
499
500 std::tie(length_E, length_E_left, length_E_right, weight) = cw;
501
502 ASSERT_ALWAYS(length_E >= 2);
503 ASSERT_ALWAYS(weight);
504
505 size_t asize, bsize, csize;
506
507 if (op == op_mul_or_mp_base::OP_MP) {
508 csize = length_E_right;
509 /* pi is the identity matrix for zero coefficients, but the
510 * identity matrix already has length 1.
511 */
512 bsize = 1 + iceildiv(m * length_E_left, m+n);
513 asize = csize + bsize - 1;
514 } else {
515 asize = 1 + iceildiv(m * length_E_left, m+n);
516 bsize = 1 + iceildiv(m * length_E_right, m+n);
517 csize = asize + bsize - 1;
518 }
519
520 ASSERT_ALWAYS(asize);
521 ASSERT_ALWAYS(bsize);
522
523 return lingen_substep_characteristics(
524 ab, rstate, length_E,
525 op,
526 op == op_mul_or_mp_base::OP_MP ? m : (m+n),
527 m+n, m+n,
528 asize, bsize, csize);
529 } /* }}} */
recursion_makes_senselingen_tuner530 bool recursion_makes_sense(size_t L) const {/*{{{*/
531 return L >= 2;
532 }/*}}}*/
533 struct tuner_persistent_data {/*{{{*/
534 typedef std::map<size_t, std::pair<unsigned int, double>, lingen_tuning_cache::coarse_compare> level_strategy_map;
535 lingen_hints hints;
536 lingen_hints const & stored_hints;
537 level_strategy_map best;
538 /* The "minimum mesh" field is only used by
539 * tune_local_at_depth. This determines the "mesh" of calls
540 * that will be tried. Options are:
541 *
542 * mesh=0 (if 0>=minimum_mesh) : basecase
543 * mesh=1 (if 1>=minimum_mesh) : recursive, single-node
544 * other : recursive, multi-node
545 *
546 * Currently, because scatter_mat and gather_mat are limited to
547 * 1-n and n-1 conversions, we transition to single- to
548 * multi-node all in one go.
549 */
550 unsigned int minimum_mesh = 0;
551 double last_save = 0;
552 size_t peak = 0;
553 int ipeak = 0;
554 size_t upper_threshold = 0;
555 bool impose_hints;
tuner_persistent_datalingen_tuner::tuner_persistent_data556 tuner_persistent_data(lingen_hints const & stored_hints) : stored_hints(stored_hints) {
557 last_save = wct_seconds();
558 impose_hints = !stored_hints.empty();
559 }
560 };/*}}}*/
tune_local_at_depth_mp_or_mullingen_tuner561 lingen_call_companion::mul_or_mp_times tune_local_at_depth_mp_or_mul(
562 std::ostream& os,
563 tuner_persistent_data & persist,
564 weighted_call_t cw,
565 int depth,
566 unsigned int mesh,
567 std::vector<lingen_substep_schedule::fft_type_t> const & allowed_ffts,
568 bool do_timings,
569 op_mul_or_mp_base::op_type_t op_type)/*{{{*/
570 {
571 lingen_call_companion::mul_or_mp_times U { op_type };
572 lingen_hints const & stored_hints(persist.stored_hints);
573 double & last_save(persist.last_save);
574 size_t & peak(persist.peak);
575 int & ipeak(persist.ipeak);
576
577 /* For input length L, the reserved
578 * storage at depth i is
579 * RMP'(i) = [m/r][(m+n)/r][(1+\alpha)(L-2\ell_i)] + [(m+n)/r]^2*[2\alpha\ell_i]
580 * RMUL'(i) = [m/r][(m+n)/r][(1+\alpha)(L-2\ell_i)] + [(m+n)/r]^2*[4\alpha\ell_i]
581 * with the notations \alpha=m/(m+n), \ell_i=L/2^(i+1), and
582 * [] denotes ceiling.
583 * The details of the computation are in the comments in
584 * lingen.cpp
585 *
586 * Note that here, we're using P.r and not the mesh parameter,
587 * because what matters is what we're reserving at the _upper_
588 * levels. And most importantly, chances are that the largest
589 * share of the reserved memory comes from the topmost levels
590 * anyway, so we surmise that P.r is appropriate there.
591 */
592 size_t base_E = iceildiv(m,P.r)*iceildiv(m+n,P.r)*mpz_size(p)*sizeof(mp_limb_t);
593 size_t base_pi = iceildiv(m+n,P.r)*iceildiv(m+n,P.r)*mpz_size(p)*sizeof(mp_limb_t);
594 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
595 size_t reserved_base = base_E * iceildiv(L - (L >> depth), simd);
596 size_t reserved_mp = base_pi * iceildiv(iceildiv(m * iceildiv(L, 1<<depth), m+n), simd);
597 size_t reserved_mul = base_pi * iceildiv(iceildiv(m * iceildiv(2*L, 1<<depth), m+n), simd);
598 reserved_mp += reserved_base;
599 reserved_mul += reserved_base;
600
601 size_t reserved = op_type == op_mul_or_mp_base::OP_MP ? reserved_mp : reserved_mul;
602
603 os << fmt::format(FMT_STRING("# {} reserved storage = {}\n"),
604 op_mul_or_mp_base::op_name(op_type),
605 size_disp(reserved));
606
607 size_t L, Lleft, Lright;
608 unsigned int weight;
609 std::tie(L, Lleft, Lright, weight) = cw;
610 lingen_call_companion::key K { depth, L };
611 ASSERT_ALWAYS(weight);
612
613 ASSERT_ALWAYS (recursion_makes_sense(L));
614 auto step = substep(cw, op_type);
615 bool print_here = true;
616 #if 0
617 if (print_here)
618 step.report_size_stats_human(os);
619 #endif
620
621 std::vector<lingen_substep_schedule> SS;
622
623 if (stored_hints.find(K) != stored_hints.end()) {
624 lingen_substep_schedule S = stored_hints.at(K)[op_type].S;
625 if (step.is_valid_schedule(P, mesh, S)) {
626 os << "# Using imposed schedule " << S.serialize() << "\n";
627 SS.push_back(S);
628 } else {
629 os << "# IGNORING invalid schedule " << S.serialize() << "\n";
630 }
631 }
632
633 if (SS.empty()) {
634 /* get the schedule by trying all possibilities for the
635 * shrink and batch options. */
636 SS = optimize(os, step, P, mesh,
637 allowed_ffts, do_timings, C, reserved);
638 }
639
640 for(auto const & S : SS)
641 step.get_and_report_call_time(os, P, mesh, S, C, do_timings);
642
643 lingen_substep_schedule S = SS.front();
644
645 if (print_here)
646 step.report_op_winner(os, mesh, S);
647
648 if (do_timings) {
649 if (wct_seconds() > last_save + 10) {
650 int rank;
651 MPI_Comm_rank(P.comm, &rank);
652 if (rank == 0)
653 C.save(timing_cache_filename);
654 last_save = wct_seconds();
655 }
656 }
657
658 U = step.get_companion(os, P, mesh, S, C, reserved, do_timings);
659 os << "#\n";
660
661 size_t mm = U.ram_total();
662 if (mm > peak) { ipeak = depth; peak = mm; }
663
664 return U;
665 }/*}}}*/
666 struct configurations_to_test {/*{{{*/
667 /* This is the list of dimensions or the MPI mesh that
668 * we're going to try. For the moment, it's limited to
669 * either {1*1, r*r}, or only one of them. In principle,
670 * the framework could be extended to support further
671 * mesh sizes, but that requires support in scatter_mat
672 * and gather_mat to dispatch from m nodes to n nodes.
673 */
674 std::vector<unsigned int> mesh_sizes;
675 std::vector<lingen_substep_schedule::fft_type_t> ffts;
676 /* The various choices for the "shrink" and "batch" options are
677 * not listed here, but are rather computed on the fly in the
678 * optimize() function. This seems more convenient there, as it's
679 * easier to do pruning based on the mesh sizes and such.
680 */
681 bool do_timings = true;
configurations_to_testlingen_tuner::configurations_to_test682 configurations_to_test() {
683 ffts = { lingen_substep_schedule::FFT_NONE };
684 #ifndef SELECT_MPFQ_LAYER_u64k1
685 ffts.push_back(lingen_substep_schedule::FFT_FLINT);
686 #else
687 ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
688 ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
689 #endif
690 }
from_stored_hintslingen_tuner::configurations_to_test691 bool from_stored_hints(std::ostream& os, lingen_tuner & tuner, tuner_persistent_data & persist, lingen_call_companion::key const & K) {/*{{{*/
692 lingen_hints const & stored_hints(persist.stored_hints);
693 lingen_platform const & P(tuner.P);
694
695 if (stored_hints.find(K) == stored_hints.end())
696 return false;
697 os << ("# Re-using stored schedule\n");
698 /* This is _not_ a complete set ! We still must compute the
699 * timings, the RAM, and so on.
700 *
701 * This is done via tune_local_at_depth_mp_or_mul, which
702 * reads stored_hints a second time. In particular, it
703 * fetches the info of the previously selected fft types for
704 * both operations, and eventually sets U.mp and U.mul. This
705 * implies that we do not need to restrict the list of
706 * allowed ffts here.
707 */
708 unsigned int mesh = stored_hints.at(K).mesh;
709
710 if (mesh == UINT_MAX) mesh = P.r;
711 if (mesh > 1 && mesh != P.r) {
712 throw std::runtime_error(
713 fmt::sprintf(
714 "stored schedule is invalid,"
715 " we cannot (yet) run on a %d*%d grid"
716 " a schedule meant for a %d*%d grid\n",
717 P.r, P.r,
718 mesh, mesh));
719 }
720
721 mesh_sizes = { mesh };
722 os << fmt::format(FMT_STRING("# Forcing {} at depth {} L={}\n"),
723 tuner.strat_name[mesh], K.depth, K.L);
724
725 return true;
726 }/* }}} */
explainlingen_tuner::configurations_to_test727 std::string explain(tuning_thresholds_t const & T, std::string const & k) {
728 if (T.has(k)) {
729 return fmt::format(FMT_STRING(" tuning_threshold[{}]={}"), k, T[k]);
730 } else {
731 return fmt::format(FMT_STRING(" tuning_threshold[{}]=undef"), k);
732 }
733 }
734 typedef tuning_thresholds_t T_t;
from_thresholds_mesh_levellingen_tuner::configurations_to_test735 bool from_thresholds_mesh_level(std::ostream& os, lingen_tuner & tuner, tuner_persistent_data & persist, lingen_call_companion::key const & K) {/*{{{*/
736 T_t const & T(tuner.tuning_thresholds);
737 lingen_platform const & P(tuner.P);
738 unsigned int L = K.L;
739 /* first decision: do we force recursion. There are various
740 * reasons to do so, since we have various thresholds that
741 * implicitly enable recursion */
742 std::vector<std::pair<unsigned int, const char *>> all_rec;
743 for(auto k : T_t::thresholds_verbs) {
744 if (k == T_t::notiming) continue;
745 if (T.has(k))
746 all_rec.push_back(std::make_pair(T[k], k));
747 }
748 /* If no threshold talks about being recursive, then it's
749 * left to our decision. We keep all possible mesh sizes a
750 * priori. However, we must pay attention to the fact that we
751 * may know already that some mesh sizes should be discarded
752 * (this can only happen if at an earlier level, the current mesh
753 * size X was beaten by a larger one Y, which means that the
754 * minimum mesh size was at most X for the previous size, and
755 * is not at most Y, not more).
756 */
757 if (all_rec.empty()) {
758 mesh_sizes.clear();
759 if (persist.minimum_mesh <= 0) mesh_sizes.push_back(0);
760 if (persist.minimum_mesh <= 1) mesh_sizes.push_back(1);
761 if (P.r != 1 && persist.minimum_mesh <= P.r)
762 mesh_sizes.push_back(P.r);
763 return true;
764 }
765 std::sort(all_rec.begin(), all_rec.end());
766
767 mesh_sizes.clear();
768 unsigned int next_mesh = 1;
769 std::string explainer;
770 for(auto const & tk : all_rec) {
771 if (L < tk.first)
772 break;
773 if (tk.second == T_t::collective)
774 next_mesh = P.r;
775
776 mesh_sizes = { next_mesh };
777 explainer = tk.second;
778 }
779 std::ostringstream explanation;
780 bool done = false;
781 if (mesh_sizes.empty()) {
782 /* Given that we have identified at least one threshold
783 * that says "go recursive" and that we decided that we
784 * _don't_ go recursive, the conclusion is here that we
785 * want to do basecase only.
786 */
787 mesh_sizes = { 0 };
788 explanation << explain(T, all_rec.front().second);
789 done = true;
790 } else {
791 explanation << explain(T, explainer);
792 constexpr const char * ck = T_t::collective;
793 if (!T.has(ck) && P.r > 1) {
794 mesh_sizes.push_back(P.r);
795 explanation << explain(T, ck);
796 }
797 }
798
799 os << "# Testing only";
800 for(auto mesh : mesh_sizes)
801 os << " " << tuner.strat_name[mesh];
802 os << fmt::format(FMT_STRING(" at depth {} L={} since"), K.depth, K.L)
803 << explanation.str()
804 << "\n";
805 return done;
806 }/*}}}*/
from_thresholds_fft_levellingen_tuner::configurations_to_test807 bool from_thresholds_fft_level(std::ostream& os, lingen_tuner & tuner, lingen_call_companion::key const & K) {/*{{{*/
808 T_t const & T(tuner.tuning_thresholds);
809 /* second decision: should we restrict the set of ffts ? */
810 /* In the following, we rely on the fact that the asymptotic
811 * ordering is known in advance. If we add more cases, this
812 * assumption will likely not hold anymore
813 *
814 * Note that by design, if we arrive here, we completed the
815 * function find_configurations_mesh_level and got a true value,
816 * meaning that L is larger than at least one of the recursive
817 * thresholds. Therefore, checking against "recursive" or
818 * "collective" doesn't make sense.
819 */
820 unsigned int L = K.L;
821 ffts.clear();
822 /* Okay, it's a bit of spaghetti, but the small and concise
823 * loop would not be easier to understand.
824 */
825 std::ostringstream explanation;
826 #ifdef SELECT_MPFQ_LAYER_u64k1
827 bool hc = T.has(T_t::cantor);
828 bool ht = T.has(T_t::ternary);
829 unsigned int tc = T[T_t::cantor];
830 unsigned int tt = T[T_t::ternary];
831 if (hc && ht) {
832 if (tc >= tt) {
833 if (L >= tc) {
834 ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
835 explanation << explain(T, T_t::cantor);
836 } else if (L >= tt) {
837 ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
838 explanation << explain(T, T_t::ternary);
839 explanation << explain(T, T_t::cantor);
840 } else {
841 ffts.push_back(lingen_substep_schedule::FFT_NONE);
842 explanation << explain(T, T_t::recursive);
843 explanation << explain(T, T_t::ternary);
844 }
845 } else if (tt >= tc) {
846 /* quite unlikely except for ridiculously small
847 * matrices perhaps */
848 if (L >= tt) {
849 ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
850 explanation << explain(T, T_t::ternary);
851 } else if (L >= tc) {
852 ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
853 explanation << explain(T, T_t::cantor);
854 explanation << explain(T, T_t::ternary);
855 } else {
856 ffts.push_back(lingen_substep_schedule::FFT_NONE);
857 explanation << explain(T, T_t::recursive);
858 explanation << explain(T, T_t::cantor);
859 }
860 }
861 } else if (hc) {
862 if (L >= tc) {
863 ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
864 explanation << explain(T, T_t::cantor);
865 } else {
866 ffts.push_back(lingen_substep_schedule::FFT_NONE);
867 ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
868 explanation << explain(T, T_t::recursive);
869 explanation << explain(T, T_t::cantor);
870 }
871 } else if (ht) {
872 if (L >= tt) {
873 ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
874 ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
875 explanation << explain(T, T_t::ternary);
876 } else {
877 ffts.push_back(lingen_substep_schedule::FFT_NONE);
878 explanation << explain(T, T_t::recursive);
879 explanation << explain(T, T_t::ternary);
880 }
881 } else {
882 ffts.push_back(lingen_substep_schedule::FFT_NONE);
883 ffts.push_back(lingen_substep_schedule::FFT_TERNARY);
884 ffts.push_back(lingen_substep_schedule::FFT_CANTOR);
885 explanation << explain(T, T_t::recursive);
886 }
887 #else
888 bool hf = T.has(T_t::flint);
889 bool tf = T[T_t::flint];
890 if (hf) {
891 if (L >= tf) {
892 ffts.push_back(lingen_substep_schedule::FFT_FLINT);
893 explanation << explain(T, T_t::flint);
894 } else {
895 ffts.push_back(lingen_substep_schedule::FFT_NONE);
896 explanation << explain(T, T_t::recursive);
897 explanation << explain(T, T_t::flint);
898 }
899 } else {
900 ffts.push_back(lingen_substep_schedule::FFT_NONE);
901 ffts.push_back(lingen_substep_schedule::FFT_FLINT);
902 explanation << explain(T, T_t::recursive);
903 }
904 #endif
905 os << "# Testing only";
906 for(auto fft : ffts)
907 os << " " << lingen_substep_schedule::fft_name(fft);
908 os << fmt::format(FMT_STRING(" at depth {} L={} since"), K.depth, K.L)
909 << explanation.str()
910 << "\n";
911 return true;
912 }/*}}}*/
913 };/*}}}*/
914
915 /* Determine which configurations will undergo testing. This covers
916 * the single/collective, basecase/recursive choices, as well as the
917 * fft choice. This does NOT cover the shrink and batch setting,
918 * which are covered in tune_local_at_depth_mp_or_mul() and optimize()
919 */
find_configurations_to_testlingen_tuner920 configurations_to_test find_configurations_to_test(std::ostream & os, tuner_persistent_data & persist, lingen_call_companion::key const & K) {/*{{{*/
921 unsigned int L = K.L;
922 configurations_to_test C;
923 bool impose_hints(persist.impose_hints);
924
925 if (C.from_stored_hints(os, *this, persist, K))
926 return C;
927
928 if (L >= tuning_thresholds[tuning_thresholds_t::notiming])
929 C.do_timings = false;
930
931 if (!recursion_makes_sense(L)) {
932 C.mesh_sizes = { 0 };
933 C.ffts.clear();
934 return C;
935 }
936
937 if (impose_hints) {
938 os << "# No stored schedule found in provided file,"
939 " computing new one\n";
940 }
941 /* Try with the tuning_thresholds */
942
943 if (C.from_thresholds_mesh_level(os, *this, persist, K)) {
944 /* then the decision was trivial because lingen_thresholds
945 * told us nothing. No need to bother with fft types either.
946 */
947 return C;
948 }
949
950 C.from_thresholds_fft_level(os, *this, K);
951 return C;
952 }/*}}}*/
953
tune_local_at_depthlingen_tuner954 bool tune_local_at_depth(std::ostream& os, tuner_persistent_data & persist, int depth)/*{{{*/
955 {
956 tuner_persistent_data::level_strategy_map & best(persist.best);
957 unsigned int & minimum_mesh(persist.minimum_mesh);
958 lingen_hints & hints(persist.hints);
959
960 auto cws = calls_and_weights_at_depth(depth);
961
962 // output first goes to a temp string until we're sure that we're
963 // doing timings.
964
965 std::ostringstream os_pre;
966 bool timed_something = false;
967
968 os_pre << fmt::format(FMT_STRING("####################### Measuring time at depth {} #######################\n"), depth);
969
970 ASSERT_ALWAYS(cws.size() <= 2);
971
972 /* Given a mesh size (or 0 to mean "basecase"), this returns a
973 * pair (number of calls, sum of the time over all calls)
974 */
975 std::map<unsigned int, std::pair<unsigned int, double>> mesh_tt_weighted;
976
977 lingen_call_companion U_typical;
978
979 for(size_t idx = 0 ; idx < cws.size() ; idx++) {
980 auto const & cw(cws[idx]);
981 size_t L, Lleft, Lright;
982 unsigned int weight;
983 std::tie(L, Lleft, Lright, weight) = cw;
984 if (!L) continue;
985 double ratio = weight / (double) (1U << depth);
986 os_pre << fmt::sprintf("# input size %zu, %u times [%.1f%%]\n",
987 L, weight, 100*ratio);
988 }
989 os_pre << "#\n";
990
991 bool do_all_timings = true;
992
993 unsigned int ncalls_at_depth = 0;
994
995 /* This goes to the "preliminary" buffer until we're sure that
996 * there's timing to print!
997 */
998 std::ostream * p_talk = &os_pre;
999
1000 for(size_t idx = 0 ; idx < cws.size() ; idx++) {
1001 auto const & cw(cws[idx]);
1002 size_t L, Lleft, Lright;
1003 unsigned int weight;
1004 std::tie(L, Lleft, Lright, weight) = cw;
1005
1006 /* the weight is the number of calls that must be made
1007 * with this input length. The sum of weights at this
1008 * depth must equal the total input length, whatever the
1009 * level */
1010 ASSERT_ALWAYS(weight);
1011
1012 if (!L) {
1013 /* This is a non-call. It's only here in situations where
1014 * at depth D, we have x calls of size 1, and then
1015 * ((1<<D)-x) calls of size 0, which obviously correspond
1016 * to nothing.
1017 *
1018 * One may question the meaning of the "weight" of these
1019 * non-calls. I'd hazard the idea that 0 is most
1020 * appropriate.
1021 */
1022 best[L] = { 0, 0 };
1023 mesh_tt_weighted[0].first = 0; // += weight ?
1024 continue;
1025 }
1026
1027 lingen_call_companion::key K { depth, L };
1028
1029 /* We **MUST** create hints[K], at this point. We will gain
1030 * _some_ insight from stored_hints[] and maybe from the
1031 * thresholds passed on the command line, but in any case we
1032 * will have to recompute the timings and the RAM usage */
1033
1034 if (hints.find(K) != hints.end()) {
1035 /* Then we only have to adjust, I think. At any rate, if
1036 * we end up doing hints[K]=U here, we're going to
1037 * overwrite the previously computed data, which sounds a
1038 * bad idea. */
1039 hints[K].total_ncalls += weight;
1040 ncalls_at_depth += weight;
1041 continue;
1042 }
1043
1044 configurations_to_test CF = find_configurations_to_test(os_pre, persist, K);
1045
1046 std::map<unsigned int, lingen_call_companion> mesh_res;
1047 std::map<unsigned int, double> mesh_tt;
1048 std::map<unsigned int, double> mesh_tt_children;
1049
1050 if (!CF.do_timings) do_all_timings = false;
1051
1052 if (!CF.do_timings && CF.mesh_sizes.size() != 1)
1053 throw std::invalid_argument("Cannot have \"notiming\" in tuning_thresholds without specified thresholds for all choices");
1054
1055 if (CF.do_timings && !timed_something) {
1056 os << os_pre.str();
1057 os_pre.str();
1058 p_talk = &os;
1059 timed_something = true;
1060 }
1061
1062 for(auto mesh : CF.mesh_sizes) {
1063 lingen_call_companion U;
1064 U.mesh = mesh;
1065 if (mesh == 0) {
1066 U.ttb = compute_and_report_basecase(*p_talk, L, CF.do_timings);
1067 mesh_res[mesh] = U;
1068 if (!CF.do_timings) continue;
1069 mesh_tt[mesh] = U.ttb;
1070 mesh_tt_weighted[mesh].first += weight;
1071 mesh_tt_weighted[mesh].second += mesh_tt[mesh] * weight;
1072 } else {
1073 U.mp = tune_local_at_depth_mp_or_mul(
1074 *p_talk,
1075 persist, cw, depth, mesh,
1076 CF.ffts, CF.do_timings,
1077 op_mul_or_mp_base::OP_MP);
1078 U.mul = tune_local_at_depth_mp_or_mul(
1079 *p_talk,
1080 persist, cw, depth, mesh,
1081 CF.ffts, CF.do_timings,
1082 op_mul_or_mp_base::OP_MUL);
1083 mesh_res[mesh] = U;
1084 if (!CF.do_timings) continue;
1085 mesh_tt[mesh] = U.mp.tt.t + U.mul.tt.t;
1086 mesh_tt_children[mesh] += best[Lleft].second;
1087 mesh_tt_children[mesh] += best[Lright].second;
1088 mesh_tt[mesh] += mesh_tt_children[mesh];
1089 mesh_tt_weighted[mesh].first += weight;
1090 mesh_tt_weighted[mesh].second += mesh_tt[mesh] * weight;
1091 }
1092 }
1093
1094 /* find the best mesh value */
1095 unsigned int meshbest = UINT_MAX;
1096 double ttbest = DBL_MAX;
1097 if (CF.do_timings) {
1098 for(auto x : mesh_tt) {
1099 if (x.second < ttbest) {
1100 meshbest = x.first;
1101 ttbest = x.second;
1102 }
1103 }
1104 } else {
1105 meshbest = CF.mesh_sizes.front();
1106 }
1107
1108 lingen_call_companion U = mesh_res[meshbest];
1109 U.total_ncalls = 0;
1110 U.complete = true;
1111 /* we no longer store ttb in the call companions which
1112 * are intended for recursion */
1113 hints[K] = U;
1114 U_typical = U;
1115
1116 if (CF.do_timings) {
1117 /* discard mesh values that are less than the winner and
1118 * appear to be slow enough that we don't think they'll
1119 * ever catch up.
1120 */
1121 for(auto x : mesh_tt) {
1122 if (x.first == meshbest || x.first > meshbest)
1123 continue;
1124 if (x.second >= basecase_keep_until * mesh_tt[meshbest]) {
1125 (*p_talk) << fmt::sprintf("# Discarding %s from now on\n",
1126 strat_name[x.first]);
1127 minimum_mesh = x.first + 1;
1128 }
1129 }
1130 }
1131
1132 /* At this point, we started using collective operations,
1133 * which means that we don't want to go back. A priori.
1134 * But it's quite difficult indeed because we might see
1135 * some spurious results at small sizes. */
1136 // if (mesh > 1 && minimum_mesh <= 1) minimum_mesh = 2;
1137
1138 best[L] = { meshbest, CF.do_timings ? mesh_tt[meshbest] : -1 };
1139
1140 hints[K].total_ncalls += weight;
1141 ncalls_at_depth += weight;
1142
1143 }
1144
1145 /* Now give a summary at this level */
1146
1147 size_t L0 = std::get<0>(cws.front());
1148 size_t L1 = std::get<0>(cws.back());
1149 /* calls_and_weights_at_depth must return a sorted list */
1150 ASSERT_ALWAYS(L0 <= L1);
1151 unsigned int mesh0 = best[L0].first;
1152 unsigned int mesh1 = best[L1].first;
1153
1154 if (do_all_timings) {
1155 for(auto mesh : mesh_all) {
1156 if (mesh_tt_weighted.find(mesh) != mesh_tt_weighted.end()) {
1157 double tt = mesh_tt_weighted[mesh].second;
1158 // std::string rescaled;
1159 if (mesh_tt_weighted[mesh].first != (1U << depth)) {
1160 double ratio = mesh_tt_weighted[mesh].first / (double) (1U << depth);
1161 // rescaled = fmt::sprintf("[rescaled from %.1f%%] ", 100*ratio);
1162 tt /= ratio;
1163 }
1164 (*p_talk) << fmt::sprintf("# %s (%u calls): %.2f [%.1fd]\n",
1165 strat_name[mesh],
1166 ncalls_at_depth,
1167 // rescaled,
1168 tt, tt / 86400);
1169 }
1170 }
1171
1172 double tt_total = best[L0].second * std::get<3>(cws.front())
1173 + best[L1].second * std::get<3>(cws.back());
1174
1175 if (mesh0 == mesh1) {
1176 (*p_talk) << fmt::sprintf("# BEST: %s: %.2f [%.1fd]\n",
1177 strat_name[mesh0],
1178 tt_total, tt_total / 86400);
1179 } else {
1180 (*p_talk) << fmt::sprintf("# BEST: mix of %s and %s: %.2f [%.1fd]\n",
1181 strat_name[mesh0],
1182 strat_name[mesh1],
1183 tt_total, tt_total / 86400);
1184 }
1185 }
1186
1187 if (mesh0 || mesh1) {
1188 lingen_call_companion U = U_typical;
1189 if (U.mp.ram_total() > U.mul.ram_total()) {
1190 (*p_talk) << fmt::sprintf("# (memory(MP): %s, incl %s reserved)\n",
1191 size_disp(U.mp.ram_total()),
1192 size_disp(U.mp.reserved_ram));
1193 } else {
1194 (*p_talk) << fmt::sprintf("# (memory(MUL): %s, incl %s reserved)\n",
1195 size_disp(U.mul.ram_total()),
1196 size_disp(U.mul.reserved_ram));
1197 }
1198
1199 }
1200 return timed_something;
1201 }/*}}}*/
tune_locallingen_tuner1202 lingen_hints tune_local(std::ostream& os, lingen_hints & stored_hints) {/*{{{*/
1203 size_t N = m*n*L/(m+n);
1204 char buf[20];
1205
1206 /* same mechanism in tune_local_at_depth ; output goes to a
1207 * separate stringstream until we're sure that there's
1208 * interesting output somewhere.
1209 */
1210 std::ostringstream os_pre;
1211 bool timed_something = false;
1212 std::ostream * p_talk = &os_pre;
1213
1214 (*p_talk) << fmt::sprintf("# Measuring lingen data"
1215 " for N ~ %zu m=%u n=%u"
1216 " for a %zu-bit prime p,"
1217 " using a %u*%u grid of %u-thread nodes"
1218 " [max target RAM = %s]\n",
1219 N, m, n, mpz_sizeinbase(p, 2),
1220 P.r, P.r, P.T,
1221 size_disp(P.available_ram, buf));
1222 #ifdef HAVE_OPENMP
1223 (*p_talk) << fmt::sprintf("# Note: non-cached basecase measurements"
1224 " are done using openmp as it is configured"
1225 " for the running code, that is, with %d threads\n",
1226 P.openmp_threads);
1227 #endif
1228
1229 mesh_all.push_back(0); strat_name[0] = "basecase";
1230 mesh_all.push_back(1); strat_name[1] = "recursive(single-node)";
1231 if (P.r > 1) {
1232 mesh_all.push_back(P.r);
1233 strat_name[P.r] = fmt::sprintf("recursive(%d*%d-nodes)", P.r, P.r);
1234 }
1235
1236 int fl = log2(L-1) + 1; /* ceil(log_2(L)) */
1237
1238 tuner_persistent_data persist(stored_hints);
1239
1240 /* with basecase_keep_until == 0, then we never measure basecase */
1241 if (basecase_keep_until == 0)
1242 persist.minimum_mesh = 1;
1243
1244 if (persist.impose_hints) {
1245 (*p_talk) << fmt::sprintf("# While we are doing timings here,"
1246 " we'll take schedule decisions based on the hints"
1247 " found in %s when they apply\n", schedule_filename);
1248 }
1249
1250 for(int i = fl ; i>=0 ; i--) {
1251 bool timed_here = tune_local_at_depth((*p_talk), persist, i);
1252 if (timed_here && !timed_something) {
1253 os << os_pre.str();
1254 os_pre.str();
1255 p_talk = &os;
1256 timed_something = true;
1257 }
1258 }
1259
1260 tuner_persistent_data::level_strategy_map & best(persist.best);
1261 lingen_hints & hints(persist.hints);
1262 size_t peak(persist.peak);
1263 int ipeak(persist.ipeak);
1264
1265 /*****************************************************************/
1266 /* make the mesh sizes monotonic. In truth, we can only do this
1267 * safely for mesh size 0 (basecase), since otherwise we would
1268 * have an inconsistency in the sub-block sizes, which would make
1269 * the batch values invalid. */
1270 /* keys in the hint table are sorted as "top-level first" */
1271 std::map<unsigned int, lingen_call_companion::key> max_win_per_mesh;
1272 for(auto const & x : hints) {
1273 if (max_win_per_mesh.size() == mesh_all.size()) break;
1274 if (max_win_per_mesh.find(x.second.mesh) != max_win_per_mesh.end())
1275 max_win_per_mesh[x.second.mesh] = x.first;
1276 }
1277 for(auto & x : hints) {
1278 for(auto const & y : max_win_per_mesh) {
1279 if (y.first) continue; // see above
1280 if (x.second.mesh > y.first && x.first < y.second) {
1281 (*p_talk) << fmt::format(FMT_STRING("## forcing %s at ({}) since it is known to win at ({})\n"), strat_name[y.first],
1282 y.first, y.second);
1283 x.second.mesh = y.first;
1284 }
1285 }
1286 }
1287
1288 os << ("######################### Summary of timing data #######################\n");
1289 if (!tuning_thresholds.empty()) {
1290 std::ostringstream ss;
1291 for(auto const & x : tuning_thresholds) {
1292 if (!ss.str().empty()) ss << ",";
1293 ss << x.first << ':' << x.second;
1294 }
1295 os << fmt::sprintf("# Using explicit tuning_thresholds=%s (from command-line)\n", ss.str());
1296 }
1297 if (!timed_something) {
1298 os << "# note: no real timing data is reported, since the notiming\n"
1299 << "# threshold has explicitly prevented all measurements\n";
1300 }
1301
1302 size_t size_com0;
1303 double tt_com0;
1304 std::tie(size_com0, tt_com0) = mpi_threshold_comm_and_time();
1305 os << fmt::sprintf("# Communication time at lingen_mpi_threshold (%s): %.2f [%.1fd]\n", size_disp(size_com0, buf), tt_com0, tt_com0/86400);
1306 double time_best = best[L].second;
1307 if (time_best != -1) {
1308 time_best += tt_com0;
1309 os << fmt::sprintf("# Expected total time: %.2f [%.1fd], peak memory %s (at depth %d)\n", time_best, time_best / 86400, size_disp(peak, buf), ipeak);
1310 }
1311 hints.ipeak=ipeak;
1312 hints.peak=peak;
1313 os << fmt::sprintf("(%u,%u,%u,%.1f,%1.f)\n",m,n,P.r,time_best,(double)peak/1024./1024./1024.);
1314
1315 /* This one is strictly linear anyway */
1316 hints.tt_gather_per_unit = tt_com0 / 2 / L;
1317 hints.tt_scatter_per_unit = tt_com0 / 2 / L;
1318
1319 return hints;
1320 }/*}}}*/
tunelingen_tuner1321 lingen_hints tune(std::ostream& os) {/*{{{*/
1322 int rank;
1323 MPI_Comm_rank(P.comm, &rank);
1324 lingen_hints hints;
1325
1326 if (rank == 0) {
1327 lingen_hints stored_hints;
1328 if (schedule_filename) {
1329 std::ifstream is(schedule_filename);
1330 if (is && is >> stored_hints) {
1331 /* This one _always_ goes to stdout */
1332 std::cout << fmt::sprintf("# Read tuning schedule from %s\n", schedule_filename);
1333 } else {
1334 std::cerr << fmt::sprintf("# Failed to read tuning schedule from %s\n", schedule_filename);
1335 }
1336 }
1337 hints = tune_local(os, stored_hints);
1338 if (schedule_filename && stored_hints.empty()) {
1339 std::ofstream os(schedule_filename);
1340 if (os && os << hints) {
1341 /* This one _always_ goes to stdout */
1342 std::cout << fmt::sprintf("# Written tuning schedule to %s\n", schedule_filename);
1343 } else {
1344 std::cerr << fmt::sprintf("# Failed to write tuning schedule to %s\n", schedule_filename);
1345 }
1346 }
1347 }
1348
1349 hints.share(0, P.comm);
1350
1351 return hints;
1352 }/*}}}*/
1353 };
1354
1355 const std::vector<const char *>
1356 lingen_tuner::tuning_thresholds_t::thresholds_verbs
1357 {
1358 recursive,
1359 collective,
1360 // "collective" should allow finer grain, so as to allow
1361 // multiple mesh sizes.
1362 // "plain" makes no sense per se, as it's the base thing
1363 // to use as soon as we go recursive
1364 #ifdef SELECT_MPFQ_LAYER_u64k1
1365 ternary,
1366 cantor,
1367 #else
1368 flint,
1369 #endif
1370 notiming,
1371 };
1372
1373 const std::vector<std::pair<lingen_substep_schedule::fft_type_t, const char *>> lingen_tuner::tuning_thresholds_t::code_to_key
1374 #ifdef SELECT_MPFQ_LAYER_u64k1
1375 {
1376 { lingen_substep_schedule::FFT_NONE, recursive, },
1377 { lingen_substep_schedule::FFT_TERNARY, ternary, },
1378 { lingen_substep_schedule::FFT_CANTOR, cantor },
1379 };
1380 #else
1381 {
1382 { lingen_substep_schedule::FFT_NONE, recursive, },
1383 { lingen_substep_schedule::FFT_FLINT, flint, },
1384 };
1385 #endif
1386
1387 /* Need this for conformance */
1388 constexpr const char * lingen_tuner::tuning_thresholds_t::recursive;
1389 constexpr const char * lingen_tuner::tuning_thresholds_t::collective;
1390 constexpr const char * lingen_tuner::tuning_thresholds_t::ternary;
1391 constexpr const char * lingen_tuner::tuning_thresholds_t::cantor;
1392 constexpr const char * lingen_tuner::tuning_thresholds_t::flint;
1393 constexpr const char * lingen_tuner::tuning_thresholds_t::notiming;
1394
1395
lingen_tuning(bw_dimensions & d,size_t L,MPI_Comm comm,cxx_param_list & pl)1396 lingen_hints lingen_tuning(bw_dimensions & d, size_t L, MPI_Comm comm, cxx_param_list & pl)
1397 {
1398 lingen_tuner::output_info O(pl);
1399 if (O.quiet) {
1400 std::ostringstream os;
1401 return lingen_tuner(os, d, L, comm, pl).tune(os);
1402 } else if (O.tuning_log_filename) {
1403 std::ofstream os(O.tuning_log_filename, std::ios_base::out);
1404 return lingen_tuner(os, d, L, comm, pl).tune(os);
1405 } else {
1406 return lingen_tuner(std::cout, d, L, comm, pl).tune(std::cout);
1407 }
1408 }
1409
lingen_tuning_decl_usage(cxx_param_list & pl)1410 void lingen_tuning_decl_usage(cxx_param_list & pl)
1411 {
1412 lingen_tuner::declare_usage(pl);
1413 }
1414
lingen_tuning_lookup_parameters(cxx_param_list & pl)1415 void lingen_tuning_lookup_parameters(cxx_param_list & pl)
1416 {
1417 lingen_tuner::lookup_parameters(pl);
1418 }
1419
1420