1 #include "cado.h" // IWYU pragma: keep
2
3 // IWYU pragma: no_include <sys/param.h>
4 // IWYU pragma: no_include <memory>
5
6 #ifdef SELECT_MPFQ_LAYER_u64k1
7 #error "lingen_tune_cutoffs does not work with binary, at least for the moment"
8 #endif
9
10 #include <cstdio>
11 #include <cstdlib>
12 #include <cstring>
13 #include <climits> // for UINT_MAX
14 #include <cstdint> // for uint64_t
15 #ifdef HAVE_SIGHUP
16 #include <csignal>
17 #endif
18 #include <vector>
19 #include <utility>
20 #include <map>
21 #include <string>
22 #include <iostream> // std::cout
23 #include <sstream> // IWYU pragma: keep
24 #include <type_traits> // for __strip_reference_wrapper...
25
26 #include <gmp.h> // for mp_limb_t, gmp_randclear
27
28 #include "timing.h" // for seconds, wct_seconds, cpu...
29 #include "tree_stats.hpp" // for tree_stats
30
31 #include "macros.h"
32 #include "cxx_mpz.hpp"
33 #ifndef SELECT_MPFQ_LAYER_u64k1
34 #include "lingen_polymat.hpp"
35 #endif
36 #include "lingen_matpoly_select.hpp"
37 #include "lingen_fft_select.hpp" // IWYU pragma: keep
38 // #include "lingen_bigpolymat.h" // 20150826: deleted.
39 #include "lingen_matpoly_ft.hpp"
40 #include "lingen_abfield.hpp" // IWYU pragma: keep
41 #include "lingen_bw_dimensions.hpp"
42 #include "lingen_tune_cutoffs.hpp"
43 #include "params.h"
44
45 using namespace std;
46
lingen_tune_cutoffs_decl_usage(cxx_param_list & pl)47 void lingen_tune_cutoffs_decl_usage(cxx_param_list & pl)
48 {
49 param_list_decl_usage(pl, "B", "minimum end of bench span window");
50 param_list_decl_usage(pl, "catchsig", "enable intercepting ^C");
51 }
52
lingen_tune_cutoffs_lookup_parameters(cxx_param_list & pl)53 void lingen_tune_cutoffs_lookup_parameters(cxx_param_list & pl)
54 {
55 param_list_lookup_string(pl, "B");
56 param_list_lookup_string(pl, "catchsig");
57 }
58
59 /* interface to C programs for list of cutoffs we compute *//*{{{*/
60 /* This is the simple C-type cutoff table which is used down the line by
61 * C programs */
62
63 struct cutoff_list_s {
64 unsigned int k; /* this is the number of coefficients (length) of
65 * pi. Admittedly, it is slightly bogus to have
66 * this as a main variable. Length [k] for pi
67 * corresponds to an "input length" of [k*(m+n)/m].
68 * In the MP case, this means that E has length
69 * [input_length+k-1], and the output E_right has
70 * length [input_length]. */
71 unsigned int choice; /* semantics vary. For FFT-based stuff,
72 * this is the depth reduction from the
73 * flit default.
74 */
75 };
76
77 typedef cutoff_list_s * cutoff_list;
78
79 /* cutoff list *ALWAYS* finish with UINT_MAX, UINT_MAX */
cutoff_list_get(cutoff_list cl,unsigned int k)80 unsigned int cutoff_list_get(cutoff_list cl, unsigned int k)
81 {
82 if (!cl) return 0;
83 if (k < cl->k) return 0;
84 for( ; k >= cl->k ; cl++);
85 return (--cl)->choice;
86 }
87 /*}}}*/
88
89 /* The -B argument may be used to request printing of timing results at
90 * least up to this input length */
91 unsigned int bench_atleast_uptothis = 0;
92
93 /* timer backends *//*{{{*/
94 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
95 struct timer_rdtsc {
operator ()timer_rdtsc96 inline double operator()() const {
97 uint64_t c = cputicks();
98 return c/1.0e9;
99 }
timer_unittimer_rdtsc100 static const char * timer_unit() { return "Gccyc"; }
101 };
102 #endif
103
104
105 struct timer_rusage {
operator ()timer_rusage106 inline double operator()() const { return seconds(); }
timer_unittimer_rusage107 static const char * timer_unit() { return "seconds (rusage)"; }
108 };
109 struct timer_wct {
operator ()timer_wct110 inline double operator()() const { return wct_seconds(); }
timer_unittimer_wct111 static const char * timer_unit() { return "seconds (wct)"; }
112 };
113 /* It is important, when doing MPI benches, that we use this algorithm */
114 struct timer_wct_synchronized {
operator ()timer_wct_synchronized115 inline double operator()() const {
116 double d = wct_seconds();
117 MPI_Allreduce(MPI_IN_PLACE, &d, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
118 return d;
119 }
120 };
121 /*}}}*/
122
123 /* how do we do one single measurements ? We'll do it many times, but how
124 * long ? Here is the place for making the corresponding choices.
125 */
126 struct measurement_choice {
127 int enough_repeats;
128 double minimum_time;
129 double enough_time;
measurement_choicemeasurement_choice130 measurement_choice() {
131 /* defaults */
132 enough_repeats = 1000;
133 minimum_time = 0.5;
134 enough_time = 1.0;
135 }
136 };
137
138 template<typename T>
139 struct small_bench {
140 measurement_choice mc;
141 double& tfinal;
142 double last_starting_tt;
143 /* accumulated_time so far. Can be increased either with
144 * - add_since_last() (which resets our timer)
145 * - set_since_last() (which does not)
146 * - inject() (which does not either, since we do not care about our
147 * own timer in this case).
148 */
149 double tt;
150
151 int n;
small_benchsmall_bench152 explicit small_bench(double& t, measurement_choice mc = measurement_choice()) : mc(mc), tfinal(t) {tt=n=0;reset();}
153 /* are we done ? when we are, we set tfinal to the average value. */
donesmall_bench154 inline int done() {
155 if (tt < mc.enough_time) {
156 if (tt < mc.minimum_time && n < mc.enough_repeats)
157 return 0;
158 }
159 // fprintf(stderr, "%.2f %d\n", tt, n);
160 tfinal = tt / n;
161 return 1;
162 }
operator ++small_bench163 inline small_bench& operator++() { n++; return *this; }
resetsmall_bench164 inline void reset() { last_starting_tt = T()(); }
165 /* adds to tt the time since the last timer reset, and resets the
166 * timer. */
add_since_lastsmall_bench167 inline void add_since_last(int weight = 1) {
168 double now = T()();
169 tt += weight * (now - last_starting_tt);
170 last_starting_tt = now;
171 }
172 /* *sets* tt to the time since the last timer reset. Do not reset the
173 * timer. */
set_since_lastsmall_bench174 inline void set_since_last(int weight = 1) {
175 double now = T()();
176 tt = weight * (now - last_starting_tt);
177 }
injectsmall_bench178 inline void inject(double t, int weight = 1) { tt += weight*t; }
179 };
180
181 /* {{{ cutoff_finder and its child small_bench */
182 template<typename Timer_backend = timer_rusage>
183 struct cutoff_finder {
184 measurement_choice mc;
185 double scale;
186 int stable_cutoff_break;
187 unsigned int ntests;
188 int stable;
189 vector<double> benches;
190 vector<bool> meaningful;
191 vector<pair<unsigned int, pair<vector<double>, int> > > all_results;
192 map<int, string> method_names;
193 double slowness_ratio;
194 unsigned int age_slow_discard;
195
cutoff_findercutoff_finder196 cutoff_finder(unsigned int ntests, measurement_choice mc = measurement_choice())
197 :
198 mc(mc),
199 ntests(ntests),
200 benches(ntests),
201 meaningful(ntests, true)
202 {
203 /* Fill defaults */
204 scale = 1.1;
205 stable_cutoff_break = 4;
206 stable = 0;
207 slowness_ratio = 2;
208 age_slow_discard = 5;
209 }
210
set_method_namecutoff_finder211 inline void set_method_name(int k, string const& s) {
212 method_names.insert(make_pair(k, s));
213 }
method_namecutoff_finder214 inline string method_name(int i) const {
215 ostringstream v;
216 map<int, string>::const_iterator z = method_names.find(i);
217 if (z == method_names.end()) {
218 v << "method " << i;
219 return v.str();
220 }
221 return z->second;
222 }
223
next_lengthcutoff_finder224 inline unsigned int next_length(unsigned int k) const {
225 return MAX(k+1, scale*k);
226 }
227
donecutoff_finder228 inline int done() const {
229 int nactive=0;
230 for(unsigned int i = 0 ; i < ntests ; i++) {
231 nactive += meaningful[i];
232 }
233 if (nactive > 1) return 0;
234 return !(stable < stable_cutoff_break);
235 }
236
still_meaningful_to_testcutoff_finder237 inline bool still_meaningful_to_test(int i) { return meaningful[i]; }
238
summarize_for_this_lengthcutoff_finder239 string summarize_for_this_length(unsigned int k)
240 {
241 std::ostringstream comments;
242 int best = -1;
243 for(unsigned int i = 0 ; i < ntests ; i++) {
244 if (meaningful[i] && (best < 0 || benches[i] < benches[best])) best = i;
245 }
246 ASSERT_ALWAYS(best >= 0);
247 all_results.push_back(make_pair(k,
248 make_pair(benches, best)));
249
250 if (all_results.empty() || best != all_results.back().second.second) {
251 stable = 0;
252 } else {
253 stable++;
254 }
255
256 vector<bool> was_meaningful = meaningful;
257
258 /* try to discard those which are slow. We live on the assumption
259 * that the higher-numbered strategies win eventually, therefore
260 * we don't discard them if [best] is smaller currently */
261 for(int i = 0 ; i < best ; i++) {
262 /* already dead ? */
263 if (!meaningful[i]) continue;
264
265 /* for how long has it been slow ? */
266 unsigned int age_slow = 0;
267 for( ; age_slow < all_results.size() ; age_slow++) {
268 vector<double> const& these(all_results[all_results.size()-1-age_slow].second.first);
269 int best_there = all_results[all_results.size()-1-age_slow].second.second;
270 /* do not discard this method if a less advanced one is
271 * still alive.
272 */
273 if (best_there < i) break;
274 /* not so slow */
275 if (these[i] <= these[best_there] * slowness_ratio) break;
276 }
277 if (age_slow >= age_slow_discard) {
278 /* ok, discard */
279 comments << " ; discarding " << method_name(i);
280 meaningful[i] = false;
281 }
282 }
283 ostringstream s;
284 s << k;
285 for(unsigned int i = 0 ; i < ntests ; i++) {
286 if (was_meaningful[i]) {
287 s << " " << benches[i];
288 if ((int) i==best) s << '#';
289 } else {
290 s << " *";
291 }
292 benches[i] = 999999;
293 }
294 s << " " << method_name(best) << comments.str();
295 return s.str();
296 }
297
micro_benchcutoff_finder298 small_bench<Timer_backend> micro_bench(int i) {
299 return small_bench<Timer_backend>(benches[i], mc);
300 }
micro_benchcutoff_finder301 small_bench<Timer_backend> micro_bench(int i, measurement_choice mc1) {
302 return small_bench<Timer_backend>(benches[i], mc1);
303 }
304
305 /* This is really limited to karatsuba-like cuttofs. It's ugly */
export_best_tablecutoff_finder306 vector<pair<unsigned int, int> > export_best_table()
307 {
308 vector<pair<unsigned int, int> > steps;
309 steps.push_back(make_pair(1,0));
310
311 typedef vector<pair<unsigned int, pair<vector<double>, int> > >::const_iterator it_t;
312 for(it_t it = all_results.begin() ; it != all_results.end() ; ++it) {
313 unsigned int size = it->first;
314 int best = it->second.second;
315 if (steps.empty() || best != steps.back().second) {
316 steps.push_back(make_pair(size, best));
317 }
318 }
319 return steps;
320 }
export_kara_cutoff_datacutoff_finder321 vector<pair<unsigned int, int> > export_kara_cutoff_data(struct polymat_cutoff_info * dst)
322 {
323 /* This size will eventually feed the ->subdivide field for the
324 * cutoff info */
325 unsigned int first_kara_size = UINT_MAX;
326
327 /* This size will eventually feed the ->cut field for the
328 * cutoff info. For sizes above this, we will _always_ use
329 * karatsuba. */
330 unsigned int first_alwayskara_size = UINT_MAX;
331
332 vector<pair<unsigned int, int> > steps;
333 steps.push_back(make_pair(1,0));
334
335 typedef vector<pair<unsigned int, pair<vector<double>, int> > >::const_iterator it_t;
336 for(it_t it = all_results.begin() ; it != all_results.end() ; ++it) {
337 unsigned int size = it->first;
338 vector<double> const& benches(it->second.first);
339 int best = it->second.second;
340 /* In case fft wins, we invent something which will use
341 * karatsuba still */
342 if (best > 1) best = benches[1] < benches[0];
343 if (steps.empty() || best != steps.back().second) {
344 steps.push_back(make_pair(size, best));
345 if (best == 1 && size < first_kara_size)
346 first_kara_size = size;
347 /* assign it multiple times */
348 first_alwayskara_size = size;
349 }
350 }
351 dst->cut = first_alwayskara_size;
352 dst->subdivide = first_kara_size;
353 dst->table_size = steps.size();
354 dst->table = (unsigned int (*)[2]) realloc(dst->table,
355 dst->table_size * sizeof(unsigned int[2]));
356 for(unsigned int v = 0 ; v < dst->table_size ; v++) {
357 dst->table[v][0] = steps[v].first;
358 dst->table[v][1] = steps[v].second;
359 };
360 return steps;
361 }
export_kara_cutoff_data_force_kara_nowcutoff_finder362 vector<pair<unsigned int, int> > export_kara_cutoff_data_force_kara_now(struct polymat_cutoff_info * dst, unsigned int size)
363 {
364 vector<double> allz(ntests);
365 all_results.push_back(make_pair(size, make_pair(allz, 1)));
366 vector<pair<unsigned int, int> > x = export_kara_cutoff_data(dst);
367 all_results.pop_back();
368 return x;
369 }
print_resultcutoff_finder370 static string print_result(vector<pair<unsigned int, int> > const& tab) {
371 ostringstream s;
372 s << "{";
373 typedef vector<pair<unsigned int, int> >::const_iterator it_t;
374 for(it_t y = tab.begin() ; y != tab.end() ; y++) {
375 s << " { " << y->first << ", " << y->second << " },";
376 }
377 s << " }";
378 return s.str();
379 }
380
381 };
382 /* }}} */
383
384 #ifdef HAVE_SIGHUP
385 double last_hup = 0;
386 int hup_caught = 0;
387
sighandler(int sig MAYBE_UNUSED)388 void sighandler(int sig MAYBE_UNUSED)
389 {
390 double t = wct_seconds();
391 if (t < last_hup + 0.5) {
392 fprintf(stderr, "Interrupt twice in half a second; exiting\n");
393 exit(1);
394 }
395 last_hup = wct_seconds();
396 hup_caught++;
397 }
398
399
catch_control_signals()400 void catch_control_signals()
401 {
402 struct sigaction sa[1];
403 memset(sa, 0, sizeof(sa));
404 sa->sa_handler = sighandler;
405 sigaction(SIGHUP, sa, NULL);
406 sigaction(SIGINT, sa, NULL);
407
408 /*
409 * should play sigprocmask and friends
410 sigset_t sset[1];
411 sigemptyset(sset);
412 sigaddset(sset, SIGHUP);
413 */
414 }
415 #endif
416
417 /* This code will first try to bench the basic operations (first
418 * local; global are later) of middle product E*pi and product pi*pi.
419 * The goal is to see where is the good value for the thresholds:
420 * polymat_mul_kara_threshold
421 * polymat_mp_kara_threshold
422 */
423
lingen_tune_mul_fti_depth(abdst_field ab,unsigned int m,unsigned int n,cutoff_list * cl_out)424 void lingen_tune_mul_fti_depth(abdst_field ab, unsigned int m, unsigned int n, cutoff_list *cl_out)/*{{{*/
425 {
426 gmp_randstate_t rstate;
427 gmp_randinit_default(rstate);
428 gmp_randseed_ui(rstate, 1);
429
430 typedef timer_rdtsc timer_t;
431
432 int nadjs=7;
433 measurement_choice mc;
434 mc.enough_time = 2.0;
435 mc.minimum_time = 0.1;
436 mc.enough_repeats = 100;
437
438 measurement_choice mcout = mc;
439 mcout.enough_repeats = 2;
440 mc.enough_time = 1.0;
441
442 cutoff_finder<timer_t> finder(nadjs, mcout);
443 finder.slowness_ratio = 1.4;
444 // finder.age_slow_discard = 5;
445 finder.scale = 1.01;
446
447 for(int i = 0 ; i < nadjs ; i++) {
448 ostringstream o;
449 o << "depth_adj=" << nadjs-1-i;
450 finder.set_method_name(i, o.str());
451 }
452
453 cout << "# Tuning FFT depth adjustment (mul) for"
454 << " m=" << m
455 << ", n=" << n
456 << "\n";
457
458 cout << "# Timings reported in " << timer_t::timer_unit() << "\n";
459 cout << "# inputlength, ncoeffs, various depth adjustments";
460 cout << "\n";
461
462 /* for multiplication */
463 cout << "# Note: for input length k within lingen,"
464 << " we use ncoeffs=k*m/(m+n) = "<<(double)m/(m+n)<<"*k\n";
465
466 /* This is for forcing the bench to run until a large length. This is
467 * unnecessary here */
468 unsigned int min_bench = 0;
469
470 /* Beware, k is the length of piL, not a degree. Hence length 1
471 * clearly makes no sense */
472 for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
473 unsigned int input_length = (m+n) * k / m;
474
475 abvec A, B, C;
476 void *tA, *tB, *tC;
477
478 mpz_t p;
479 mpz_init(p);
480 abfield_characteristic(ab, p);
481
482 abvec_init(ab, &A, k);
483 abvec_init(ab, &B, k);
484 abvec_init(ab, &C, 2*k-1);
485
486 abvec_random(ab, A, k, rstate);
487 abvec_random(ab, B, k, rstate);
488 abvec_set_zero(ab, C, 2*k-1);
489
490 ostringstream extra_info;
491
492 for(int index = 0 ; index < nadjs ; index++) {
493 struct fft_transform_info fti[1];
494 fft_transform_info_init_fppol(fti, p, k, k, m+n);
495 fft_transform_info_set_first_guess(fti);
496 if (index == 0) {
497 extra_info << "(from " << fti->depth << ")";
498 }
499 if (nadjs-1-index >= fti->depth) {
500 finder.benches[index] = 999999999;
501 continue;
502 }
503 if (fti->depth >= 11 && index > 0) {
504 finder.benches[index] = 999999999;
505 continue;
506 }
507 fft_transform_info_adjust_depth(fti, nadjs-1-index);
508
509 size_t fft_alloc_sizes[3];
510 fft_transform_info_get_alloc_sizes(fti, fft_alloc_sizes);
511 void * tt = malloc(fft_alloc_sizes[2]);
512 void * qt = malloc(fft_alloc_sizes[1]);
513
514 tA = malloc(fft_alloc_sizes[0]);
515 tB = malloc(fft_alloc_sizes[0]);
516 tC = malloc(fft_alloc_sizes[0]);
517
518 typedef small_bench<timer_t> bt;
519 for(bt x = finder.micro_bench(index); !x.done(); ++x) {
520
521 double t_dftA=0;
522 for(bt y = bt(t_dftA, mc); !y.done(); ++y) {
523 fft_prepare(fti, tA);
524 fft_dft(fti, tA, (mp_limb_t*)A, k, qt);
525 y.set_since_last();
526 }
527 x.inject(t_dftA, (m+n)*(m+n));
528
529 double t_dftB=0;
530 for(bt y = bt(t_dftB, mc); !y.done(); ++y) {
531 fft_prepare(fti, tB);
532 fft_dft(fti, tB, (mp_limb_t*)B, k, qt);
533 y.set_since_last();
534 }
535 x.inject(t_dftB, (m+n)*(m+n));
536
537 double t_conv=0;
538 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
539 fft_prepare(fti, tC);
540 fft_zero(fti, tC);
541 fft_addcompose(fti, tC, tA, tB, tt, qt);
542 y.set_since_last();
543 }
544 x.inject(t_conv, (m+n)*(m+n)*(m+n));
545
546 double t_iftC=0;
547 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
548 fft_prepare(fti, tC);
549 fft_ift(fti, (mp_limb_t*)C, 2*k-1, tC, qt);
550 y.set_since_last();
551 }
552 x.inject(t_iftC, (m+n)*(m+n));
553 }
554 free(tA);
555 free(tB);
556 free(tC);
557 free(tt);
558 free(qt);
559 }
560
561 abvec_clear(ab, &A, k);
562 abvec_clear(ab, &B, k);
563 abvec_clear(ab, &C, 2*k-1);
564
565 mpz_clear(p);
566
567 cout << input_length
568 << " " << finder.summarize_for_this_length(k)
569 << extra_info.str()
570 << "\n";
571 }
572 hup_caught = 0;
573
574 vector<pair<unsigned int, int> > table = finder.export_best_table();
575
576 typedef vector<pair<unsigned int, int> >::iterator it_t;
577 for(it_t x = table.begin() ; x != table.end() ; ++x)
578 x->second = nadjs-1-x->second;
579
580 cout << "/* FFT depth adjustments for "
581 << (m)<<"*"<<(m+n)
582 <<" times "
583 << (m+n)<<"*"<<(m+n)<<" products */\n";
584 cout << "#define MUL_FTI_DEPTH_ADJ_" <<m+n<<"_"<<(m+n)<<"_"<<(m+n)
585 << " " << finder.print_result(table) << endl;
586
587 if (cl_out) {
588 *cl_out = (cutoff_list) malloc((table.size()+1)*sizeof(**cl_out));
589 for(unsigned int i = 0 ; i < table.size(); i++) {
590 (*cl_out)[i].k = table[i].first;
591 (*cl_out)[i].choice = table[i].second;
592 }
593 (*cl_out)[table.size()].k = UINT_MAX;
594 (*cl_out)[table.size()].choice = UINT_MAX;
595 }
596
597 gmp_randclear(rstate);
598 }/*}}}*/
lingen_tune_mp_fti_depth(abdst_field ab,unsigned int m,unsigned int n,cutoff_list * cl_out)599 void lingen_tune_mp_fti_depth(abdst_field ab, unsigned int m, unsigned int n, cutoff_list * cl_out)/*{{{*/
600 {
601 gmp_randstate_t rstate;
602 gmp_randinit_default(rstate);
603 gmp_randseed_ui(rstate, 1);
604
605 typedef timer_rdtsc timer_t;
606
607 int nadjs=7;
608 measurement_choice mc;
609 mc.enough_time = 2.0;
610 mc.minimum_time = 0.1;
611 mc.enough_repeats = 100;
612
613 measurement_choice mcout = mc;
614 mcout.enough_repeats = 2;
615 mc.enough_time = 1.0;
616
617 cutoff_finder<timer_t> finder(nadjs, mcout);
618 finder.slowness_ratio = 1.4;
619 // finder.age_slow_discard = 5;
620 finder.scale = 1.01;
621
622 for(int i = 0 ; i < nadjs ; i++) {
623 ostringstream o;
624 o << "depth_adj=" << nadjs-1-i;
625 finder.set_method_name(i, o.str());
626 }
627
628 cout << "# Tuning FFT depth adjustment (mp) for"
629 << " m=" << m
630 << ", n=" << n
631 << "\n";
632
633 cout << "# Timings reported in " << timer_t::timer_unit() << "\n";
634 cout << "# inputlength, ncoeffs, various depth adjustments";
635 cout << "\n";
636
637 /* for multiplication */
638 cout << "# Note: for input length k within lingen,"
639 << " we use ncoeffs=k*m/(m+n) = "<<(double)m/(m+n)<<"*k\n";
640
641 /* This is for forcing the bench to run until a large length. This is
642 * unnecessary here */
643 unsigned int min_bench = 0;
644
645 /* Beware, k is the length of piL, not a degree. Hence length 1
646 * clearly makes no sense */
647 for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
648 unsigned int input_length = (m+n) * k / m;
649
650 unsigned int E_length = k + input_length - 1;
651
652 abvec A, B, C;
653 void *tA, *tB, *tC;
654
655 mpz_t p;
656 mpz_init(p);
657 abfield_characteristic(ab, p);
658
659 abvec_init(ab, &A, E_length);
660 abvec_init(ab, &B, k);
661 abvec_init(ab, &C, input_length);
662
663 abvec_random(ab, A, E_length, rstate);
664 abvec_random(ab, B, k, rstate);
665 abvec_set_zero(ab, C, input_length);
666
667 ostringstream extra_info;
668
669 for(int index = 0 ; index < nadjs ; index++) {
670 struct fft_transform_info fti[1];
671 fft_transform_info_init_fppol_mp(fti, p, k, E_length, m+n);
672 fft_transform_info_set_first_guess(fti);
673 if (index == 0) {
674 extra_info << "(from " << fti->depth << ")";
675 }
676 if (nadjs-1-index >= fti->depth) {
677 finder.benches[index] = 999999999;
678 continue;
679 }
680 if (fti->depth >= 11 && index > 0) {
681 finder.benches[index] = 999999999;
682 continue;
683 }
684 fft_transform_info_adjust_depth(fti, nadjs-1-index);
685
686 size_t fft_alloc_sizes[3];
687 fft_transform_info_get_alloc_sizes(fti, fft_alloc_sizes);
688 void * tt = malloc(fft_alloc_sizes[2]);
689 void * qt = malloc(fft_alloc_sizes[1]);
690
691 tA = malloc(fft_alloc_sizes[0]);
692 tB = malloc(fft_alloc_sizes[0]);
693 tC = malloc(fft_alloc_sizes[0]);
694
695 typedef small_bench<timer_t> bt;
696 for(bt x = finder.micro_bench(index); !x.done(); ++x) {
697
698 double t_dftA=0;
699 for(bt y = bt(t_dftA, mc); !y.done(); ++y) {
700 fft_prepare(fti, tA);
701 fft_dft(fti, tA, (mp_limb_t*)A, E_length, qt);
702 y.set_since_last();
703 }
704 x.inject(t_dftA, m*(m+n));
705
706 double t_dftB=0;
707 for(bt y = bt(t_dftB, mc); !y.done(); ++y) {
708 fft_prepare(fti, tB);
709 fft_dft(fti, tB, (mp_limb_t*)B, k, qt);
710 y.set_since_last();
711 }
712 x.inject(t_dftB, (m+n)*(m+n));
713
714 double t_conv=0;
715 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
716 fft_prepare(fti, tC);
717 fft_zero(fti, tC);
718 fft_addcompose(fti, tC, tA, tB, tt, qt);
719 y.set_since_last();
720 }
721 x.inject(t_conv, m*(m+n)*(m+n));
722
723 double t_iftC=0;
724 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
725 fft_prepare(fti, tC);
726 fft_ift(fti, (mp_limb_t*)C, input_length, tC, qt);
727 y.set_since_last();
728 }
729 x.inject(t_iftC, m*(m+n));
730 }
731 free(tA);
732 free(tB);
733 free(tC);
734 free(tt);
735 free(qt);
736 }
737
738 abvec_clear(ab, &A, E_length);
739 abvec_clear(ab, &B, k);
740 abvec_clear(ab, &C, input_length);
741
742
743 cout << input_length
744 << " " << finder.summarize_for_this_length(k)
745 << extra_info.str()
746 << "\n";
747 }
748 hup_caught = 0;
749
750 vector<pair<unsigned int, int> > table = finder.export_best_table();
751
752 typedef vector<pair<unsigned int, int> >::iterator it_t;
753 for(it_t x = table.begin() ; x != table.end() ; ++x)
754 x->second = nadjs-1-x->second;
755
756 cout << "/* FFT depth adjustments for "
757 << (m)<<"*"<<(m+n)
758 <<" times "
759 << (m+n)<<"*"<<(m+n)<<" middle-products */\n";
760 cout << "#define MP_FTI_DEPTH_ADJ_" <<m<<"_"<<(m+n)<<"_"<<(m+n)
761 << " " << finder.print_result(table) << endl;
762
763 if (cl_out) {
764 *cl_out = (cutoff_list) malloc((table.size()+1)*sizeof(**cl_out));
765 for(unsigned int i = 0 ; i < table.size(); i++) {
766 (*cl_out)[i].k = table[i].first;
767 (*cl_out)[i].choice = table[i].second;
768 }
769 (*cl_out)[table.size()].k = UINT_MAX;
770 (*cl_out)[table.size()].choice = UINT_MAX;
771 }
772
773 gmp_randclear(rstate);
774 }/*}}}*/
775
776
lingen_tune_mul(abdst_field ab,unsigned int m,unsigned int n,cutoff_list cl MAYBE_UNUSED)777 void lingen_tune_mul(abdst_field ab, unsigned int m, unsigned int n, cutoff_list cl MAYBE_UNUSED)/*{{{*/
778 {
779 typedef fft_transform_info fft_type;
780 gmp_randstate_t rstate;
781 gmp_randinit_default(rstate);
782 gmp_randseed_ui(rstate, 1);
783 #define TUNE_MUL_FINDER_NMETHODS 4
784
785 typedef timer_rusage timer_t;
786 cutoff_finder<timer_rusage> finder(TUNE_MUL_FINDER_NMETHODS);
787 finder.set_method_name(0, "polymat-basecase");
788 finder.set_method_name(1, "polymat-karatsuba");
789 finder.set_method_name(2, "matpoly-kronecker-schönhage");
790 finder.set_method_name(3, "matpoly-ft-kronecker-schönhage-caching");
791
792 polymat_cutoff_info always_basecase[1];
793 polymat_cutoff_info improved[1];
794
795 polymat_cutoff_info_init(always_basecase);
796 polymat_cutoff_info_init(improved);
797
798 cout << "# Tuning "
799 << (m+n)<<"*"<<(m+n)
800 <<" times "
801 << (m+n)<<"*"<<(m+n)<<" products\n";
802
803 cout << "# inputlength, ncoeffs";
804 for(unsigned int i = 0 ; i < finder.ntests ; i++) {
805 cout << ", " << finder.method_name(i);
806 }
807 cout << "\n";
808 cout << "# Note: for input length k within lingen,"
809 << " we use ncoeffs=k*m/(m+n) = "<<(double)m/(m+n)<<"*k\n";
810 /* input length k means we consider the pi polynomial which is
811 * created by k successive steps. So this mulitplication, in effect,
812 * occurs at the end of a recursive procedure of length 2k, which
813 * collects the pi matrices of its two child calls.
814 */
815
816 /* make sure we don't stop the bench absurdly early. We set the bench
817 * as input_length \approx 2000 */
818 unsigned int min_bench = bench_atleast_uptothis * m / (m+n);
819
820 /* Beware, k is the length of piL, not a degree. Hence length 1
821 * clearly makes no sense */
822 for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
823 unsigned int input_length = (m+n) * k / m;
824 polymat piL (ab, m+n, m+n, k);
825 polymat piR (ab, m+n, m+n, k);
826 polymat pi (ab, m+n, m+n, 2*k-1);
827 polymat piref(ab, m+n, m+n, 2*k-1);
828 piL.fill_random(k, rstate);
829 piR.fill_random(k, rstate);
830
831 ostringstream extra_info;
832
833 if (finder.still_meaningful_to_test(0)) {
834 /* disable kara for a minute */
835 polymat_set_mul_kara_cutoff(always_basecase, NULL);
836 for(small_bench<timer_t> x = finder.micro_bench(0); !x.done(); ++x) {
837 pi.mul(piL, piR);
838 x.set_since_last();
839 }
840 if (piref.get_size() == 0) {
841 piref = std::move(pi);
842 // fprintf(stderr, "BASIS0\n");
843 } else if (pi.cmp(piref) != 0) {
844 fprintf(stderr, "MISMATCH0!\n");
845 }
846 }
847
848 if (finder.still_meaningful_to_test(1)) {
849 /* This temporarily sets the cutoff table to enable karatsuba for
850 * length >=k (hence for this test) at least, possibly using
851 * karatsuba one or more times in the recursive calls depending
852 * on what has been measured as best so far.
853 */
854 finder.export_kara_cutoff_data_force_kara_now(improved, k);
855 polymat_set_mul_kara_cutoff(improved, NULL);
856 for(small_bench<timer_t> x = finder.micro_bench(1); !x.done(); ++x) {
857 pi.mul(piL, piR);
858 x.set_since_last();
859 }
860 if (piref.get_size() == 0) {
861 piref = std::move(pi);
862 // fprintf(stderr, "BASIS1\n");
863 } else if (pi.cmp(piref) != 0) {
864 fprintf(stderr, "MISMATCH1!\n");
865 }
866 }
867
868 /* don't exaggerate our memory requirements */
869 matpoly xpiL;
870 xpiL.set_polymat(piL);
871 piL = polymat();
872
873 matpoly xpiR;
874 xpiR.set_polymat(piR);
875 piR = polymat();
876
877 matpoly xpiref;
878 if (piref.get_size()) {
879 xpiref.set_polymat(piref);
880 piref = polymat();
881 }
882
883 matpoly xpi(ab, m+n, m+n, 2*k-1);
884 pi = polymat();
885
886 /* we should make the effort of converting polymat to matpoly,
887 * right ? */
888
889 if (finder.still_meaningful_to_test(2)) {
890 /* The matpoly layer is just completetly different -- and gets
891 * faster quite early on... */
892 for(small_bench<timer_t> x = finder.micro_bench(2); !x.done(); ++x) {
893 xpi = matpoly::mul(xpiL, xpiR);
894 x.set_since_last();
895 }
896 if (xpiref.get_size() == 0) {
897 xpiref = std::move(xpi);
898 // fprintf(stderr, "BASIS2\n");
899 } else if (xpi.cmp(xpiref) != 0) {
900 fprintf(stderr, "MISMATCH2!\n");
901 }
902 }
903
904 if (finder.still_meaningful_to_test(3)) {
905 unsigned int adj = cutoff_list_get(cl, k);
906 {
907 ostringstream o;
908 o << "matpoly-ft-kronecker-schönhage-caching@adj" << adj;
909 finder.set_method_name(3, o.str());
910 }
911 #if 0
912 matpoly_ft tpi, tpiL, tpiR;
913 mpz_t p;
914 mpz_init(p);
915 abfield_characteristic(ab, p);
916 struct fft_transform_info fti[1];
917 fft_get_transform_info_fppol(fti, p, xpiL->size, xpiR->size, xpiL->n);
918 int s = 0;
919 matpoly_clear(ab, xpi);
920 matpoly_init(ab, xpi, m+n, m+n, xpiL->size + xpiR->size - 1);
921 for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
922 matpoly_ft_init(fti, ab, tpiL, xpiL->m, xpiL->n);
923 matpoly_ft_init(fti, ab, tpiR, xpiR->m, xpiR->n);
924 matpoly_ft_init(fti, ab, tpi, xpiL->m, xpiR->n);
925 matpoly_ft_dft(fti, ab, tpiL, xpiL);
926 matpoly_ft_dft(fti, ab, tpiR, xpiR);
927 matpoly_ft_mul(fti, ab, tpi, tpiL, tpiR);
928 xpi->size = xpiL->size + xpiR->size - 1;
929 ASSERT_ALWAYS(xpi->size <= xpi->alloc);
930 matpoly_ft_ift(fti, ab, xpi, tpi);
931 matpoly_ft_clear(fti, ab, tpiL);
932 matpoly_ft_clear(fti, ab, tpiR);
933 matpoly_ft_clear(ab, tpi, fti);
934 x.set_since_last();
935 s++;
936 }
937 mpz_clear(p);
938 #else
939 /* TODO: matpoly_mul_caching_adj, being the back-end of
940 * matpoly_mul_caching, is likely to gain a stats argument
941 * someday (not if deemed useless, though). If this happens,
942 * we'll have to add the "stats" argument here only so that the
943 * whole thing compiles. Alas this won't work out of the box, the
944 * stats structure must receive information regarding the
945 * planned substeps, prior to the call. So either we fix the
946 * code here, either we loosen our API complexity to this
947 * regard, or we ditch this tune-cutoffs file entirely.
948 * (anyway the more important plingen_tuning code also needs
949 * some update so that this change can happen).
950 */
951 tree_stats stats;
952 for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
953 xpi = matpoly_ft<fft_type>::mul_caching_adj(stats, xpiL, xpiR, adj, NULL);
954 x.set_since_last();
955 }
956 #endif
957 if (xpiref.get_size() == 0) {
958 xpiref = std::move(xpi);
959 } else if (xpi.cmp(xpiref) != 0) {
960 fprintf(stderr, "MISMATCH3!\n");
961 }
962 }
963
964 cout << input_length
965 << " " << finder.summarize_for_this_length(k)
966 << extra_info.str()
967 << "\n";
968 }
969 hup_caught = 0;
970
971
972 vector<pair<unsigned int, int> > table = finder.export_kara_cutoff_data(improved);
973 polymat_set_mul_kara_cutoff(improved, NULL);
974
975 cout << "/* Cutoffs (0=basecase, 1=kara) for "
976 << (m+n)<<"*"<<(m+n)
977 <<" times "
978 << (m+n)<<"*"<<(m+n)<<" products: */\n";
979 cout << "#define MUL_CUTOFFS_" <<(m+n)<<"_"<<(m+n)<<"_"<<(m+n)
980 << " " << finder.print_result(table) << endl;
981 gmp_randclear(rstate);
982
983 polymat_cutoff_info_clear(always_basecase);
984 polymat_cutoff_info_clear(improved);
985 }/*}}}*/
986
lingen_tune_mp(abdst_field ab,unsigned int m,unsigned int n,cutoff_list cl MAYBE_UNUSED)987 void lingen_tune_mp(abdst_field ab, unsigned int m, unsigned int n, cutoff_list cl MAYBE_UNUSED)/*{{{*/
988 {
989 typedef fft_transform_info fft_type;
990 gmp_randstate_t rstate;
991 gmp_randinit_default(rstate);
992 gmp_randseed_ui(rstate, 1);
993 #define TUNE_MP_FINDER_NMETHODS 4
994
995 typedef timer_rusage timer_t;
996 cutoff_finder<timer_rusage> finder(TUNE_MP_FINDER_NMETHODS);
997 finder.set_method_name(0, "polymat-basecase");
998 finder.set_method_name(1, "polymat-karatsuba");
999 finder.set_method_name(2, "matpoly-kronecker-schönhage");
1000 finder.set_method_name(3, "matpoly-ft-kronecker-schönhage-caching");
1001
1002 polymat_cutoff_info always_basecase[1];
1003 polymat_cutoff_info improved[1];
1004
1005 polymat_cutoff_info_init(always_basecase);
1006 polymat_cutoff_info_init(improved);
1007
1008 cout << "# Tuning "
1009 << (m)<<"*"<<(m+n)
1010 <<" times "
1011 << (m+n)<<"*"<<(m+n)<<" middle-products\n";
1012
1013 cout << "# inputlength, ncoeffs";
1014 for(unsigned int i = 0 ; i < finder.ntests ; i++) {
1015 cout << ", " << finder.method_name(i);
1016 }
1017 cout << "\n";
1018
1019 cout << "# Note: for input length k within lingen, "
1020 << " we use MP((1+m/(m+n))k,m/(m+n)k)->k"
1021 << " = MP(" <<1+(double)m/(m+n)<<"*k"
1022 <<", "
1023 <<(double)m/(m+n)<<"*k"
1024 << ")\n";
1025 /* we use the same semantics as for testing the product. input length
1026 * k means we want to count how it takes to do the required MP with
1027 * the pi matrix which has been computed after k successive steps.
1028 * IOW, this is what happens at the middle of a recursive procedure
1029 * on length 2k. In terms of degrees, this will be:
1030 * 2k times m/(m+n)k --> k *BUT*...
1031 * we want coefficients [k..2k[ ; but then, the contribution from the
1032 * first (k-m/(m+n)k) coefficients is useless. Therefore, we rewrite
1033 * the polynomial lengths as:
1034 * (1+m/(m+n))k times m/(m+n)k --> k
1035 */
1036
1037 /* make sure we don't stop the bench absurdly early. We set the bench
1038 * as input_length \approx 2000 */
1039 unsigned int min_bench = bench_atleast_uptothis * m / (m+n);
1040
1041 /* Beware, k is a length of piL, not a degree. Hence length 1 clearly
1042 * makes no sense */
1043 for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
1044 unsigned int input_length = (m+n) * k / m;
1045 /* MP(degree a, degree b) -> degree b-a
1046 * length la=a+1, length lb=b+1 -> length b-a+1 = lb-la+1
1047 * we do want lc=input_length, we have set la=k, whence lb =
1048 * k+input_length-1
1049 */
1050 unsigned int E_length = k + input_length - 1;
1051 polymat E(ab, m, m+n, E_length);
1052 polymat piL(ab, m+n, m+n, k);
1053 polymat ER(ab, m, m+n, input_length);
1054 polymat ERref(ab, m, m+n, input_length);
1055 E.fill_random(E_length, rstate);
1056 piL.fill_random(k, rstate);
1057
1058 ostringstream extra_info;
1059
1060 if (finder.still_meaningful_to_test(0)) {
1061 /* disable kara for a minute */
1062 polymat_set_mp_kara_cutoff(always_basecase, NULL);
1063 for(small_bench<timer_t> x = finder.micro_bench(0); !x.done(); ++x) {
1064 ER.mp(E, piL);
1065 x.set_since_last();
1066 }
1067 if (ERref.get_size() == 0) {
1068 ERref = std::move(ER);
1069 // fprintf(stderr, "BASIS0\n");
1070 } else if (ER.cmp(ERref) != 0) {
1071 fprintf(stderr, "MISMATCH0!\n");
1072 }
1073 }
1074
1075 if (finder.still_meaningful_to_test(1)) {
1076 /* This temporarily sets the cutoff table to enable karatsuba for
1077 * length >=k (hence for this test) at least, possibly using
1078 * karatsuba one or more times in the recursive calls depending
1079 * on what has been measured as best so far.
1080 */
1081 finder.export_kara_cutoff_data_force_kara_now(improved, k);
1082 polymat_set_mp_kara_cutoff(improved, NULL);
1083 for(small_bench<timer_t> x = finder.micro_bench(1); !x.done(); ++x) {
1084 ER.mp(E, piL);
1085 x.set_since_last();
1086 }
1087 if (ERref.get_size() == 0) {
1088 ERref = std::move(ER);
1089 // fprintf(stderr, "BASIS1\n");
1090 } else if (ER.cmp(ERref) != 0) {
1091 fprintf(stderr, "MISMATCH1!\n");
1092 }
1093 }
1094
1095 /* don't exaggerate our memory requirements */
1096 matpoly xpiL(ab, m+n, m+n, k);
1097 xpiL.set_polymat(piL);
1098 piL = polymat();
1099
1100 matpoly xE(ab, m, m+n, E_length);
1101 xE.set_polymat(E);
1102 E = polymat();
1103
1104 matpoly xERref(ab, m, m+n, input_length);
1105 if (ERref.get_size()) {
1106 xERref.set_polymat(ERref);
1107 }
1108 ERref = polymat();
1109
1110 matpoly xER(ab, m, m+n, input_length);
1111 ER = polymat();
1112
1113 /* we should make the effort of converting polymat to matpoly,
1114 * right ? */
1115
1116 if (finder.still_meaningful_to_test(2)) {
1117 /* The matpoly layer is just completetly different -- and gets
1118 * faster quite early on... */
1119 for(small_bench<timer_t> x = finder.micro_bench(2); !x.done(); ++x) {
1120 xER = matpoly::mp(xE, xpiL);
1121 x.set_since_last();
1122 }
1123 if (xERref.get_size() == 0) {
1124 xERref = std::move(xER);
1125 // fprintf(stderr, "BASIS2\n");
1126 } else if (xER.cmp(xERref) != 0) {
1127 fprintf(stderr, "MISMATCH2!\n");
1128 }
1129 }
1130
1131 if (finder.still_meaningful_to_test(3)) {
1132 unsigned int adj = UINT_MAX;
1133 if (cl) {
1134 adj = cutoff_list_get(cl, k);
1135 ostringstream o;
1136 o << "matpoly-ft-kronecker-schönhage-caching@adj" << adj;
1137 finder.set_method_name(3, o.str());
1138 }
1139 #if 0
1140 matpoly_ft tER, tpiL, tE;
1141 mpz_t p;
1142 mpz_init(p);
1143 abfield_characteristic(ab, p);
1144 struct fft_transform_info fti[1];
1145 fft_get_transform_info_fppol_mp(fti, p, xpiL->size, xE->size, xpiL->m);
1146 int s = 0;
1147 matpoly_clear(ab, xER);
1148 matpoly_init(ab, xER, m, m+n, input_length);
1149 for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
1150 matpoly_ft_init(fti, ab, tpiL, xpiL->m, xpiL->n);
1151 matpoly_ft_init(fti, ab, tE, xE->m, xE->n);
1152 matpoly_ft_init(fti, ab, tER, xE->m, xpiL->n);
1153 matpoly_ft_dft(fti, ab, tE, xE);
1154 matpoly_ft_dft(fti, ab, tpiL, xpiL);
1155 /* length E_length * length k ==> length input_length
1156 * with E_length = input_length + k - 1
1157 *
1158 * we'll shift by k-1 coefficients, because k is the
1159 * smallest length
1160 */
1161 matpoly_ft_mul(fti, ab, tER, tE, tpiL);
1162 xER->size = input_length;
1163 ASSERT_ALWAYS(xER->size <= xER->alloc);
1164 matpoly_ft_ift_mp(fti, ab, xER, tER, k-1);
1165 matpoly_ft_clear(fti, ab, tpiL);
1166 matpoly_ft_clear(fti, ab, tE);
1167 matpoly_ft_clear(ab, tER, fti);
1168 x.set_since_last();
1169 s++;
1170 }
1171 mpz_clear(p);
1172 #endif
1173 /* see remark above about matpoly_mul_caching gaining a stats
1174 * argument someday */
1175 tree_stats stats;
1176 for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
1177 xER = matpoly_ft<fft_type>::mp_caching_adj(stats, xE, xpiL, adj, NULL);
1178 x.set_since_last();
1179 }
1180 if (xERref.get_size() == 0) {
1181 xERref = std::move(xER);
1182 } else if (xER.cmp(xERref) != 0) {
1183 fprintf(stderr, "MISMATCH3!\n");
1184 }
1185 }
1186
1187 cout << input_length
1188 << " " << finder.summarize_for_this_length(k)
1189 << extra_info.str()
1190 << "\n";
1191 }
1192 hup_caught = 0;
1193
1194 vector<pair<unsigned int, int> > table = finder.export_kara_cutoff_data(improved);
1195 polymat_set_mp_kara_cutoff(improved, NULL);
1196
1197 cout << "/* Cutoffs (0=basecase, 1=kara) for "
1198 << (m)<<"*"<<(m+n)
1199 <<" times "
1200 << (m+n)<<"*"<<(m+n)<<" middle-products */\n";
1201 cout << "#define MP_CUTOFFS_" <<m<<"_"<<(m+n)<<"_"<<(m+n)
1202 << " " << finder.print_result(table) << endl;
1203 gmp_randclear(rstate);
1204
1205 polymat_cutoff_info_clear(always_basecase);
1206 polymat_cutoff_info_clear(improved);
1207 }/*}}}*/
1208
1209 #if 0
1210 /* 20150826: bigpolymat deleted */
1211 void lingen_tune_bigmul(abdst_field ab, unsigned int m, unsigned int n, unsigned int m1, unsigned int n1, MPI_Comm comm)/*{{{*/
1212 {
1213 int rank;
1214 MPI_Comm_rank(comm, &rank);
1215 gmp_randstate_t rstate;
1216 gmp_randinit_default(rstate);
1217 gmp_randseed_ui(rstate, 1);
1218 /* arguments to the ctor:
1219 * 2 : we are benching 2 methods
1220 * 1 : size makes sense only for >=1
1221 */
1222 cutoff_finder<> finder(2, 1);
1223
1224 bigpolymat model;
1225 bigpolymat_init_model(model, comm, m1, n1);
1226
1227 /* Beware, k is a length, not a degree. Hence length 1 clearly makes
1228 * no sense */
1229 for(unsigned int k = 2 ; !finder.done() ; k=finder.next_length(k)) {
1230 /* Note: we are benching degree k, but the degree we are
1231 * interested in for pi is k*m/(m+n) */
1232 polymat pi, piL, piR;
1233 bigpolymat bpi, bpiL, bpiR;
1234 polymat_init(ab, piL, m+n, m+n, k);
1235 polymat_init(ab, piR, m+n, m+n, k);
1236 polymat_init(ab, pi, m+n, m+n, 2*k-1);
1237 bigpolymat_init(ab, bpiL, model, m+n, m+n, k);
1238 bigpolymat_init(ab, bpiR, model, m+n, m+n, k);
1239 bigpolymat_init(ab, bpi, model, m+n, m+n, 2*k-1);
1240 polymat_fill_random(ab, piL, k, rstate);
1241 polymat_fill_random(ab, piR, k, rstate);
1242 polymat_fill_random(ab, bigpolymat_my_cell(bpiL), k, rstate);
1243 polymat_fill_random(ab, bigpolymat_my_cell(bpiR), k, rstate);
1244
1245 double ttloc;
1246 if (rank == 0) {
1247 for(small_bench<timer_t> x = finder.micro_bench<timer_wct>(ttloc); !x.done(); ++x) {
1248 polymat_mul(ab, pi, piL, piR);
1249 x.set_since_last();
1250 }
1251 }
1252 MPI_Bcast(&ttloc, 1, MPI_DOUBLE, 0, comm);
1253
1254 double ttmpi;
1255 for(auto x = finder.micro_bench<timer_wct_synchronized>(ttmpi); !x.done(); ++x) {
1256 bigpolymat_mul(ab, bpi, bpiL, bpiR);
1257 x.set_since_last();
1258 }
1259 if (rank == 0)
1260 printf("%d %1.6f %1.6f %1.1f\n", k, ttloc, ttmpi, ttmpi/ttloc);
1261 finder.new_winner(k, ttmpi < ttloc); /* < : mpi wins: 1 */
1262
1263 polymat_clear(ab, piL);
1264 polymat_clear(ab, piR);
1265 polymat_clear(ab, pi);
1266 bigpolymat_clear(ab, bpiL);
1267 bigpolymat_clear(ab, bpiR);
1268 bigpolymat_clear(ab, bpi);
1269 }
1270
1271 bigpolymat_clear_model(model);
1272
1273 if (rank == 0) {
1274 cout << "/* Cutoffs for "<<(m+n)<<"*"<<(m+n)<<"*"<<(m+n)<<" MPI products: */\n";
1275 cout << "#define MUL_MPI_CUTOFFS_" <<(m+n)<<"_"<<(m+n)<<"_"<<(m+n)
1276 << " " << finder.result() << endl;
1277 }
1278 gmp_randclear(rstate);
1279 }/*}}}*/
1280 #endif
1281
lingen_tune_cutoffs(bw_dimensions & d,MPI_Comm comm MAYBE_UNUSED,cxx_param_list & pl)1282 void lingen_tune_cutoffs(bw_dimensions & d, MPI_Comm comm MAYBE_UNUSED, cxx_param_list & pl)
1283 {
1284 cxx_mpz p;
1285 gmp_randstate_t rstate;
1286
1287 int catchsig=0;
1288
1289 setbuf(stdout, NULL);
1290 setbuf(stderr, NULL);
1291
1292 param_list_parse_uint(pl, "B", &bench_atleast_uptothis);
1293 param_list_parse_int(pl, "catchsig", &catchsig);
1294
1295 abdst_field ab = d.ab;
1296 unsigned int m = d.m;
1297 unsigned int n = d.n;
1298 abfield_characteristic(d.ab, p);
1299
1300 gmp_randinit_default(rstate);
1301 gmp_randseed_ui(rstate, 1);
1302
1303 if (catchsig) {
1304 #ifdef HAVE_SIGHUP
1305 catch_control_signals();
1306 #else
1307 fprintf(stderr, "Warning: support for signal trapping not enabled at compile time\n");
1308 #endif
1309 }
1310
1311 printf("# Tuning for m=%d n=%d p=[%zu %u-bits words]\n",
1312 m, n, mpz_size(p), GMP_LIMB_BITS);
1313
1314 /* XXX BUG: with depth adjustment==0 early on, we get check failures.
1315 * Must investigate */
1316
1317 cutoff_list cl_mp = NULL;
1318 // lingen_tune_mp_fti_depth(ab, m, n, &cl_mp);
1319 lingen_tune_mp(ab, m, n, cl_mp);
1320
1321 cutoff_list cl_mul = NULL;
1322 // lingen_tune_mul_fti_depth(ab, m, n, &cl_mul);
1323 lingen_tune_mul(ab, m, n, cl_mul);
1324
1325 // int tune_bm_basecase = 1;
1326 int tune_mp = 1;
1327 /* record the list of cutoffs, and the method which wins from there
1328 */
1329
1330 if (tune_mp) {
1331 /* Now for benching mp and plain mul */
1332 unsigned int maxtune = 10000 / (m*n);
1333 /* Bench the products which would come together with a k-steps
1334 * basecase algorithm. IOW a 2k, one-level recursive call incurs
1335 * twice the k-steps basecase, plus once the timings counted here
1336 * (presently, this has Karatsuba complexity)
1337 */
1338 for(unsigned int k = 10 ; k < maxtune ; k+= k/10) {
1339 unsigned int sE = k*(m+2*n)/(m+n);
1340 unsigned int spi = k*m/(m+n);
1341 polymat E(ab, m, m+n, sE);
1342 polymat piL(ab, m+n, m+n, spi);
1343 polymat piR(ab, m+n, m+n, spi);
1344 polymat pi(ab, m+n, m+n, spi*2);
1345 polymat Er(ab, m, m+n, sE-spi+1);
1346 E.set_size(sE);
1347 for(unsigned int v = 0 ; v < E.m * E.n * E.get_size() ; v++) {
1348 abrandom(ab, abvec_coeff_ptr(ab, E.x, v), rstate);
1349 }
1350 piL.set_size(spi);
1351 piR.set_size(spi);
1352 for(unsigned int v = 0 ; v < piL.m * piL.n * piL.get_size() ; v++) {
1353 abrandom(ab, abvec_coeff_ptr(ab, piL.x, v), rstate);
1354 abrandom(ab, abvec_coeff_ptr(ab, piR.x, v), rstate);
1355 }
1356 double ttmp = 0, ttmul = 0;
1357 ttmp -= seconds();
1358 Er.mp(E, piL);
1359 ttmp += seconds();
1360 ttmul -= seconds();
1361 pi.mul(piL, piR);
1362 ttmul += seconds();
1363 double ttmpq = ttmp / (k*k);
1364 double ttmulq = ttmul / (k*k);
1365 double ttmpk = ttmp / pow(k, 1.58);
1366 double ttmulk = ttmul / pow(k, 1.58);
1367 printf("%u [%.2e+%.2e = %.2e] [%.2e+%.2e = %.2e]\n",
1368 k,
1369 ttmpq, ttmulq, ttmpq + ttmulq,
1370 ttmpk, ttmulk, ttmpk + ttmulk
1371 );
1372 // (seconds()-tt) / (k*k)); // ((sE-spi) * spi) / (m*(m+n)*(m+n)));
1373 // printf("%zu %.2e\n", E.get_size(), (seconds()-tt) / (k*k)); // (spi * spi) / ((m+n)*(m+n)*(m+n)));
1374 }
1375 }
1376
1377 #if 0 /* for basecase with polymat */
1378 if (tune_bm_basecase) {
1379 bmstatus bm;
1380 bmstatus_init(bm, m, n);
1381 abfield_specify(bm->d.ab, MPFQ_PRIME_MPZ, (mpz_srcptr) p);
1382 unsigned int t0 = bm->t;
1383 unsigned int maxtune = 10000 / (m * n);
1384 for(unsigned int k = 10 ; k < maxtune ; k += k/10) {
1385 unsigned int * delta = new unsigned int[m + n];
1386 polymat E, pi;
1387 polymat_init(ab, pi, 0, 0, 0);
1388 polymat_init(ab, E, m, m+n, maxtune);
1389 E->size = k;
1390 for(unsigned int v = 0 ; v < E->m * E->n * E->size ; v++) {
1391 abrandom(ab, E->x[v], rstate);
1392 }
1393 double tt = seconds();
1394 for(unsigned int j = 0 ; j < m + n ; delta[j++]=1);
1395 bm->t = 1;
1396 bw_lingen_basecase(bm, pi, E, delta);
1397 printf("%zu %.2e\n", E->size, (seconds()-tt) / (k * k));
1398 polymat_clear(ab, pi);
1399 polymat_clear(ab, E);
1400 delete[] delta;
1401 }
1402 bmstatus_clear(bm);
1403 }
1404 #endif
1405
1406 #if 0
1407 /* This should normally be reasonably quick, and running it every
1408 * time can be considered as an option */
1409 if (rank == 0) {
1410 lingen_tune_mul(ab, m, n);
1411 lingen_tune_mp(ab, m, n);
1412 }
1413 extern polymat_cutoff_info polymat_mul_kara_cutoff;
1414 extern polymat_cutoff_info polymat_mp_kara_cutoff;
1415 bigpolymat_bcast_polymat_cutoff(&polymat_mul_kara_cutoff, 0, comm);
1416 bigpolymat_bcast_polymat_cutoff(&polymat_mp_kara_cutoff, 0, comm);
1417
1418 lingen_tune_bigmul(ab, m, n, mpi[0]*thr[0], mpi[1]*thr[1], comm);
1419 #endif
1420
1421 gmp_randclear(rstate);
1422
1423 return;
1424 }
1425
1426
1427