1 #include "cado.h" // IWYU pragma: keep
2 
3 // IWYU pragma: no_include <sys/param.h>
4 // IWYU pragma: no_include <memory>
5 
6 #ifdef SELECT_MPFQ_LAYER_u64k1
7 #error "lingen_tune_cutoffs does not work with binary, at least for the moment"
8 #endif
9 
10 #include <cstdio>
11 #include <cstdlib>
12 #include <cstring>
13 #include <climits>                         // for UINT_MAX
14 #include <cstdint>                         // for uint64_t
15 #ifdef  HAVE_SIGHUP
16 #include <csignal>
17 #endif
18 #include <vector>
19 #include <utility>
20 #include <map>
21 #include <string>
22 #include <iostream>     // std::cout
23 #include <sstream> // IWYU pragma: keep
24 #include <type_traits>                      // for __strip_reference_wrapper...
25 
26 #include <gmp.h>                            // for mp_limb_t, gmp_randclear
27 
28 #include "timing.h"                         // for seconds, wct_seconds, cpu...
29 #include "tree_stats.hpp"                   // for tree_stats
30 
31 #include "macros.h"
32 #include "cxx_mpz.hpp"
33 #ifndef SELECT_MPFQ_LAYER_u64k1
34 #include "lingen_polymat.hpp"
35 #endif
36 #include "lingen_matpoly_select.hpp"
37 #include "lingen_fft_select.hpp" // IWYU pragma: keep
38 // #include "lingen_bigpolymat.h" // 20150826: deleted.
39 #include "lingen_matpoly_ft.hpp"
40 #include "lingen_abfield.hpp" // IWYU pragma: keep
41 #include "lingen_bw_dimensions.hpp"
42 #include "lingen_tune_cutoffs.hpp"
43 #include "params.h"
44 
45 using namespace std;
46 
lingen_tune_cutoffs_decl_usage(cxx_param_list & pl)47 void lingen_tune_cutoffs_decl_usage(cxx_param_list & pl)
48 {
49     param_list_decl_usage(pl, "B",        "minimum end of bench span window");
50     param_list_decl_usage(pl, "catchsig", "enable intercepting ^C");
51 }
52 
lingen_tune_cutoffs_lookup_parameters(cxx_param_list & pl)53 void lingen_tune_cutoffs_lookup_parameters(cxx_param_list & pl)
54 {
55     param_list_lookup_string(pl, "B");
56     param_list_lookup_string(pl, "catchsig");
57 }
58 
59 /* interface to C programs for list of cutoffs we compute *//*{{{*/
60 /* This is the simple C-type cutoff table which is used down the line by
61  * C programs */
62 
63 struct cutoff_list_s {
64     unsigned int k;     /* this is the number of coefficients (length) of
65                          * pi.  Admittedly,  it is slightly bogus to have
66                          * this as a main variable.  Length [k] for pi
67                          * corresponds to an "input length" of [k*(m+n)/m].
68                          * In the MP case, this means that E has length
69                          * [input_length+k-1], and the output E_right has
70                          * length [input_length]. */
71     unsigned int choice;        /* semantics vary. For FFT-based stuff,
72                                  * this is the depth reduction from the
73                                  * flit default.
74                                  */
75 };
76 
77 typedef cutoff_list_s * cutoff_list;
78 
79 /* cutoff list *ALWAYS* finish with UINT_MAX, UINT_MAX */
cutoff_list_get(cutoff_list cl,unsigned int k)80 unsigned int cutoff_list_get(cutoff_list cl, unsigned int k)
81 {
82     if (!cl) return 0;
83     if (k < cl->k) return 0;
84     for( ; k >= cl->k ; cl++);
85     return (--cl)->choice;
86 }
87 /*}}}*/
88 
89 /* The -B argument may be used to request printing of timing results at
90  * least up to this input length */
91 unsigned int bench_atleast_uptothis = 0;
92 
93 /* timer backends *//*{{{*/
94 #ifdef  HAVE_GCC_STYLE_AMD64_INLINE_ASM
95 struct timer_rdtsc {
operator ()timer_rdtsc96     inline double operator()() const {
97         uint64_t c = cputicks();
98         return c/1.0e9;
99     }
timer_unittimer_rdtsc100     static const char * timer_unit() { return "Gccyc"; }
101 };
102 #endif
103 
104 
105 struct timer_rusage {
operator ()timer_rusage106     inline double operator()() const { return seconds(); }
timer_unittimer_rusage107     static const char * timer_unit() { return "seconds (rusage)"; }
108 };
109 struct timer_wct {
operator ()timer_wct110     inline double operator()() const { return wct_seconds(); }
timer_unittimer_wct111     static const char * timer_unit() { return "seconds (wct)"; }
112 };
113 /* It is important, when doing MPI benches, that we use this algorithm */
114 struct timer_wct_synchronized {
operator ()timer_wct_synchronized115     inline double operator()() const {
116         double d = wct_seconds();
117         MPI_Allreduce(MPI_IN_PLACE, &d, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
118         return d;
119     }
120 };
121 /*}}}*/
122 
123 /* how do we do one single measurements ? We'll do it many times, but how
124  * long ? Here is the place for making the corresponding choices.
125  */
126 struct measurement_choice {
127     int enough_repeats;
128     double minimum_time;
129     double enough_time;
measurement_choicemeasurement_choice130     measurement_choice() {
131         /* defaults */
132         enough_repeats = 1000;
133         minimum_time = 0.5;
134         enough_time = 1.0;
135     }
136 };
137 
138 template<typename T>
139 struct small_bench {
140     measurement_choice mc;
141     double& tfinal;
142     double last_starting_tt;
143     /* accumulated_time so far. Can be increased either with
144      * - add_since_last() (which resets our timer)
145      * - set_since_last() (which does not)
146      * - inject() (which does not either, since we do not care about our
147      *   own timer in this case).
148      */
149     double tt;
150 
151     int n;
small_benchsmall_bench152     explicit small_bench(double& t, measurement_choice mc = measurement_choice()) : mc(mc), tfinal(t) {tt=n=0;reset();}
153     /* are we done ? when we are, we set tfinal to the average value. */
donesmall_bench154     inline int done() {
155         if (tt < mc.enough_time) {
156             if (tt < mc.minimum_time && n < mc.enough_repeats)
157                 return 0;
158         }
159         // fprintf(stderr, "%.2f %d\n", tt, n);
160         tfinal = tt / n;
161         return 1;
162     }
operator ++small_bench163     inline small_bench& operator++() { n++; return *this; }
resetsmall_bench164     inline void reset() { last_starting_tt = T()(); }
165     /* adds to tt the time since the last timer reset, and resets the
166      * timer.  */
add_since_lastsmall_bench167     inline void add_since_last(int weight = 1) {
168         double now = T()();
169         tt += weight * (now - last_starting_tt);
170         last_starting_tt = now;
171     }
172     /* *sets* tt to the time since the last timer reset. Do not reset the
173      * timer.  */
set_since_lastsmall_bench174     inline void set_since_last(int weight = 1) {
175         double now = T()();
176         tt = weight * (now - last_starting_tt);
177     }
injectsmall_bench178     inline void inject(double t, int weight = 1) { tt += weight*t; }
179 };
180 
181 /* {{{ cutoff_finder and its child small_bench */
182 template<typename Timer_backend = timer_rusage>
183 struct cutoff_finder {
184     measurement_choice mc;
185     double scale;
186     int stable_cutoff_break;
187     unsigned int ntests;
188     int stable;
189     vector<double> benches;
190     vector<bool> meaningful;
191     vector<pair<unsigned int, pair<vector<double>, int> > > all_results;
192     map<int, string> method_names;
193         double slowness_ratio;
194         unsigned int age_slow_discard;
195 
cutoff_findercutoff_finder196     cutoff_finder(unsigned int ntests, measurement_choice mc = measurement_choice())
197         :
198         mc(mc),
199         ntests(ntests),
200         benches(ntests),
201         meaningful(ntests, true)
202     {
203         /* Fill defaults */
204         scale = 1.1;
205         stable_cutoff_break = 4;
206         stable = 0;
207         slowness_ratio = 2;
208         age_slow_discard = 5;
209     }
210 
set_method_namecutoff_finder211     inline void set_method_name(int k, string const& s) {
212         method_names.insert(make_pair(k, s));
213     }
method_namecutoff_finder214     inline string method_name(int i) const {
215         ostringstream v;
216         map<int, string>::const_iterator z = method_names.find(i);
217         if (z == method_names.end()) {
218             v << "method " << i;
219             return v.str();
220         }
221         return z->second;
222     }
223 
next_lengthcutoff_finder224     inline unsigned int next_length(unsigned int k) const {
225         return MAX(k+1, scale*k);
226     }
227 
donecutoff_finder228     inline int done() const {
229         int nactive=0;
230         for(unsigned int i = 0 ; i < ntests ; i++) {
231             nactive += meaningful[i];
232         }
233         if (nactive > 1) return 0;
234         return !(stable < stable_cutoff_break);
235     }
236 
still_meaningful_to_testcutoff_finder237     inline bool still_meaningful_to_test(int i) { return meaningful[i]; }
238 
summarize_for_this_lengthcutoff_finder239     string summarize_for_this_length(unsigned int k)
240     {
241         std::ostringstream comments;
242         int best = -1;
243         for(unsigned int i = 0 ; i < ntests ; i++) {
244             if (meaningful[i] && (best < 0 || benches[i] < benches[best])) best = i;
245         }
246         ASSERT_ALWAYS(best >= 0);
247         all_results.push_back(make_pair(k,
248                     make_pair(benches, best)));
249 
250         if (all_results.empty() || best != all_results.back().second.second) {
251             stable = 0;
252         } else {
253             stable++;
254         }
255 
256         vector<bool> was_meaningful = meaningful;
257 
258         /* try to discard those which are slow. We live on the assumption
259          * that the higher-numbered strategies win eventually, therefore
260          * we don't discard them if [best] is smaller currently */
261         for(int i = 0 ; i < best ; i++) {
262             /* already dead ? */
263             if (!meaningful[i]) continue;
264 
265             /* for how long has it been slow ? */
266             unsigned int age_slow = 0;
267             for( ; age_slow < all_results.size() ; age_slow++) {
268                 vector<double> const& these(all_results[all_results.size()-1-age_slow].second.first);
269                 int best_there = all_results[all_results.size()-1-age_slow].second.second;
270                 /* do not discard this method if a less advanced one is
271                  * still alive.
272                  */
273                 if (best_there < i) break;
274                 /* not so slow */
275                 if (these[i] <= these[best_there] * slowness_ratio) break;
276             }
277             if (age_slow >= age_slow_discard) {
278                 /* ok, discard */
279                 comments << "  ; discarding " << method_name(i);
280                 meaningful[i] = false;
281             }
282         }
283         ostringstream s;
284         s << k;
285         for(unsigned int i = 0 ; i < ntests ; i++) {
286             if (was_meaningful[i]) {
287                 s << " " << benches[i];
288                 if ((int) i==best) s << '#';
289             } else {
290                 s << " *";
291             }
292             benches[i] = 999999;
293         }
294         s << " " << method_name(best) << comments.str();
295         return s.str();
296     }
297 
micro_benchcutoff_finder298     small_bench<Timer_backend> micro_bench(int i) {
299         return small_bench<Timer_backend>(benches[i], mc);
300     }
micro_benchcutoff_finder301     small_bench<Timer_backend> micro_bench(int i, measurement_choice mc1) {
302         return small_bench<Timer_backend>(benches[i], mc1);
303     }
304 
305     /* This is really limited to karatsuba-like cuttofs. It's ugly */
export_best_tablecutoff_finder306     vector<pair<unsigned int, int> > export_best_table()
307     {
308         vector<pair<unsigned int, int> > steps;
309         steps.push_back(make_pair(1,0));
310 
311         typedef vector<pair<unsigned int, pair<vector<double>, int> > >::const_iterator it_t;
312         for(it_t it = all_results.begin() ; it != all_results.end() ; ++it) {
313             unsigned int size = it->first;
314             int best = it->second.second;
315             if (steps.empty() || best != steps.back().second) {
316                 steps.push_back(make_pair(size, best));
317             }
318         }
319         return steps;
320     }
export_kara_cutoff_datacutoff_finder321     vector<pair<unsigned int, int> > export_kara_cutoff_data(struct polymat_cutoff_info * dst)
322     {
323         /* This size will eventually feed the ->subdivide field for the
324          * cutoff info */
325         unsigned int first_kara_size = UINT_MAX;
326 
327         /* This size will eventually feed the ->cut field for the
328          * cutoff info. For sizes above this, we will _always_ use
329          * karatsuba. */
330         unsigned int first_alwayskara_size = UINT_MAX;
331 
332         vector<pair<unsigned int, int> > steps;
333         steps.push_back(make_pair(1,0));
334 
335         typedef vector<pair<unsigned int, pair<vector<double>, int> > >::const_iterator it_t;
336         for(it_t it = all_results.begin() ; it != all_results.end() ; ++it) {
337             unsigned int size = it->first;
338             vector<double> const& benches(it->second.first);
339             int best = it->second.second;
340             /* In case fft wins, we invent something which will use
341              * karatsuba still */
342             if (best > 1) best = benches[1] < benches[0];
343             if (steps.empty() || best != steps.back().second) {
344                 steps.push_back(make_pair(size, best));
345                 if (best == 1 && size < first_kara_size)
346                     first_kara_size = size;
347                 /* assign it multiple times */
348                 first_alwayskara_size = size;
349             }
350         }
351         dst->cut = first_alwayskara_size;
352         dst->subdivide = first_kara_size;
353         dst->table_size = steps.size();
354         dst->table = (unsigned int (*)[2]) realloc(dst->table,
355                 dst->table_size * sizeof(unsigned int[2]));
356         for(unsigned int v = 0 ; v < dst->table_size ; v++) {
357             dst->table[v][0] = steps[v].first;
358             dst->table[v][1] = steps[v].second;
359         };
360         return steps;
361     }
export_kara_cutoff_data_force_kara_nowcutoff_finder362     vector<pair<unsigned int, int> > export_kara_cutoff_data_force_kara_now(struct polymat_cutoff_info * dst, unsigned int size)
363     {
364         vector<double> allz(ntests);
365         all_results.push_back(make_pair(size, make_pair(allz, 1)));
366         vector<pair<unsigned int, int> > x = export_kara_cutoff_data(dst);
367         all_results.pop_back();
368         return x;
369     }
print_resultcutoff_finder370     static string print_result(vector<pair<unsigned int, int> > const& tab) {
371         ostringstream s;
372         s << "{";
373         typedef vector<pair<unsigned int, int> >::const_iterator it_t;
374         for(it_t y = tab.begin() ; y != tab.end() ; y++) {
375             s << " { " << y->first << ", " << y->second << " },";
376         }
377         s << " }";
378         return s.str();
379     }
380 
381 };
382 /* }}} */
383 
384 #ifdef  HAVE_SIGHUP
385 double last_hup = 0;
386 int hup_caught = 0;
387 
sighandler(int sig MAYBE_UNUSED)388 void sighandler(int sig MAYBE_UNUSED)
389 {
390     double t = wct_seconds();
391     if (t < last_hup + 0.5) {
392         fprintf(stderr, "Interrupt twice in half a second; exiting\n");
393         exit(1);
394     }
395     last_hup = wct_seconds();
396     hup_caught++;
397 }
398 
399 
catch_control_signals()400 void catch_control_signals()
401 {
402     struct sigaction sa[1];
403     memset(sa, 0, sizeof(sa));
404     sa->sa_handler = sighandler;
405     sigaction(SIGHUP, sa, NULL);
406     sigaction(SIGINT, sa, NULL);
407 
408     /*
409      * should play sigprocmask and friends
410     sigset_t sset[1];
411     sigemptyset(sset);
412     sigaddset(sset, SIGHUP);
413     */
414 }
415 #endif
416 
417 /* This code will first try to bench the basic operations (first
418  * local; global are later) of middle product E*pi and product pi*pi.
419  * The goal is to see where is the good value for the thresholds:
420  * polymat_mul_kara_threshold
421  * polymat_mp_kara_threshold
422  */
423 
lingen_tune_mul_fti_depth(abdst_field ab,unsigned int m,unsigned int n,cutoff_list * cl_out)424 void lingen_tune_mul_fti_depth(abdst_field ab, unsigned int m, unsigned int n, cutoff_list *cl_out)/*{{{*/
425 {
426     gmp_randstate_t rstate;
427     gmp_randinit_default(rstate);
428     gmp_randseed_ui(rstate, 1);
429 
430     typedef timer_rdtsc timer_t;
431 
432     int nadjs=7;
433     measurement_choice mc;
434     mc.enough_time = 2.0;
435     mc.minimum_time = 0.1;
436     mc.enough_repeats = 100;
437 
438     measurement_choice mcout = mc;
439     mcout.enough_repeats = 2;
440     mc.enough_time = 1.0;
441 
442     cutoff_finder<timer_t> finder(nadjs, mcout);
443     finder.slowness_ratio = 1.4;
444     // finder.age_slow_discard = 5;
445     finder.scale = 1.01;
446 
447     for(int i = 0 ; i < nadjs ; i++) {
448         ostringstream o;
449         o << "depth_adj=" << nadjs-1-i;
450         finder.set_method_name(i, o.str());
451     }
452 
453     cout << "# Tuning FFT depth adjustment (mul) for"
454                 << " m=" << m
455                 << ", n=" << n
456                 << "\n";
457 
458     cout << "# Timings reported in " << timer_t::timer_unit() << "\n";
459     cout << "# inputlength, ncoeffs, various depth adjustments";
460     cout << "\n";
461 
462     /* for multiplication */
463     cout << "# Note: for input length k within lingen,"
464         << " we use ncoeffs=k*m/(m+n) = "<<(double)m/(m+n)<<"*k\n";
465 
466     /* This is for forcing the bench to run until a large length. This is
467      * unnecessary here */
468     unsigned int min_bench = 0;
469 
470     /* Beware, k is the length of piL, not a degree. Hence length 1
471      * clearly makes no sense */
472     for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
473         unsigned int input_length = (m+n) * k / m;
474 
475         abvec   A,   B,   C;
476         void  *tA, *tB, *tC;
477 
478         mpz_t p;
479         mpz_init(p);
480         abfield_characteristic(ab, p);
481 
482         abvec_init(ab, &A, k);
483         abvec_init(ab, &B, k);
484         abvec_init(ab, &C, 2*k-1);
485 
486         abvec_random(ab, A, k, rstate);
487         abvec_random(ab, B, k, rstate);
488         abvec_set_zero(ab, C, 2*k-1);
489 
490         ostringstream extra_info;
491 
492         for(int index = 0 ; index < nadjs ; index++) {
493             struct fft_transform_info fti[1];
494             fft_transform_info_init_fppol(fti, p, k, k, m+n);
495             fft_transform_info_set_first_guess(fti);
496             if (index == 0) {
497                 extra_info << "(from " << fti->depth << ")";
498             }
499             if (nadjs-1-index >= fti->depth) {
500                 finder.benches[index] = 999999999;
501                 continue;
502             }
503             if (fti->depth >= 11 && index > 0) {
504                 finder.benches[index] = 999999999;
505                 continue;
506             }
507             fft_transform_info_adjust_depth(fti, nadjs-1-index);
508 
509             size_t fft_alloc_sizes[3];
510             fft_transform_info_get_alloc_sizes(fti, fft_alloc_sizes);
511             void * tt = malloc(fft_alloc_sizes[2]);
512             void * qt = malloc(fft_alloc_sizes[1]);
513 
514             tA = malloc(fft_alloc_sizes[0]);
515             tB = malloc(fft_alloc_sizes[0]);
516             tC = malloc(fft_alloc_sizes[0]);
517 
518             typedef small_bench<timer_t> bt;
519             for(bt x = finder.micro_bench(index); !x.done(); ++x) {
520 
521                 double t_dftA=0;
522                 for(bt y = bt(t_dftA, mc); !y.done(); ++y) {
523                     fft_prepare(fti, tA);
524                     fft_dft(fti, tA, (mp_limb_t*)A, k, qt);
525                     y.set_since_last();
526                 }
527                 x.inject(t_dftA, (m+n)*(m+n));
528 
529                 double t_dftB=0;
530                 for(bt y = bt(t_dftB, mc); !y.done(); ++y) {
531                     fft_prepare(fti, tB);
532                     fft_dft(fti, tB, (mp_limb_t*)B, k, qt);
533                     y.set_since_last();
534                 }
535                 x.inject(t_dftB, (m+n)*(m+n));
536 
537                 double t_conv=0;
538                 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
539                     fft_prepare(fti, tC);
540                     fft_zero(fti, tC);
541                     fft_addcompose(fti, tC, tA, tB, tt, qt);
542                     y.set_since_last();
543                 }
544                 x.inject(t_conv, (m+n)*(m+n)*(m+n));
545 
546                 double t_iftC=0;
547                 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
548                     fft_prepare(fti, tC);
549                     fft_ift(fti, (mp_limb_t*)C, 2*k-1, tC, qt);
550                     y.set_since_last();
551                 }
552                 x.inject(t_iftC, (m+n)*(m+n));
553             }
554             free(tA);
555             free(tB);
556             free(tC);
557             free(tt);
558             free(qt);
559         }
560 
561         abvec_clear(ab, &A, k);
562         abvec_clear(ab, &B, k);
563         abvec_clear(ab, &C, 2*k-1);
564 
565         mpz_clear(p);
566 
567         cout << input_length
568             << " " << finder.summarize_for_this_length(k)
569             << extra_info.str()
570             << "\n";
571     }
572     hup_caught = 0;
573 
574     vector<pair<unsigned int, int> > table = finder.export_best_table();
575 
576     typedef vector<pair<unsigned int, int> >::iterator it_t;
577     for(it_t x = table.begin() ; x != table.end() ; ++x)
578         x->second = nadjs-1-x->second;
579 
580     cout << "/* FFT depth adjustments for "
581                 << (m)<<"*"<<(m+n)
582                 <<" times "
583                 << (m+n)<<"*"<<(m+n)<<" products */\n";
584     cout << "#define MUL_FTI_DEPTH_ADJ_" <<m+n<<"_"<<(m+n)<<"_"<<(m+n)
585         << " " << finder.print_result(table) << endl;
586 
587     if (cl_out) {
588         *cl_out = (cutoff_list) malloc((table.size()+1)*sizeof(**cl_out));
589         for(unsigned int i = 0 ; i < table.size(); i++) {
590             (*cl_out)[i].k = table[i].first;
591             (*cl_out)[i].choice = table[i].second;
592         }
593         (*cl_out)[table.size()].k      = UINT_MAX;
594         (*cl_out)[table.size()].choice = UINT_MAX;
595     }
596 
597     gmp_randclear(rstate);
598 }/*}}}*/
lingen_tune_mp_fti_depth(abdst_field ab,unsigned int m,unsigned int n,cutoff_list * cl_out)599 void lingen_tune_mp_fti_depth(abdst_field ab, unsigned int m, unsigned int n, cutoff_list * cl_out)/*{{{*/
600 {
601     gmp_randstate_t rstate;
602     gmp_randinit_default(rstate);
603     gmp_randseed_ui(rstate, 1);
604 
605     typedef timer_rdtsc timer_t;
606 
607     int nadjs=7;
608     measurement_choice mc;
609     mc.enough_time = 2.0;
610     mc.minimum_time = 0.1;
611     mc.enough_repeats = 100;
612 
613     measurement_choice mcout = mc;
614     mcout.enough_repeats = 2;
615     mc.enough_time = 1.0;
616 
617     cutoff_finder<timer_t> finder(nadjs, mcout);
618     finder.slowness_ratio = 1.4;
619     // finder.age_slow_discard = 5;
620     finder.scale = 1.01;
621 
622     for(int i = 0 ; i < nadjs ; i++) {
623         ostringstream o;
624         o << "depth_adj=" << nadjs-1-i;
625         finder.set_method_name(i, o.str());
626     }
627 
628     cout << "# Tuning FFT depth adjustment (mp) for"
629                 << " m=" << m
630                 << ", n=" << n
631                 << "\n";
632 
633     cout << "# Timings reported in " << timer_t::timer_unit() << "\n";
634     cout << "# inputlength, ncoeffs, various depth adjustments";
635     cout << "\n";
636 
637     /* for multiplication */
638     cout << "# Note: for input length k within lingen,"
639         << " we use ncoeffs=k*m/(m+n) = "<<(double)m/(m+n)<<"*k\n";
640 
641     /* This is for forcing the bench to run until a large length. This is
642      * unnecessary here */
643     unsigned int min_bench = 0;
644 
645     /* Beware, k is the length of piL, not a degree. Hence length 1
646      * clearly makes no sense */
647     for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
648         unsigned int input_length = (m+n) * k / m;
649 
650         unsigned int E_length = k + input_length - 1;
651 
652         abvec   A,   B,   C;
653         void  *tA, *tB, *tC;
654 
655         mpz_t p;
656         mpz_init(p);
657         abfield_characteristic(ab, p);
658 
659         abvec_init(ab, &A, E_length);
660         abvec_init(ab, &B, k);
661         abvec_init(ab, &C, input_length);
662 
663         abvec_random(ab, A, E_length, rstate);
664         abvec_random(ab, B, k, rstate);
665         abvec_set_zero(ab, C, input_length);
666 
667         ostringstream extra_info;
668 
669         for(int index = 0 ; index < nadjs ; index++) {
670             struct fft_transform_info fti[1];
671             fft_transform_info_init_fppol_mp(fti, p, k, E_length, m+n);
672             fft_transform_info_set_first_guess(fti);
673             if (index == 0) {
674                 extra_info << "(from " << fti->depth << ")";
675             }
676             if (nadjs-1-index >= fti->depth) {
677                 finder.benches[index] = 999999999;
678                 continue;
679             }
680             if (fti->depth >= 11 && index > 0) {
681                 finder.benches[index] = 999999999;
682                 continue;
683             }
684             fft_transform_info_adjust_depth(fti, nadjs-1-index);
685 
686             size_t fft_alloc_sizes[3];
687             fft_transform_info_get_alloc_sizes(fti, fft_alloc_sizes);
688             void * tt = malloc(fft_alloc_sizes[2]);
689             void * qt = malloc(fft_alloc_sizes[1]);
690 
691             tA = malloc(fft_alloc_sizes[0]);
692             tB = malloc(fft_alloc_sizes[0]);
693             tC = malloc(fft_alloc_sizes[0]);
694 
695             typedef small_bench<timer_t> bt;
696             for(bt x = finder.micro_bench(index); !x.done(); ++x) {
697 
698                 double t_dftA=0;
699                 for(bt y = bt(t_dftA, mc); !y.done(); ++y) {
700                     fft_prepare(fti, tA);
701                     fft_dft(fti, tA, (mp_limb_t*)A, E_length, qt);
702                     y.set_since_last();
703                 }
704                 x.inject(t_dftA, m*(m+n));
705 
706                 double t_dftB=0;
707                 for(bt y = bt(t_dftB, mc); !y.done(); ++y) {
708                     fft_prepare(fti, tB);
709                     fft_dft(fti, tB, (mp_limb_t*)B, k, qt);
710                     y.set_since_last();
711                 }
712                 x.inject(t_dftB, (m+n)*(m+n));
713 
714                 double t_conv=0;
715                 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
716                     fft_prepare(fti, tC);
717                     fft_zero(fti, tC);
718                     fft_addcompose(fti, tC, tA, tB, tt, qt);
719                     y.set_since_last();
720                 }
721                 x.inject(t_conv, m*(m+n)*(m+n));
722 
723                 double t_iftC=0;
724                 for(bt y = bt(t_conv, mc); !y.done(); ++y) {
725                     fft_prepare(fti, tC);
726                     fft_ift(fti, (mp_limb_t*)C, input_length, tC, qt);
727                     y.set_since_last();
728                 }
729                 x.inject(t_iftC, m*(m+n));
730             }
731             free(tA);
732             free(tB);
733             free(tC);
734             free(tt);
735             free(qt);
736         }
737 
738         abvec_clear(ab, &A, E_length);
739         abvec_clear(ab, &B, k);
740         abvec_clear(ab, &C, input_length);
741 
742 
743         cout << input_length
744             << " " << finder.summarize_for_this_length(k)
745             << extra_info.str()
746             << "\n";
747     }
748     hup_caught = 0;
749 
750     vector<pair<unsigned int, int> > table = finder.export_best_table();
751 
752     typedef vector<pair<unsigned int, int> >::iterator it_t;
753     for(it_t x = table.begin() ; x != table.end() ; ++x)
754         x->second = nadjs-1-x->second;
755 
756     cout << "/* FFT depth adjustments for "
757                 << (m)<<"*"<<(m+n)
758                 <<" times "
759                 << (m+n)<<"*"<<(m+n)<<" middle-products */\n";
760     cout << "#define MP_FTI_DEPTH_ADJ_" <<m<<"_"<<(m+n)<<"_"<<(m+n)
761         << " " << finder.print_result(table) << endl;
762 
763     if (cl_out) {
764         *cl_out = (cutoff_list) malloc((table.size()+1)*sizeof(**cl_out));
765         for(unsigned int i = 0 ; i < table.size(); i++) {
766             (*cl_out)[i].k = table[i].first;
767             (*cl_out)[i].choice = table[i].second;
768         }
769         (*cl_out)[table.size()].k = UINT_MAX;
770         (*cl_out)[table.size()].choice = UINT_MAX;
771     }
772 
773     gmp_randclear(rstate);
774 }/*}}}*/
775 
776 
lingen_tune_mul(abdst_field ab,unsigned int m,unsigned int n,cutoff_list cl MAYBE_UNUSED)777 void lingen_tune_mul(abdst_field ab, unsigned int m, unsigned int n, cutoff_list cl MAYBE_UNUSED)/*{{{*/
778 {
779     typedef fft_transform_info fft_type;
780     gmp_randstate_t rstate;
781     gmp_randinit_default(rstate);
782     gmp_randseed_ui(rstate, 1);
783 #define TUNE_MUL_FINDER_NMETHODS 4
784 
785     typedef timer_rusage timer_t;
786     cutoff_finder<timer_rusage> finder(TUNE_MUL_FINDER_NMETHODS);
787     finder.set_method_name(0, "polymat-basecase");
788     finder.set_method_name(1, "polymat-karatsuba");
789     finder.set_method_name(2, "matpoly-kronecker-schönhage");
790     finder.set_method_name(3, "matpoly-ft-kronecker-schönhage-caching");
791 
792     polymat_cutoff_info always_basecase[1];
793     polymat_cutoff_info improved[1];
794 
795     polymat_cutoff_info_init(always_basecase);
796     polymat_cutoff_info_init(improved);
797 
798     cout << "# Tuning "
799                 << (m+n)<<"*"<<(m+n)
800                 <<" times "
801                 << (m+n)<<"*"<<(m+n)<<" products\n";
802 
803     cout << "# inputlength, ncoeffs";
804     for(unsigned int i = 0 ; i < finder.ntests ; i++) {
805         cout << ", " << finder.method_name(i);
806     }
807     cout << "\n";
808     cout << "# Note: for input length k within lingen,"
809         << " we use ncoeffs=k*m/(m+n) = "<<(double)m/(m+n)<<"*k\n";
810     /* input length k means we consider the pi polynomial which is
811      * created by k successive steps. So this mulitplication, in effect,
812      * occurs at the end of a recursive procedure of length 2k, which
813      * collects the pi matrices of its two child calls.
814      */
815 
816     /* make sure we don't stop the bench absurdly early. We set the bench
817      * as input_length \approx 2000 */
818     unsigned int min_bench = bench_atleast_uptothis * m / (m+n);
819 
820     /* Beware, k is the length of piL, not a degree. Hence length 1
821      * clearly makes no sense */
822     for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
823         unsigned int input_length = (m+n) * k / m;
824         polymat piL  (ab, m+n, m+n, k);
825         polymat piR  (ab, m+n, m+n, k);
826         polymat pi   (ab, m+n, m+n, 2*k-1);
827         polymat piref(ab, m+n, m+n, 2*k-1);
828         piL.fill_random(k, rstate);
829         piR.fill_random(k, rstate);
830 
831         ostringstream extra_info;
832 
833         if (finder.still_meaningful_to_test(0)) {
834             /* disable kara for a minute */
835             polymat_set_mul_kara_cutoff(always_basecase, NULL);
836             for(small_bench<timer_t> x = finder.micro_bench(0); !x.done(); ++x) {
837                 pi.mul(piL, piR);
838                 x.set_since_last();
839             }
840             if (piref.get_size() == 0) {
841                 piref = std::move(pi);
842                 // fprintf(stderr, "BASIS0\n");
843             } else if (pi.cmp(piref) != 0) {
844                 fprintf(stderr, "MISMATCH0!\n");
845             }
846         }
847 
848         if (finder.still_meaningful_to_test(1)) {
849             /* This temporarily sets the cutoff table to enable karatsuba for
850              * length >=k (hence for this test) at least, possibly using
851              * karatsuba one or more times in the recursive calls depending
852              * on what has been measured as best so far.
853              */
854             finder.export_kara_cutoff_data_force_kara_now(improved, k);
855             polymat_set_mul_kara_cutoff(improved, NULL);
856             for(small_bench<timer_t> x = finder.micro_bench(1); !x.done(); ++x) {
857                 pi.mul(piL, piR);
858                 x.set_since_last();
859             }
860             if (piref.get_size() == 0) {
861                 piref = std::move(pi);
862                 // fprintf(stderr, "BASIS1\n");
863             } else if (pi.cmp(piref) != 0) {
864                 fprintf(stderr, "MISMATCH1!\n");
865             }
866         }
867 
868         /* don't exaggerate our memory requirements */
869         matpoly xpiL;
870         xpiL.set_polymat(piL);
871         piL = polymat();
872 
873         matpoly xpiR;
874         xpiR.set_polymat(piR);
875         piR = polymat();
876 
877         matpoly xpiref;
878         if (piref.get_size()) {
879             xpiref.set_polymat(piref);
880             piref = polymat();
881         }
882 
883         matpoly xpi(ab, m+n, m+n, 2*k-1);
884         pi = polymat();
885 
886         /* we should make the effort of converting polymat to matpoly,
887          * right ? */
888 
889         if (finder.still_meaningful_to_test(2)) {
890             /* The matpoly layer is just completetly different -- and gets
891              * faster quite early on... */
892             for(small_bench<timer_t> x = finder.micro_bench(2); !x.done(); ++x) {
893                 xpi = matpoly::mul(xpiL, xpiR);
894                 x.set_since_last();
895             }
896             if (xpiref.get_size() == 0) {
897                 xpiref = std::move(xpi);
898                 // fprintf(stderr, "BASIS2\n");
899             } else if (xpi.cmp(xpiref) != 0) {
900                 fprintf(stderr, "MISMATCH2!\n");
901             }
902         }
903 
904         if (finder.still_meaningful_to_test(3)) {
905             unsigned int adj = cutoff_list_get(cl, k);
906             {
907                 ostringstream o;
908                 o << "matpoly-ft-kronecker-schönhage-caching@adj" << adj;
909                 finder.set_method_name(3, o.str());
910             }
911 #if 0
912             matpoly_ft tpi, tpiL, tpiR;
913             mpz_t p;
914             mpz_init(p);
915             abfield_characteristic(ab, p);
916             struct fft_transform_info fti[1];
917             fft_get_transform_info_fppol(fti, p, xpiL->size, xpiR->size, xpiL->n);
918             int s = 0;
919             matpoly_clear(ab, xpi);
920             matpoly_init(ab, xpi, m+n, m+n, xpiL->size + xpiR->size - 1);
921             for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
922                 matpoly_ft_init(fti, ab, tpiL, xpiL->m, xpiL->n);
923                 matpoly_ft_init(fti, ab, tpiR, xpiR->m, xpiR->n);
924                 matpoly_ft_init(fti, ab, tpi, xpiL->m, xpiR->n);
925                 matpoly_ft_dft(fti, ab, tpiL, xpiL);
926                 matpoly_ft_dft(fti, ab, tpiR, xpiR);
927                 matpoly_ft_mul(fti, ab, tpi, tpiL, tpiR);
928                 xpi->size = xpiL->size + xpiR->size - 1;
929                 ASSERT_ALWAYS(xpi->size <= xpi->alloc);
930                 matpoly_ft_ift(fti, ab, xpi, tpi);
931                 matpoly_ft_clear(fti, ab, tpiL);
932                 matpoly_ft_clear(fti, ab, tpiR);
933                 matpoly_ft_clear(ab, tpi,  fti);
934                 x.set_since_last();
935                 s++;
936             }
937             mpz_clear(p);
938 #else
939             /* TODO: matpoly_mul_caching_adj, being the back-end of
940              * matpoly_mul_caching, is likely to gain a stats argument
941              * someday (not if deemed useless, though). If this happens,
942              * we'll have to add the "stats" argument here only so that the
943              * whole thing compiles. Alas this won't work out of the box, the
944              * stats structure must receive information regarding the
945              * planned substeps, prior to the call. So either we fix the
946              * code here, either we loosen our API complexity to this
947              * regard, or we ditch this tune-cutoffs file entirely.
948              * (anyway the more important plingen_tuning code also needs
949              * some update so that this change can happen).
950              */
951             tree_stats stats;
952             for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
953                 xpi = matpoly_ft<fft_type>::mul_caching_adj(stats, xpiL, xpiR, adj, NULL);
954                 x.set_since_last();
955             }
956 #endif
957             if (xpiref.get_size() == 0) {
958                 xpiref = std::move(xpi);
959             } else if (xpi.cmp(xpiref) != 0) {
960                 fprintf(stderr, "MISMATCH3!\n");
961             }
962         }
963 
964         cout << input_length
965             << " " << finder.summarize_for_this_length(k)
966             << extra_info.str()
967             << "\n";
968     }
969     hup_caught = 0;
970 
971 
972     vector<pair<unsigned int, int> > table = finder.export_kara_cutoff_data(improved);
973     polymat_set_mul_kara_cutoff(improved, NULL);
974 
975     cout << "/* Cutoffs (0=basecase, 1=kara) for "
976                 << (m+n)<<"*"<<(m+n)
977                 <<" times "
978                 << (m+n)<<"*"<<(m+n)<<" products: */\n";
979     cout << "#define MUL_CUTOFFS_" <<(m+n)<<"_"<<(m+n)<<"_"<<(m+n)
980         << " " << finder.print_result(table) << endl;
981     gmp_randclear(rstate);
982 
983     polymat_cutoff_info_clear(always_basecase);
984     polymat_cutoff_info_clear(improved);
985 }/*}}}*/
986 
lingen_tune_mp(abdst_field ab,unsigned int m,unsigned int n,cutoff_list cl MAYBE_UNUSED)987 void lingen_tune_mp(abdst_field ab, unsigned int m, unsigned int n, cutoff_list cl MAYBE_UNUSED)/*{{{*/
988 {
989     typedef fft_transform_info fft_type;
990     gmp_randstate_t rstate;
991     gmp_randinit_default(rstate);
992     gmp_randseed_ui(rstate, 1);
993 #define TUNE_MP_FINDER_NMETHODS 4
994 
995     typedef timer_rusage timer_t;
996     cutoff_finder<timer_rusage> finder(TUNE_MP_FINDER_NMETHODS);
997     finder.set_method_name(0, "polymat-basecase");
998     finder.set_method_name(1, "polymat-karatsuba");
999     finder.set_method_name(2, "matpoly-kronecker-schönhage");
1000     finder.set_method_name(3, "matpoly-ft-kronecker-schönhage-caching");
1001 
1002     polymat_cutoff_info always_basecase[1];
1003     polymat_cutoff_info improved[1];
1004 
1005     polymat_cutoff_info_init(always_basecase);
1006     polymat_cutoff_info_init(improved);
1007 
1008     cout << "# Tuning "
1009                 << (m)<<"*"<<(m+n)
1010                 <<" times "
1011                 << (m+n)<<"*"<<(m+n)<<" middle-products\n";
1012 
1013     cout << "# inputlength, ncoeffs";
1014     for(unsigned int i = 0 ; i < finder.ntests ; i++) {
1015         cout << ", " << finder.method_name(i);
1016     }
1017     cout << "\n";
1018 
1019     cout << "# Note: for input length k within lingen, "
1020          << " we use MP((1+m/(m+n))k,m/(m+n)k)->k"
1021          << " = MP("    <<1+(double)m/(m+n)<<"*k"
1022                         <<", "
1023                         <<(double)m/(m+n)<<"*k"
1024                         << ")\n";
1025     /* we use the same semantics as for testing the product. input length
1026      * k means we want to count how it takes to do the required MP with
1027      * the pi matrix which has been computed after k successive steps.
1028      * IOW, this is what happens at the middle of a recursive procedure
1029      * on length 2k. In terms of degrees, this will be:
1030      *     2k times m/(m+n)k --> k  *BUT*...
1031      * we want coefficients [k..2k[ ; but then, the contribution from the
1032      * first (k-m/(m+n)k) coefficients is useless. Therefore, we rewrite
1033      * the polynomial lengths as:
1034      *      (1+m/(m+n))k times m/(m+n)k --> k
1035      */
1036 
1037     /* make sure we don't stop the bench absurdly early. We set the bench
1038      * as input_length \approx 2000 */
1039     unsigned int min_bench = bench_atleast_uptothis * m / (m+n);
1040 
1041     /* Beware, k is a length of piL, not a degree. Hence length 1 clearly
1042      * makes no sense */
1043     for(unsigned int k = 2 ; !hup_caught && (k < min_bench || !finder.done()) ; k=finder.next_length(k)) {
1044         unsigned int input_length = (m+n) * k / m;
1045         /* MP(degree a, degree b) -> degree b-a
1046          *    length la=a+1, length lb=b+1 -> length b-a+1 = lb-la+1
1047          * we do want lc=input_length, we have set la=k, whence lb =
1048          * k+input_length-1
1049          */
1050         unsigned int E_length = k + input_length - 1;
1051         polymat E(ab, m, m+n, E_length);
1052         polymat piL(ab, m+n, m+n, k);
1053         polymat ER(ab, m, m+n, input_length);
1054         polymat ERref(ab, m, m+n, input_length);
1055         E.fill_random(E_length, rstate);
1056         piL.fill_random(k, rstate);
1057 
1058         ostringstream extra_info;
1059 
1060         if (finder.still_meaningful_to_test(0)) {
1061             /* disable kara for a minute */
1062             polymat_set_mp_kara_cutoff(always_basecase, NULL);
1063             for(small_bench<timer_t> x = finder.micro_bench(0); !x.done(); ++x) {
1064                 ER.mp(E, piL);
1065                 x.set_since_last();
1066             }
1067             if (ERref.get_size() == 0) {
1068                 ERref = std::move(ER);
1069                 // fprintf(stderr, "BASIS0\n");
1070             } else if (ER.cmp(ERref) != 0) {
1071                 fprintf(stderr, "MISMATCH0!\n");
1072             }
1073         }
1074 
1075         if (finder.still_meaningful_to_test(1)) {
1076             /* This temporarily sets the cutoff table to enable karatsuba for
1077              * length >=k (hence for this test) at least, possibly using
1078              * karatsuba one or more times in the recursive calls depending
1079              * on what has been measured as best so far.
1080              */
1081             finder.export_kara_cutoff_data_force_kara_now(improved, k);
1082             polymat_set_mp_kara_cutoff(improved, NULL);
1083             for(small_bench<timer_t> x = finder.micro_bench(1); !x.done(); ++x) {
1084                 ER.mp(E, piL);
1085                 x.set_since_last();
1086             }
1087             if (ERref.get_size() == 0) {
1088                 ERref = std::move(ER);
1089                 // fprintf(stderr, "BASIS1\n");
1090             } else if (ER.cmp(ERref) != 0) {
1091                 fprintf(stderr, "MISMATCH1!\n");
1092             }
1093         }
1094 
1095         /* don't exaggerate our memory requirements */
1096         matpoly xpiL(ab, m+n, m+n, k);
1097         xpiL.set_polymat(piL);
1098         piL = polymat();
1099 
1100         matpoly xE(ab, m, m+n, E_length);
1101         xE.set_polymat(E);
1102         E = polymat();
1103 
1104         matpoly xERref(ab, m, m+n, input_length);
1105         if (ERref.get_size()) {
1106             xERref.set_polymat(ERref);
1107         }
1108         ERref = polymat();
1109 
1110         matpoly xER(ab, m, m+n, input_length);
1111         ER = polymat();
1112 
1113         /* we should make the effort of converting polymat to matpoly,
1114          * right ? */
1115 
1116         if (finder.still_meaningful_to_test(2)) {
1117             /* The matpoly layer is just completetly different -- and gets
1118              * faster quite early on... */
1119             for(small_bench<timer_t> x = finder.micro_bench(2); !x.done(); ++x) {
1120                 xER = matpoly::mp(xE, xpiL);
1121                 x.set_since_last();
1122             }
1123             if (xERref.get_size() == 0) {
1124                 xERref = std::move(xER);
1125                 // fprintf(stderr, "BASIS2\n");
1126             } else if (xER.cmp(xERref) != 0) {
1127                 fprintf(stderr, "MISMATCH2!\n");
1128             }
1129         }
1130 
1131         if (finder.still_meaningful_to_test(3)) {
1132             unsigned int adj = UINT_MAX;
1133             if (cl) {
1134                 adj = cutoff_list_get(cl, k);
1135                 ostringstream o;
1136                 o << "matpoly-ft-kronecker-schönhage-caching@adj" << adj;
1137                 finder.set_method_name(3, o.str());
1138             }
1139 #if 0
1140             matpoly_ft tER, tpiL, tE;
1141             mpz_t p;
1142             mpz_init(p);
1143             abfield_characteristic(ab, p);
1144             struct fft_transform_info fti[1];
1145             fft_get_transform_info_fppol_mp(fti, p, xpiL->size, xE->size, xpiL->m);
1146             int s = 0;
1147             matpoly_clear(ab, xER);
1148             matpoly_init(ab, xER, m, m+n, input_length);
1149             for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
1150                 matpoly_ft_init(fti, ab, tpiL, xpiL->m, xpiL->n);
1151                 matpoly_ft_init(fti, ab, tE, xE->m, xE->n);
1152                 matpoly_ft_init(fti, ab, tER, xE->m, xpiL->n);
1153                 matpoly_ft_dft(fti, ab, tE, xE);
1154                 matpoly_ft_dft(fti, ab, tpiL, xpiL);
1155                 /* length E_length * length k ==> length input_length
1156                  * with E_length = input_length + k - 1
1157                  *
1158                  * we'll shift by k-1 coefficients, because k is the
1159                  * smallest length
1160                  */
1161                 matpoly_ft_mul(fti, ab, tER, tE, tpiL);
1162                 xER->size = input_length;
1163                 ASSERT_ALWAYS(xER->size <= xER->alloc);
1164                 matpoly_ft_ift_mp(fti, ab, xER, tER, k-1);
1165                 matpoly_ft_clear(fti, ab, tpiL);
1166                 matpoly_ft_clear(fti, ab, tE);
1167                 matpoly_ft_clear(ab, tER,  fti);
1168                 x.set_since_last();
1169                 s++;
1170             }
1171             mpz_clear(p);
1172 #endif
1173             /* see remark above about matpoly_mul_caching gaining a stats
1174              * argument someday */
1175             tree_stats stats;
1176             for(small_bench<timer_t> x = finder.micro_bench(3); !x.done(); ++x) {
1177                 xER = matpoly_ft<fft_type>::mp_caching_adj(stats, xE, xpiL, adj, NULL);
1178                 x.set_since_last();
1179             }
1180             if (xERref.get_size() == 0) {
1181                 xERref = std::move(xER);
1182             } else if (xER.cmp(xERref) != 0) {
1183                 fprintf(stderr, "MISMATCH3!\n");
1184             }
1185         }
1186 
1187         cout << input_length
1188             << " " << finder.summarize_for_this_length(k)
1189             << extra_info.str()
1190             << "\n";
1191     }
1192     hup_caught = 0;
1193 
1194     vector<pair<unsigned int, int> > table = finder.export_kara_cutoff_data(improved);
1195     polymat_set_mp_kara_cutoff(improved, NULL);
1196 
1197     cout << "/* Cutoffs (0=basecase, 1=kara) for "
1198                 << (m)<<"*"<<(m+n)
1199                 <<" times "
1200                 << (m+n)<<"*"<<(m+n)<<" middle-products */\n";
1201     cout << "#define MP_CUTOFFS_" <<m<<"_"<<(m+n)<<"_"<<(m+n)
1202         << " " << finder.print_result(table) << endl;
1203     gmp_randclear(rstate);
1204 
1205     polymat_cutoff_info_clear(always_basecase);
1206     polymat_cutoff_info_clear(improved);
1207 }/*}}}*/
1208 
1209 #if 0
1210 /* 20150826: bigpolymat deleted */
1211 void lingen_tune_bigmul(abdst_field ab, unsigned int m, unsigned int n, unsigned int m1, unsigned int n1, MPI_Comm comm)/*{{{*/
1212 {
1213     int rank;
1214     MPI_Comm_rank(comm, &rank);
1215     gmp_randstate_t rstate;
1216     gmp_randinit_default(rstate);
1217     gmp_randseed_ui(rstate, 1);
1218     /* arguments to the ctor:
1219      * 2 : we are benching 2 methods
1220      * 1 : size makes sense only for >=1
1221      */
1222     cutoff_finder<> finder(2, 1);
1223 
1224     bigpolymat model;
1225     bigpolymat_init_model(model, comm, m1, n1);
1226 
1227     /* Beware, k is a length, not a degree. Hence length 1 clearly makes
1228      * no sense */
1229     for(unsigned int k = 2 ; !finder.done() ; k=finder.next_length(k)) {
1230         /* Note: we are benching degree k, but the degree we are
1231          * interested in for pi is k*m/(m+n) */
1232         polymat pi, piL, piR;
1233         bigpolymat bpi, bpiL, bpiR;
1234         polymat_init(ab, piL, m+n, m+n, k);
1235         polymat_init(ab, piR, m+n, m+n, k);
1236         polymat_init(ab, pi, m+n, m+n, 2*k-1);
1237         bigpolymat_init(ab, bpiL, model, m+n, m+n, k);
1238         bigpolymat_init(ab, bpiR, model, m+n, m+n, k);
1239         bigpolymat_init(ab, bpi, model, m+n, m+n, 2*k-1);
1240         polymat_fill_random(ab, piL, k, rstate);
1241         polymat_fill_random(ab, piR, k, rstate);
1242         polymat_fill_random(ab, bigpolymat_my_cell(bpiL), k, rstate);
1243         polymat_fill_random(ab, bigpolymat_my_cell(bpiR), k, rstate);
1244 
1245         double ttloc;
1246         if (rank == 0) {
1247             for(small_bench<timer_t> x = finder.micro_bench<timer_wct>(ttloc); !x.done(); ++x) {
1248                 polymat_mul(ab, pi, piL, piR);
1249                 x.set_since_last();
1250             }
1251         }
1252         MPI_Bcast(&ttloc, 1, MPI_DOUBLE, 0, comm);
1253 
1254         double ttmpi;
1255         for(auto x = finder.micro_bench<timer_wct_synchronized>(ttmpi); !x.done(); ++x) {
1256             bigpolymat_mul(ab, bpi, bpiL, bpiR);
1257             x.set_since_last();
1258         }
1259         if (rank == 0)
1260             printf("%d %1.6f %1.6f %1.1f\n", k, ttloc, ttmpi, ttmpi/ttloc);
1261         finder.new_winner(k, ttmpi < ttloc); /* < : mpi wins: 1 */
1262 
1263         polymat_clear(ab, piL);
1264         polymat_clear(ab, piR);
1265         polymat_clear(ab, pi);
1266         bigpolymat_clear(ab, bpiL);
1267         bigpolymat_clear(ab, bpiR);
1268         bigpolymat_clear(ab, bpi);
1269     }
1270 
1271     bigpolymat_clear_model(model);
1272 
1273     if (rank == 0) {
1274         cout << "/* Cutoffs for "<<(m+n)<<"*"<<(m+n)<<"*"<<(m+n)<<" MPI products: */\n";
1275         cout << "#define MUL_MPI_CUTOFFS_" <<(m+n)<<"_"<<(m+n)<<"_"<<(m+n)
1276             << " " << finder.result() << endl;
1277     }
1278     gmp_randclear(rstate);
1279 }/*}}}*/
1280 #endif
1281 
lingen_tune_cutoffs(bw_dimensions & d,MPI_Comm comm MAYBE_UNUSED,cxx_param_list & pl)1282 void lingen_tune_cutoffs(bw_dimensions & d, MPI_Comm comm MAYBE_UNUSED, cxx_param_list & pl)
1283 {
1284     cxx_mpz p;
1285     gmp_randstate_t rstate;
1286 
1287     int catchsig=0;
1288 
1289     setbuf(stdout, NULL);
1290     setbuf(stderr, NULL);
1291 
1292     param_list_parse_uint(pl, "B", &bench_atleast_uptothis);
1293     param_list_parse_int(pl, "catchsig", &catchsig);
1294 
1295     abdst_field ab = d.ab;
1296     unsigned int m = d.m;
1297     unsigned int n = d.n;
1298     abfield_characteristic(d.ab, p);
1299 
1300     gmp_randinit_default(rstate);
1301     gmp_randseed_ui(rstate, 1);
1302 
1303     if (catchsig) {
1304 #ifdef  HAVE_SIGHUP
1305         catch_control_signals();
1306 #else
1307         fprintf(stderr, "Warning: support for signal trapping not enabled at compile time\n");
1308 #endif
1309     }
1310 
1311     printf("# Tuning for m=%d n=%d p=[%zu %u-bits words]\n",
1312             m, n, mpz_size(p), GMP_LIMB_BITS);
1313 
1314     /* XXX BUG: with depth adjustment==0 early on, we get check failures.
1315      * Must investigate */
1316 
1317     cutoff_list cl_mp = NULL;
1318     // lingen_tune_mp_fti_depth(ab, m, n, &cl_mp);
1319     lingen_tune_mp(ab, m, n, cl_mp);
1320 
1321     cutoff_list cl_mul = NULL;
1322     // lingen_tune_mul_fti_depth(ab, m, n, &cl_mul);
1323     lingen_tune_mul(ab, m, n, cl_mul);
1324 
1325     // int tune_bm_basecase = 1;
1326     int tune_mp = 1;
1327     /* record the list of cutoffs, and the method which wins from there
1328      */
1329 
1330     if (tune_mp) {
1331         /* Now for benching mp and plain mul */
1332         unsigned int maxtune = 10000 / (m*n);
1333         /* Bench the products which would come together with a k-steps
1334          * basecase algorithm. IOW a 2k, one-level recursive call incurs
1335          * twice the k-steps basecase, plus once the timings counted here
1336          * (presently, this has Karatsuba complexity)
1337          */
1338         for(unsigned int k = 10 ; k < maxtune ; k+= k/10) {
1339             unsigned int sE = k*(m+2*n)/(m+n);
1340             unsigned int spi = k*m/(m+n);
1341             polymat E(ab, m, m+n, sE);
1342             polymat piL(ab, m+n, m+n, spi);
1343             polymat piR(ab, m+n, m+n, spi);
1344             polymat pi(ab, m+n, m+n, spi*2);
1345             polymat Er(ab, m, m+n, sE-spi+1);
1346             E.set_size(sE);
1347             for(unsigned int v = 0 ; v < E.m * E.n * E.get_size() ; v++) {
1348                 abrandom(ab, abvec_coeff_ptr(ab, E.x, v), rstate);
1349             }
1350             piL.set_size(spi);
1351             piR.set_size(spi);
1352             for(unsigned int v = 0 ; v < piL.m * piL.n * piL.get_size() ; v++) {
1353                 abrandom(ab, abvec_coeff_ptr(ab, piL.x, v), rstate);
1354                 abrandom(ab, abvec_coeff_ptr(ab, piR.x, v), rstate);
1355             }
1356             double ttmp = 0, ttmul = 0;
1357             ttmp -= seconds();
1358             Er.mp(E, piL);
1359             ttmp += seconds();
1360             ttmul -= seconds();
1361             pi.mul(piL, piR);
1362             ttmul += seconds();
1363             double ttmpq = ttmp / (k*k);
1364             double ttmulq = ttmul / (k*k);
1365             double ttmpk = ttmp / pow(k, 1.58);
1366             double ttmulk = ttmul / pow(k, 1.58);
1367             printf("%u [%.2e+%.2e = %.2e] [%.2e+%.2e = %.2e]\n",
1368                     k,
1369                     ttmpq, ttmulq, ttmpq + ttmulq,
1370                     ttmpk, ttmulk, ttmpk + ttmulk
1371                     );
1372             // (seconds()-tt) / (k*k)); // ((sE-spi) * spi) / (m*(m+n)*(m+n)));
1373             // printf("%zu %.2e\n", E.get_size(), (seconds()-tt) / (k*k)); // (spi * spi) / ((m+n)*(m+n)*(m+n)));
1374         }
1375     }
1376 
1377 #if 0   /* for basecase with polymat */
1378     if (tune_bm_basecase) {
1379         bmstatus bm;
1380         bmstatus_init(bm, m, n);
1381         abfield_specify(bm->d.ab, MPFQ_PRIME_MPZ, (mpz_srcptr) p);
1382         unsigned int t0 = bm->t;
1383         unsigned int maxtune = 10000 / (m * n);
1384         for(unsigned int k = 10 ; k < maxtune ; k += k/10) {
1385             unsigned int * delta = new unsigned int[m + n];
1386             polymat E, pi;
1387             polymat_init(ab, pi, 0, 0, 0);
1388             polymat_init(ab, E, m, m+n, maxtune);
1389             E->size = k;
1390             for(unsigned int v = 0 ; v < E->m * E->n * E->size ; v++) {
1391                 abrandom(ab, E->x[v], rstate);
1392             }
1393             double tt = seconds();
1394             for(unsigned int j = 0 ; j < m + n ; delta[j++]=1);
1395             bm->t = 1;
1396             bw_lingen_basecase(bm, pi, E, delta);
1397             printf("%zu %.2e\n", E->size, (seconds()-tt) / (k * k));
1398             polymat_clear(ab, pi);
1399             polymat_clear(ab, E);
1400             delete[] delta;
1401         }
1402         bmstatus_clear(bm);
1403     }
1404 #endif
1405 
1406 #if 0
1407     /* This should normally be reasonably quick, and running it every
1408      * time can be considered as an option */
1409     if (rank == 0) {
1410         lingen_tune_mul(ab, m, n);
1411         lingen_tune_mp(ab, m, n);
1412     }
1413     extern polymat_cutoff_info polymat_mul_kara_cutoff;
1414     extern polymat_cutoff_info polymat_mp_kara_cutoff;
1415     bigpolymat_bcast_polymat_cutoff(&polymat_mul_kara_cutoff, 0, comm);
1416     bigpolymat_bcast_polymat_cutoff(&polymat_mp_kara_cutoff, 0, comm);
1417 
1418     lingen_tune_bigmul(ab, m, n, mpi[0]*thr[0], mpi[1]*thr[1], comm);
1419 #endif
1420 
1421     gmp_randclear(rstate);
1422 
1423     return;
1424 }
1425 
1426 
1427