xref: /netbsd/external/lgpl3/gmp/dist/tune/tuneup.c (revision 122966f8)
1 /* Create tuned thresholds for various algorithms.
2 
3 Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
4 
5 This file is part of the GNU MP Library.
6 
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9 
10   * the GNU Lesser General Public License as published by the Free
11     Software Foundation; either version 3 of the License, or (at your
12     option) any later version.
13 
14 or
15 
16   * the GNU General Public License as published by the Free Software
17     Foundation; either version 2 of the License, or (at your option) any
18     later version.
19 
20 or both in parallel, as here.
21 
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 for more details.
26 
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library.  If not,
29 see https://www.gnu.org/licenses/.  */
30 
31 
32 /* Usage: tuneup [-t] [-t] [-p precision]
33 
34    -t turns on some diagnostic traces, a second -t turns on more traces.
35 
36    Notes:
37 
38    The code here isn't a vision of loveliness, mainly because it's subject
39    to ongoing changes according to new things wanting to be tuned, and
40    practical requirements of systems tested.
41 
42    Sometimes running the program twice produces slightly different results.
43    This is probably because there's so little separating algorithms near
44    their crossover, and on that basis it should make little or no difference
45    to the final speed of the relevant routines, but nothing has been done to
46    check that carefully.
47 
48    Algorithm:
49 
50    The thresholds are determined as follows.  A crossover may not be a
51    single size but rather a range where it oscillates between method A or
52    method B faster.  If the threshold is set making B used where A is faster
53    (or vice versa) that's bad.  Badness is the percentage time lost and
54    total badness is the sum of this over all sizes measured.  The threshold
55    is set to minimize total badness.
56 
57    Suppose, as sizes increase, method B becomes faster than method A.  The
58    effect of the rule is that, as you look at increasing sizes, isolated
59    points where B is faster are ignored, but when it's consistently faster,
60    or faster on balance, then the threshold is set there.  The same result
61    is obtained thinking in the other direction of A becoming faster at
62    smaller sizes.
63 
64    In practice the thresholds tend to be chosen to bring on the next
65    algorithm fairly quickly.
66 
67    This rule is attractive because it's got a basis in reason and is fairly
68    easy to implement, but no work has been done to actually compare it in
69    absolute terms to other possibilities.
70 
71    Implementation:
72 
73    In a normal library build the thresholds are constants.  To tune them
74    selected objects are recompiled with the thresholds as global variables
75    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
76    the end of gmp-impl.h, and rules in tune/Makefile.am.
77 
78    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
79    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
80    level, but recurse into the basecase.
81 
82    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
83    Other routines in turn will make use of both of those.  Naturally the
84    dependants must be tuned first.
85 
86    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
87    just a threshold based on comparing two routines (mpn_divrem_1 and
88    mpn_divexact_1), and no further use of the value determined.
89 
90    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
91    just comparisons between certain routines on representative data.
92 
93    Shortcuts are applied when native (assembler) versions of routines exist.
94    For instance a native mpn_sqr_basecase is assumed to be always faster
95    than mpn_mul_basecase, with no measuring.
96 
97    No attempt is made to tune within assembler routines, for instance
98    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
99    written and tuned all by hand.  Assembler routines that might have hard
100    limits are recompiled though, to make them accept a bigger range of sizes
101    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
102 
103    Limitations:
104 
105    The FFTs aren't subject to the same badness rule as the other thresholds,
106    so each k is probably being brought on a touch early.  This isn't likely
107    to make a difference, and the simpler probing means fewer tests.
108 
109 */
110 
111 #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
112 
113 #include "config.h"
114 
115 #include <math.h>
116 #include <stdio.h>
117 #include <stdlib.h>
118 #include <time.h>
119 #if HAVE_UNISTD_H
120 #include <unistd.h>
121 #endif
122 
123 #include "gmp-impl.h"
124 #include "longlong.h"
125 
126 #include "tests.h"
127 #include "speed.h"
128 
129 #if !HAVE_DECL_OPTARG
130 extern char *optarg;
131 extern int optind, opterr;
132 #endif
133 
134 
135 #define DEFAULT_MAX_SIZE   1000  /* limbs */
136 
137 #if WANT_FFT
138 mp_size_t  option_fft_max_size = 50000;  /* limbs */
139 #else
140 mp_size_t  option_fft_max_size = 0;
141 #endif
142 int        option_trace = 0;
143 int        option_fft_trace = 0;
144 struct speed_params  s;
145 
146 struct dat_t {
147   mp_size_t  size;
148   double     d;
149 } *dat = NULL;
150 int  ndat = 0;
151 int  allocdat = 0;
152 
153 /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
154    case use zero here, which for params.max_size means no limit.  */
155 #ifndef TUNE_SQR_TOOM2_MAX
156 #define TUNE_SQR_TOOM2_MAX  0
157 #endif
158 
159 mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
160 mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
161 mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
162 mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
163 mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
164 mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
165 mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
166 mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
167 mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
168 mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
169 mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
170 mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
171 mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
172 mp_size_t  sqr_toom2_threshold
173   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
174 mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
175 mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
176 mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
177 mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
178 mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
179 mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
180 mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
181 mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
182 mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
183 mp_size_t  sqrlo_basecase_threshold     = MP_SIZE_T_MAX;
184 mp_size_t  sqrlo_dc_threshold           = MP_SIZE_T_MAX;
185 mp_size_t  sqrlo_sqr_threshold          = MP_SIZE_T_MAX;
186 mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
187 mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
188 mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
189 mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
190 mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
191 mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
192 mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
193 mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
194 mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
195 mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
196 mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
197 mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
198 mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
199 mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
200 mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
201 mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
202 mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
203 mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
204 mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
205 mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
206 mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
207 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
208 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
209 mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
210 mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
211 mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
212 mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
213 int	   div_qr_1n_pi1_method		= 0;
214 mp_size_t  div_qr_1_norm_threshold      = MP_SIZE_T_MAX;
215 mp_size_t  div_qr_1_unnorm_threshold    = MP_SIZE_T_MAX;
216 mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
217 mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
218 mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
219 mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
220 int	   mod_1_1p_method		= 0;
221 mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
222 mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
223 mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
224 mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
225 mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
226 mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
227 mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
228 mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
229 mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
230 mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
231 mp_size_t  fac_odd_threshold            = 0;
232 mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
233 
234 mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
235 mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
236 
237 struct param_t {
238   const char        *name;
239   speed_function_t  function;
240   speed_function_t  function2;
241   double            step_factor;    /* how much to step relatively */
242   int               step;           /* how much to step absolutely */
243   double            function_fudge; /* multiplier for "function" speeds */
244   int               stop_since_change;
245   double            stop_factor;
246   mp_size_t         min_size;
247   int               min_is_always;
248   mp_size_t         max_size;
249   mp_size_t         check_size;
250   mp_size_t         size_extra;
251 
252 #define DATA_HIGH_LT_R  1
253 #define DATA_HIGH_GE_R  2
254   int               data_high;
255 
256   int               noprint;
257 };
258 
259 
260 /* These are normally undefined when false, which suits "#if" fine.
261    But give them zero values so they can be used in plain C "if"s.  */
262 #ifndef UDIV_PREINV_ALWAYS
263 #define UDIV_PREINV_ALWAYS 0
264 #endif
265 #ifndef HAVE_NATIVE_mpn_divexact_1
266 #define HAVE_NATIVE_mpn_divexact_1 0
267 #endif
268 #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
269 #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
270 #endif
271 #ifndef HAVE_NATIVE_mpn_divrem_1
272 #define HAVE_NATIVE_mpn_divrem_1 0
273 #endif
274 #ifndef HAVE_NATIVE_mpn_divrem_2
275 #define HAVE_NATIVE_mpn_divrem_2 0
276 #endif
277 #ifndef HAVE_NATIVE_mpn_mod_1
278 #define HAVE_NATIVE_mpn_mod_1 0
279 #endif
280 #ifndef HAVE_NATIVE_mpn_mod_1_1p
281 #define HAVE_NATIVE_mpn_mod_1_1p 0
282 #endif
283 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
284 #define HAVE_NATIVE_mpn_modexact_1_odd 0
285 #endif
286 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
287 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
288 #endif
289 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
290 #define HAVE_NATIVE_mpn_preinv_mod_1 0
291 #endif
292 #ifndef HAVE_NATIVE_mpn_sqr_basecase
293 #define HAVE_NATIVE_mpn_sqr_basecase 0
294 #endif
295 
296 
297 #define MAX3(a,b,c)  MAX (MAX (a, b), c)
298 
299 mp_limb_t
randlimb_norm(void)300 randlimb_norm (void)
301 {
302   mp_limb_t  n;
303   mpn_random (&n, 1);
304   n |= GMP_NUMB_HIGHBIT;
305   return n;
306 }
307 
308 #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
309 
310 mp_limb_t
randlimb_half(void)311 randlimb_half (void)
312 {
313   mp_limb_t  n;
314   mpn_random (&n, 1);
315   n &= GMP_NUMB_HALFMASK;
316   n += (n==0);
317   return n;
318 }
319 
320 
321 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
322    if necessary.  */
323 void
add_dat(mp_size_t size,double d)324 add_dat (mp_size_t size, double d)
325 {
326 #define ALLOCDAT_STEP  500
327 
328   ASSERT_ALWAYS (ndat <= allocdat);
329 
330   if (ndat == allocdat)
331     {
332       dat = (struct dat_t *) __gmp_allocate_or_reallocate
333         (dat, allocdat * sizeof(dat[0]),
334          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
335       allocdat += ALLOCDAT_STEP;
336     }
337 
338   dat[ndat].size = size;
339   dat[ndat].d = d;
340   ndat++;
341 }
342 
343 
344 /* Return the threshold size based on the data accumulated. */
345 mp_size_t
analyze_dat(int final)346 analyze_dat (int final)
347 {
348   double  x, min_x;
349   int     j, min_j;
350 
351   /* If the threshold is set at dat[0].size, any positive values are bad. */
352   x = 0.0;
353   for (j = 0; j < ndat; j++)
354     if (dat[j].d > 0.0)
355       x += dat[j].d;
356 
357   if (option_trace >= 2 && final)
358     {
359       printf ("\n");
360       printf ("x is the sum of the badness from setting thresh at given size\n");
361       printf ("  (minimum x is sought)\n");
362       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
363     }
364 
365   min_x = x;
366   min_j = 0;
367 
368 
369   /* When stepping to the next dat[j].size, positive values are no longer
370      bad (so subtracted), negative values become bad (so add the absolute
371      value, meaning subtract). */
372   for (j = 0; j < ndat; x -= dat[j].d, j++)
373     {
374       if (option_trace >= 2 && final)
375         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
376 
377       if (x < min_x)
378         {
379           min_x = x;
380           min_j = j;
381         }
382     }
383 
384   return min_j;
385 }
386 
387 
388 /* Measuring for recompiled mpn/generic/div_qr_1.c,
389  * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
390 
391 mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
392 
393 #if defined (__cplusplus)
394 extern "C" {
395 #endif
396 
397 mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
398 mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
399 void mpz_fac_ui_tune (mpz_ptr, unsigned long);
400 
401 #if defined (__cplusplus)
402 }
403 #endif
404 
405 double
speed_mpn_mod_1_tune(struct speed_params * s)406 speed_mpn_mod_1_tune (struct speed_params *s)
407 {
408   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
409 }
410 double
speed_mpn_divrem_1_tune(struct speed_params * s)411 speed_mpn_divrem_1_tune (struct speed_params *s)
412 {
413   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
414 }
415 double
speed_mpz_fac_ui_tune(struct speed_params * s)416 speed_mpz_fac_ui_tune (struct speed_params *s)
417 {
418   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
419 }
420 double
speed_mpn_div_qr_1_tune(struct speed_params * s)421 speed_mpn_div_qr_1_tune (struct speed_params *s)
422 {
423   SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
424 }
425 
426 double
tuneup_measure(speed_function_t fun,const struct param_t * param,struct speed_params * s)427 tuneup_measure (speed_function_t fun,
428                 const struct param_t *param,
429                 struct speed_params *s)
430 {
431   static struct param_t  dummy;
432   double   t;
433   TMP_DECL;
434 
435   if (! param)
436     param = &dummy;
437 
438   s->size += param->size_extra;
439 
440   TMP_MARK;
441   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
442   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
443 
444   mpn_random (s->xp, s->size);
445   mpn_random (s->yp, s->size);
446 
447   switch (param->data_high) {
448   case DATA_HIGH_LT_R:
449     s->xp[s->size-1] %= s->r;
450     s->yp[s->size-1] %= s->r;
451     break;
452   case DATA_HIGH_GE_R:
453     s->xp[s->size-1] |= s->r;
454     s->yp[s->size-1] |= s->r;
455     break;
456   }
457 
458   t = speed_measure (fun, s);
459 
460   s->size -= param->size_extra;
461 
462   TMP_FREE;
463   return t;
464 }
465 
466 
467 #define PRINT_WIDTH  31
468 
469 void
print_define_start(const char * name)470 print_define_start (const char *name)
471 {
472   printf ("#define %-*s  ", PRINT_WIDTH, name);
473   if (option_trace)
474     printf ("...\n");
475 }
476 
477 void
print_define_end_remark(const char * name,mp_size_t value,const char * remark)478 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
479 {
480   if (option_trace)
481     printf ("#define %-*s  ", PRINT_WIDTH, name);
482 
483   if (value == MP_SIZE_T_MAX)
484     printf ("MP_SIZE_T_MAX");
485   else
486     printf ("%5ld", (long) value);
487 
488   if (remark != NULL)
489     printf ("  /* %s */", remark);
490   printf ("\n");
491   fflush (stdout);
492 }
493 
494 void
print_define_end(const char * name,mp_size_t value)495 print_define_end (const char *name, mp_size_t value)
496 {
497   const char  *remark;
498   if (value == MP_SIZE_T_MAX)
499     remark = "never";
500   else if (value == 0)
501     remark = "always";
502   else
503     remark = NULL;
504   print_define_end_remark (name, value, remark);
505 }
506 
507 void
print_define(const char * name,mp_size_t value)508 print_define (const char *name, mp_size_t value)
509 {
510   print_define_start (name);
511   print_define_end (name, value);
512 }
513 
514 void
print_define_remark(const char * name,mp_size_t value,const char * remark)515 print_define_remark (const char *name, mp_size_t value, const char *remark)
516 {
517   print_define_start (name);
518   print_define_end_remark (name, value, remark);
519 }
520 
521 void
print_define_with_speedup(const char * name,mp_size_t value,mp_size_t runner_up,double speedup)522 print_define_with_speedup (const char *name, mp_size_t value,
523 			   mp_size_t runner_up, double speedup)
524 {
525   char buf[100];
526 #if __STDC_VERSION__ >= 199901L
527   snprintf (buf, sizeof buf, "%.2f%% faster than %ld",
528 	    100.0 * (speedup - 1), runner_up);
529 #else
530   sprintf (buf, "%.2f%% faster than %ld",
531 	    100.0 * (speedup - 1), runner_up);
532 #endif
533   print_define_remark (name, value, buf);
534 }
535 
536 void
one(mp_size_t * threshold,struct param_t * param)537 one (mp_size_t *threshold, struct param_t *param)
538 {
539   int  since_positive, since_thresh_change;
540   int  thresh_idx, new_thresh_idx;
541 
542 #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
543 
544   DEFAULT (param->function_fudge, 1.0);
545   DEFAULT (param->function2, param->function);
546   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
547   DEFAULT (param->step, 1);            /* small steps by default */
548   DEFAULT (param->stop_since_change, 80);
549   DEFAULT (param->stop_factor, 1.2);
550   DEFAULT (param->min_size, 10);
551   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
552 
553   if (param->check_size != 0)
554     {
555       double   t1, t2;
556       s.size = param->check_size;
557 
558       *threshold = s.size+1;
559       t1 = tuneup_measure (param->function, param, &s);
560 
561       *threshold = s.size;
562       t2 = tuneup_measure (param->function2, param, &s);
563       if (t1 == -1.0 || t2 == -1.0)
564         {
565           printf ("Oops, can't run both functions at size %ld\n",
566                   (long) s.size);
567           abort ();
568         }
569       t1 *= param->function_fudge;
570 
571       /* ask that t2 is at least 4% below t1 */
572       if (t1 < t2*1.04)
573         {
574           if (option_trace)
575             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
576           *threshold = MP_SIZE_T_MAX;
577           if (! param->noprint)
578             print_define (param->name, *threshold);
579           return;
580         }
581 
582       if (option_trace >= 2)
583         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
584                 (long) s.size, t1, t2);
585     }
586 
587   if (! param->noprint || option_trace)
588     print_define_start (param->name);
589 
590   ndat = 0;
591   since_positive = 0;
592   since_thresh_change = 0;
593   thresh_idx = 0;
594 
595   if (option_trace >= 2)
596     {
597       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
598       printf ("              (seconds)    (seconds)    diff    thresh\n");
599     }
600 
601   for (s.size = param->min_size;
602        s.size < param->max_size;
603        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
604     {
605       double   ti, tiplus1, d;
606 
607       /*
608         FIXME: check minimum size requirements are met, possibly by just
609         checking for the -1 returns from the speed functions.
610       */
611 
612       /* using method A at this size */
613       *threshold = s.size+1;
614       ti = tuneup_measure (param->function, param, &s);
615       if (ti == -1.0)
616         abort ();
617       ti *= param->function_fudge;
618 
619       /* using method B at this size */
620       *threshold = s.size;
621       tiplus1 = tuneup_measure (param->function2, param, &s);
622       if (tiplus1 == -1.0)
623         abort ();
624 
625       /* Calculate the fraction by which the one or the other routine is
626          slower.  */
627       if (tiplus1 >= ti)
628         d = (tiplus1 - ti) / tiplus1;  /* negative */
629       else
630         d = (tiplus1 - ti) / ti;       /* positive */
631 
632       add_dat (s.size, d);
633 
634       new_thresh_idx = analyze_dat (0);
635 
636       if (option_trace >= 2)
637         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
638                 (long) s.size, ti, tiplus1, d,
639                 ti > tiplus1 ? '#' : ' ',
640                 (long) dat[new_thresh_idx].size);
641 
642       /* Stop if the last time method i was faster was more than a
643          certain number of measurements ago.  */
644 #define STOP_SINCE_POSITIVE  200
645       if (d >= 0)
646         since_positive = 0;
647       else
648         if (++since_positive > STOP_SINCE_POSITIVE)
649           {
650             if (option_trace >= 1)
651               printf ("stopped due to since_positive (%d)\n",
652                       STOP_SINCE_POSITIVE);
653             break;
654           }
655 
656       /* Stop if method A has become slower by a certain factor. */
657       if (ti >= tiplus1 * param->stop_factor)
658         {
659           if (option_trace >= 1)
660             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
661                     param->stop_factor);
662           break;
663         }
664 
665       /* Stop if the threshold implied hasn't changed in a certain
666          number of measurements.  (It's this condition that usually
667          stops the loop.) */
668       if (thresh_idx != new_thresh_idx)
669         since_thresh_change = 0, thresh_idx = new_thresh_idx;
670       else
671         if (++since_thresh_change > param->stop_since_change)
672           {
673             if (option_trace >= 1)
674               printf ("stopped due to since_thresh_change (%d)\n",
675                       param->stop_since_change);
676             break;
677           }
678 
679       /* Stop if the threshold implied is more than a certain number of
680          measurements ago.  */
681 #define STOP_SINCE_AFTER   500
682       if (ndat - thresh_idx > STOP_SINCE_AFTER)
683         {
684           if (option_trace >= 1)
685             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
686                     STOP_SINCE_AFTER);
687           break;
688         }
689 
690       /* Stop when the size limit is reached before the end of the
691          crossover, but only show this as an error for >= the default max
692          size.  FIXME: Maybe should make it a param choice whether this is
693          an error.  */
694       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
695         {
696           fprintf (stderr, "%s\n", param->name);
697           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
698                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
699           fprintf (stderr, "    max size reached before end of crossover\n");
700           break;
701         }
702     }
703 
704   if (option_trace >= 1)
705     printf ("sizes %ld to %ld total %d measurements\n",
706             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
707 
708   *threshold = dat[analyze_dat (1)].size;
709 
710   if (param->min_is_always)
711     {
712       if (*threshold == param->min_size)
713         *threshold = 0;
714     }
715 
716   if (! param->noprint || option_trace)
717     print_define_end (param->name, *threshold);
718 }
719 
720 /* Time N different FUNCTIONS with the same parameters and size, to
721    select the fastest. Since *_METHOD defines start numbering from
722    one, if functions[i] is fastest, the value of the define is i+1.
723    Also output a comment with speedup compared to the next fastest
724    function. The NAME argument is used only for trace output.
725 
726    Returns the index of the fastest function.
727 */
728 int
one_method(int n,speed_function_t * functions,const char * name,const char * define,const struct param_t * param)729 one_method (int n, speed_function_t *functions,
730 	    const char *name, const char *define,
731 	    const struct param_t *param)
732 {
733   double *t;
734   int i;
735   int method;
736   int method_runner_up;
737 
738   TMP_DECL;
739   TMP_MARK;
740   t = (double*) TMP_ALLOC (n * sizeof (*t));
741 
742   for (i = 0; i < n; i++)
743     {
744       t[i] = tuneup_measure (functions[i], param, &s);
745       if (option_trace >= 1)
746 	printf ("size=%ld, %s, method %d %.9f\n",
747 		(long) s.size, name, i + 1, t[i]);
748       if (t[i] == -1.0)
749 	{
750 	  printf ("Oops, can't measure all %s methods\n", name);
751 	  abort ();
752 	}
753     }
754   method = 0;
755   for (i = 1; i < n; i++)
756     if (t[i] < t[method])
757       method = i;
758 
759   method_runner_up = (method == 0);
760   for (i = 0; i < n; i++)
761     if (i != method && t[i] < t[method_runner_up])
762       method_runner_up = i;
763 
764   print_define_with_speedup (define, method + 1, method_runner_up + 1,
765 			     t[method_runner_up] / t[method]);
766 
767   TMP_FREE;
768   return method;
769 }
770 
771 
772 /* Special probing for the fft thresholds.  The size restrictions on the
773    FFTs mean the graph of time vs size has a step effect.  See this for
774    example using
775 
776        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
777        gnuplot foo.gnuplot
778 
779    The current approach is to compare routines at the midpoint of relevant
780    steps.  Arguably a more sophisticated system of threshold data is wanted
781    if this step effect remains. */
782 
783 struct fft_param_t {
784   const char        *table_name;
785   const char        *threshold_name;
786   const char        *modf_threshold_name;
787   mp_size_t         *p_threshold;
788   mp_size_t         *p_modf_threshold;
789   mp_size_t         first_size;
790   mp_size_t         max_size;
791   speed_function_t  function;
792   speed_function_t  mul_modf_function;
793   speed_function_t  mul_function;
794   mp_size_t         sqr;
795 };
796 
797 
798 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
799    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
800    of 2^(k-1) bits. */
801 
802 mp_size_t
fft_step_size(int k)803 fft_step_size (int k)
804 {
805   mp_size_t  step;
806 
807   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
808   step *= (mp_size_t) 1 << k;
809 
810   if (step <= 0)
811     {
812       printf ("Can't handle k=%d\n", k);
813       abort ();
814     }
815 
816   return step;
817 }
818 
819 mp_size_t
fft_next_size(mp_size_t pl,int k)820 fft_next_size (mp_size_t pl, int k)
821 {
822   mp_size_t  m = fft_step_size (k);
823 
824 /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
825 
826   if (pl == 0 || (pl & (m-1)) != 0)
827     pl = (pl | (m-1)) + 1;
828 
829 /*    printf (" %ld\n", pl); */
830   return pl;
831 }
832 
833 #define NMAX_DEFAULT 1000000
834 #define MAX_REPS 25
835 #define MIN_REPS 5
836 
837 static inline size_t
mpn_mul_fft_lcm(size_t a,unsigned int k)838 mpn_mul_fft_lcm (size_t a, unsigned int k)
839 {
840   unsigned int l = k;
841 
842   while (a % 2 == 0 && k > 0)
843     {
844       a >>= 1;
845       k--;
846     }
847   return a << l;
848 }
849 
850 mp_size_t
fftfill(mp_size_t pl,int k,int sqr)851 fftfill (mp_size_t pl, int k, int sqr)
852 {
853   mp_size_t maxLK;
854   mp_bitcnt_t N, Nprime, nprime, M;
855 
856   N = pl * GMP_NUMB_BITS;
857   M = N >> k;
858 
859   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
860 
861   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
862   nprime = Nprime / GMP_NUMB_BITS;
863   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
864     {
865       size_t K2;
866       for (;;)
867 	{
868 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
869 	  if ((nprime & (K2 - 1)) == 0)
870 	    break;
871 	  nprime = (nprime + K2 - 1) & -K2;
872 	  Nprime = nprime * GMP_LIMB_BITS;
873 	}
874     }
875   ASSERT_ALWAYS (nprime < pl);
876 
877   return Nprime;
878 }
879 
880 static int
compare_double(const void * ap,const void * bp)881 compare_double (const void *ap, const void *bp)
882 {
883   double a = * (const double *) ap;
884   double b = * (const double *) bp;
885 
886   if (a < b)
887     return -1;
888   else if (a > b)
889     return 1;
890   else
891     return 0;
892 }
893 
894 double
median(double * times,int n)895 median (double *times, int n)
896 {
897   qsort (times, n, sizeof (double), compare_double);
898   return times[n/2];
899 }
900 
901 #define FFT_CACHE_SIZE 25
902 typedef struct fft_cache
903 {
904   mp_size_t n;
905   double time;
906 } fft_cache_t;
907 
908 fft_cache_t fft_cache[FFT_CACHE_SIZE];
909 
910 double
cached_measure(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,int k,int n_measurements)911 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
912 		int n_measurements)
913 {
914   int i;
915   double t, ttab[MAX_REPS];
916 
917   if (fft_cache[k].n == n)
918     return fft_cache[k].time;
919 
920   for (i = 0; i < n_measurements; i++)
921     {
922       speed_starttime ();
923       mpn_mul_fft (rp, n, ap, n, bp, n, k);
924       ttab[i] = speed_endtime ();
925     }
926 
927   t = median (ttab, n_measurements);
928   fft_cache[k].n = n;
929   fft_cache[k].time = t;
930   return t;
931 }
932 
933 #define INSERT_FFTTAB(idx, nval, kval)					\
934   do {									\
935     fft_tab[idx].n = nval;						\
936     fft_tab[idx].k = kval;						\
937     fft_tab[idx+1].n = (1 << 27) - 1;	/* sentinel, 27b wide field */	\
938     fft_tab[idx+1].k = (1 <<  5) - 1;					\
939   } while (0)
940 
941 int
fftmes(mp_size_t nmin,mp_size_t nmax,int initial_k,struct fft_param_t * p,int idx,int print)942 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
943 {
944   mp_size_t n, n1, prev_n1;
945   int k, best_k, last_best_k, kmax;
946   int eff, prev_eff;
947   double t0, t1;
948   int n_measurements;
949   mp_limb_t *ap, *bp, *rp;
950   mp_size_t alloc;
951   struct fft_table_nk *fft_tab;
952 
953   fft_tab = mpn_fft_table3[p->sqr];
954 
955   for (k = 0; k < FFT_CACHE_SIZE; k++)
956     fft_cache[k].n = 0;
957 
958   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
959     {
960       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
961     }
962 
963   if (print)
964     printf ("#define %s%*s", p->table_name, 38, "");
965 
966   if (idx == 0)
967     {
968       INSERT_FFTTAB (0, nmin, initial_k);
969 
970       if (print)
971 	{
972 	  printf ("\\\n  { ");
973 	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
974 	}
975 
976       idx = 1;
977     }
978 
979   ap = (mp_ptr) malloc (sizeof (mp_limb_t));
980   if (p->sqr)
981     bp = ap;
982   else
983     bp = (mp_ptr) malloc (sizeof (mp_limb_t));
984   rp = (mp_ptr) malloc (sizeof (mp_limb_t));
985   alloc = 1;
986 
987   /* Round n to comply to initial k value */
988   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
989 
990   n_measurements = (18 - initial_k) | 1;
991   n_measurements = MAX (n_measurements, MIN_REPS);
992   n_measurements = MIN (n_measurements, MAX_REPS);
993 
994   last_best_k = initial_k;
995   best_k = initial_k;
996 
997   while (n < nmax)
998     {
999       int start_k, end_k;
1000 
1001       /* Assume the current best k is best until we hit its next FFT step.  */
1002       t0 = 99999;
1003 
1004       prev_n1 = n + 1;
1005 
1006       start_k = MAX (4, best_k - 4);
1007       end_k = MIN (24, best_k + 4);
1008       for (k = start_k; k <= end_k; k++)
1009 	{
1010           n1 = mpn_fft_next_size (prev_n1, k);
1011 
1012 	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
1013 
1014 	  if (eff < 70)		/* avoid measuring too slow fft:s */
1015 	    continue;
1016 
1017 	  if (n1 > alloc)
1018 	    {
1019 	      alloc = n1;
1020 	      if (p->sqr)
1021 		{
1022 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1023 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1024 		  ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1025 		  mpn_random (ap, alloc);
1026 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1027 		}
1028 	      else
1029 		{
1030 		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1031 		  bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
1032 		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1033 		  ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1034 		  mpn_random (ap, alloc);
1035 		  bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
1036 		  mpn_random (bp, alloc);
1037 		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1038 		}
1039 	    }
1040 
1041 	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
1042 
1043 	  if (t1 * n_measurements > 0.3)
1044 	    n_measurements -= 2;
1045 	  n_measurements = MAX (n_measurements, MIN_REPS);
1046 
1047 	  if (t1 < t0)
1048 	    {
1049 	      best_k = k;
1050 	      t0 = t1;
1051 	    }
1052 	}
1053 
1054       n1 = mpn_fft_next_size (prev_n1, best_k);
1055 
1056       if (last_best_k != best_k)
1057 	{
1058 	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
1059 
1060 	  if (idx >= FFT_TABLE3_SIZE)
1061 	    {
1062 	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1063 	      abort ();
1064 	    }
1065 	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
1066 
1067 	  if (print)
1068 	    {
1069 	      printf (", ");
1070 	      if (idx % 4 == 0)
1071 		printf ("\\\n    ");
1072 	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1073 	    }
1074 
1075 	  if (option_trace >= 2)
1076 	    {
1077 	      printf ("{%lu,%u}\n", prev_n1, best_k);
1078 	      fflush (stdout);
1079 	    }
1080 
1081 	  last_best_k = best_k;
1082 	  idx++;
1083 	}
1084 
1085       for (;;)
1086 	{
1087 	  prev_n1 = n1;
1088 	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
1089 	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
1090 	  eff = fftfill (n1, best_k, p->sqr);
1091 
1092 	  if (eff != prev_eff)
1093 	    break;
1094 	}
1095 
1096       n = prev_n1;
1097     }
1098 
1099   kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
1100   kmax = MIN (kmax, 25-1);
1101   for (k = last_best_k + 1; k <= kmax; k++)
1102     {
1103       if (idx >= FFT_TABLE3_SIZE)
1104 	{
1105 	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1106 	  abort ();
1107 	}
1108       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1109 
1110       if (print)
1111 	{
1112 	  printf (", ");
1113 	  if (idx % 4 == 0)
1114 	    printf ("\\\n    ");
1115 	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1116 	}
1117 
1118       idx++;
1119     }
1120 
1121   if (print)
1122     printf (" }\n");
1123 
1124   free (ap);
1125   if (! p->sqr)
1126     free (bp);
1127   free (rp);
1128 
1129   return idx;
1130 }
1131 
1132 void
fft(struct fft_param_t * p)1133 fft (struct fft_param_t *p)
1134 {
1135   mp_size_t  size;
1136   int        k, idx, initial_k;
1137 
1138   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1139 
1140 #if 1
1141   {
1142     /* Use plain one() mechanism, for some reasonable initial values of k.  The
1143        advantage is that we don't depend on mpn_fft_table3, which can therefore
1144        leave it completely uninitialized.  */
1145 
1146     static struct param_t param;
1147     mp_size_t thres, best_thres;
1148     int best_k;
1149     char buf[20];
1150 
1151     best_thres = MP_SIZE_T_MAX;
1152     best_k = -1;
1153 
1154     for (k = 5; k <= 7; k++)
1155       {
1156 	param.name = p->modf_threshold_name;
1157 	param.min_size = 100;
1158 	param.max_size = 2000;
1159 	param.function  = p->mul_function;
1160 	param.step_factor = 0.0;
1161 	param.step = 4;
1162 	param.function2 = p->mul_modf_function;
1163 	param.noprint = 1;
1164 	s.r = k;
1165 	one (&thres, &param);
1166 	if (thres < best_thres)
1167 	  {
1168 	    best_thres = thres;
1169 	    best_k = k;
1170 	  }
1171       }
1172 
1173     *(p->p_modf_threshold) = best_thres;
1174     sprintf (buf, "k = %d", best_k);
1175     print_define_remark (p->modf_threshold_name, best_thres, buf);
1176     initial_k = best_k;
1177   }
1178 #else
1179   size = p->first_size;
1180   for (;;)
1181     {
1182       double  tk, tm;
1183 
1184       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1185       k = mpn_fft_best_k (size, p->sqr);
1186 
1187       if (size >= p->max_size)
1188         break;
1189 
1190       s.size = size + fft_step_size (k) / 2;
1191       s.r = k;
1192       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1193       if (tk == -1.0)
1194         abort ();
1195 
1196       tm = tuneup_measure (p->mul_function, NULL, &s);
1197       if (tm == -1.0)
1198         abort ();
1199 
1200       if (option_trace >= 2)
1201         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
1202                 (long) size,
1203                 (long) size + fft_step_size (k) / 2, k, tk,
1204                 (long) s.size, tm);
1205 
1206       if (tk < tm)
1207         {
1208 	  *p->p_modf_threshold = s.size;
1209 	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
1210 	  break;
1211         }
1212     }
1213   initial_k = ?;
1214 #endif
1215 
1216   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1217 
1218   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1219   printf ("#define %s_SIZE %d\n", p->table_name, idx);
1220 
1221   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1222 
1223   size = 2 * *p->p_modf_threshold;	/* OK? */
1224   for (;;)
1225     {
1226       double  tk, tm;
1227       mp_size_t mulmod_size, mul_size;;
1228 
1229       if (size >= p->max_size)
1230         break;
1231 
1232       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1233       mul_size = (size + mulmod_size) / 2;	/* middle of step */
1234 
1235       s.size = mulmod_size;
1236       tk = tuneup_measure (p->function, NULL, &s);
1237       if (tk == -1.0)
1238         abort ();
1239 
1240       s.size = mul_size;
1241       tm = tuneup_measure (p->mul_function, NULL, &s);
1242       if (tm == -1.0)
1243         abort ();
1244 
1245       if (option_trace >= 2)
1246         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
1247                 (long) size,
1248                 (long) mulmod_size, tk,
1249                 (long) mul_size, tm);
1250 
1251       size = mulmod_size;
1252 
1253       if (tk < tm)
1254         {
1255 	  *p->p_threshold = s.size;
1256 	  print_define (p->threshold_name, *p->p_threshold);
1257 	  break;
1258         }
1259     }
1260 }
1261 
1262 /* Compare mpn_mul_1 to whatever fast exact single-limb division we have.  This
1263    is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
1264    This is used in get_str and set_str.  */
1265 void
relspeed_div_1_vs_mul_1(void)1266 relspeed_div_1_vs_mul_1 (void)
1267 {
1268 #define max_opsize 100
1269   mp_size_t n;
1270   long j;
1271   mp_limb_t rp[max_opsize];
1272   mp_limb_t ap[max_opsize];
1273   double multime, divtime;
1274 
1275   mpn_random (ap, max_opsize);
1276 
1277   multime = 0;
1278   for (n = max_opsize; n > 1; n--)
1279     {
1280       mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1281       speed_starttime ();
1282       for (j = speed_precision; j != 0 ; j--)
1283 	mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1284       multime += speed_endtime () / n;
1285     }
1286 
1287   divtime = 0;
1288   for (n = max_opsize; n > 1; n--)
1289     {
1290       /* Make input divisible for good measure.  */
1291       ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
1292 
1293 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1294 	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1295 			    MP_BASES_BIG_BASE_BINVERTED_10,
1296 			    MP_BASES_BIG_BASE_CTZ_10);
1297 #else
1298 	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1299 #endif
1300       speed_starttime ();
1301       for (j = speed_precision; j != 0 ; j--)
1302 	{
1303 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1304 	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1305 			    MP_BASES_BIG_BASE_BINVERTED_10,
1306 			    MP_BASES_BIG_BASE_CTZ_10);
1307 #else
1308 	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1309 #endif
1310 	}
1311       divtime += speed_endtime () / n;
1312     }
1313 
1314   print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
1315 }
1316 
1317 
1318 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1319    giving wrong results.  */
1320 void
tune_mul_n(void)1321 tune_mul_n (void)
1322 {
1323   static struct param_t  param;
1324   mp_size_t next_toom_start;
1325   int something_changed;
1326 
1327   param.function = speed_mpn_mul_n;
1328 
1329   param.name = "MUL_TOOM22_THRESHOLD";
1330   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1331   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1332   one (&mul_toom22_threshold, &param);
1333 
1334   param.noprint = 1;
1335 
1336   /* Threshold sequence loop.  Disable functions that would be used in a very
1337      narrow range, re-measuring things when that happens.  */
1338   something_changed = 1;
1339   while (something_changed)
1340     {
1341       something_changed = 0;
1342 
1343 	next_toom_start = mul_toom22_threshold;
1344 
1345 	if (mul_toom33_threshold != 0)
1346 	  {
1347 	    param.name = "MUL_TOOM33_THRESHOLD";
1348 	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
1349 	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1350 	    one (&mul_toom33_threshold, &param);
1351 
1352 	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
1353 	      {
1354 		mul_toom33_threshold = 0;
1355 		something_changed = 1;
1356 	      }
1357 	  }
1358 
1359 	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
1360 
1361 	if (mul_toom44_threshold != 0)
1362 	  {
1363 	    param.name = "MUL_TOOM44_THRESHOLD";
1364 	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
1365 	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1366 	    one (&mul_toom44_threshold, &param);
1367 
1368 	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
1369 	      {
1370 		mul_toom44_threshold = 0;
1371 		something_changed = 1;
1372 	      }
1373 	  }
1374 
1375 	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
1376 
1377 	if (mul_toom6h_threshold != 0)
1378 	  {
1379 	    param.name = "MUL_TOOM6H_THRESHOLD";
1380 	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
1381 	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1382 	    one (&mul_toom6h_threshold, &param);
1383 
1384 	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
1385 	      {
1386 		mul_toom6h_threshold = 0;
1387 		something_changed = 1;
1388 	      }
1389 	  }
1390 
1391 	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
1392 
1393 	if (mul_toom8h_threshold != 0)
1394 	  {
1395 	    param.name = "MUL_TOOM8H_THRESHOLD";
1396 	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
1397 	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1398 	    one (&mul_toom8h_threshold, &param);
1399 
1400 	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
1401 	      {
1402 		mul_toom8h_threshold = 0;
1403 		something_changed = 1;
1404 	      }
1405 	  }
1406     }
1407 
1408     print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
1409     print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
1410     print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
1411     print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
1412 
1413   /* disabled until tuned */
1414   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1415 }
1416 
1417 void
tune_mul(void)1418 tune_mul (void)
1419 {
1420   static struct param_t  param;
1421   mp_size_t thres;
1422 
1423   param.noprint = 1;
1424 
1425   param.function = speed_mpn_toom32_for_toom43_mul;
1426   param.function2 = speed_mpn_toom43_for_toom32_mul;
1427   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1428   param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
1429   one (&thres, &param);
1430   mul_toom32_to_toom43_threshold = thres * 17 / 24;
1431   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1432 
1433   param.function = speed_mpn_toom32_for_toom53_mul;
1434   param.function2 = speed_mpn_toom53_for_toom32_mul;
1435   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1436   param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
1437   one (&thres, &param);
1438   mul_toom32_to_toom53_threshold = thres * 19 / 30;
1439   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1440 
1441   param.function = speed_mpn_toom42_for_toom53_mul;
1442   param.function2 = speed_mpn_toom53_for_toom42_mul;
1443   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1444   param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
1445   one (&thres, &param);
1446   mul_toom42_to_toom53_threshold = thres * 11 / 20;
1447   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1448 
1449   param.function = speed_mpn_toom42_mul;
1450   param.function2 = speed_mpn_toom63_mul;
1451   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1452   param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
1453   one (&thres, &param);
1454   mul_toom42_to_toom63_threshold = thres / 2;
1455   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1456 
1457   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
1458   param.function = speed_mpn_toom43_for_toom54_mul;
1459   param.function2 = speed_mpn_toom54_for_toom43_mul;
1460   param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
1461   param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
1462   one (&thres, &param);
1463   mul_toom43_to_toom54_threshold = thres * 5 / 6;
1464   print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
1465 }
1466 
1467 
1468 void
tune_mullo(void)1469 tune_mullo (void)
1470 {
1471   static struct param_t  param;
1472 
1473   param.function = speed_mpn_mullo_n;
1474 
1475   param.name = "MULLO_BASECASE_THRESHOLD";
1476   param.min_size = 2;
1477   param.min_is_always = 1;
1478   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1479   param.stop_factor = 1.5;
1480   param.noprint = 1;
1481   one (&mullo_basecase_threshold, &param);
1482 
1483   param.name = "MULLO_DC_THRESHOLD";
1484   param.min_size = 8;
1485   param.min_is_always = 0;
1486   param.max_size = 1000;
1487   one (&mullo_dc_threshold, &param);
1488 
1489   if (mullo_basecase_threshold >= mullo_dc_threshold)
1490     {
1491       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1492       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1493     }
1494   else
1495     {
1496       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1497       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1498     }
1499 
1500   if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
1501     {
1502       param.name = "MULLO_MUL_N_THRESHOLD";
1503       param.min_size = mullo_dc_threshold;
1504       param.max_size = 2 * mul_fft_threshold;
1505       param.noprint = 0;
1506       param.step_factor = 0.03;
1507       one (&mullo_mul_n_threshold, &param);
1508     }
1509   else
1510     print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1511 			 "without FFT use mullo forever");
1512 }
1513 
1514 void
tune_sqrlo(void)1515 tune_sqrlo (void)
1516 {
1517   static struct param_t  param;
1518 
1519   param.function = speed_mpn_sqrlo;
1520 
1521   param.name = "SQRLO_BASECASE_THRESHOLD";
1522   param.min_size = 2;
1523   param.min_is_always = 1;
1524   param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
1525   param.stop_factor = 1.5;
1526   param.noprint = 1;
1527   one (&sqrlo_basecase_threshold, &param);
1528 
1529   param.name = "SQRLO_DC_THRESHOLD";
1530   param.min_size = 8;
1531   param.min_is_always = 0;
1532   param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
1533   one (&sqrlo_dc_threshold, &param);
1534 
1535   if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
1536     {
1537       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
1538       print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
1539     }
1540   else
1541     {
1542       print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
1543       print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
1544     }
1545 
1546   if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
1547     {
1548       param.name = "SQRLO_SQR_THRESHOLD";
1549       param.min_size = sqrlo_dc_threshold;
1550       param.max_size = 2 * sqr_fft_threshold;
1551       param.noprint = 0;
1552       param.step_factor = 0.03;
1553       one (&sqrlo_sqr_threshold, &param);
1554     }
1555   else
1556     print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
1557 			 "without FFT use sqrlo forever");
1558 }
1559 
1560 void
tune_mulmid(void)1561 tune_mulmid (void)
1562 {
1563   static struct param_t  param;
1564 
1565   param.name = "MULMID_TOOM42_THRESHOLD";
1566   param.function = speed_mpn_mulmid_n;
1567   param.min_size = 4;
1568   param.max_size = 100;
1569   one (&mulmid_toom42_threshold, &param);
1570 }
1571 
1572 void
tune_mulmod_bnm1(void)1573 tune_mulmod_bnm1 (void)
1574 {
1575   static struct param_t  param;
1576 
1577   param.name = "MULMOD_BNM1_THRESHOLD";
1578   param.function = speed_mpn_mulmod_bnm1;
1579   param.min_size = 4;
1580   param.max_size = 100;
1581   one (&mulmod_bnm1_threshold, &param);
1582 }
1583 
1584 void
tune_sqrmod_bnm1(void)1585 tune_sqrmod_bnm1 (void)
1586 {
1587   static struct param_t  param;
1588 
1589   param.name = "SQRMOD_BNM1_THRESHOLD";
1590   param.function = speed_mpn_sqrmod_bnm1;
1591   param.min_size = 4;
1592   param.max_size = 100;
1593   one (&sqrmod_bnm1_threshold, &param);
1594 }
1595 
1596 
1597 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1598    is faster only at size==2 then we don't want to bother with extra code
1599    just for that.  Start karatsuba from 4 same as MUL above.  */
1600 
1601 void
tune_sqr(void)1602 tune_sqr (void)
1603 {
1604   /* disabled until tuned */
1605   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1606 
1607   if (HAVE_NATIVE_mpn_sqr_basecase)
1608     {
1609       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1610       sqr_basecase_threshold = 0;
1611     }
1612   else
1613     {
1614       static struct param_t  param;
1615       param.name = "SQR_BASECASE_THRESHOLD";
1616       param.function = speed_mpn_sqr;
1617       param.min_size = 3;
1618       param.min_is_always = 1;
1619       param.max_size = TUNE_SQR_TOOM2_MAX;
1620       param.noprint = 1;
1621       one (&sqr_basecase_threshold, &param);
1622     }
1623 
1624   {
1625     static struct param_t  param;
1626     param.name = "SQR_TOOM2_THRESHOLD";
1627     param.function = speed_mpn_sqr;
1628     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1629     param.max_size = TUNE_SQR_TOOM2_MAX;
1630     param.noprint = 1;
1631     one (&sqr_toom2_threshold, &param);
1632 
1633     if (! HAVE_NATIVE_mpn_sqr_basecase
1634         && sqr_toom2_threshold < sqr_basecase_threshold)
1635       {
1636         /* Karatsuba becomes faster than mul_basecase before
1637            sqr_basecase does.  Arrange for the expression
1638            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1639            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1640            SQR_TOOM2_THRESHOLD to zero, making
1641            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
1642 
1643         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1644         SQR_TOOM2_THRESHOLD = 0;
1645 
1646         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1647                              "toom2");
1648         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1649                              "never sqr_basecase");
1650       }
1651     else
1652       {
1653         if (! HAVE_NATIVE_mpn_sqr_basecase)
1654           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1655         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1656       }
1657   }
1658 
1659   {
1660     static struct param_t  param;
1661     mp_size_t next_toom_start;
1662     int something_changed;
1663 
1664     param.function = speed_mpn_sqr;
1665     param.noprint = 1;
1666 
1667   /* Threshold sequence loop.  Disable functions that would be used in a very
1668      narrow range, re-measuring things when that happens.  */
1669     something_changed = 1;
1670     while (something_changed)
1671       {
1672 	something_changed = 0;
1673 
1674 	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1675 
1676 	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
1677 	param.name = "SQR_TOOM3_THRESHOLD";
1678 	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
1679 	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1680 	one (&sqr_toom3_threshold, &param);
1681 
1682 	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
1683 
1684 	if (sqr_toom4_threshold != 0)
1685 	  {
1686 	    param.name = "SQR_TOOM4_THRESHOLD";
1687 	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
1688 	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
1689 	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1690 	    one (&sqr_toom4_threshold, &param);
1691 
1692 	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
1693 	      {
1694 		sqr_toom4_threshold = 0;
1695 		something_changed = 1;
1696 	      }
1697 	  }
1698 
1699 	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
1700 
1701 	if (sqr_toom6_threshold != 0)
1702 	  {
1703 	    param.name = "SQR_TOOM6_THRESHOLD";
1704 	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
1705 	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
1706 	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1707 	    one (&sqr_toom6_threshold, &param);
1708 
1709 	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
1710 	      {
1711 		sqr_toom6_threshold = 0;
1712 		something_changed = 1;
1713 	      }
1714 	  }
1715 
1716 	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
1717 
1718 	if (sqr_toom8_threshold != 0)
1719 	  {
1720 	    param.name = "SQR_TOOM8_THRESHOLD";
1721 	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
1722 	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
1723 	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1724 	    one (&sqr_toom8_threshold, &param);
1725 
1726 	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
1727 	      {
1728 		sqr_toom8_threshold = 0;
1729 		something_changed = 1;
1730 	      }
1731 	  }
1732       }
1733 
1734     print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
1735     print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
1736     print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
1737     print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
1738   }
1739 }
1740 
1741 
1742 void
tune_dc_div(void)1743 tune_dc_div (void)
1744 {
1745   s.r = 0;		/* clear to make speed function do 2n/n */
1746   {
1747     static struct param_t  param;
1748     param.name = "DC_DIV_QR_THRESHOLD";
1749     param.function = speed_mpn_sbpi1_div_qr;
1750     param.function2 = speed_mpn_dcpi1_div_qr;
1751     param.min_size = 6;
1752     one (&dc_div_qr_threshold, &param);
1753   }
1754   {
1755     static struct param_t  param;
1756     param.name = "DC_DIVAPPR_Q_THRESHOLD";
1757     param.function = speed_mpn_sbpi1_divappr_q;
1758     param.function2 = speed_mpn_dcpi1_divappr_q;
1759     param.min_size = 6;
1760     one (&dc_divappr_q_threshold, &param);
1761   }
1762 }
1763 
1764 static double
speed_mpn_sbordcpi1_div_qr(struct speed_params * s)1765 speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1766 {
1767   if (s->size < DC_DIV_QR_THRESHOLD)
1768     return speed_mpn_sbpi1_div_qr (s);
1769   else
1770     return speed_mpn_dcpi1_div_qr (s);
1771 }
1772 
1773 void
tune_mu_div(void)1774 tune_mu_div (void)
1775 {
1776   s.r = 0;		/* clear to make speed function do 2n/n */
1777   {
1778     static struct param_t  param;
1779     param.name = "MU_DIV_QR_THRESHOLD";
1780     param.function = speed_mpn_dcpi1_div_qr;
1781     param.function2 = speed_mpn_mu_div_qr;
1782     param.min_size = mul_toom22_threshold;
1783     param.max_size = 5000;
1784     param.step_factor = 0.02;
1785     one (&mu_div_qr_threshold, &param);
1786   }
1787   {
1788     static struct param_t  param;
1789     param.name = "MU_DIVAPPR_Q_THRESHOLD";
1790     param.function = speed_mpn_dcpi1_divappr_q;
1791     param.function2 = speed_mpn_mu_divappr_q;
1792     param.min_size = mul_toom22_threshold;
1793     param.max_size = 5000;
1794     param.step_factor = 0.02;
1795     one (&mu_divappr_q_threshold, &param);
1796   }
1797   {
1798     static struct param_t  param;
1799     param.name = "MUPI_DIV_QR_THRESHOLD";
1800     param.function = speed_mpn_sbordcpi1_div_qr;
1801     param.function2 = speed_mpn_mupi_div_qr;
1802     param.min_size = 6;
1803     param.min_is_always = 1;
1804     param.max_size = 1000;
1805     param.step_factor = 0.02;
1806     one (&mupi_div_qr_threshold, &param);
1807   }
1808 }
1809 
1810 void
tune_dc_bdiv(void)1811 tune_dc_bdiv (void)
1812 {
1813   s.r = 0;		/* clear to make speed function do 2n/n*/
1814   {
1815     static struct param_t  param;
1816     param.name = "DC_BDIV_QR_THRESHOLD";
1817     param.function = speed_mpn_sbpi1_bdiv_qr;
1818     param.function2 = speed_mpn_dcpi1_bdiv_qr;
1819     param.min_size = 4;
1820     one (&dc_bdiv_qr_threshold, &param);
1821   }
1822   {
1823     static struct param_t  param;
1824     param.name = "DC_BDIV_Q_THRESHOLD";
1825     param.function = speed_mpn_sbpi1_bdiv_q;
1826     param.function2 = speed_mpn_dcpi1_bdiv_q;
1827     param.min_size = 4;
1828     one (&dc_bdiv_q_threshold, &param);
1829   }
1830 }
1831 
1832 void
tune_mu_bdiv(void)1833 tune_mu_bdiv (void)
1834 {
1835   s.r = 0;		/* clear to make speed function do 2n/n*/
1836   {
1837     static struct param_t  param;
1838     param.name = "MU_BDIV_QR_THRESHOLD";
1839     param.function = speed_mpn_dcpi1_bdiv_qr;
1840     param.function2 = speed_mpn_mu_bdiv_qr;
1841     param.min_size = dc_bdiv_qr_threshold;
1842     param.max_size = 5000;
1843     param.step_factor = 0.02;
1844     one (&mu_bdiv_qr_threshold, &param);
1845   }
1846   {
1847     static struct param_t  param;
1848     param.name = "MU_BDIV_Q_THRESHOLD";
1849     param.function = speed_mpn_dcpi1_bdiv_q;
1850     param.function2 = speed_mpn_mu_bdiv_q;
1851     param.min_size = dc_bdiv_q_threshold;
1852     param.max_size = 5000;
1853     param.step_factor = 0.02;
1854     one (&mu_bdiv_q_threshold, &param);
1855   }
1856 }
1857 
1858 void
tune_invertappr(void)1859 tune_invertappr (void)
1860 {
1861   static struct param_t  param;
1862 
1863   param.function = speed_mpn_ni_invertappr;
1864   param.name = "INV_MULMOD_BNM1_THRESHOLD";
1865   param.min_size = 5;
1866   one (&inv_mulmod_bnm1_threshold, &param);
1867 
1868   param.function = speed_mpn_invertappr;
1869   param.name = "INV_NEWTON_THRESHOLD";
1870   param.min_size = 5;
1871   one (&inv_newton_threshold, &param);
1872 }
1873 
1874 void
tune_invert(void)1875 tune_invert (void)
1876 {
1877   static struct param_t  param;
1878 
1879   param.function = speed_mpn_invert;
1880   param.name = "INV_APPR_THRESHOLD";
1881   param.min_size = 5;
1882   one (&inv_appr_threshold, &param);
1883 }
1884 
1885 void
tune_binvert(void)1886 tune_binvert (void)
1887 {
1888   static struct param_t  param;
1889 
1890   param.function = speed_mpn_binvert;
1891   param.name = "BINV_NEWTON_THRESHOLD";
1892   param.min_size = 8;		/* pointless with smaller operands */
1893   one (&binv_newton_threshold, &param);
1894 }
1895 
1896 void
tune_redc(void)1897 tune_redc (void)
1898 {
1899 #define TUNE_REDC_2_MAX 100
1900 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1901 #define WANT_REDC_2 1
1902 #endif
1903 
1904 #if WANT_REDC_2
1905   {
1906     static struct param_t  param;
1907     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1908     param.function = speed_mpn_redc_1;
1909     param.function2 = speed_mpn_redc_2;
1910     param.min_size = 1;
1911     param.min_is_always = 1;
1912     param.max_size = TUNE_REDC_2_MAX;
1913     param.noprint = 1;
1914     param.stop_factor = 1.5;
1915     one (&redc_1_to_redc_2_threshold, &param);
1916   }
1917   {
1918     static struct param_t  param;
1919     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1920     param.function = speed_mpn_redc_2;
1921     param.function2 = speed_mpn_redc_n;
1922     param.min_size = 16;
1923     param.noprint = 1;
1924     one (&redc_2_to_redc_n_threshold, &param);
1925   }
1926   if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
1927     {
1928       redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
1929 
1930       /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
1931 	 REDC_1_TO_REDC_2_THRESHOLD.  */
1932       {
1933 	static struct param_t  param;
1934 	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1935 	param.function = speed_mpn_redc_1;
1936 	param.function2 = speed_mpn_redc_n;
1937 	param.min_size = 16;
1938 	param.noprint = 1;
1939 	one (&redc_1_to_redc_2_threshold, &param);
1940       }
1941     }
1942   print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1943   print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1944 #else
1945   {
1946     static struct param_t  param;
1947     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1948     param.function = speed_mpn_redc_1;
1949     param.function2 = speed_mpn_redc_n;
1950     param.min_size = 16;
1951     one (&redc_1_to_redc_n_threshold, &param);
1952   }
1953 #endif
1954 }
1955 
1956 void
tune_matrix22_mul(void)1957 tune_matrix22_mul (void)
1958 {
1959   static struct param_t  param;
1960   param.name = "MATRIX22_STRASSEN_THRESHOLD";
1961   param.function = speed_mpn_matrix22_mul;
1962   param.min_size = 2;
1963   one (&matrix22_strassen_threshold, &param);
1964 }
1965 
1966 void
tune_hgcd2(void)1967 tune_hgcd2 (void)
1968 {
1969   static struct param_t  param;
1970   hgcd2_func_t *f[5] =
1971     { mpn_hgcd2_1,
1972       mpn_hgcd2_2,
1973       mpn_hgcd2_3,
1974       mpn_hgcd2_4,
1975       mpn_hgcd2_5 };
1976   speed_function_t speed_f[5] =
1977     { speed_mpn_hgcd2_1,
1978       speed_mpn_hgcd2_2,
1979       speed_mpn_hgcd2_3,
1980       speed_mpn_hgcd2_4,
1981       speed_mpn_hgcd2_5 };
1982   int best;
1983 
1984   s.size = 1;
1985   best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", &param);
1986 
1987   /* Use selected function when tuning hgcd and gcd */
1988   hgcd2_func = f[best];
1989 }
1990 
1991 void
tune_hgcd(void)1992 tune_hgcd (void)
1993 {
1994   static struct param_t  param;
1995   param.name = "HGCD_THRESHOLD";
1996   param.function = speed_mpn_hgcd;
1997   /* We seem to get strange results for small sizes */
1998   param.min_size = 30;
1999   one (&hgcd_threshold, &param);
2000 }
2001 
2002 void
tune_hgcd_appr(void)2003 tune_hgcd_appr (void)
2004 {
2005   static struct param_t  param;
2006   param.name = "HGCD_APPR_THRESHOLD";
2007   param.function = speed_mpn_hgcd_appr;
2008   /* We seem to get strange results for small sizes */
2009   param.min_size = 50;
2010   param.stop_since_change = 150;
2011   one (&hgcd_appr_threshold, &param);
2012 }
2013 
2014 void
tune_hgcd_reduce(void)2015 tune_hgcd_reduce (void)
2016 {
2017   static struct param_t  param;
2018   param.name = "HGCD_REDUCE_THRESHOLD";
2019   param.function = speed_mpn_hgcd_reduce;
2020   param.min_size = 30;
2021   param.max_size = 7000;
2022   param.step_factor = 0.04;
2023   one (&hgcd_reduce_threshold, &param);
2024 }
2025 
2026 void
tune_gcd_dc(void)2027 tune_gcd_dc (void)
2028 {
2029   static struct param_t  param;
2030   param.name = "GCD_DC_THRESHOLD";
2031   param.function = speed_mpn_gcd;
2032   param.min_size = hgcd_threshold;
2033   param.max_size = 3000;
2034   param.step_factor = 0.02;
2035   one (&gcd_dc_threshold, &param);
2036 }
2037 
2038 void
tune_gcdext_dc(void)2039 tune_gcdext_dc (void)
2040 {
2041   static struct param_t  param;
2042   param.name = "GCDEXT_DC_THRESHOLD";
2043   param.function = speed_mpn_gcdext;
2044   param.min_size = hgcd_threshold;
2045   param.max_size = 3000;
2046   param.step_factor = 0.02;
2047   one (&gcdext_dc_threshold, &param);
2048 }
2049 
2050 /* In tune_powm_sec we compute the table used by the win_size function.  The
2051    cutoff points are in exponent bits, disregarding other operand sizes.  It is
2052    not possible to use the one framework since it currently uses a granularity
2053    of full limbs.
2054 */
2055 
2056 /* This win_size replaces the variant in the powm code, allowing us to
2057    control k in the k-ary algorithms.  */
2058 int winsize;
2059 int
win_size(mp_bitcnt_t eb)2060 win_size (mp_bitcnt_t eb)
2061 {
2062   return winsize;
2063 }
2064 
2065 void
tune_powm_sec(void)2066 tune_powm_sec (void)
2067 {
2068   mp_size_t n;
2069   int k, i;
2070   mp_size_t itch;
2071   mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
2072   const int n_max = 3000 / GMP_NUMB_BITS;
2073 #define n_measurements  5
2074   mp_ptr rp, bp, ep, mp, tp;
2075   double ttab[n_measurements], tk, tkp1;
2076   TMP_DECL;
2077   TMP_MARK;
2078 
2079   possible_nbits_cutoff = 0;
2080 
2081   k = 1;
2082 
2083   winsize = 10;			/* the itch function needs this */
2084   itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
2085 
2086   rp = TMP_ALLOC_LIMBS (n_max);
2087   bp = TMP_ALLOC_LIMBS (n_max);
2088   ep = TMP_ALLOC_LIMBS (n_max);
2089   mp = TMP_ALLOC_LIMBS (n_max);
2090   tp = TMP_ALLOC_LIMBS (itch);
2091 
2092   mpn_random (bp, n_max);
2093   mpn_random (mp, n_max);
2094   mp[0] |= 1;
2095 
2096 /* How about taking the M operand size into account?
2097 
2098    An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
2099    B = O(M)).
2100 
2101    Using k-ary and no sliding window, the precomputation will need time
2102    O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
2103    O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
2104 
2105    An operation R=powm_sec(B,E,N) will take time like powm.
2106 
2107    Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
2108    main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
2109    O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
2110    table reads, respectively.  */
2111 
2112   printf ("#define POWM_SEC_TABLE  ");
2113 
2114   /* For nbits == 1, we should always use k == 1, so no need to tune
2115      that. Starting with nbits == 2 also ensure that nbits always is
2116      larger than the windowsize k+1. */
2117   for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
2118     {
2119       n = (nbits - 1) / GMP_NUMB_BITS + 1;
2120 
2121       /* Generate E such that sliding-window for k and k+1 works equally
2122 	 well/poorly (but sliding is not used in powm_sec, of course). */
2123       for (i = 0; i < n; i++)
2124 	ep[i] = ~CNST_LIMB(0);
2125 
2126       winsize = k;
2127       for (i = 0; i < n_measurements; i++)
2128 	{
2129 	  speed_starttime ();
2130 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2131 	  ttab[i] = speed_endtime ();
2132 	}
2133       tk = median (ttab, n_measurements);
2134 
2135       winsize = k + 1;
2136       speed_starttime ();
2137       for (i = 0; i < n_measurements; i++)
2138 	{
2139 	  speed_starttime ();
2140 	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2141 	  ttab[i] = speed_endtime ();
2142 	}
2143       tkp1 = median (ttab, n_measurements);
2144 /*
2145       printf ("testing: %ld, %d", nbits, k, ep[n-1]);
2146       printf ("   %10.5f  %10.5f\n", tk, tkp1);
2147 */
2148       if (tkp1 < tk)
2149 	{
2150 	  if (possible_nbits_cutoff)
2151 	    {
2152 	      /* Two consecutive sizes indicate k increase, obey.  */
2153 
2154 	      /* Must always have x[k] >= k */
2155 	      ASSERT_ALWAYS (possible_nbits_cutoff >= k);
2156 
2157 	      if (k > 1)
2158 		printf (",");
2159 	      printf ("%ld", (long) possible_nbits_cutoff);
2160 	      k++;
2161 	      possible_nbits_cutoff = 0;
2162 	    }
2163 	  else
2164 	    {
2165 	      /* One measurement indicate k increase, save nbits for further
2166 		 consideration.  */
2167 	      /* The new larger k gets used for sizes > the cutoff
2168 		 value, hence the cutoff should be one less than the
2169 		 smallest size where it gives a speedup. */
2170 	      possible_nbits_cutoff = nbits - 1;
2171 	    }
2172 	}
2173       else
2174 	possible_nbits_cutoff = 0;
2175 
2176       nbits_next = nbits * 65 / 64;
2177       nbits = nbits_next + (nbits_next == nbits);
2178     }
2179   printf ("\n");
2180   TMP_FREE;
2181 }
2182 
2183 
2184 /* size_extra==1 reflects the fact that with high<divisor one division is
2185    always skipped.  Forcing high<divisor while testing ensures consistency
2186    while stepping through sizes, ie. that size-1 divides will be done each
2187    time.
2188 
2189    min_size==2 and min_is_always are used so that if plain division is only
2190    better at size==1 then don't bother including that code just for that
2191    case, instead go with preinv always and get a size saving.  */
2192 
2193 #define DIV_1_PARAMS                    \
2194   param.check_size = 256;               \
2195   param.min_size = 2;                   \
2196   param.min_is_always = 1;              \
2197   param.data_high = DATA_HIGH_LT_R;     \
2198   param.size_extra = 1;                 \
2199   param.stop_factor = 2.0;
2200 
2201 
2202 double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
2203 
2204 void
tune_divrem_1(void)2205 tune_divrem_1 (void)
2206 {
2207   /* plain version by default */
2208   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
2209 
2210   /* No support for tuning native assembler code, do that by hand and put
2211      the results in the .asm file, there's no need for such thresholds to
2212      appear in gmp-mparam.h.  */
2213   if (HAVE_NATIVE_mpn_divrem_1)
2214     return;
2215 
2216   if (GMP_NAIL_BITS != 0)
2217     {
2218       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2219                            "no preinv with nails");
2220       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2221                            "no preinv with nails");
2222       return;
2223     }
2224 
2225   if (UDIV_PREINV_ALWAYS)
2226     {
2227       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
2228       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
2229       return;
2230     }
2231 
2232   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
2233 
2234   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
2235      a bit out for the fractional part, but that's too bad, the integer part
2236      is more important. */
2237   {
2238     static struct param_t  param;
2239     param.name = "DIVREM_1_NORM_THRESHOLD";
2240     DIV_1_PARAMS;
2241     s.r = randlimb_norm ();
2242     param.function = speed_mpn_divrem_1_tune;
2243     one (&divrem_1_norm_threshold, &param);
2244   }
2245   {
2246     static struct param_t  param;
2247     param.name = "DIVREM_1_UNNORM_THRESHOLD";
2248     DIV_1_PARAMS;
2249     s.r = randlimb_half ();
2250     param.function = speed_mpn_divrem_1_tune;
2251     one (&divrem_1_unnorm_threshold, &param);
2252   }
2253 }
2254 
2255 void
tune_div_qr_1(void)2256 tune_div_qr_1 (void)
2257 {
2258   if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
2259     {
2260       static struct param_t  param;
2261       speed_function_t f[2] =
2262 	{
2263 	 speed_mpn_div_qr_1n_pi1_1,
2264 	 speed_mpn_div_qr_1n_pi1_2,
2265 	};
2266 
2267       s.size = 10;
2268       s.r = randlimb_norm ();
2269 
2270       one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
2271     }
2272 
2273   {
2274     static struct param_t  param;
2275     param.name = "DIV_QR_1_NORM_THRESHOLD";
2276     DIV_1_PARAMS;
2277     param.min_size = 1;
2278     param.min_is_always = 0;
2279     s.r = randlimb_norm ();
2280     param.function = speed_mpn_div_qr_1_tune;
2281     one (&div_qr_1_norm_threshold, &param);
2282   }
2283   {
2284     static struct param_t  param;
2285     param.name = "DIV_QR_1_UNNORM_THRESHOLD";
2286     DIV_1_PARAMS;
2287     param.min_size = 1;
2288     param.min_is_always = 0;
2289     s.r = randlimb_half();
2290     param.function = speed_mpn_div_qr_1_tune;
2291     one (&div_qr_1_unnorm_threshold, &param);
2292   }
2293 }
2294 
2295 
2296 void
tune_mod_1(void)2297 tune_mod_1 (void)
2298 {
2299   /* No support for tuning native assembler code, do that by hand and put
2300      the results in the .asm file, there's no need for such thresholds to
2301      appear in gmp-mparam.h.  */
2302   if (HAVE_NATIVE_mpn_mod_1)
2303     return;
2304 
2305   if (GMP_NAIL_BITS != 0)
2306     {
2307       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2308                            "no preinv with nails");
2309       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2310                            "no preinv with nails");
2311       return;
2312     }
2313 
2314   if (!HAVE_NATIVE_mpn_mod_1_1p)
2315     {
2316       static struct param_t  param;
2317       speed_function_t f[2] =
2318 	{
2319 	 speed_mpn_mod_1_1_1,
2320 	 speed_mpn_mod_1_1_2,
2321 	};
2322 
2323       s.size = 10;
2324       s.r = randlimb_half ();
2325       one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
2326     }
2327 
2328   if (UDIV_PREINV_ALWAYS)
2329     {
2330       print_define ("MOD_1_NORM_THRESHOLD", 0L);
2331       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
2332     }
2333   else
2334     {
2335       {
2336 	static struct param_t  param;
2337 	param.name = "MOD_1_NORM_THRESHOLD";
2338 	DIV_1_PARAMS;
2339 	s.r = randlimb_norm ();
2340 	param.function = speed_mpn_mod_1_tune;
2341 	one (&mod_1_norm_threshold, &param);
2342       }
2343       {
2344 	static struct param_t  param;
2345 	param.name = "MOD_1_UNNORM_THRESHOLD";
2346 	DIV_1_PARAMS;
2347 	s.r = randlimb_half ();
2348 	param.function = speed_mpn_mod_1_tune;
2349 	one (&mod_1_unnorm_threshold, &param);
2350       }
2351     }
2352   {
2353     static struct param_t  param;
2354 
2355     param.check_size = 256;
2356 
2357     s.r = randlimb_norm ();
2358     param.function = speed_mpn_mod_1_tune;
2359 
2360     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
2361     param.min_size = 2;
2362     one (&mod_1n_to_mod_1_1_threshold, &param);
2363   }
2364 
2365   {
2366     static struct param_t  param;
2367 
2368     param.check_size = 256;
2369     s.r = randlimb_half ();
2370     param.noprint = 1;
2371 
2372     param.function = speed_mpn_mod_1_1;
2373     param.function2 = speed_mpn_mod_1_2;
2374     param.min_is_always = 1;
2375     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
2376     param.min_size = 2;
2377     one (&mod_1_1_to_mod_1_2_threshold, &param);
2378 
2379     param.function = speed_mpn_mod_1_2;
2380     param.function2 = speed_mpn_mod_1_4;
2381     param.min_is_always = 1;
2382     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
2383     param.min_size = 1;
2384     one (&mod_1_2_to_mod_1_4_threshold, &param);
2385 
2386     if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
2387       {
2388 	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
2389 	mod_1_2_to_mod_1_4_threshold = 0;
2390 
2391 	param.function = speed_mpn_mod_1_1;
2392 	param.function2 = speed_mpn_mod_1_4;
2393 	param.min_is_always = 1;
2394 	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
2395 	param.min_size = 2;
2396 	one (&mod_1_1_to_mod_1_2_threshold, &param);
2397       }
2398 
2399     param.function = speed_mpn_mod_1_tune;
2400     param.function2 = NULL;
2401     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
2402     param.min_size = 2;
2403     param.min_is_always = 0;
2404     one (&mod_1u_to_mod_1_1_threshold, &param);
2405 
2406     if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
2407       mod_1_1_to_mod_1_2_threshold = 0;
2408     if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
2409       mod_1_2_to_mod_1_4_threshold = 0;
2410 
2411     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
2412     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
2413 			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
2414     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
2415 			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
2416   }
2417 
2418   {
2419     static struct param_t  param;
2420 
2421     param.check_size = 256;
2422 
2423     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
2424     s.r = randlimb_norm ();
2425     param.function = speed_mpn_preinv_mod_1;
2426     param.function2 = speed_mpn_mod_1_tune;
2427     param.min_size = 1;
2428     one (&preinv_mod_1_to_mod_1_threshold, &param);
2429   }
2430 }
2431 
2432 
2433 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
2434    imply that udiv_qrnnd_preinv is worth using, but it seems most
2435    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
2436    directly.  */
2437 
2438 void
tune_preinv_divrem_1(void)2439 tune_preinv_divrem_1 (void)
2440 {
2441   static struct param_t  param;
2442   speed_function_t  divrem_1;
2443   const char        *divrem_1_name;
2444   double            t1, t2;
2445 
2446   if (GMP_NAIL_BITS != 0)
2447     {
2448       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
2449       return;
2450     }
2451 
2452   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
2453      it's faster than mpn_divrem_1.  */
2454   if (HAVE_NATIVE_mpn_preinv_divrem_1)
2455     {
2456       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
2457       return;
2458     }
2459 
2460   /* If udiv_qrnnd_preinv is the only division method then of course
2461      mpn_preinv_divrem_1 should be used.  */
2462   if (UDIV_PREINV_ALWAYS)
2463     {
2464       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
2465       return;
2466     }
2467 
2468   /* If we've got an assembler version of mpn_divrem_1, then compare against
2469      that, not the mpn_divrem_1_div generic C.  */
2470   if (HAVE_NATIVE_mpn_divrem_1)
2471     {
2472       divrem_1 = speed_mpn_divrem_1;
2473       divrem_1_name = "mpn_divrem_1";
2474     }
2475   else
2476     {
2477       divrem_1 = speed_mpn_divrem_1_div;
2478       divrem_1_name = "mpn_divrem_1_div";
2479     }
2480 
2481   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
2482   s.size = 200;                     /* generous but not too big */
2483   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
2484      since in general that's probably most common, though in fact for a
2485      64-bit limb mp_bases[10].big_base is normalized.  */
2486   s.r = urandom() & (GMP_NUMB_MASK >> 4);
2487   if (s.r == 0) s.r = 123;
2488 
2489   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
2490   t2 = tuneup_measure (divrem_1, &param, &s);
2491   if (t1 == -1.0 || t2 == -1.0)
2492     {
2493       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
2494               divrem_1_name, (long) s.size);
2495       abort ();
2496     }
2497   if (option_trace >= 1)
2498     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
2499             (long) s.size, t1, divrem_1_name, t2);
2500 
2501   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
2502 }
2503 
2504 
2505 
2506 void
tune_divrem_2(void)2507 tune_divrem_2 (void)
2508 {
2509   static struct param_t  param;
2510 
2511   /* No support for tuning native assembler code, do that by hand and put
2512      the results in the .asm file, and there's no need for such thresholds
2513      to appear in gmp-mparam.h.  */
2514   if (HAVE_NATIVE_mpn_divrem_2)
2515     return;
2516 
2517   if (GMP_NAIL_BITS != 0)
2518     {
2519       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
2520                            "no preinv with nails");
2521       return;
2522     }
2523 
2524   if (UDIV_PREINV_ALWAYS)
2525     {
2526       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
2527       return;
2528     }
2529 
2530   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
2531      a bit out for the fractional part, but that's too bad, the integer part
2532      is more important.
2533 
2534      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
2535      code space if plain division is better only at size==2 or size==3. */
2536   param.name = "DIVREM_2_THRESHOLD";
2537   param.check_size = 256;
2538   param.min_size = 4;
2539   param.min_is_always = 1;
2540   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
2541   param.stop_factor = 2.0;
2542 
2543   s.r = randlimb_norm ();
2544   param.function = speed_mpn_divrem_2;
2545   one (&divrem_2_threshold, &param);
2546 }
2547 
2548 void
tune_div_qr_2(void)2549 tune_div_qr_2 (void)
2550 {
2551   static struct param_t  param;
2552   param.name = "DIV_QR_2_PI2_THRESHOLD";
2553   param.function = speed_mpn_div_qr_2n;
2554   param.check_size = 500;
2555   param.min_size = 4;
2556   one (&div_qr_2_pi2_threshold, &param);
2557 }
2558 
2559 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
2560    tune for that.  Its speed can differ on odd or even divisor, so take an
2561    average threshold for the two.
2562 
2563    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
2564    might not vary that way, but don't test this since high<divisor isn't
2565    expected to occur often with small divisors.  */
2566 
2567 void
tune_divexact_1(void)2568 tune_divexact_1 (void)
2569 {
2570   static struct param_t  param;
2571   mp_size_t  thresh[2], average;
2572   int        low, i;
2573 
2574   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
2575      full mpn_divrem_1.  */
2576   if (HAVE_NATIVE_mpn_divexact_1)
2577     {
2578       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
2579       return;
2580     }
2581 
2582   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
2583 
2584   param.name = "DIVEXACT_1_THRESHOLD";
2585   param.data_high = DATA_HIGH_GE_R;
2586   param.check_size = 256;
2587   param.min_size = 2;
2588   param.stop_factor = 1.5;
2589   param.function  = tuned_speed_mpn_divrem_1;
2590   param.function2 = speed_mpn_divexact_1;
2591   param.noprint = 1;
2592 
2593   print_define_start (param.name);
2594 
2595   for (low = 0; low <= 1; low++)
2596     {
2597       s.r = randlimb_half();
2598       if (low == 0)
2599         s.r |= 1;
2600       else
2601         s.r &= ~CNST_LIMB(7);
2602 
2603       one (&thresh[low], &param);
2604       if (option_trace)
2605         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
2606 
2607       if (thresh[low] == MP_SIZE_T_MAX)
2608         {
2609           average = MP_SIZE_T_MAX;
2610           goto divexact_1_done;
2611         }
2612     }
2613 
2614   if (option_trace)
2615     {
2616       printf ("average of:");
2617       for (i = 0; i < numberof(thresh); i++)
2618         printf (" %ld", (long) thresh[i]);
2619       printf ("\n");
2620     }
2621 
2622   average = 0;
2623   for (i = 0; i < numberof(thresh); i++)
2624     average += thresh[i];
2625   average /= numberof(thresh);
2626 
2627   /* If divexact turns out to be better as early as 3 limbs, then use it
2628      always, so as to reduce code size and conditional jumps.  */
2629   if (average <= 3)
2630     average = 0;
2631 
2632  divexact_1_done:
2633   print_define_end (param.name, average);
2634 }
2635 
2636 
2637 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2638    same as mpn_mod_1, but this might not be true of an assembler
2639    implementation.  The threshold used is an average based on data where a
2640    divide can be skipped and where it can't.
2641 
2642    If modexact turns out to be better as early as 3 limbs, then use it
2643    always, so as to reduce code size and conditional jumps.  */
2644 
2645 void
tune_modexact_1_odd(void)2646 tune_modexact_1_odd (void)
2647 {
2648   static struct param_t  param;
2649   mp_size_t  thresh_lt, thresh_ge, average;
2650 
2651 #if 0
2652   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2653      of a full mpn_mod_1.  */
2654   if (HAVE_NATIVE_mpn_modexact_1_odd)
2655     {
2656       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2657       return;
2658     }
2659 #endif
2660 
2661   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2662   param.check_size = 256;
2663   param.min_size = 2;
2664   param.stop_factor = 1.5;
2665   param.function  = speed_mpn_modexact_1c_odd;
2666   param.function2 = speed_mpn_mod_1_tune;
2667   param.noprint = 1;
2668   s.r = randlimb_half () | 1;
2669 
2670   print_define_start (param.name);
2671 
2672   param.data_high = DATA_HIGH_LT_R;
2673   one (&thresh_lt, &param);
2674   if (option_trace)
2675     printf ("lt thresh %ld\n", (long) thresh_lt);
2676 
2677   average = thresh_lt;
2678   if (thresh_lt != MP_SIZE_T_MAX)
2679     {
2680       param.data_high = DATA_HIGH_GE_R;
2681       one (&thresh_ge, &param);
2682       if (option_trace)
2683         printf ("ge thresh %ld\n", (long) thresh_ge);
2684 
2685       if (thresh_ge != MP_SIZE_T_MAX)
2686         {
2687           average = (thresh_ge + thresh_lt) / 2;
2688           if (thresh_ge <= 3)
2689             average = 0;
2690         }
2691     }
2692 
2693   print_define_end (param.name, average);
2694 }
2695 
2696 
2697 void
tune_jacobi_base(void)2698 tune_jacobi_base (void)
2699 {
2700   static struct param_t  param;
2701   speed_function_t f[4] =
2702     {
2703      speed_mpn_jacobi_base_1,
2704      speed_mpn_jacobi_base_2,
2705      speed_mpn_jacobi_base_3,
2706      speed_mpn_jacobi_base_4,
2707     };
2708 
2709   s.size = GMP_LIMB_BITS * 3 / 4;
2710 
2711   one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
2712 }
2713 
2714 
2715 void
tune_get_str(void)2716 tune_get_str (void)
2717 {
2718   /* Tune for decimal, it being most common.  Some rough testing suggests
2719      other bases are different, but not by very much.  */
2720   s.r = 10;
2721   {
2722     static struct param_t  param;
2723     GET_STR_PRECOMPUTE_THRESHOLD = 0;
2724     param.name = "GET_STR_DC_THRESHOLD";
2725     param.function = speed_mpn_get_str;
2726     param.min_size = 4;
2727     param.max_size = GET_STR_THRESHOLD_LIMIT;
2728     one (&get_str_dc_threshold, &param);
2729   }
2730   {
2731     static struct param_t  param;
2732     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2733     param.function = speed_mpn_get_str;
2734     param.min_size = GET_STR_DC_THRESHOLD;
2735     param.max_size = GET_STR_THRESHOLD_LIMIT;
2736     one (&get_str_precompute_threshold, &param);
2737   }
2738 }
2739 
2740 
2741 double
speed_mpn_pre_set_str(struct speed_params * s)2742 speed_mpn_pre_set_str (struct speed_params *s)
2743 {
2744   unsigned char *str;
2745   mp_ptr     wp;
2746   mp_size_t  wn;
2747   unsigned   i;
2748   int        base;
2749   double     t;
2750   mp_ptr powtab_mem, tp;
2751   powers_t powtab[GMP_LIMB_BITS];
2752   mp_size_t un;
2753   int chars_per_limb;
2754   size_t n_pows;
2755   powers_t *pt;
2756   TMP_DECL;
2757 
2758   SPEED_RESTRICT_COND (s->size >= 1);
2759 
2760   base = s->r == 0 ? 10 : s->r;
2761   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2762 
2763   TMP_MARK;
2764 
2765   str = (unsigned char *) TMP_ALLOC (s->size);
2766   for (i = 0; i < s->size; i++)
2767     str[i] = s->xp[i] % base;
2768 
2769   LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
2770   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2771 
2772   /* use this during development to check wn is big enough */
2773   /*
2774   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2775   */
2776 
2777   speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
2778   speed_operand_dst (s, wp, wn);
2779   speed_cache_fill (s);
2780 
2781   chars_per_limb = mp_bases[base].chars_per_limb;
2782   un = s->size / chars_per_limb + 1;
2783   powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
2784   n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
2785   pt = powtab + n_pows;
2786   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2787 
2788   speed_starttime ();
2789   i = s->reps;
2790   do
2791     {
2792       mpn_pre_set_str (wp, str, s->size, pt, tp);
2793     }
2794   while (--i != 0);
2795   t = speed_endtime ();
2796 
2797   TMP_FREE;
2798   return t;
2799 }
2800 
2801 void
tune_set_str(void)2802 tune_set_str (void)
2803 {
2804   s.r = 10;  /* decimal */
2805   {
2806     static struct param_t  param;
2807     SET_STR_PRECOMPUTE_THRESHOLD = 0;
2808     param.step_factor = 0.01;
2809     param.name = "SET_STR_DC_THRESHOLD";
2810     param.function = speed_mpn_pre_set_str;
2811     param.min_size = 100;
2812     param.max_size = 50000;
2813     one (&set_str_dc_threshold, &param);
2814   }
2815   {
2816     static struct param_t  param;
2817     param.step_factor = 0.02;
2818     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2819     param.function = speed_mpn_set_str;
2820     param.min_size = SET_STR_DC_THRESHOLD;
2821     param.max_size = 100000;
2822     one (&set_str_precompute_threshold, &param);
2823   }
2824 }
2825 
2826 
2827 void
tune_fft_mul(void)2828 tune_fft_mul (void)
2829 {
2830   static struct fft_param_t  param;
2831 
2832   if (option_fft_max_size == 0)
2833     return;
2834 
2835   param.table_name          = "MUL_FFT_TABLE3";
2836   param.threshold_name      = "MUL_FFT_THRESHOLD";
2837   param.p_threshold         = &mul_fft_threshold;
2838   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2839   param.p_modf_threshold    = &mul_fft_modf_threshold;
2840   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
2841   param.max_size            = option_fft_max_size;
2842   param.function            = speed_mpn_fft_mul;
2843   param.mul_modf_function   = speed_mpn_mul_fft;
2844   param.mul_function        = speed_mpn_mul_n;
2845   param.sqr = 0;
2846   fft (&param);
2847 }
2848 
2849 
2850 void
tune_fft_sqr(void)2851 tune_fft_sqr (void)
2852 {
2853   static struct fft_param_t  param;
2854 
2855   if (option_fft_max_size == 0)
2856     return;
2857 
2858   param.table_name          = "SQR_FFT_TABLE3";
2859   param.threshold_name      = "SQR_FFT_THRESHOLD";
2860   param.p_threshold         = &sqr_fft_threshold;
2861   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2862   param.p_modf_threshold    = &sqr_fft_modf_threshold;
2863   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
2864   param.max_size            = option_fft_max_size;
2865   param.function            = speed_mpn_fft_sqr;
2866   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
2867   param.mul_function        = speed_mpn_sqr;
2868   param.sqr = 1;
2869   fft (&param);
2870 }
2871 
2872 void
tune_fac_ui(void)2873 tune_fac_ui (void)
2874 {
2875   static struct param_t  param;
2876 
2877   param.function = speed_mpz_fac_ui_tune;
2878 
2879   param.name = "FAC_DSC_THRESHOLD";
2880   param.min_size = 70;
2881   param.max_size = FAC_DSC_THRESHOLD_LIMIT;
2882   one (&fac_dsc_threshold, &param);
2883 
2884   param.name = "FAC_ODD_THRESHOLD";
2885   param.min_size = 22;
2886   param.stop_factor = 1.7;
2887   param.min_is_always = 1;
2888   one (&fac_odd_threshold, &param);
2889 }
2890 
2891 void
all(void)2892 all (void)
2893 {
2894   time_t  start_time, end_time;
2895   TMP_DECL;
2896 
2897   TMP_MARK;
2898   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2899   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2900 
2901   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2902   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2903 
2904   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2905 
2906   speed_time_init ();
2907   fprintf (stderr, "Using: %s\n", speed_time_string);
2908 
2909   fprintf (stderr, "speed_precision %d", speed_precision);
2910   if (speed_unittime == 1.0)
2911     fprintf (stderr, ", speed_unittime 1 cycle");
2912   else
2913     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2914   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2915     fprintf (stderr, ", CPU freq unknown\n");
2916   else
2917     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2918 
2919   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2920            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2921   fprintf (stderr, "\n");
2922 
2923   time (&start_time);
2924   {
2925     struct tm  *tp;
2926     tp = localtime (&start_time);
2927     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2928             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2929 
2930 #ifdef __GNUC__
2931     /* gcc sub-minor version doesn't seem to come through as a define */
2932     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2933 #define PRINTED_COMPILER
2934 #endif
2935 #if defined (__SUNPRO_C)
2936     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2937 #define PRINTED_COMPILER
2938 #endif
2939 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2940     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2941     printf ("MIPSpro C %d.%d.%d */\n",
2942 	    _COMPILER_VERSION / 100,
2943 	    _COMPILER_VERSION / 10 % 10,
2944 	    _COMPILER_VERSION % 10);
2945 #define PRINTED_COMPILER
2946 #endif
2947 #if defined (__DECC) && defined (__DECC_VER)
2948     printf ("DEC C %d */\n", __DECC_VER);
2949 #define PRINTED_COMPILER
2950 #endif
2951 #if ! defined (PRINTED_COMPILER)
2952     printf ("system compiler */\n");
2953 #endif
2954   }
2955   printf ("\n");
2956 
2957   tune_divrem_1 ();
2958   tune_mod_1 ();
2959   tune_preinv_divrem_1 ();
2960   tune_div_qr_1 ();
2961 #if 0
2962   tune_divrem_2 ();
2963 #endif
2964   tune_div_qr_2 ();
2965   tune_divexact_1 ();
2966   tune_modexact_1_odd ();
2967   printf("\n");
2968 
2969   relspeed_div_1_vs_mul_1 ();
2970   printf("\n");
2971 
2972   tune_mul_n ();
2973   printf("\n");
2974 
2975   tune_mul ();
2976   printf("\n");
2977 
2978   tune_sqr ();
2979   printf("\n");
2980 
2981   tune_mulmid ();
2982   printf("\n");
2983 
2984   tune_mulmod_bnm1 ();
2985   tune_sqrmod_bnm1 ();
2986   printf("\n");
2987 
2988   tune_fft_mul ();
2989   printf("\n");
2990 
2991   tune_fft_sqr ();
2992   printf ("\n");
2993 
2994   tune_mullo ();
2995   tune_sqrlo ();
2996   printf("\n");
2997 
2998   tune_dc_div ();
2999   tune_dc_bdiv ();
3000 
3001   printf("\n");
3002   tune_invertappr ();
3003   tune_invert ();
3004   printf("\n");
3005 
3006   tune_binvert ();
3007   tune_redc ();
3008   printf("\n");
3009 
3010   tune_mu_div ();
3011   tune_mu_bdiv ();
3012   printf("\n");
3013 
3014   tune_powm_sec ();
3015   printf("\n");
3016 
3017   tune_get_str ();
3018   tune_set_str ();
3019   printf("\n");
3020 
3021   tune_fac_ui ();
3022   printf("\n");
3023 
3024   tune_matrix22_mul ();
3025   tune_hgcd2 ();
3026   tune_hgcd ();
3027   tune_hgcd_appr ();
3028   tune_hgcd_reduce();
3029   tune_gcd_dc ();
3030   tune_gcdext_dc ();
3031   tune_jacobi_base ();
3032   printf("\n");
3033 
3034   time (&end_time);
3035   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
3036           (long) (end_time - start_time));
3037 
3038   TMP_FREE;
3039 }
3040 
3041 
3042 int
main(int argc,char * argv[])3043 main (int argc, char *argv[])
3044 {
3045   int  opt;
3046 
3047   /* Unbuffered so if output is redirected to a file it isn't lost if the
3048      program is killed part way through.  */
3049   setbuf (stdout, NULL);
3050   setbuf (stderr, NULL);
3051 
3052   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
3053     {
3054       switch (opt) {
3055       case 'f':
3056         if (optarg[0] == 't')
3057           option_fft_trace = 2;
3058         else
3059           option_fft_max_size = atol (optarg);
3060         break;
3061       case 'o':
3062         speed_option_set (optarg);
3063         break;
3064       case 'p':
3065         speed_precision = atoi (optarg);
3066         break;
3067       case 't':
3068         option_trace++;
3069         break;
3070       case '?':
3071         exit(1);
3072       }
3073     }
3074 
3075   all ();
3076   exit (0);
3077 }
3078