1 /* Create tuned thresholds for various algorithms.
2 
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010,
4 2011, 2012 Free Software Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20 
21 */
22 
23 
24 /* Usage: tuneup [-t] [-t] [-p precision]
25 
26    -t turns on some diagnostic traces, a second -t turns on more traces.
27 
28    Algorithm:
29 
30    The thresholds are determined as follows: two algorithms A and B are
31    compared over a range of sizes. At each point, we define "badness" to
32    be the percentage time lost if algorithm B is chosen over algorithm A.
33    Then total badness is the sum of this over all sizes measured.  The
34    threshold is set to minimize total badness.
35 
36    In practice the thresholds tend to be chosen to bring on algorithm B
37    fairly quickly.
38 
39    Implementation:
40 
41    In a normal library build the thresholds are constants.  To tune them
42    selected objects are recompiled with the thresholds as global variables
43    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
44    the end of gmp-impl.h, and rules in tune/Makefile.am.
45 
46    MUL_KARATSUBA_THRESHOLD for example uses a recompiled mpn_mul_n.  The
47    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
48    level, but recurse into the basecase.
49 
50    MUL_TOOM3_THRESHOLD makes use of the tuned MUL_KARATSUBA_THRESHOLD value.
51    Other routines in turn will make use of both of those.  Naturally the
52    dependants must be tuned first.
53 
54    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
55    just a threshold based on comparing two routines (mpn_divrem_1 and
56    mpn_divexact_1), and no further use of the value determined.
57 
58    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
59    just comparisons between certain routines on representative data.
60 
61    Shortcuts are applied when native (assembler) versions of routines exist.
62    For instance a native mpn_sqr_basecase is assumed to be always faster
63    than mpn_mul_basecase, with no measuring.
64 
65    No attempt is made to tune within assembler routines, for instance
66    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
67    written and tuned all by hand.  Assembler routines that might have hard
68    limits are recompiled though, to make them accept a bigger range of sizes
69    than normal, eg. mpn_sqr_basecase to compare against mpn_kara_sqr_n.
70 
71    Code:
72      - main : checks for various command line options and calls all()
73      - all : prints the tuneup message, date and compiler, then calls
74              each of the individual tuning functions in turn, e.g.
75              tune_mul()
76      - tune_blah() : tunes function of type blah, e.g. tune_mul() tunes the
77              karatsuba and toom cutoffs. It sets up a param struct with the
78              following parameters:
79                a) name : the name of the threshold being tuned, e.g.
80                   MUL_TOOM3_THRESHOLD
81                b) function : the first function being compared (this must be
82                   of the form speed_blah and the function speed_blah will
83                   exist in speed.h and speed.c
84                c) function2 : the second function being compared (if set to
85                   NULL, this is automatically set to equal function
86                d) step_factor : the size of the step between sizes,
87                   set to 0.01 by default, i.e. 1% increments
88                e) function_fudge : multiplier for the speed of function, used
89                   to adjust for overheads, by default set to 1.0
90                f) stop_since_change is a stop condition. If the threshold
91                   has not changed for this many iterations, then stop. This
92                   is set to 80 iterations by default.
93                g) stop_factor : this is another stop factor. If method B
94                   becomes faster by at least this factor, then stop. By
95                   default this is set to 1.2, i.e. 20% faster.
96                h) min_size : the minimum size to start comparing from.
97                i) min_is_always : if this is set to 1, then if the threshold
98                   just ends up being min_size, then the threshold is actually
99                   set to 0, i.e. algorithm B is always used.
100                j) max_size : the maximum size to compare up to. By default this
101                   is set to DEFAULT_MAX_SIZE which is 1000 limbs.
102                h) check_size : if set, will check that the given starting size
103                   is valid for both algorithms and that algorithm B is at least
104                   4% slower than algorithm A at that point.
105                i) size_extra : this is a bias added to each size when doing
106                   measurements. It is subtracted off after each measurement.
107                   It is basically used for shifting a threshold from the
108                   measured value.
109                j) data_high : if set to 1, the high limb of xp and yp are set to
110                   be less than s->r, if set to 2, the high limb of xp and yp are
111                   set to be greater than or equal to s->r
112                k) noprint : if set, the threshold is computed but not printed.
113 
114              After setting all the appropriate parameters, the function one() is
115              called. It takes a reference to a parameter, e.g. mul_toom3_threshold
116              which is defined in a table below. That threshold will have been given
117              some initial value (usually MP_SIZE_T_MAX) in the table. It also takes
118              a reference to the param struct.
119         - one() : does repeated timings over the given range of sizes always setting
120             the threshold to size+1 for function and size for function2.
121 
122         N.B: the functions that need to be rebuilt to use variable thresholds must be
123         added to the Makefile.am file (and automake run) before tune can work.
124 
125 */
126 
127 #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
128 
129 #include "config.h"
130 
131 #include <math.h>
132 #include <stdio.h>
133 #include <stdlib.h>
134 #include <time.h>
135 #if HAVE_UNISTD_H
136 #include <unistd.h>
137 #endif
138 
139 #include "mpir.h"
140 #include "gmp-impl.h"
141 #include "longlong.h"
142 
143 #include "tests.h"
144 #include "speed.h"
145 
146 #if !HAVE_DECL_OPTARG
147 extern char *optarg;
148 extern int optind, opterr;
149 #endif
150 
151 
152 #define DEFAULT_MAX_SIZE   1000  /* limbs */
153 
154 #if WANT_FFT
155 mp_size_t  option_fft_max_size = 50000;  /* limbs */
156 #else
157 mp_size_t  option_fft_max_size = 0;
158 #endif
159 int        option_trace = 0;
160 int        option_fft_trace = 0;
161 struct speed_params  s;
162 
163 struct dat_t {
164   mp_size_t  size;
165   double     d;
166 } *dat = NULL;
167 int  ndat = 0;
168 int  allocdat = 0;
169 
170 /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
171    case use zero here, which for params.max_size means no limit.  */
172 #ifndef TUNE_SQR_KARATSUBA_MAX
173 #define TUNE_SQR_KARATSUBA_MAX  0
174 #endif
175 
176 mp_size_t  mul_karatsuba_threshold      = MP_SIZE_T_MAX;
177 mp_size_t  mul_toom3_threshold          = MUL_TOOM3_THRESHOLD_LIMIT;
178 mp_size_t  mul_toom4_threshold          = MUL_TOOM4_THRESHOLD_LIMIT;
179 mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
180 mp_size_t  mul_fft_full_threshold       = MP_SIZE_T_MAX;
181 mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
182 mp_size_t  sqr_karatsuba_threshold
183   = (TUNE_SQR_KARATSUBA_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_KARATSUBA_MAX);
184 mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
185 mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
186 mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
187 mp_size_t  sqr_fft_full_threshold       = MP_SIZE_T_MAX;
188 mp_size_t  mulmod_2expm1_threshold	= MP_SIZE_T_MAX;
189 mp_size_t  mullow_basecase_threshold    = MP_SIZE_T_MAX;
190 mp_size_t  mullow_dc_threshold          = MP_SIZE_T_MAX;
191 mp_size_t  mullow_mul_threshold         = MP_SIZE_T_MAX;
192 mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
193 mp_size_t  mulhigh_basecase_threshold   = MP_SIZE_T_MAX;
194 mp_size_t  mulhigh_dc_threshold         = MP_SIZE_T_MAX;
195 mp_size_t  mulhigh_mul_threshold        = MP_SIZE_T_MAX;
196 mp_size_t  div_sb_preinv_threshold      = MP_SIZE_T_MAX;
197 mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
198 mp_size_t  inv_div_qr_threshold         = MP_SIZE_T_MAX;
199 mp_size_t  inv_divappr_q_n_threshold    = MP_SIZE_T_MAX;
200 mp_size_t  dc_div_q_threshold           = MP_SIZE_T_MAX;
201 mp_size_t  inv_div_q_threshold          = MP_SIZE_T_MAX;
202 mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
203 mp_size_t  inv_divappr_q_threshold      = MP_SIZE_T_MAX;
204 mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
205 mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
206 mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
207 mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
208 mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
209 mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
210 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
211 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
212 mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
213 mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
214 mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
215 mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
216 mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
217 mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
218 mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
219 mp_size_t  mod_1_1_threshold            = MP_SIZE_T_MAX;
220 mp_size_t  mod_1_2_threshold            = MP_SIZE_T_MAX;
221 mp_size_t  mod_1_3_threshold            = MP_SIZE_T_MAX;
222 mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
223 mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
224 mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
225 mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
226 mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
227 mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
228 mp_size_t  rootrem_threshold            = MP_SIZE_T_MAX;
229 mp_size_t  divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
230 mp_size_t  rsh_divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
231 mp_size_t  divrem_euclid_hensel_threshold = MP_SIZE_T_MAX;
232 mp_size_t  fac_odd_threshold            = 0;
233 mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
234 
235 struct param_t {
236   const char        *name;
237   speed_function_t  function;
238   speed_function_t  function2;
239   double            step_factor;    /* how much to step sizes (rounded down) */
240   double            function_fudge; /* multiplier for "function" speeds */
241   int               stop_since_change;
242   double            stop_factor;
243   mp_size_t         min_size;
244   int               min_is_always;
245   mp_size_t         max_size;
246   mp_size_t         check_size;
247   mp_size_t         size_extra;
248 
249 #define DATA_HIGH_LT_R  1
250 #define DATA_HIGH_GE_R  2
251   int               data_high;
252 
253   int               noprint;
254 };
255 
256 
257 /* These are normally undefined when false, which suits "#if" fine.
258    But give them zero values so they can be used in plain C "if"s.  */
259 #ifndef UDIV_PREINV_ALWAYS
260 #define UDIV_PREINV_ALWAYS 0
261 #endif
262 #ifndef HAVE_NATIVE_mpn_divexact_1
263 #define HAVE_NATIVE_mpn_divexact_1 0
264 #endif
265 #ifndef HAVE_NATIVE_mpn_divrem_1
266 #define HAVE_NATIVE_mpn_divrem_1 0
267 #endif
268 #ifndef HAVE_NATIVE_mpn_divrem_2
269 #define HAVE_NATIVE_mpn_divrem_2 0
270 #endif
271 #ifndef HAVE_NATIVE_mpn_mod_1
272 #define HAVE_NATIVE_mpn_mod_1 0
273 #endif
274 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
275 #define HAVE_NATIVE_mpn_modexact_1_odd 0
276 #endif
277 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
278 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
279 #endif
280 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
281 #define HAVE_NATIVE_mpn_preinv_mod_1 0
282 #endif
283 #ifndef HAVE_NATIVE_mpn_sqr_basecase
284 #define HAVE_NATIVE_mpn_sqr_basecase 0
285 #endif
286 
287 
288 #define MAX3(a,b,c)  MAX (MAX (a, b), c)
289 
290 mp_limb_t
randlimb_norm(gmp_randstate_t rands)291 randlimb_norm (gmp_randstate_t rands)
292 {
293   mp_limb_t  n;
294   mpn_randomb (&n,rands, 1);
295   n |= GMP_NUMB_HIGHBIT;
296   return n;
297 }
298 
299 #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
300 
301 mp_limb_t
randlimb_half(gmp_randstate_t rands)302 randlimb_half (gmp_randstate_t rands)
303 {
304   mp_limb_t  n;
305   mpn_randomb (&n, rands,1);
306   n &= GMP_NUMB_HALFMASK;
307   n += (n==0);
308   return n;
309 }
310 
311 
312 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
313    if necessary.  */
314 void
add_dat(mp_size_t size,double d)315 add_dat (mp_size_t size, double d)
316 {
317 #define ALLOCDAT_STEP  500
318 
319   ASSERT_ALWAYS (ndat <= allocdat);
320 
321   if (ndat == allocdat)
322     {
323       dat = (struct dat_t *) __gmp_allocate_or_reallocate
324         (dat, allocdat * sizeof(dat[0]),
325          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
326       allocdat += ALLOCDAT_STEP;
327     }
328 
329   dat[ndat].size = size;
330   dat[ndat].d = d;
331   ndat++;
332 }
333 
334 
335 /* Return the threshold size based on the data accumulated. */
336 mp_size_t
analyze_dat(int final)337 analyze_dat (int final)
338 {
339   double  x, min_x;
340   int     j, min_j;
341 
342   /* If the threshold is set at dat[0].size, any positive values are bad. */
343   x = 0.0;
344   for (j = 0; j < ndat; j++)
345     if (dat[j].d > 0.0)
346       x += dat[j].d;
347 
348   if (option_trace >= 2 && final)
349     {
350       printf ("\n");
351       printf ("x is the sum of the badness from setting thresh at given size\n");
352       printf ("  (minimum x is sought)\n");
353       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
354     }
355 
356   min_x = x;
357   min_j = 0;
358 
359 
360   /* When stepping to the next dat[j].size, positive values are no longer
361      bad (so subtracted), negative values become bad (so add the absolute
362      value, meaning subtract). */
363   for (j = 0; j < ndat; x -= dat[j].d, j++)
364     {
365       if (option_trace >= 2 && final)
366         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
367 
368       if (x < min_x)
369         {
370           min_x = x;
371           min_j = j;
372         }
373     }
374 
375   return min_j;
376 }
377 
378 
379 /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
380 
381 mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
382                                     mp_srcptr ap, mp_size_t size,
383                                     mp_limb_t d));
384 mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
385 
386 void mpz_fac_ui_tune _PROTO ((mpz_ptr, mpir_ui));
387 
388 double
speed_mpn_mod_1_tune(struct speed_params * s)389 speed_mpn_mod_1_tune (struct speed_params *s)
390 {
391   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
392 }
393 double
speed_mpn_divrem_1_tune(struct speed_params * s)394 speed_mpn_divrem_1_tune (struct speed_params *s)
395 {
396   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
397 }
398 
399 double
speed_mpz_fac_ui_tune(struct speed_params * s)400 speed_mpz_fac_ui_tune (struct speed_params *s)
401 {
402   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
403 }
404 
405 double
tuneup_measure(speed_function_t fun,gmp_randstate_t rands,const struct param_t * param,struct speed_params * s)406 tuneup_measure (speed_function_t fun,gmp_randstate_t rands,
407                 const struct param_t *param,
408                 struct speed_params *s)
409 {
410   static struct param_t  dummy;
411   double   t;
412   TMP_DECL;
413 
414   if (! param)
415     param = &dummy;
416 
417   s->size += param->size_extra;
418 
419   TMP_MARK;
420   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
421   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
422 
423   mpn_randomb (s->xp, rands, s->size);
424   mpn_randomb (s->yp, rands, s->size);
425 
426   switch (param->data_high) {
427   case DATA_HIGH_LT_R:
428     s->xp[s->size-1] %= s->r;
429     s->yp[s->size-1] %= s->r;
430     break;
431   case DATA_HIGH_GE_R:
432     s->xp[s->size-1] |= s->r;
433     s->yp[s->size-1] |= s->r;
434     break;
435   }
436 
437   t = speed_measure (fun, s);
438 
439   s->size -= param->size_extra;
440 
441   TMP_FREE;
442   return t;
443 }
444 
445 
446 #define PRINT_WIDTH  28
447 
448 void
print_define_start(const char * name)449 print_define_start (const char *name)
450 {
451   printf ("#define %-*s  ", PRINT_WIDTH, name);
452   if (option_trace)
453     printf ("...\n");
454 }
455 
456 void
print_define_end_remark(const char * name,mp_size_t value,const char * remark)457 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
458 {
459   if (option_trace)
460     printf ("#define %-*s  ", PRINT_WIDTH, name);
461 
462   if (value == MP_SIZE_T_MAX)
463     printf ("MP_SIZE_T_MAX");
464   else
465     printf ("%5ld", (long) value);
466 
467   if (remark != NULL)
468     printf ("  /* %s */", remark);
469   printf ("\n");
470 }
471 
472 void
print_define_end(const char * name,mp_size_t value)473 print_define_end (const char *name, mp_size_t value)
474 {
475   const char  *remark;
476   if (value == MP_SIZE_T_MAX)
477     remark = "never";
478   else if (value == 0)
479     remark = "always";
480   else
481     remark = NULL;
482   print_define_end_remark (name, value, remark);
483 }
484 
485 void
print_define(const char * name,mp_size_t value)486 print_define (const char *name, mp_size_t value)
487 {
488   print_define_start (name);
489   print_define_end (name, value);
490 }
491 
492 void
print_define_remark(const char * name,mp_size_t value,const char * remark)493 print_define_remark (const char *name, mp_size_t value, const char *remark)
494 {
495   print_define_start (name);
496   print_define_end_remark (name, value, remark);
497 }
498 
499 
500 void
one(mp_size_t * threshold,gmp_randstate_t rands,struct param_t * param)501 one (mp_size_t *threshold, gmp_randstate_t rands,struct param_t *param)
502 {
503   int  since_positive, since_thresh_change;
504   int  thresh_idx, new_thresh_idx;
505 
506 #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
507 
508   DEFAULT (param->function_fudge, 1.0);
509   DEFAULT (param->function2, param->function);
510   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
511   DEFAULT (param->stop_since_change, 80);
512   DEFAULT (param->stop_factor, 1.2);
513   DEFAULT (param->min_size, 10);
514   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
515 
516   if (param->check_size != 0)
517     {
518       double   t1, t2;
519       s.size = param->check_size;
520 
521       *threshold = s.size+1;
522       t1 = tuneup_measure (param->function, rands,param, &s);
523 
524       *threshold = s.size;
525       t2 = tuneup_measure (param->function2, rands, param, &s);
526       if (t1 == -1.0 || t2 == -1.0)
527         {
528           printf ("Oops, can't run both functions at size %ld\n",
529                   (long) s.size);
530           abort ();
531         }
532       t1 *= param->function_fudge;
533 
534       /* ask that t2 is at least 4% below t1 */
535       if (t1 < t2*1.04)
536         {
537           if (option_trace)
538             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
539           *threshold = MP_SIZE_T_MAX;
540           if (! param->noprint)
541             print_define (param->name, *threshold);
542           return;
543         }
544 
545       if (option_trace >= 2)
546         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
547                 (long) s.size, t1, t2);
548     }
549 
550   if (! param->noprint || option_trace)
551     print_define_start (param->name);
552 
553   ndat = 0;
554   since_positive = 0;
555   since_thresh_change = 0;
556   thresh_idx = 0;
557 
558   if (option_trace >= 2)
559     {
560       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
561       printf ("              (seconds)    (seconds)    diff    thresh\n");
562     }
563 
564   for (s.size = param->min_size;
565        s.size < param->max_size;
566        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
567     {
568       double   ti, tiplus1, d;
569 
570       /* If there's a size limit and it's reached then it should still
571          be sensible to analyze the data since we want the threshold put
572          either at or near the limit.  */
573       if (s.size >= param->max_size)
574         {
575           if (option_trace)
576             printf ("Reached maximum size (%ld) without otherwise stopping\n",
577                     (long) param->max_size);
578           break;
579         }
580 
581       /*
582         FIXME: check minimum size requirements are met, possibly by just
583         checking for the -1 returns from the speed functions.
584       */
585 
586       /* using method A at this size */
587       *threshold = s.size+1;
588       ti = tuneup_measure (param->function, rands,param, &s);
589       if (ti == -1.0)
590         abort ();
591       ti *= param->function_fudge;
592 
593       /* using method B at this size */
594       *threshold = s.size;
595       tiplus1 = tuneup_measure (param->function2, rands, param, &s);
596       if (tiplus1 == -1.0)
597         abort ();
598 
599       /* Calculate the fraction by which the one or the other routine is
600          slower.  */
601       if (tiplus1 >= ti)
602         d = (tiplus1 - ti) / tiplus1;  /* negative */
603       else
604         d = (tiplus1 - ti) / ti;       /* positive */
605 
606       add_dat (s.size, d);
607 
608       new_thresh_idx = analyze_dat (0);
609 
610       if (option_trace >= 2)
611         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
612                 (long) s.size, ti, tiplus1, d,
613                 ti > tiplus1 ? '#' : ' ',
614                 (long) dat[new_thresh_idx].size);
615 
616       /* Stop if the last time method i was faster was more than a
617          certain number of measurements ago.  */
618 #define STOP_SINCE_POSITIVE  200
619       if (d >= 0)
620         since_positive = 0;
621       else
622         if (++since_positive > STOP_SINCE_POSITIVE)
623           {
624             if (option_trace >= 1)
625               printf ("stopped due to since_positive (%d)\n",
626                       STOP_SINCE_POSITIVE);
627             break;
628           }
629 
630       /* Stop if method A has become slower by a certain factor. */
631       if (ti >= tiplus1 * param->stop_factor)
632         {
633           if (option_trace >= 1)
634             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
635                     param->stop_factor);
636           break;
637         }
638 
639       /* Stop if the threshold implied hasn't changed in a certain
640          number of measurements.  (It's this condition that ususally
641          stops the loop.) */
642       if (thresh_idx != new_thresh_idx)
643         since_thresh_change = 0, thresh_idx = new_thresh_idx;
644       else
645         if (++since_thresh_change > param->stop_since_change)
646           {
647             if (option_trace >= 1)
648               printf ("stopped due to since_thresh_change (%d)\n",
649                       param->stop_since_change);
650             break;
651           }
652 
653       /* Stop if the threshold implied is more than a certain number of
654          measurements ago.  */
655 #define STOP_SINCE_AFTER   500
656       if (ndat - thresh_idx > STOP_SINCE_AFTER)
657         {
658           if (option_trace >= 1)
659             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
660                     STOP_SINCE_AFTER);
661           break;
662         }
663 
664       /* Stop when the size limit is reached before the end of the
665          crossover, but only show this as an error for >= the default max
666          size.  FIXME: Maybe should make it a param choice whether this is
667          an error.  */
668       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
669         {
670           fprintf (stderr, "%s\n", param->name);
671           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
672                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
673           fprintf (stderr, "    max size reached before end of crossover\n");
674           break;
675         }
676     }
677 
678   if (option_trace >= 1)
679     printf ("sizes %ld to %ld total %d measurements\n",
680             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
681 
682   *threshold = dat[analyze_dat (1)].size;
683 
684   if (param->min_is_always)
685     {
686       if (*threshold == param->min_size)
687         *threshold = 0;
688     }
689 
690   if (! param->noprint || option_trace)
691     print_define_end (param->name, *threshold);
692 }
693 
694 
695 /* Special probing for the fft thresholds.  The size restrictions on the
696    FFTs mean the graph of time vs size has a step effect.  See this for
697    example using
698 
699        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
700        gnuplot foo.gnuplot
701 
702    The current approach is to compare routines at the midpoint of relevant
703    steps.  Arguably a more sophisticated system of threshold data is wanted
704    if this step effect remains. */
705 
706 struct fft_param_t {
707   const char        *threshold_name;
708   mp_size_t         *p_threshold;
709   mp_size_t         first_size;
710   mp_size_t         max_size;
711   speed_function_t  function;
712   speed_function_t  mul_function;
713   mp_size_t         sqr;
714 };
715 
716 mp_size_t
fft_step_size(int size)717 fft_step_size (int size)
718 {
719   mp_size_t  step;
720 
721   step = mpir_fft_adjust_limbs(size + 1) - size;
722 
723   if (step <= 0)
724     {
725       printf ("Can't handle size=%d\n", size);
726       abort ();
727     }
728 
729   return step;
730 }
731 
732 void
fft(struct fft_param_t * p,gmp_randstate_t rands)733 fft (struct fft_param_t *p,gmp_randstate_t rands)
734 {
735   mp_size_t  size;
736   int        i, k;
737 
738   *p->p_threshold = MP_SIZE_T_MAX;
739 
740   option_trace = MAX (option_trace, option_fft_trace);
741 
742   size = p->first_size;
743 
744   /* Declare an FFT faster than a plain toom3 etc multiplication found as
745      soon as one faster measurement obtained.  A multiplication in the
746      middle of the FFT step is tested.  */
747   for (;;)
748     {
749       double  tk, tm;
750 
751       size = mpir_fft_adjust_limbs (size+1);
752 
753       if (size >= p->max_size)
754         break;
755 
756       s.size = size + fft_step_size (size) / 2;
757 
758       tk = tuneup_measure (p->function, rands, NULL, &s);
759       if (tk == -1.0)
760         abort ();
761 
762       tm = tuneup_measure (p->mul_function, rands, NULL, &s);
763       if (tm == -1.0)
764         abort ();
765 
766       if (option_trace >= 2)
767         printf ("at %ld   size=%ld   %.9f   size=%ld %s mul %.9f\n",
768                 (long) size,
769                 (long) size + fft_step_size (size) / 2, tk,
770                 (long) s.size, "full", tm);
771 
772       if (tk < tm)
773         {
774             *p->p_threshold = s.size;
775             print_define (p->threshold_name,      *p->p_threshold);
776             break;
777         }
778     }
779 
780 }
781 
782 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
783    giving wrong results.  */
784 void
tune_mul(gmp_randstate_t rands)785 tune_mul (gmp_randstate_t rands)
786 {
787   static struct param_t  param;
788 
789   param.function = speed_mpn_mul_n;
790 
791   param.name = "MUL_KARATSUBA_THRESHOLD";
792   param.min_size = MAX (4, MPN_KARA_MUL_N_MINSIZE);
793   param.max_size = MUL_KARATSUBA_THRESHOLD_LIMIT-1;
794   one (&mul_karatsuba_threshold, rands,&param);
795 
796   param.name = "MUL_TOOM3_THRESHOLD";
797   param.min_size = MAX (mul_karatsuba_threshold, MPN_TOOM3_MUL_N_MINSIZE);
798   param.max_size = MUL_TOOM3_THRESHOLD_LIMIT-1;
799   one (&mul_toom3_threshold, rands, &param);
800 
801   param.name = "MUL_TOOM4_THRESHOLD";
802   param.min_size = MAX (mul_toom3_threshold, MPN_TOOM4_MUL_N_MINSIZE);
803   param.max_size = MUL_TOOM4_THRESHOLD_LIMIT-1;
804   one (&mul_toom4_threshold, rands, &param);
805 
806   param.name = "MUL_TOOM8H_THRESHOLD";
807   param.min_size = MAX (mul_toom4_threshold, MPN_TOOM8H_MUL_MINSIZE);
808   param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
809   one (&mul_toom8h_threshold, rands, &param);
810 
811   /* disabled until tuned */
812   MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
813 }
814 
815 
816 /* This was written by the tuneup challenged tege.  Kevin, please delete
817    this comment when you've reviewed/rewritten this.  :-) */
818 void
tune_mullow(gmp_randstate_t rands)819 tune_mullow (gmp_randstate_t rands)
820 {
821   static struct param_t  param;
822 
823   param.function = speed_mpn_mullow_n;
824 
825   param.name = "MULLOW_BASECASE_THRESHOLD";
826   param.min_size = 3;
827   param.min_is_always = 1;
828   //param.max_size = MULLOW_BASECASE_THRESHOLD_LIMIT-1;
829   one (&mullow_basecase_threshold, rands, &param);
830 
831   param.min_is_always = 0;	/* ??? */
832 
833   param.name = "MULLOW_DC_THRESHOLD";
834   param.min_size = mullow_basecase_threshold;
835   param.max_size = 1000;
836   one (&mullow_dc_threshold, rands, &param);
837 
838   param.name = "MULLOW_MUL_THRESHOLD";
839   param.min_size = mullow_dc_threshold;
840   param.max_size = 10000;
841   one (&mullow_mul_threshold, rands, &param);
842 
843   /* disabled until tuned */
844   MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
845 }
846 
847 void
tune_mulmid(gmp_randstate_t rands)848 tune_mulmid (gmp_randstate_t rands)
849 {
850   static struct param_t  param;
851 
852   param.name = "MULMID_TOOM42_THRESHOLD";
853   param.function = speed_mpn_mulmid_n;
854   param.min_size = 4;
855   param.max_size = 100;
856   one (&mulmid_toom42_threshold, rands, &param);
857 
858   /* disabled until tuned */
859   MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
860 }
861 
862 void
tune_mulmod_2expm1(gmp_randstate_t rands)863 tune_mulmod_2expm1 (gmp_randstate_t rands)
864 {
865   static struct param_t  param;
866   param.function = speed_mpn_mulmod_2expm1;
867   param.name = "MULMOD_2EXPM1_THRESHOLD";
868   param.min_size = 1;
869   //param.max_size =  ?? ;
870   one (&mulmod_2expm1_threshold, rands, &param);
871   /* disabled until tuned */
872   MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;    // ??????????????
873 }
874 
875 void
tune_mulhigh(gmp_randstate_t rands)876 tune_mulhigh (gmp_randstate_t rands)
877 {
878   static struct param_t  param;
879 
880   param.function = speed_mpn_mulhigh_n;
881 
882   param.name = "MULHIGH_BASECASE_THRESHOLD";
883   param.min_size = 3;
884   param.min_is_always = 3;
885   //param.max_size = MULHIGH_BASECASE_THRESHOLD_LIMIT-1;
886   one (&mulhigh_basecase_threshold, rands, &param);
887 
888   param.min_is_always = 0;	/* ??? */
889 
890   param.name = "MULHIGH_DC_THRESHOLD";
891   param.min_size = MAX(mulhigh_basecase_threshold,4);
892   param.max_size = 1000;
893   one (&mulhigh_dc_threshold, rands, &param);
894 
895   param.name = "MULHIGH_MUL_THRESHOLD";
896   param.min_size = mulhigh_dc_threshold;
897   param.max_size = 10000;
898   one (&mulhigh_mul_threshold, rands, &param);
899 
900   /* disabled until tuned */
901   MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
902 }
903 
904 void
tune_rootrem(gmp_randstate_t rands)905 tune_rootrem (gmp_randstate_t rands)
906 {
907 
908   static struct param_t  param;
909   s.r=5;   // tune for 5th root
910   param.function = speed_mpn_rootrem;
911   param.name = "ROOTREM_THRESHOLD";
912   param.min_size = 1;
913   one (&rootrem_threshold, rands, &param);
914 }
915 
916 void
tune_divrem_hensel_qr_1(gmp_randstate_t rands)917 tune_divrem_hensel_qr_1 (gmp_randstate_t rands)
918 {
919   static struct param_t  param;
920   param.function = speed_mpn_divrem_hensel_qr_1;
921   param.name = "DIVREM_HENSEL_QR_1_THRESHOLD";
922   param.min_size = 2;
923   one (&divrem_hensel_qr_1_threshold, rands, &param);
924 }
925 
926 void
tune_rsh_divrem_hensel_qr_1(gmp_randstate_t rands)927 tune_rsh_divrem_hensel_qr_1 (gmp_randstate_t rands)
928 {
929   static struct param_t  param;
930   param.function = speed_mpn_rsh_divrem_hensel_qr_1;
931   param.name = "RSH_DIVREM_HENSEL_QR_1_THRESHOLD";
932   param.min_size = 3;
933   one (&rsh_divrem_hensel_qr_1_threshold, rands, &param);
934 }
935 
936 void
tune_divrem_euclid_hensel(gmp_randstate_t rands)937 tune_divrem_euclid_hensel (gmp_randstate_t rands)
938 {
939   static struct param_t  param;
940   param.function = speed_mpn_divrem_1;
941   param.name = "DIVREM_EUCLID_HENSEL_THRESHOLD";
942   param.min_size = 8;
943   s.r=0x81234567;// tune for this divisor
944   one (&divrem_euclid_hensel_threshold, rands, &param);
945 }
946 
947 // for tuning  we dont care if the divisors go out of range as it doesn't affect the runtime
tune_mod_1_k(gmp_randstate_t rands)948 void tune_mod_1_k (gmp_randstate_t rands)
949 {
950   static struct param_t  param;
951 
952   param.function = speed_mpn_divrem_euclidean_r_1;
953 
954   param.name = "MOD_1_1_THRESHOLD";
955   param.min_size = 3;
956   one (&mod_1_1_threshold, rands, &param);
957 
958   param.name = "MOD_1_2_THRESHOLD";
959   param.min_size = MAX(mod_1_1_threshold,4);
960   //param.max_size = 1000;
961   one (&mod_1_2_threshold, rands, &param);
962 
963   param.name = "MOD_1_3_THRESHOLD";
964   param.min_size = MAX(mod_1_2_threshold,5);
965   //param.max_size = 10000;
966   one (&mod_1_3_threshold, rands, &param);
967 
968 }
969 
970 
971 
972 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
973    is faster only at size==2 then we don't want to bother with extra code
974    just for that.  Start karatsuba from 4 same as MUL above.  */
975 
976 void
tune_sqr(gmp_randstate_t rands)977 tune_sqr (gmp_randstate_t rands)
978 {
979   /* disabled until tuned */
980   SQR_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
981 
982   if (HAVE_NATIVE_mpn_sqr_basecase)
983     {
984       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
985       sqr_basecase_threshold = 0;
986     }
987   else
988     {
989       static struct param_t  param;
990       param.name = "SQR_BASECASE_THRESHOLD";
991       param.function = speed_mpn_sqr;
992       param.min_size = 3;
993       param.min_is_always = 1;
994       param.max_size = TUNE_SQR_KARATSUBA_MAX;
995       param.noprint = 1;
996       one (&sqr_basecase_threshold, rands, &param);
997     }
998 
999   {
1000     static struct param_t  param;
1001     param.name = "SQR_KARATSUBA_THRESHOLD";
1002     param.function = speed_mpn_sqr;
1003     param.min_size = MAX (4, MPN_KARA_SQR_N_MINSIZE);
1004     param.max_size = TUNE_SQR_KARATSUBA_MAX;
1005     param.noprint = 1;
1006     one (&sqr_karatsuba_threshold, rands, &param);
1007 
1008     if (! HAVE_NATIVE_mpn_sqr_basecase
1009         && sqr_karatsuba_threshold < sqr_basecase_threshold)
1010       {
1011         /* Karatsuba becomes faster than mul_basecase before
1012            sqr_basecase does.  Arrange for the expression
1013            "BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD))" which
1014            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1015            SQR_KARATSUBA_THRESHOLD to zero, making
1016            SQR_BASECASE_THRESHOLD the karatsuba threshold.  */
1017 
1018         sqr_basecase_threshold = SQR_KARATSUBA_THRESHOLD;
1019         SQR_KARATSUBA_THRESHOLD = 0;
1020 
1021         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1022                              "karatsuba");
1023         print_define_remark ("SQR_KARATSUBA_THRESHOLD",SQR_KARATSUBA_THRESHOLD,
1024                              "never sqr_basecase");
1025       }
1026     else
1027       {
1028         if (! HAVE_NATIVE_mpn_sqr_basecase)
1029           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1030         print_define ("SQR_KARATSUBA_THRESHOLD", SQR_KARATSUBA_THRESHOLD);
1031       }
1032   }
1033 
1034   {
1035     static struct param_t  param;
1036     param.function = speed_mpn_sqr;
1037 
1038   {
1039     param.name = "SQR_TOOM3_THRESHOLD";
1040     param.min_size = MAX3 (MPN_TOOM3_SQR_N_MINSIZE,
1041                            SQR_KARATSUBA_THRESHOLD, SQR_BASECASE_THRESHOLD);
1042     param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1043     one (&sqr_toom3_threshold, rands, &param);
1044   }
1045 
1046   {
1047     param.name = "SQR_TOOM4_THRESHOLD";
1048     param.min_size = MAX (MPN_TOOM4_SQR_N_MINSIZE, sqr_toom3_threshold);
1049     param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1050     one (&sqr_toom4_threshold, rands, &param);
1051   }
1052 
1053   {
1054     param.name = "SQR_TOOM8_THRESHOLD";
1055     param.min_size = MAX (MPN_TOOM8_SQR_N_MINSIZE, sqr_toom4_threshold);
1056     param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1057     one (&sqr_toom8_threshold, rands, &param);
1058   }
1059   }
1060 }
1061 
1062 void
tune_dc_div(gmp_randstate_t rands)1063 tune_dc_div (gmp_randstate_t rands)
1064 {
1065   {
1066   static struct param_t  param;
1067   param.name = "DC_DIV_QR_THRESHOLD";
1068   param.function = speed_mpn_dc_div_qr_n;
1069   param.step_factor = 0.02;
1070   one (&dc_div_qr_threshold, rands, &param);
1071   }
1072 
1073   {
1074   static struct param_t  param;
1075   param.name = "INV_DIV_QR_THRESHOLD";
1076   param.max_size = 10000;
1077   param.function = speed_mpn_inv_div_qr;
1078   param.min_size = dc_div_qr_threshold;
1079   param.step_factor = 0.02;
1080   one (&inv_div_qr_threshold, rands, &param);
1081   }
1082 
1083   {
1084   static struct param_t  param;
1085   param.name = "INV_DIVAPPR_Q_N_THRESHOLD";
1086   param.function = speed_mpn_inv_divappr_q;
1087   param.max_size = 10000;
1088   param.min_size = dc_divappr_q_threshold;
1089   param.step_factor = 0.1;
1090   param.stop_factor = 0.2;
1091   one (&inv_divappr_q_n_threshold, rands, &param);
1092   }
1093 }
1094 
1095 void
tune_tdiv_q(gmp_randstate_t rands)1096 tune_tdiv_q (gmp_randstate_t rands)
1097 {
1098   {
1099   static struct param_t  param;
1100   param.name = "DC_DIV_Q_THRESHOLD";
1101   param.function = speed_mpn_tdiv_q1;
1102   param.step_factor = 0.02;
1103   one (&dc_div_q_threshold, rands, &param);
1104   }
1105 
1106   {
1107   static struct param_t  param;
1108   param.name = "INV_DIV_Q_THRESHOLD";
1109   param.function = speed_mpn_tdiv_q1;
1110   param.max_size = 10000;
1111   param.min_size = dc_div_q_threshold;
1112   param.step_factor = 0.02;
1113   one (&inv_div_q_threshold, rands, &param);
1114   }
1115 
1116   {
1117   static struct param_t  param;
1118   param.name = "DC_DIVAPPR_Q_THRESHOLD";
1119   param.function = speed_mpn_tdiv_q2;
1120   param.step_factor = 0.02;
1121   one (&dc_divappr_q_threshold, rands, &param);
1122   }
1123 
1124   {
1125   static struct param_t  param;
1126   param.name = "INV_DIVAPPR_Q_THRESHOLD";
1127   param.function = speed_mpn_tdiv_q2;
1128   param.max_size = 20000;
1129   param.min_size = dc_divappr_q_threshold;
1130   param.step_factor = 0.1;
1131   one (&inv_divappr_q_threshold, rands, &param);
1132   }
1133 }
1134 
1135 void
tune_dc_bdiv(gmp_randstate_t rands)1136 tune_dc_bdiv (gmp_randstate_t rands)
1137 {
1138   {
1139   static struct param_t  param;
1140   param.name = "DC_BDIV_QR_THRESHOLD";
1141   param.function = speed_mpn_dc_bdiv_qr_n;
1142   param.step_factor = 0.02;
1143   one (&dc_bdiv_qr_threshold, rands, &param);
1144   }
1145 
1146   {
1147   static struct param_t  param;
1148   param.name = "DC_BDIV_Q_THRESHOLD";
1149   param.function = speed_mpn_dc_bdiv_q;
1150   param.step_factor = 0.02;
1151   one (&dc_bdiv_q_threshold, rands, &param);
1152   }
1153 }
1154 
1155 void
tune_binvert(gmp_randstate_t rands)1156 tune_binvert (gmp_randstate_t rands)
1157 {
1158   static struct param_t  param;
1159 
1160   param.function = speed_mpn_binvert;
1161   param.name = "BINV_NEWTON_THRESHOLD";
1162   param.min_size = 8;		/* pointless with smaller operands */
1163   one (&binv_newton_threshold, rands, &param);
1164 }
1165 
1166 void
tune_redc(gmp_randstate_t rands)1167 tune_redc (gmp_randstate_t rands)
1168 {
1169 #define TUNE_REDC_2_MAX 100
1170 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1171 #define WANT_REDC_2 1
1172 #endif
1173 
1174 #if WANT_REDC_2
1175   {
1176     static struct param_t  param;
1177     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1178     param.function = speed_mpn_redc_1;
1179     param.function2 = speed_mpn_redc_2;
1180     param.min_size = 1;
1181     param.min_is_always = 1;
1182     param.max_size = TUNE_REDC_2_MAX;
1183     param.noprint = 1;
1184     param.stop_factor = 1.5;
1185     one (&redc_1_to_redc_2_threshold, rands, &param);
1186   }
1187   {
1188     static struct param_t  param;
1189     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1190     param.function = speed_mpn_redc_2;
1191     param.function2 = speed_mpn_redc_n;
1192     param.min_size = 16;
1193     param.noprint = 1;
1194     one (&redc_2_to_redc_n_threshold, rands, &param);
1195   }
1196   if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
1197     {
1198       redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
1199 
1200       /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
1201 	 REDC_1_TO_REDC_2_THRESHOLD.  */
1202       {
1203 	static struct param_t  param;
1204 	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1205 	param.function = speed_mpn_redc_1;
1206 	param.function2 = speed_mpn_redc_n;
1207 	param.min_size = 16;
1208 	param.noprint = 1;
1209 	one (&redc_1_to_redc_2_threshold, rands, &param);
1210       }
1211     }
1212   print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1213   print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1214 #else
1215   {
1216     static struct param_t  param;
1217     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1218     param.function = speed_mpn_redc_1;
1219     param.function2 = speed_mpn_redc_n;
1220     param.min_size = 16;
1221     one (&redc_1_to_redc_n_threshold, rands, &param);
1222   }
1223 #endif
1224 }
1225 
1226 void
tune_matrix22_mul(gmp_randstate_t rands)1227 tune_matrix22_mul (gmp_randstate_t rands)
1228 {
1229   static struct param_t  param;
1230   param.name = "MATRIX22_STRASSEN_THRESHOLD";
1231   param.function = speed_mpn_matrix22_mul;
1232   param.min_size = 2;
1233   one (&matrix22_strassen_threshold, rands, &param);
1234 }
1235 
1236 void
tune_hgcd(gmp_randstate_t rands)1237 tune_hgcd (gmp_randstate_t rands)
1238 {
1239   static struct param_t  param;
1240   param.name = "HGCD_THRESHOLD";
1241   param.function = speed_mpn_hgcd;
1242   /* We seem to get strange results for small sizes */
1243   param.min_size = 30;
1244   one (&hgcd_threshold, rands, &param);
1245 }
1246 
1247 void
tune_hgcd_appr(gmp_randstate_t rands)1248 tune_hgcd_appr (gmp_randstate_t rands)
1249 {
1250   static struct param_t  param;
1251   param.name = "HGCD_APPR_THRESHOLD";
1252   param.function = speed_mpn_hgcd_appr;
1253   /* We seem to get strange results for small sizes */
1254   param.min_size = 50;
1255   param.stop_since_change = 150;
1256   one (&hgcd_appr_threshold, rands, &param);
1257 }
1258 
1259 void
tune_hgcd_reduce(gmp_randstate_t rands)1260 tune_hgcd_reduce (gmp_randstate_t rands)
1261 {
1262   static struct param_t  param;
1263   param.name = "HGCD_REDUCE_THRESHOLD";
1264   param.function = speed_mpn_hgcd_reduce;
1265   param.min_size = 30;
1266   param.max_size = 7000;
1267   param.step_factor = 0.04;
1268   one (&hgcd_reduce_threshold, rands, &param);
1269 }
1270 
1271 void
tune_gcd_dc(gmp_randstate_t rands)1272 tune_gcd_dc (gmp_randstate_t rands)
1273 {
1274   static struct param_t  param;
1275   param.name = "GCD_DC_THRESHOLD";
1276   param.function = speed_mpn_gcd;
1277   param.min_size = hgcd_threshold;
1278   param.max_size = 3000;
1279   param.step_factor = 0.02;
1280   one (&gcd_dc_threshold, rands, &param);
1281 }
1282 
1283 void
tune_gcdext_dc(gmp_randstate_t rands)1284 tune_gcdext_dc (gmp_randstate_t rands)
1285 {
1286   static struct param_t  param;
1287   param.name = "GCDEXT_DC_THRESHOLD";
1288   param.function = speed_mpn_gcdext;
1289   param.min_size = hgcd_threshold;
1290   param.max_size = 3000;
1291   param.step_factor = 0.02;
1292   one (&gcdext_dc_threshold, rands, &param);
1293 }
1294 
1295 
1296 
1297 /* size_extra==1 reflects the fact that with high<divisor one division is
1298    always skipped.  Forcing high<divisor while testing ensures consistency
1299    while stepping through sizes, ie. that size-1 divides will be done each
1300    time.
1301 
1302    min_size==2 and min_is_always are used so that if plain division is only
1303    better at size==1 then don't bother including that code just for that
1304    case, instead go with preinv always and get a size saving.  */
1305 
1306 #define DIV_1_PARAMS                    \
1307   param.check_size = 256;               \
1308   param.min_size = 2;                   \
1309   param.min_is_always = 1;              \
1310   param.data_high = DATA_HIGH_LT_R;     \
1311   param.size_extra = 1;                 \
1312   param.stop_factor = 2.0;
1313 
1314 
1315 double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
1316 
1317 void
tune_divrem_1(gmp_randstate_t rands)1318 tune_divrem_1 (gmp_randstate_t rands)
1319 {
1320   /* plain version by default */
1321   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
1322 
1323   /* No support for tuning native assembler code, do that by hand and put
1324      the results in the .asm file, there's no need for such thresholds to
1325      appear in gmp-mparam.h.  */
1326   if (HAVE_NATIVE_mpn_divrem_1)
1327     return;
1328 
1329   if (GMP_NAIL_BITS != 0)
1330     {
1331       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1332                            "no preinv with nails");
1333       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1334                            "no preinv with nails");
1335       return;
1336     }
1337 
1338   if (UDIV_PREINV_ALWAYS)
1339     {
1340       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
1341       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
1342       return;
1343     }
1344 
1345   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1346 
1347   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
1348      a bit out for the fractional part, but that's too bad, the integer part
1349      is more important. */
1350   {
1351     static struct param_t  param;
1352     param.name = "DIVREM_1_NORM_THRESHOLD";
1353     DIV_1_PARAMS;
1354     s.r = randlimb_norm (rands);
1355     param.function = speed_mpn_divrem_1_tune;
1356     one (&divrem_1_norm_threshold, rands, &param);
1357   }
1358   {
1359     static struct param_t  param;
1360     param.name = "DIVREM_1_UNNORM_THRESHOLD";
1361     DIV_1_PARAMS;
1362     s.r = randlimb_half (rands);
1363     param.function = speed_mpn_divrem_1_tune;
1364     one (&divrem_1_unnorm_threshold, rands, &param);
1365   }
1366 }
1367 
1368 
1369 double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
1370 
1371 void
tune_mod_1(gmp_randstate_t rands)1372 tune_mod_1 (gmp_randstate_t rands)
1373 {
1374   /* plain version by default */
1375   tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
1376 
1377   /* No support for tuning native assembler code, do that by hand and put
1378      the results in the .asm file, there's no need for such thresholds to
1379      appear in gmp-mparam.h.  */
1380   if (HAVE_NATIVE_mpn_mod_1)
1381     return;
1382 
1383   if (GMP_NAIL_BITS != 0)
1384     {
1385       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1386                            "no preinv with nails");
1387       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1388                            "no preinv with nails");
1389       return;
1390     }
1391 
1392   if (UDIV_PREINV_ALWAYS)
1393     {
1394       print_define ("MOD_1_NORM_THRESHOLD", 0L);
1395       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
1396       return;
1397     }
1398 
1399   tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
1400 
1401   {
1402     static struct param_t  param;
1403     param.name = "MOD_1_NORM_THRESHOLD";
1404     DIV_1_PARAMS;
1405     s.r = randlimb_norm (rands);
1406     param.function = speed_mpn_mod_1_tune;
1407     one (&mod_1_norm_threshold, rands, &param);
1408   }
1409   {
1410     static struct param_t  param;
1411     param.name = "MOD_1_UNNORM_THRESHOLD";
1412     DIV_1_PARAMS;
1413     s.r = randlimb_half (rands);
1414     param.function = speed_mpn_mod_1_tune;
1415     one (&mod_1_unnorm_threshold, rands, &param);
1416   }
1417 }
1418 
1419 
1420 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
1421    imply that udiv_qrnnd_preinv is worth using, but it seems most
1422    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
1423    directly.  */
1424 
1425 void
tune_preinv_divrem_1(gmp_randstate_t rands)1426 tune_preinv_divrem_1 (gmp_randstate_t rands)
1427 {
1428   static struct param_t  param;
1429   speed_function_t  divrem_1;
1430   const char        *divrem_1_name;
1431   double            t1, t2;
1432 
1433   if (GMP_NAIL_BITS != 0)
1434     {
1435       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
1436       return;
1437     }
1438 
1439   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
1440      it's faster than mpn_divrem_1.  */
1441   if (HAVE_NATIVE_mpn_preinv_divrem_1)
1442     {
1443       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
1444       return;
1445     }
1446 
1447   /* If udiv_qrnnd_preinv is the only division method then of course
1448      mpn_preinv_divrem_1 should be used.  */
1449   if (UDIV_PREINV_ALWAYS)
1450     {
1451       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
1452       return;
1453     }
1454 
1455   /* If we've got an assembler version of mpn_divrem_1, then compare against
1456      that, not the mpn_divrem_1_div generic C.  */
1457   if (HAVE_NATIVE_mpn_divrem_1)
1458     {
1459       divrem_1 = speed_mpn_divrem_1;
1460       divrem_1_name = "mpn_divrem_1";
1461     }
1462   else
1463     {
1464       divrem_1 = speed_mpn_divrem_1_div;
1465       divrem_1_name = "mpn_divrem_1_div";
1466     }
1467 
1468   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
1469   s.size = 200;                     /* generous but not too big */
1470   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
1471      since in general that's probably most common, though in fact for a
1472      64-bit limb mp_bases[10].big_base is normalized.  */
1473   s.r = urandom(rands) & (GMP_NUMB_MASK >> 4);
1474   if (s.r == 0) s.r = 123;
1475 
1476   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, rands, &param, &s);
1477   t2 = tuneup_measure (divrem_1, rands, &param, &s);
1478   if (t1 == -1.0 || t2 == -1.0)
1479     {
1480       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
1481               divrem_1_name, (long) s.size);
1482       abort ();
1483     }
1484   if (option_trace >= 1)
1485     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
1486             (long) s.size, t1, divrem_1_name, t2);
1487 
1488   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
1489 }
1490 
1491 
1492 /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
1493    that udiv_qrnnd_preinv is worth using, but it seems most straightforward
1494    to compare mpn_preinv_mod_1 and mpn_mod_1_div directly.  */
1495 
1496 void
tune_preinv_mod_1(gmp_randstate_t rands)1497 tune_preinv_mod_1 (gmp_randstate_t rands)
1498 {
1499   static struct param_t  param;
1500   speed_function_t  mod_1;
1501   const char        *mod_1_name;
1502   double            t1, t2;
1503 
1504   /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
1505      faster than mpn_mod_1.  */
1506   if (HAVE_NATIVE_mpn_preinv_mod_1)
1507     {
1508       print_define_remark ("USE_PREINV_MOD_1", 1, "native");
1509       return;
1510     }
1511 
1512   if (GMP_NAIL_BITS != 0)
1513     {
1514       print_define_remark ("USE_PREINV_MOD_1", 0, "no preinv with nails");
1515       return;
1516     }
1517 
1518   /* If udiv_qrnnd_preinv is the only division method then of course
1519      mpn_preinv_mod_1 should be used.  */
1520   if (UDIV_PREINV_ALWAYS)
1521     {
1522       print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
1523       return;
1524     }
1525 
1526   /* If we've got an assembler version of mpn_mod_1, then compare against
1527      that, not the mpn_mod_1_div generic C.  */
1528   if (HAVE_NATIVE_mpn_mod_1)
1529     {
1530       mod_1 = speed_mpn_mod_1;
1531       mod_1_name = "mpn_mod_1";
1532     }
1533   else
1534     {
1535       mod_1 = speed_mpn_mod_1_div;
1536       mod_1_name = "mpn_mod_1_div";
1537     }
1538 
1539   param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
1540   s.size = 200;                     /* generous but not too big */
1541   s.r = randlimb_norm(rands);            /* divisor */
1542 
1543   t1 = tuneup_measure (speed_mpn_preinv_mod_1, rands, &param, &s);
1544   t2 = tuneup_measure (mod_1, rands, &param, &s);
1545   if (t1 == -1.0 || t2 == -1.0)
1546     {
1547       printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
1548               mod_1_name, (long) s.size);
1549       abort ();
1550     }
1551   if (option_trace >= 1)
1552     printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
1553             (long) s.size, t1, mod_1_name, t2);
1554 
1555   print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
1556 }
1557 
1558 
1559 void
tune_divrem_2(gmp_randstate_t rands)1560 tune_divrem_2 (gmp_randstate_t rands)
1561 {
1562   static struct param_t  param;
1563 
1564   /* No support for tuning native assembler code, do that by hand and put
1565      the results in the .asm file, and there's no need for such thresholds
1566      to appear in gmp-mparam.h.  */
1567   if (HAVE_NATIVE_mpn_divrem_2)
1568     return;
1569 
1570   if (GMP_NAIL_BITS != 0)
1571     {
1572       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
1573                            "no preinv with nails");
1574       return;
1575     }
1576 
1577   if (UDIV_PREINV_ALWAYS)
1578     {
1579       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
1580       return;
1581     }
1582 
1583   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
1584      a bit out for the fractional part, but that's too bad, the integer part
1585      is more important.
1586 
1587      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
1588      code space if plain division is better only at size==2 or size==3. */
1589   param.name = "DIVREM_2_THRESHOLD";
1590   param.check_size = 256;
1591   param.min_size = 4;
1592   param.min_is_always = 1;
1593   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
1594   param.stop_factor = 2.0;
1595 
1596   s.r = randlimb_norm (rands);
1597   param.function = speed_mpn_divrem_2;
1598   one (&divrem_2_threshold, rands, &param);
1599 }
1600 
1601 
1602 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
1603    tune for that.  Its speed can differ on odd or even divisor, so take an
1604    average threshold for the two.
1605 
1606    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
1607    might not vary that way, but don't test this since high<divisor isn't
1608    expected to occur often with small divisors.  */
1609 
1610 void
tune_divexact_1(gmp_randstate_t rands)1611 tune_divexact_1 (gmp_randstate_t rands)
1612 {
1613   static struct param_t  param;
1614   mp_size_t  thresh[2], average;
1615   int        low, i;
1616 
1617   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
1618      full mpn_divrem_1.  */
1619   if (HAVE_NATIVE_mpn_divexact_1)
1620     {
1621       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
1622       return;
1623     }
1624 
1625   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
1626 
1627   param.name = "DIVEXACT_1_THRESHOLD";
1628   param.data_high = DATA_HIGH_GE_R;
1629   param.check_size = 256;
1630   param.min_size = 2;
1631   param.stop_factor = 1.5;
1632   param.function  = tuned_speed_mpn_divrem_1;
1633   param.function2 = speed_mpn_divexact_1;
1634   param.noprint = 1;
1635 
1636   print_define_start (param.name);
1637 
1638   for (low = 0; low <= 1; low++)
1639     {
1640       s.r = randlimb_half(rands);
1641       if (low == 0)
1642         s.r |= 1;
1643       else
1644         s.r &= ~CNST_LIMB(7);
1645 
1646       one (&thresh[low], rands, &param);
1647       if (option_trace)
1648         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
1649 
1650       if (thresh[low] == MP_SIZE_T_MAX)
1651         {
1652           average = MP_SIZE_T_MAX;
1653           goto divexact_1_done;
1654         }
1655     }
1656 
1657   if (option_trace)
1658     {
1659       printf ("average of:");
1660       for (i = 0; i < numberof(thresh); i++)
1661         printf (" %ld", (long) thresh[i]);
1662       printf ("\n");
1663     }
1664 
1665   average = 0;
1666   for (i = 0; i < numberof(thresh); i++)
1667     average += thresh[i];
1668   average /= numberof(thresh);
1669 
1670   /* If divexact turns out to be better as early as 3 limbs, then use it
1671      always, so as to reduce code size and conditional jumps.  */
1672   if (average <= 3)
1673     average = 0;
1674 
1675  divexact_1_done:
1676   print_define_end (param.name, average);
1677 }
1678 
1679 
1680 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
1681    same as mpn_mod_1, but this might not be true of an assembler
1682    implementation.  The threshold used is an average based on data where a
1683    divide can be skipped and where it can't.
1684 
1685    If modexact turns out to be better as early as 3 limbs, then use it
1686    always, so as to reduce code size and conditional jumps.  */
1687 
1688 void
tune_modexact_1_odd(gmp_randstate_t rands)1689 tune_modexact_1_odd (gmp_randstate_t rands)
1690 {
1691   static struct param_t  param;
1692   mp_size_t  thresh_lt, thresh_ge, average;
1693 
1694   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
1695      of a full mpn_mod_1.  */
1696   if (HAVE_NATIVE_mpn_modexact_1_odd)
1697     {
1698       print_define_remark ("MODEXACT_1_ODD_THRESHOLD", 0, "always (native)");
1699       return;
1700     }
1701 
1702   ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
1703 
1704   param.name = "MODEXACT_1_ODD_THRESHOLD";
1705   param.check_size = 256;
1706   param.min_size = 2;
1707   param.stop_factor = 1.5;
1708   param.function  = tuned_speed_mpn_mod_1;
1709   param.function2 = speed_mpn_modexact_1c_odd;
1710   param.noprint = 1;
1711   s.r = randlimb_half (rands) | 1;
1712 
1713   print_define_start (param.name);
1714 
1715   param.data_high = DATA_HIGH_LT_R;
1716   one (&thresh_lt, rands, &param);
1717   if (option_trace)
1718     printf ("lt thresh %ld\n", (long) thresh_lt);
1719 
1720   average = thresh_lt;
1721   if (thresh_lt != MP_SIZE_T_MAX)
1722     {
1723       param.data_high = DATA_HIGH_GE_R;
1724       one (&thresh_ge, rands, &param);
1725       if (option_trace)
1726         printf ("ge thresh %ld\n", (long) thresh_ge);
1727 
1728       if (thresh_ge != MP_SIZE_T_MAX)
1729         {
1730           average = (thresh_ge + thresh_lt) / 2;
1731           if (thresh_ge <= 3)
1732             average = 0;
1733         }
1734     }
1735 
1736   print_define_end (param.name, average);
1737 }
1738 
1739 
1740 void
tune_jacobi_base(gmp_randstate_t rands)1741 tune_jacobi_base (gmp_randstate_t rands)
1742 {
1743   static struct param_t  param;
1744   double   t1, t2, t3, t4;
1745   int      method;
1746 
1747   s.size = GMP_LIMB_BITS * 3 / 4;
1748 
1749   t1 = tuneup_measure (speed_mpn_jacobi_base_1, rands, &param, &s);
1750   if (option_trace >= 1)
1751     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
1752 
1753   t2 = tuneup_measure (speed_mpn_jacobi_base_2, rands, &param, &s);
1754   if (option_trace >= 1)
1755     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
1756 
1757   t3 = tuneup_measure (speed_mpn_jacobi_base_3, rands, &param, &s);
1758   if (option_trace >= 1)
1759     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
1760 
1761   t4 = tuneup_measure (speed_mpn_jacobi_base_4, rands, &param, &s);
1762   if (option_trace >= 1)
1763     printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4);
1764 
1765   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0)
1766     {
1767       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
1768               (long) s.size);
1769       abort ();
1770     }
1771 
1772   if (t1 < t2 && t1 < t3 && t1 < t4)
1773     method = 1;
1774   else if (t2 < t3 && t2 < t4)
1775     method = 2;
1776   else if (t3 < t4)
1777     method = 3;
1778   else
1779     method = 4;
1780 
1781   print_define ("JACOBI_BASE_METHOD", method);
1782 }
1783 
1784 
1785 
1786 void
tune_get_str(gmp_randstate_t rands)1787 tune_get_str (gmp_randstate_t rands)
1788 {
1789   /* Tune for decimal, it being most common.  Some rough testing suggests
1790      other bases are different, but not by very much.  */
1791   s.r = 10;
1792   {
1793     static struct param_t  param;
1794     GET_STR_PRECOMPUTE_THRESHOLD = 0;
1795     param.name = "GET_STR_DC_THRESHOLD";
1796     param.function = speed_mpn_get_str;
1797     param.min_size = 4;
1798     param.max_size = GET_STR_THRESHOLD_LIMIT;
1799     one (&get_str_dc_threshold, rands, &param);
1800   }
1801   {
1802     static struct param_t  param;
1803     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
1804     param.function = speed_mpn_get_str;
1805     param.min_size = GET_STR_DC_THRESHOLD;
1806     param.max_size = GET_STR_THRESHOLD_LIMIT;
1807     one (&get_str_precompute_threshold, rands, &param);
1808   }
1809 }
1810 
1811 double
speed_mpn_pre_set_str(struct speed_params * s)1812 speed_mpn_pre_set_str (struct speed_params *s)
1813 {
1814   unsigned char *str;
1815   mp_ptr     wp;
1816   mp_size_t  wn;
1817   unsigned   i;
1818   int        base;
1819   double     t;
1820   mp_ptr powtab_mem, tp;
1821   powers_t powtab[GMP_LIMB_BITS];
1822   mp_size_t un;
1823   int chars_per_limb;
1824   TMP_DECL;
1825 
1826   SPEED_RESTRICT_COND (s->size >= 1);
1827 
1828   base = s->r == 0 ? 10 : s->r;
1829   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
1830 
1831   TMP_MARK;
1832 
1833   str = TMP_ALLOC (s->size);
1834   for (i = 0; i < s->size; i++)
1835     str[i] = s->xp[i] % base;
1836 
1837   wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly))
1838     / GMP_LIMB_BITS + 2;
1839   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
1840 
1841   /* use this during development to check wn is big enough */
1842   /*
1843   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
1844   */
1845 
1846   speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
1847   speed_operand_dst (s, wp, wn);
1848   speed_cache_fill (s);
1849 
1850   chars_per_limb = mp_bases[base].chars_per_limb;
1851   un = s->size / chars_per_limb + 1;
1852   powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
1853   mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
1854   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
1855 
1856   speed_starttime ();
1857   i = s->reps;
1858   do
1859     {
1860       mpn_pre_set_str (wp, str, s->size, powtab, tp);
1861     }
1862   while (--i != 0);
1863   t = speed_endtime ();
1864 
1865   TMP_FREE;
1866   return t;
1867 }
1868 
1869 void
tune_set_str(gmp_randstate_t rands)1870 tune_set_str (gmp_randstate_t rands)
1871 {
1872   s.r = 10;  /* decimal */
1873   {
1874     static struct param_t  param;
1875     SET_STR_PRECOMPUTE_THRESHOLD = 0;
1876     param.step_factor = 0.01;
1877     param.name = "SET_STR_DC_THRESHOLD";
1878     param.function = speed_mpn_pre_set_str;
1879     param.min_size = 100;
1880     param.max_size = 50000;
1881     one (&set_str_dc_threshold, rands, &param);
1882   }
1883   {
1884     static struct param_t  param;
1885     param.step_factor = 0.02;
1886     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
1887     param.function = speed_mpn_set_str;
1888     param.min_size = SET_STR_DC_THRESHOLD;
1889     param.max_size = 100000;
1890     one (&set_str_precompute_threshold, rands, &param);
1891   }
1892 }
1893 
1894 void
tune_fft(gmp_randstate_t state)1895 tune_fft(gmp_randstate_t state)
1896 {
1897     mp_bitcnt_t depth, w, depth1, w1;
1898     clock_t start, end;
1899     double elapsed;
1900     double best = 0.0;
1901     mp_size_t best_off, off, best_d, best_w, num_twos, num_printed;
1902 
1903     if (option_fft_max_size == 0)
1904        return;
1905 
1906     printf("/* fft_tuning -- autogenerated by tune-fft */\n\n");
1907     printf("#define FFT_TAB \\\n");
1908     fflush(stdout);
1909 
1910     printf("   { "); fflush(stdout);
1911     for (depth = 6; depth <= 10; depth++)
1912     {
1913         printf("{ "); fflush(stdout);
1914         for (w = 1; w <= 2; w++)
1915         {
1916             int iters = 100*((mp_size_t) 1 << (3*(10 - depth)/2)), i;
1917 
1918             mp_size_t n = ((mp_limb_t)1<<depth);
1919             mp_bitcnt_t bits1 = (n*w - (depth + 1))/2;
1920             mp_size_t len1 = 2*n;
1921             mp_size_t len2 = 2*n;
1922 
1923             mp_bitcnt_t b1 = len1*bits1, b2 = len2*bits1;
1924             mp_size_t n1, n2;
1925             mp_size_t j;
1926             mp_limb_t * i1, *i2, *r1;
1927 
1928             n1 = (b1 - 1)/GMP_LIMB_BITS + 1;
1929             n2 = (b2 - 1)/GMP_LIMB_BITS + 1;
1930 
1931             i1 = malloc(2*(n1 + n2)*sizeof(mp_limb_t));
1932             i2 = i1 + n1;
1933             r1 = i2 + n2;
1934 
1935             mpn_urandomb(i1, state, b1);
1936             mpn_urandomb(i2, state, b2);
1937 
1938             best_off = -1;
1939 
1940             for (off = 0; off <= 4; off++)
1941             {
1942                start = clock();
1943                for (i = 0; i < iters; i++)
1944                   mpn_mul_trunc_sqrt2(r1, i1, n1, i2, n2, depth - off, w*((mp_size_t)1 << (off*2)));
1945                end = clock();
1946 
1947                elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
1948 
1949                if (elapsed < best || best_off == -1)
1950                {
1951                   best_off = off;
1952                   best = elapsed;
1953                }
1954             }
1955 
1956             printf("%ld", best_off);
1957             if (w != 2) printf(",");
1958             printf(" "); fflush(stdout);
1959 
1960             free(i1);
1961         }
1962         printf("}");
1963         if (depth != 10) printf(",");
1964         printf(" "); fflush(stdout);
1965     }
1966 
1967     printf("}\n\n");
1968 
1969     best_d = 12;
1970     best_w = 1;
1971     best_off = -1;
1972     num_printed = 0;
1973     num_twos = 0;
1974 
1975     printf("#define MULMOD_TAB \\\n");
1976     fflush(stdout);
1977     printf("   { "); fflush(stdout);
1978     for (depth = 12; best_off != 1 && !(num_printed >= 25 && best_off == 2 && num_twos >= 5) ; depth++)
1979     {
1980         for (w = 1; w <= 2; w++)
1981         {
1982             int iters = 100*((mp_size_t) 1 << (3*(18 - depth)/2)), i;
1983             mp_size_t n = ((mp_limb_t)1<<depth);
1984             mp_bitcnt_t bits = n*w;
1985             mp_size_t int_limbs = (bits - 1)/GMP_LIMB_BITS + 1;
1986             mp_size_t j;
1987             mp_limb_t c, * i1, * i2, * r1, * tt;
1988 
1989             if (depth <= 21) iters = 32*((mp_size_t) 1 << (21 - depth));
1990             else iters = MAX(32/((mp_size_t) 1 << (depth - 21)), 1);
1991 
1992             i1 = malloc(6*(int_limbs+1)*sizeof(mp_limb_t));
1993             i2 = i1 + int_limbs + 1;
1994             r1 = i2 + int_limbs + 1;
1995             tt = r1 + 2*(int_limbs + 1);
1996 
1997             mpn_urandomb(i1, state, int_limbs*GMP_LIMB_BITS);
1998             mpn_urandomb(i2, state, int_limbs*GMP_LIMB_BITS);
1999             i1[int_limbs] = 0;
2000             i2[int_limbs] = 0;
2001 
2002             depth1 = 1;
2003             while ((((mp_limb_t)1)<<depth1) < bits) depth1++;
2004             depth1 = depth1/2;
2005 
2006             w1 = bits/(((mp_limb_t)1)<<(2*depth1));
2007 
2008             best_off = -1;
2009 
2010             for (off = 0; off <= 4; off++)
2011             {
2012                start = clock();
2013                for (i = 0; i < iters; i++)
2014                   mpir_fft_mulmod_2expp1(r1, i1, i2, int_limbs, depth1 - off, w1*((mp_size_t)1 << (off*2)));
2015                end = clock();
2016 
2017                elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
2018 
2019                if (best_off == -1 || elapsed < best)
2020                {
2021                   best_off = off;
2022                   best = elapsed;
2023                }
2024             }
2025 
2026             start = clock();
2027             for (i = 0; i < iters; i++)
2028                 mpn_mulmod_2expp1_basecase(r1, i1, i2, 0, bits, tt);
2029             end = clock();
2030 
2031             elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
2032             if (elapsed < best)
2033             {
2034                 best_d = depth + (w == 2);
2035                 best_w = w + 1 - 2*(w == 2);
2036             }
2037 
2038             printf("%ld", best_off);
2039             if (best_off == 2)
2040                num_twos++;
2041             else
2042                num_twos = 0;
2043             num_printed++;
2044             if (w != 2) printf(", "); fflush(stdout);
2045 
2046             free(i1);
2047         }
2048         printf(", "); fflush(stdout);
2049     }
2050     if (best_off == 2)
2051     {
2052        printf("2, 2, 2, 2, 2, 1, 1 }\n\n");
2053        num_printed += 6;
2054     } else
2055        printf("1 }\n\n");
2056 
2057     printf("#define FFT_N_NUM %ld\n\n", num_printed + 1);
2058 
2059     printf("#define FFT_MULMOD_2EXPP1_CUTOFF %ld\n\n", ((mp_limb_t) 1 << best_d)*best_w/(2*GMP_LIMB_BITS));
2060 }
2061 
2062 void
tune_fac_ui(gmp_randstate_t rands)2063 tune_fac_ui (gmp_randstate_t rands)
2064 {
2065   static struct param_t  param;
2066 
2067   param.function = speed_mpz_fac_ui_tune;
2068 
2069   param.name = "FAC_DSC_THRESHOLD";
2070   param.min_size = 70;
2071   param.max_size = FAC_DSC_THRESHOLD_LIMIT;
2072   one (&fac_dsc_threshold, rands, &param);
2073 
2074   param.name = "FAC_ODD_THRESHOLD";
2075   param.min_size = 22;
2076   param.stop_factor = 1.7;
2077   param.min_is_always = 1;
2078   one (&fac_odd_threshold, rands, &param);
2079 }
2080 
2081 void
tune_fft_mul(gmp_randstate_t rands)2082 tune_fft_mul (gmp_randstate_t rands)
2083 {
2084   static struct fft_param_t  param;
2085 
2086   if (option_fft_max_size == 0)
2087     return;
2088 
2089   param.threshold_name      = "MUL_FFT_FULL_THRESHOLD";
2090   param.p_threshold         = &mul_fft_full_threshold;
2091   param.first_size          = MUL_TOOM8H_THRESHOLD / 2;
2092   param.max_size            = option_fft_max_size;
2093   param.function            = speed_mpn_mul_fft_main;
2094   param.mul_function        = speed_mpn_mul_n;
2095   param.sqr = 0;
2096   fft (&param,rands);
2097 }
2098 
2099 
2100 void
tune_fft_sqr(gmp_randstate_t rands)2101 tune_fft_sqr (gmp_randstate_t rands)
2102 {
2103   static struct fft_param_t  param;
2104 
2105   if (option_fft_max_size == 0)
2106     return;
2107 
2108   param.threshold_name      = "SQR_FFT_FULL_THRESHOLD";
2109   param.p_threshold         = &sqr_fft_full_threshold;
2110   param.first_size          = SQR_TOOM8_THRESHOLD / 2;
2111   param.max_size            = option_fft_max_size;
2112   param.function            = speed_mpn_sqr_fft_main;
2113   param.mul_function        = speed_mpn_sqr;
2114   param.sqr = 0;
2115   fft (&param,rands);
2116 }
2117 
2118 #ifdef _MSC_VER
2119 #define GMP_MPARAM_H_SUGGEST "vc_gmp_mparam.h"
2120 #endif
2121 
2122 void
all(gmp_randstate_t rands)2123 all (gmp_randstate_t rands)
2124 {
2125   time_t  start_time, end_time;
2126   TMP_DECL;
2127 
2128   TMP_MARK;
2129   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2130   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2131 
2132   mpn_randomb (s.xp_block, rands, SPEED_BLOCK_SIZE);
2133   mpn_randomb (s.yp_block, rands, SPEED_BLOCK_SIZE);
2134 
2135   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2136 
2137   speed_time_init ();
2138   fprintf (stderr, "Using: %s\n", speed_time_string);
2139 
2140   fprintf (stderr, "speed_precision %d", speed_precision);
2141   if (speed_unittime == 1.0)
2142     fprintf (stderr, ", speed_unittime 1 cycle");
2143   else
2144     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2145   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2146     fprintf (stderr, ", CPU freq unknown\n");
2147   else
2148     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2149 
2150   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2151            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2152   fprintf (stderr, "\n");
2153 
2154   time (&start_time);
2155   {
2156     struct tm  *tp;
2157     tp = localtime (&start_time);
2158     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2159             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2160 
2161 #ifdef __GNUC__
2162     /* gcc sub-minor version doesn't seem to come through as a define */
2163     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2164 #define PRINTED_COMPILER
2165 #endif
2166 #if defined (__SUNPRO_C)
2167     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2168 #define PRINTED_COMPILER
2169 #endif
2170 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2171     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2172     printf ("MIPSpro C %d.%d.%d */\n",
2173 	    _COMPILER_VERSION / 100,
2174 	    _COMPILER_VERSION / 10 % 10,
2175 	    _COMPILER_VERSION % 10);
2176 #define PRINTED_COMPILER
2177 #endif
2178 #if defined (__DECC) && defined (__DECC_VER)
2179     printf ("DEC C %d */\n", __DECC_VER);
2180 #define PRINTED_COMPILER
2181 #endif
2182 #if ! defined (PRINTED_COMPILER)
2183     printf ("system compiler */\n");
2184 #endif
2185   }
2186   printf ("\n");
2187 
2188   tune_mul (rands);
2189   printf("\n");
2190 
2191   tune_sqr (rands);
2192   printf("\n");
2193 
2194   tune_divrem_1 (rands);
2195   tune_mod_1 (rands);
2196   tune_preinv_divrem_1 (rands);
2197   tune_preinv_mod_1 (rands);
2198   tune_divrem_2 (rands);
2199   tune_divexact_1 (rands);
2200   tune_modexact_1_odd (rands);
2201   tune_mod_1_k(rands);
2202   tune_divrem_hensel_qr_1(rands);
2203   tune_rsh_divrem_hensel_qr_1(rands);
2204   tune_divrem_euclid_hensel(rands);
2205   printf("\n");
2206 
2207   tune_fft_mul (rands);
2208   printf("\n");
2209 
2210   tune_fft_sqr (rands);
2211   printf ("\n");
2212 
2213   tune_mullow (rands);
2214   printf("\n");
2215   tune_mulmid (rands);
2216   printf("\n");
2217   tune_mulhigh (rands);
2218   printf("\n");
2219 
2220   tune_mulmod_2expm1(rands);
2221   printf("\n");
2222 
2223   /* dc_div_qr_n, dc_divappr_q, inv_div_qr, inv_divappr_q */
2224   tune_dc_div (rands);
2225 
2226   /* mpn_tdiv_q : balanced */
2227   tune_tdiv_q (rands);
2228 
2229   /* dc_bdiv_qr_n, dc_bdiv_q */
2230   tune_dc_bdiv (rands);
2231   printf("\n");
2232 
2233   tune_binvert (rands);
2234   tune_redc (rands);
2235   printf("\n");
2236 
2237   tune_rootrem(rands);
2238   printf("\n");
2239 
2240   tune_matrix22_mul (rands);
2241   tune_hgcd (rands);
2242   tune_hgcd_appr (rands);
2243   tune_hgcd_reduce(rands);
2244   tune_gcd_dc (rands);
2245   tune_gcdext_dc (rands);
2246   tune_jacobi_base (rands);
2247   printf("\n");
2248 
2249   tune_get_str (rands);
2250   tune_set_str (rands);
2251   printf("\n");
2252 
2253   tune_fac_ui (rands);
2254   printf("\n");
2255 
2256   tune_fft (rands);
2257   printf("\n");
2258 
2259   time (&end_time);
2260   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
2261           end_time - start_time);
2262 
2263   TMP_FREE;
2264 }
2265 
2266 
2267 int
main(int argc,char * argv[])2268 main (int argc, char *argv[])
2269 {
2270   int  opt;
2271   gmp_randstate_t rands;
2272 
2273   gmp_randinit_default(rands);
2274   /* Unbuffered so if output is redirected to a file it isn't lost if the
2275      program is killed part way through.  */
2276   setbuf (stdout, NULL);
2277   setbuf (stderr, NULL);
2278 
2279   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
2280     {
2281       switch (opt) {
2282       case 'f':
2283         if (optarg[0] == 't')
2284           option_fft_trace = 2;
2285         else
2286           option_fft_max_size = atol (optarg);
2287         break;
2288       case 'o':
2289         speed_option_set (optarg);
2290         break;
2291       case 'p':
2292         speed_precision = atoi (optarg);
2293         break;
2294       case 't':
2295         option_trace++;
2296         break;
2297       case '?':
2298         exit(1);
2299       }
2300     }
2301 
2302   all (rands);
2303   exit (0);
2304 }
2305