1 /* Create tuned thresholds for various algorithms.
2
3 Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
4
5 This file is part of the GNU MP Library.
6
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9
10 * the GNU Lesser General Public License as published by the Free
11 Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13
14 or
15
16 * the GNU General Public License as published by the Free Software
17 Foundation; either version 2 of the License, or (at your option) any
18 later version.
19
20 or both in parallel, as here.
21
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 for more details.
26
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library. If not,
29 see https://www.gnu.org/licenses/. */
30
31
32 /* Usage: tuneup [-t] [-t] [-p precision]
33
34 -t turns on some diagnostic traces, a second -t turns on more traces.
35
36 Notes:
37
38 The code here isn't a vision of loveliness, mainly because it's subject
39 to ongoing changes according to new things wanting to be tuned, and
40 practical requirements of systems tested.
41
42 Sometimes running the program twice produces slightly different results.
43 This is probably because there's so little separating algorithms near
44 their crossover, and on that basis it should make little or no difference
45 to the final speed of the relevant routines, but nothing has been done to
46 check that carefully.
47
48 Algorithm:
49
50 The thresholds are determined as follows. A crossover may not be a
51 single size but rather a range where it oscillates between method A or
52 method B faster. If the threshold is set making B used where A is faster
53 (or vice versa) that's bad. Badness is the percentage time lost and
54 total badness is the sum of this over all sizes measured. The threshold
55 is set to minimize total badness.
56
57 Suppose, as sizes increase, method B becomes faster than method A. The
58 effect of the rule is that, as you look at increasing sizes, isolated
59 points where B is faster are ignored, but when it's consistently faster,
60 or faster on balance, then the threshold is set there. The same result
61 is obtained thinking in the other direction of A becoming faster at
62 smaller sizes.
63
64 In practice the thresholds tend to be chosen to bring on the next
65 algorithm fairly quickly.
66
67 This rule is attractive because it's got a basis in reason and is fairly
68 easy to implement, but no work has been done to actually compare it in
69 absolute terms to other possibilities.
70
71 Implementation:
72
73 In a normal library build the thresholds are constants. To tune them
74 selected objects are recompiled with the thresholds as global variables
75 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
76 the end of gmp-impl.h, and rules in tune/Makefile.am.
77
78 MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The
79 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
80 level, but recurse into the basecase.
81
82 MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
83 Other routines in turn will make use of both of those. Naturally the
84 dependants must be tuned first.
85
86 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
87 just a threshold based on comparing two routines (mpn_divrem_1 and
88 mpn_divexact_1), and no further use of the value determined.
89
90 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
91 just comparisons between certain routines on representative data.
92
93 Shortcuts are applied when native (assembler) versions of routines exist.
94 For instance a native mpn_sqr_basecase is assumed to be always faster
95 than mpn_mul_basecase, with no measuring.
96
97 No attempt is made to tune within assembler routines, for instance
98 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
99 written and tuned all by hand. Assembler routines that might have hard
100 limits are recompiled though, to make them accept a bigger range of sizes
101 than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
102
103 Limitations:
104
105 The FFTs aren't subject to the same badness rule as the other thresholds,
106 so each k is probably being brought on a touch early. This isn't likely
107 to make a difference, and the simpler probing means fewer tests.
108
109 */
110
111 #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
112
113 #include "config.h"
114
115 #include <math.h>
116 #include <stdio.h>
117 #include <stdlib.h>
118 #include <time.h>
119 #if HAVE_UNISTD_H
120 #include <unistd.h>
121 #endif
122
123 #include "gmp-impl.h"
124 #include "longlong.h"
125
126 #include "tests.h"
127 #include "speed.h"
128
129 #if !HAVE_DECL_OPTARG
130 extern char *optarg;
131 extern int optind, opterr;
132 #endif
133
134
135 #define DEFAULT_MAX_SIZE 1000 /* limbs */
136
137 #if WANT_FFT
138 mp_size_t option_fft_max_size = 50000; /* limbs */
139 #else
140 mp_size_t option_fft_max_size = 0;
141 #endif
142 int option_trace = 0;
143 int option_fft_trace = 0;
144 struct speed_params s;
145
146 struct dat_t {
147 mp_size_t size;
148 double d;
149 } *dat = NULL;
150 int ndat = 0;
151 int allocdat = 0;
152
153 /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
154 case use zero here, which for params.max_size means no limit. */
155 #ifndef TUNE_SQR_TOOM2_MAX
156 #define TUNE_SQR_TOOM2_MAX 0
157 #endif
158
159 mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX;
160 mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT;
161 mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT;
162 mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT;
163 mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
164 mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
165 mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
166 mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
167 mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
168 mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
169 mp_size_t mul_fft_threshold = MP_SIZE_T_MAX;
170 mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX;
171 mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
172 mp_size_t sqr_toom2_threshold
173 = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
174 mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
175 mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
176 mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
177 mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
178 mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX;
179 mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX;
180 mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX;
181 mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX;
182 mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX;
183 mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX;
184 mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX;
185 mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX;
186 mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX;
187 mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX;
188 mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX;
189 mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX;
190 mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
191 mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
192 mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX;
193 mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX;
194 mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX;
195 mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX;
196 mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
197 mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
198 mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX;
199 mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX;
200 mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX;
201 mp_size_t inv_newton_threshold = MP_SIZE_T_MAX;
202 mp_size_t inv_appr_threshold = MP_SIZE_T_MAX;
203 mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
204 mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
205 mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
206 mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
207 mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
208 mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
209 mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX;
210 mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX;
211 mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
212 mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
213 int div_qr_1n_pi1_method = 0;
214 mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX;
215 mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX;
216 mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
217 mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
218 mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
219 mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
220 int mod_1_1p_method = 0;
221 mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX;
222 mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX;
223 mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
224 mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
225 mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
226 mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
227 mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
228 mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
229 mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
230 mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
231 mp_size_t fac_odd_threshold = 0;
232 mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT;
233
234 mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
235 mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
236
237 struct param_t {
238 const char *name;
239 speed_function_t function;
240 speed_function_t function2;
241 double step_factor; /* how much to step relatively */
242 int step; /* how much to step absolutely */
243 double function_fudge; /* multiplier for "function" speeds */
244 int stop_since_change;
245 double stop_factor;
246 mp_size_t min_size;
247 int min_is_always;
248 mp_size_t max_size;
249 mp_size_t check_size;
250 mp_size_t size_extra;
251
252 #define DATA_HIGH_LT_R 1
253 #define DATA_HIGH_GE_R 2
254 int data_high;
255
256 int noprint;
257 };
258
259
260 /* These are normally undefined when false, which suits "#if" fine.
261 But give them zero values so they can be used in plain C "if"s. */
262 #ifndef UDIV_PREINV_ALWAYS
263 #define UDIV_PREINV_ALWAYS 0
264 #endif
265 #ifndef HAVE_NATIVE_mpn_divexact_1
266 #define HAVE_NATIVE_mpn_divexact_1 0
267 #endif
268 #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
269 #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
270 #endif
271 #ifndef HAVE_NATIVE_mpn_divrem_1
272 #define HAVE_NATIVE_mpn_divrem_1 0
273 #endif
274 #ifndef HAVE_NATIVE_mpn_divrem_2
275 #define HAVE_NATIVE_mpn_divrem_2 0
276 #endif
277 #ifndef HAVE_NATIVE_mpn_mod_1
278 #define HAVE_NATIVE_mpn_mod_1 0
279 #endif
280 #ifndef HAVE_NATIVE_mpn_mod_1_1p
281 #define HAVE_NATIVE_mpn_mod_1_1p 0
282 #endif
283 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
284 #define HAVE_NATIVE_mpn_modexact_1_odd 0
285 #endif
286 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
287 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
288 #endif
289 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
290 #define HAVE_NATIVE_mpn_preinv_mod_1 0
291 #endif
292 #ifndef HAVE_NATIVE_mpn_sqr_basecase
293 #define HAVE_NATIVE_mpn_sqr_basecase 0
294 #endif
295
296
297 #define MAX3(a,b,c) MAX (MAX (a, b), c)
298
299 mp_limb_t
randlimb_norm(void)300 randlimb_norm (void)
301 {
302 mp_limb_t n;
303 mpn_random (&n, 1);
304 n |= GMP_NUMB_HIGHBIT;
305 return n;
306 }
307
308 #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
309
310 mp_limb_t
randlimb_half(void)311 randlimb_half (void)
312 {
313 mp_limb_t n;
314 mpn_random (&n, 1);
315 n &= GMP_NUMB_HALFMASK;
316 n += (n==0);
317 return n;
318 }
319
320
321 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
322 if necessary. */
323 void
add_dat(mp_size_t size,double d)324 add_dat (mp_size_t size, double d)
325 {
326 #define ALLOCDAT_STEP 500
327
328 ASSERT_ALWAYS (ndat <= allocdat);
329
330 if (ndat == allocdat)
331 {
332 dat = (struct dat_t *) __gmp_allocate_or_reallocate
333 (dat, allocdat * sizeof(dat[0]),
334 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
335 allocdat += ALLOCDAT_STEP;
336 }
337
338 dat[ndat].size = size;
339 dat[ndat].d = d;
340 ndat++;
341 }
342
343
344 /* Return the threshold size based on the data accumulated. */
345 mp_size_t
analyze_dat(int final)346 analyze_dat (int final)
347 {
348 double x, min_x;
349 int j, min_j;
350
351 /* If the threshold is set at dat[0].size, any positive values are bad. */
352 x = 0.0;
353 for (j = 0; j < ndat; j++)
354 if (dat[j].d > 0.0)
355 x += dat[j].d;
356
357 if (option_trace >= 2 && final)
358 {
359 printf ("\n");
360 printf ("x is the sum of the badness from setting thresh at given size\n");
361 printf (" (minimum x is sought)\n");
362 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
363 }
364
365 min_x = x;
366 min_j = 0;
367
368
369 /* When stepping to the next dat[j].size, positive values are no longer
370 bad (so subtracted), negative values become bad (so add the absolute
371 value, meaning subtract). */
372 for (j = 0; j < ndat; x -= dat[j].d, j++)
373 {
374 if (option_trace >= 2 && final)
375 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
376
377 if (x < min_x)
378 {
379 min_x = x;
380 min_j = j;
381 }
382 }
383
384 return min_j;
385 }
386
387
388 /* Measuring for recompiled mpn/generic/div_qr_1.c,
389 * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
390
391 mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
392
393 #if defined (__cplusplus)
394 extern "C" {
395 #endif
396
397 mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
398 mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
399 void mpz_fac_ui_tune (mpz_ptr, unsigned long);
400
401 #if defined (__cplusplus)
402 }
403 #endif
404
405 double
speed_mpn_mod_1_tune(struct speed_params * s)406 speed_mpn_mod_1_tune (struct speed_params *s)
407 {
408 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
409 }
410 double
speed_mpn_divrem_1_tune(struct speed_params * s)411 speed_mpn_divrem_1_tune (struct speed_params *s)
412 {
413 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
414 }
415 double
speed_mpz_fac_ui_tune(struct speed_params * s)416 speed_mpz_fac_ui_tune (struct speed_params *s)
417 {
418 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
419 }
420 double
speed_mpn_div_qr_1_tune(struct speed_params * s)421 speed_mpn_div_qr_1_tune (struct speed_params *s)
422 {
423 SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
424 }
425
426 double
tuneup_measure(speed_function_t fun,const struct param_t * param,struct speed_params * s)427 tuneup_measure (speed_function_t fun,
428 const struct param_t *param,
429 struct speed_params *s)
430 {
431 static struct param_t dummy;
432 double t;
433 TMP_DECL;
434
435 if (! param)
436 param = &dummy;
437
438 s->size += param->size_extra;
439
440 TMP_MARK;
441 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
442 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
443
444 mpn_random (s->xp, s->size);
445 mpn_random (s->yp, s->size);
446
447 switch (param->data_high) {
448 case DATA_HIGH_LT_R:
449 s->xp[s->size-1] %= s->r;
450 s->yp[s->size-1] %= s->r;
451 break;
452 case DATA_HIGH_GE_R:
453 s->xp[s->size-1] |= s->r;
454 s->yp[s->size-1] |= s->r;
455 break;
456 }
457
458 t = speed_measure (fun, s);
459
460 s->size -= param->size_extra;
461
462 TMP_FREE;
463 return t;
464 }
465
466
467 #define PRINT_WIDTH 31
468
469 void
print_define_start(const char * name)470 print_define_start (const char *name)
471 {
472 printf ("#define %-*s ", PRINT_WIDTH, name);
473 if (option_trace)
474 printf ("...\n");
475 }
476
477 void
print_define_end_remark(const char * name,mp_size_t value,const char * remark)478 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
479 {
480 if (option_trace)
481 printf ("#define %-*s ", PRINT_WIDTH, name);
482
483 if (value == MP_SIZE_T_MAX)
484 printf ("MP_SIZE_T_MAX");
485 else
486 printf ("%5ld", (long) value);
487
488 if (remark != NULL)
489 printf (" /* %s */", remark);
490 printf ("\n");
491 fflush (stdout);
492 }
493
494 void
print_define_end(const char * name,mp_size_t value)495 print_define_end (const char *name, mp_size_t value)
496 {
497 const char *remark;
498 if (value == MP_SIZE_T_MAX)
499 remark = "never";
500 else if (value == 0)
501 remark = "always";
502 else
503 remark = NULL;
504 print_define_end_remark (name, value, remark);
505 }
506
507 void
print_define(const char * name,mp_size_t value)508 print_define (const char *name, mp_size_t value)
509 {
510 print_define_start (name);
511 print_define_end (name, value);
512 }
513
514 void
print_define_remark(const char * name,mp_size_t value,const char * remark)515 print_define_remark (const char *name, mp_size_t value, const char *remark)
516 {
517 print_define_start (name);
518 print_define_end_remark (name, value, remark);
519 }
520
521 void
print_define_with_speedup(const char * name,mp_size_t value,mp_size_t runner_up,double speedup)522 print_define_with_speedup (const char *name, mp_size_t value,
523 mp_size_t runner_up, double speedup)
524 {
525 char buf[100];
526 #if __STDC_VERSION__ >= 199901L
527 snprintf (buf, sizeof buf, "%.2f%% faster than %ld",
528 100.0 * (speedup - 1), runner_up);
529 #else
530 sprintf (buf, "%.2f%% faster than %ld",
531 100.0 * (speedup - 1), runner_up);
532 #endif
533 print_define_remark (name, value, buf);
534 }
535
536 void
one(mp_size_t * threshold,struct param_t * param)537 one (mp_size_t *threshold, struct param_t *param)
538 {
539 int since_positive, since_thresh_change;
540 int thresh_idx, new_thresh_idx;
541
542 #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
543
544 DEFAULT (param->function_fudge, 1.0);
545 DEFAULT (param->function2, param->function);
546 DEFAULT (param->step_factor, 0.01); /* small steps by default */
547 DEFAULT (param->step, 1); /* small steps by default */
548 DEFAULT (param->stop_since_change, 80);
549 DEFAULT (param->stop_factor, 1.2);
550 DEFAULT (param->min_size, 10);
551 DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
552
553 if (param->check_size != 0)
554 {
555 double t1, t2;
556 s.size = param->check_size;
557
558 *threshold = s.size+1;
559 t1 = tuneup_measure (param->function, param, &s);
560
561 *threshold = s.size;
562 t2 = tuneup_measure (param->function2, param, &s);
563 if (t1 == -1.0 || t2 == -1.0)
564 {
565 printf ("Oops, can't run both functions at size %ld\n",
566 (long) s.size);
567 abort ();
568 }
569 t1 *= param->function_fudge;
570
571 /* ask that t2 is at least 4% below t1 */
572 if (t1 < t2*1.04)
573 {
574 if (option_trace)
575 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
576 *threshold = MP_SIZE_T_MAX;
577 if (! param->noprint)
578 print_define (param->name, *threshold);
579 return;
580 }
581
582 if (option_trace >= 2)
583 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
584 (long) s.size, t1, t2);
585 }
586
587 if (! param->noprint || option_trace)
588 print_define_start (param->name);
589
590 ndat = 0;
591 since_positive = 0;
592 since_thresh_change = 0;
593 thresh_idx = 0;
594
595 if (option_trace >= 2)
596 {
597 printf (" algorithm-A algorithm-B ratio possible\n");
598 printf (" (seconds) (seconds) diff thresh\n");
599 }
600
601 for (s.size = param->min_size;
602 s.size < param->max_size;
603 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
604 {
605 double ti, tiplus1, d;
606
607 /*
608 FIXME: check minimum size requirements are met, possibly by just
609 checking for the -1 returns from the speed functions.
610 */
611
612 /* using method A at this size */
613 *threshold = s.size+1;
614 ti = tuneup_measure (param->function, param, &s);
615 if (ti == -1.0)
616 abort ();
617 ti *= param->function_fudge;
618
619 /* using method B at this size */
620 *threshold = s.size;
621 tiplus1 = tuneup_measure (param->function2, param, &s);
622 if (tiplus1 == -1.0)
623 abort ();
624
625 /* Calculate the fraction by which the one or the other routine is
626 slower. */
627 if (tiplus1 >= ti)
628 d = (tiplus1 - ti) / tiplus1; /* negative */
629 else
630 d = (tiplus1 - ti) / ti; /* positive */
631
632 add_dat (s.size, d);
633
634 new_thresh_idx = analyze_dat (0);
635
636 if (option_trace >= 2)
637 printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
638 (long) s.size, ti, tiplus1, d,
639 ti > tiplus1 ? '#' : ' ',
640 (long) dat[new_thresh_idx].size);
641
642 /* Stop if the last time method i was faster was more than a
643 certain number of measurements ago. */
644 #define STOP_SINCE_POSITIVE 200
645 if (d >= 0)
646 since_positive = 0;
647 else
648 if (++since_positive > STOP_SINCE_POSITIVE)
649 {
650 if (option_trace >= 1)
651 printf ("stopped due to since_positive (%d)\n",
652 STOP_SINCE_POSITIVE);
653 break;
654 }
655
656 /* Stop if method A has become slower by a certain factor. */
657 if (ti >= tiplus1 * param->stop_factor)
658 {
659 if (option_trace >= 1)
660 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
661 param->stop_factor);
662 break;
663 }
664
665 /* Stop if the threshold implied hasn't changed in a certain
666 number of measurements. (It's this condition that usually
667 stops the loop.) */
668 if (thresh_idx != new_thresh_idx)
669 since_thresh_change = 0, thresh_idx = new_thresh_idx;
670 else
671 if (++since_thresh_change > param->stop_since_change)
672 {
673 if (option_trace >= 1)
674 printf ("stopped due to since_thresh_change (%d)\n",
675 param->stop_since_change);
676 break;
677 }
678
679 /* Stop if the threshold implied is more than a certain number of
680 measurements ago. */
681 #define STOP_SINCE_AFTER 500
682 if (ndat - thresh_idx > STOP_SINCE_AFTER)
683 {
684 if (option_trace >= 1)
685 printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
686 STOP_SINCE_AFTER);
687 break;
688 }
689
690 /* Stop when the size limit is reached before the end of the
691 crossover, but only show this as an error for >= the default max
692 size. FIXME: Maybe should make it a param choice whether this is
693 an error. */
694 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
695 {
696 fprintf (stderr, "%s\n", param->name);
697 fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
698 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
699 fprintf (stderr, " max size reached before end of crossover\n");
700 break;
701 }
702 }
703
704 if (option_trace >= 1)
705 printf ("sizes %ld to %ld total %d measurements\n",
706 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
707
708 *threshold = dat[analyze_dat (1)].size;
709
710 if (param->min_is_always)
711 {
712 if (*threshold == param->min_size)
713 *threshold = 0;
714 }
715
716 if (! param->noprint || option_trace)
717 print_define_end (param->name, *threshold);
718 }
719
720 /* Time N different FUNCTIONS with the same parameters and size, to
721 select the fastest. Since *_METHOD defines start numbering from
722 one, if functions[i] is fastest, the value of the define is i+1.
723 Also output a comment with speedup compared to the next fastest
724 function. The NAME argument is used only for trace output.
725
726 Returns the index of the fastest function.
727 */
728 int
one_method(int n,speed_function_t * functions,const char * name,const char * define,const struct param_t * param)729 one_method (int n, speed_function_t *functions,
730 const char *name, const char *define,
731 const struct param_t *param)
732 {
733 double *t;
734 int i;
735 int method;
736 int method_runner_up;
737
738 TMP_DECL;
739 TMP_MARK;
740 t = (double*) TMP_ALLOC (n * sizeof (*t));
741
742 for (i = 0; i < n; i++)
743 {
744 t[i] = tuneup_measure (functions[i], param, &s);
745 if (option_trace >= 1)
746 printf ("size=%ld, %s, method %d %.9f\n",
747 (long) s.size, name, i + 1, t[i]);
748 if (t[i] == -1.0)
749 {
750 printf ("Oops, can't measure all %s methods\n", name);
751 abort ();
752 }
753 }
754 method = 0;
755 for (i = 1; i < n; i++)
756 if (t[i] < t[method])
757 method = i;
758
759 method_runner_up = (method == 0);
760 for (i = 0; i < n; i++)
761 if (i != method && t[i] < t[method_runner_up])
762 method_runner_up = i;
763
764 print_define_with_speedup (define, method + 1, method_runner_up + 1,
765 t[method_runner_up] / t[method]);
766
767 TMP_FREE;
768 return method;
769 }
770
771
772 /* Special probing for the fft thresholds. The size restrictions on the
773 FFTs mean the graph of time vs size has a step effect. See this for
774 example using
775
776 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
777 gnuplot foo.gnuplot
778
779 The current approach is to compare routines at the midpoint of relevant
780 steps. Arguably a more sophisticated system of threshold data is wanted
781 if this step effect remains. */
782
783 struct fft_param_t {
784 const char *table_name;
785 const char *threshold_name;
786 const char *modf_threshold_name;
787 mp_size_t *p_threshold;
788 mp_size_t *p_modf_threshold;
789 mp_size_t first_size;
790 mp_size_t max_size;
791 speed_function_t function;
792 speed_function_t mul_modf_function;
793 speed_function_t mul_function;
794 mp_size_t sqr;
795 };
796
797
798 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
799 N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
800 of 2^(k-1) bits. */
801
802 mp_size_t
fft_step_size(int k)803 fft_step_size (int k)
804 {
805 mp_size_t step;
806
807 step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
808 step *= (mp_size_t) 1 << k;
809
810 if (step <= 0)
811 {
812 printf ("Can't handle k=%d\n", k);
813 abort ();
814 }
815
816 return step;
817 }
818
819 mp_size_t
fft_next_size(mp_size_t pl,int k)820 fft_next_size (mp_size_t pl, int k)
821 {
822 mp_size_t m = fft_step_size (k);
823
824 /* printf ("[k=%d %ld] %ld ->", k, m, pl); */
825
826 if (pl == 0 || (pl & (m-1)) != 0)
827 pl = (pl | (m-1)) + 1;
828
829 /* printf (" %ld\n", pl); */
830 return pl;
831 }
832
833 #define NMAX_DEFAULT 1000000
834 #define MAX_REPS 25
835 #define MIN_REPS 5
836
837 static inline size_t
mpn_mul_fft_lcm(size_t a,unsigned int k)838 mpn_mul_fft_lcm (size_t a, unsigned int k)
839 {
840 unsigned int l = k;
841
842 while (a % 2 == 0 && k > 0)
843 {
844 a >>= 1;
845 k--;
846 }
847 return a << l;
848 }
849
850 mp_size_t
fftfill(mp_size_t pl,int k,int sqr)851 fftfill (mp_size_t pl, int k, int sqr)
852 {
853 mp_size_t maxLK;
854 mp_bitcnt_t N, Nprime, nprime, M;
855
856 N = pl * GMP_NUMB_BITS;
857 M = N >> k;
858
859 maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
860
861 Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
862 nprime = Nprime / GMP_NUMB_BITS;
863 if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
864 {
865 size_t K2;
866 for (;;)
867 {
868 K2 = 1L << mpn_fft_best_k (nprime, sqr);
869 if ((nprime & (K2 - 1)) == 0)
870 break;
871 nprime = (nprime + K2 - 1) & -K2;
872 Nprime = nprime * GMP_LIMB_BITS;
873 }
874 }
875 ASSERT_ALWAYS (nprime < pl);
876
877 return Nprime;
878 }
879
880 static int
compare_double(const void * ap,const void * bp)881 compare_double (const void *ap, const void *bp)
882 {
883 double a = * (const double *) ap;
884 double b = * (const double *) bp;
885
886 if (a < b)
887 return -1;
888 else if (a > b)
889 return 1;
890 else
891 return 0;
892 }
893
894 double
median(double * times,int n)895 median (double *times, int n)
896 {
897 qsort (times, n, sizeof (double), compare_double);
898 return times[n/2];
899 }
900
901 #define FFT_CACHE_SIZE 25
902 typedef struct fft_cache
903 {
904 mp_size_t n;
905 double time;
906 } fft_cache_t;
907
908 fft_cache_t fft_cache[FFT_CACHE_SIZE];
909
910 double
cached_measure(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,int k,int n_measurements)911 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
912 int n_measurements)
913 {
914 int i;
915 double t, ttab[MAX_REPS];
916
917 if (fft_cache[k].n == n)
918 return fft_cache[k].time;
919
920 for (i = 0; i < n_measurements; i++)
921 {
922 speed_starttime ();
923 mpn_mul_fft (rp, n, ap, n, bp, n, k);
924 ttab[i] = speed_endtime ();
925 }
926
927 t = median (ttab, n_measurements);
928 fft_cache[k].n = n;
929 fft_cache[k].time = t;
930 return t;
931 }
932
933 #define INSERT_FFTTAB(idx, nval, kval) \
934 do { \
935 fft_tab[idx].n = nval; \
936 fft_tab[idx].k = kval; \
937 fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \
938 fft_tab[idx+1].k = (1 << 5) - 1; \
939 } while (0)
940
941 int
fftmes(mp_size_t nmin,mp_size_t nmax,int initial_k,struct fft_param_t * p,int idx,int print)942 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
943 {
944 mp_size_t n, n1, prev_n1;
945 int k, best_k, last_best_k, kmax;
946 int eff, prev_eff;
947 double t0, t1;
948 int n_measurements;
949 mp_limb_t *ap, *bp, *rp;
950 mp_size_t alloc;
951 struct fft_table_nk *fft_tab;
952
953 fft_tab = mpn_fft_table3[p->sqr];
954
955 for (k = 0; k < FFT_CACHE_SIZE; k++)
956 fft_cache[k].n = 0;
957
958 if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
959 {
960 nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
961 }
962
963 if (print)
964 printf ("#define %s%*s", p->table_name, 38, "");
965
966 if (idx == 0)
967 {
968 INSERT_FFTTAB (0, nmin, initial_k);
969
970 if (print)
971 {
972 printf ("\\\n { ");
973 printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
974 }
975
976 idx = 1;
977 }
978
979 ap = (mp_ptr) malloc (sizeof (mp_limb_t));
980 if (p->sqr)
981 bp = ap;
982 else
983 bp = (mp_ptr) malloc (sizeof (mp_limb_t));
984 rp = (mp_ptr) malloc (sizeof (mp_limb_t));
985 alloc = 1;
986
987 /* Round n to comply to initial k value */
988 n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
989
990 n_measurements = (18 - initial_k) | 1;
991 n_measurements = MAX (n_measurements, MIN_REPS);
992 n_measurements = MIN (n_measurements, MAX_REPS);
993
994 last_best_k = initial_k;
995 best_k = initial_k;
996
997 while (n < nmax)
998 {
999 int start_k, end_k;
1000
1001 /* Assume the current best k is best until we hit its next FFT step. */
1002 t0 = 99999;
1003
1004 prev_n1 = n + 1;
1005
1006 start_k = MAX (4, best_k - 4);
1007 end_k = MIN (24, best_k + 4);
1008 for (k = start_k; k <= end_k; k++)
1009 {
1010 n1 = mpn_fft_next_size (prev_n1, k);
1011
1012 eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
1013
1014 if (eff < 70) /* avoid measuring too slow fft:s */
1015 continue;
1016
1017 if (n1 > alloc)
1018 {
1019 alloc = n1;
1020 if (p->sqr)
1021 {
1022 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1023 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1024 ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1025 mpn_random (ap, alloc);
1026 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1027 }
1028 else
1029 {
1030 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1031 bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
1032 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1033 ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1034 mpn_random (ap, alloc);
1035 bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
1036 mpn_random (bp, alloc);
1037 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1038 }
1039 }
1040
1041 t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
1042
1043 if (t1 * n_measurements > 0.3)
1044 n_measurements -= 2;
1045 n_measurements = MAX (n_measurements, MIN_REPS);
1046
1047 if (t1 < t0)
1048 {
1049 best_k = k;
1050 t0 = t1;
1051 }
1052 }
1053
1054 n1 = mpn_fft_next_size (prev_n1, best_k);
1055
1056 if (last_best_k != best_k)
1057 {
1058 ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
1059
1060 if (idx >= FFT_TABLE3_SIZE)
1061 {
1062 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1063 abort ();
1064 }
1065 INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
1066
1067 if (print)
1068 {
1069 printf (", ");
1070 if (idx % 4 == 0)
1071 printf ("\\\n ");
1072 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1073 }
1074
1075 if (option_trace >= 2)
1076 {
1077 printf ("{%lu,%u}\n", prev_n1, best_k);
1078 fflush (stdout);
1079 }
1080
1081 last_best_k = best_k;
1082 idx++;
1083 }
1084
1085 for (;;)
1086 {
1087 prev_n1 = n1;
1088 prev_eff = fftfill (prev_n1, best_k, p->sqr);
1089 n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
1090 eff = fftfill (n1, best_k, p->sqr);
1091
1092 if (eff != prev_eff)
1093 break;
1094 }
1095
1096 n = prev_n1;
1097 }
1098
1099 kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */
1100 kmax = MIN (kmax, 25-1);
1101 for (k = last_best_k + 1; k <= kmax; k++)
1102 {
1103 if (idx >= FFT_TABLE3_SIZE)
1104 {
1105 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1106 abort ();
1107 }
1108 INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1109
1110 if (print)
1111 {
1112 printf (", ");
1113 if (idx % 4 == 0)
1114 printf ("\\\n ");
1115 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1116 }
1117
1118 idx++;
1119 }
1120
1121 if (print)
1122 printf (" }\n");
1123
1124 free (ap);
1125 if (! p->sqr)
1126 free (bp);
1127 free (rp);
1128
1129 return idx;
1130 }
1131
1132 void
fft(struct fft_param_t * p)1133 fft (struct fft_param_t *p)
1134 {
1135 mp_size_t size;
1136 int k, idx, initial_k;
1137
1138 /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1139
1140 #if 1
1141 {
1142 /* Use plain one() mechanism, for some reasonable initial values of k. The
1143 advantage is that we don't depend on mpn_fft_table3, which can therefore
1144 leave it completely uninitialized. */
1145
1146 static struct param_t param;
1147 mp_size_t thres, best_thres;
1148 int best_k;
1149 char buf[20];
1150
1151 best_thres = MP_SIZE_T_MAX;
1152 best_k = -1;
1153
1154 for (k = 5; k <= 7; k++)
1155 {
1156 param.name = p->modf_threshold_name;
1157 param.min_size = 100;
1158 param.max_size = 2000;
1159 param.function = p->mul_function;
1160 param.step_factor = 0.0;
1161 param.step = 4;
1162 param.function2 = p->mul_modf_function;
1163 param.noprint = 1;
1164 s.r = k;
1165 one (&thres, ¶m);
1166 if (thres < best_thres)
1167 {
1168 best_thres = thres;
1169 best_k = k;
1170 }
1171 }
1172
1173 *(p->p_modf_threshold) = best_thres;
1174 sprintf (buf, "k = %d", best_k);
1175 print_define_remark (p->modf_threshold_name, best_thres, buf);
1176 initial_k = best_k;
1177 }
1178 #else
1179 size = p->first_size;
1180 for (;;)
1181 {
1182 double tk, tm;
1183
1184 size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1185 k = mpn_fft_best_k (size, p->sqr);
1186
1187 if (size >= p->max_size)
1188 break;
1189
1190 s.size = size + fft_step_size (k) / 2;
1191 s.r = k;
1192 tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1193 if (tk == -1.0)
1194 abort ();
1195
1196 tm = tuneup_measure (p->mul_function, NULL, &s);
1197 if (tm == -1.0)
1198 abort ();
1199
1200 if (option_trace >= 2)
1201 printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n",
1202 (long) size,
1203 (long) size + fft_step_size (k) / 2, k, tk,
1204 (long) s.size, tm);
1205
1206 if (tk < tm)
1207 {
1208 *p->p_modf_threshold = s.size;
1209 print_define (p->modf_threshold_name, *p->p_modf_threshold);
1210 break;
1211 }
1212 }
1213 initial_k = ?;
1214 #endif
1215
1216 /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1217
1218 idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1219 printf ("#define %s_SIZE %d\n", p->table_name, idx);
1220
1221 /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1222
1223 size = 2 * *p->p_modf_threshold; /* OK? */
1224 for (;;)
1225 {
1226 double tk, tm;
1227 mp_size_t mulmod_size, mul_size;;
1228
1229 if (size >= p->max_size)
1230 break;
1231
1232 mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1233 mul_size = (size + mulmod_size) / 2; /* middle of step */
1234
1235 s.size = mulmod_size;
1236 tk = tuneup_measure (p->function, NULL, &s);
1237 if (tk == -1.0)
1238 abort ();
1239
1240 s.size = mul_size;
1241 tm = tuneup_measure (p->mul_function, NULL, &s);
1242 if (tm == -1.0)
1243 abort ();
1244
1245 if (option_trace >= 2)
1246 printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n",
1247 (long) size,
1248 (long) mulmod_size, tk,
1249 (long) mul_size, tm);
1250
1251 size = mulmod_size;
1252
1253 if (tk < tm)
1254 {
1255 *p->p_threshold = s.size;
1256 print_define (p->threshold_name, *p->p_threshold);
1257 break;
1258 }
1259 }
1260 }
1261
1262 /* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This
1263 is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
1264 This is used in get_str and set_str. */
1265 void
relspeed_div_1_vs_mul_1(void)1266 relspeed_div_1_vs_mul_1 (void)
1267 {
1268 #define max_opsize 100
1269 mp_size_t n;
1270 long j;
1271 mp_limb_t rp[max_opsize];
1272 mp_limb_t ap[max_opsize];
1273 double multime, divtime;
1274
1275 mpn_random (ap, max_opsize);
1276
1277 multime = 0;
1278 for (n = max_opsize; n > 1; n--)
1279 {
1280 mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1281 speed_starttime ();
1282 for (j = speed_precision; j != 0 ; j--)
1283 mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1284 multime += speed_endtime () / n;
1285 }
1286
1287 divtime = 0;
1288 for (n = max_opsize; n > 1; n--)
1289 {
1290 /* Make input divisible for good measure. */
1291 ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
1292
1293 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1294 mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1295 MP_BASES_BIG_BASE_BINVERTED_10,
1296 MP_BASES_BIG_BASE_CTZ_10);
1297 #else
1298 mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1299 #endif
1300 speed_starttime ();
1301 for (j = speed_precision; j != 0 ; j--)
1302 {
1303 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1304 mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1305 MP_BASES_BIG_BASE_BINVERTED_10,
1306 MP_BASES_BIG_BASE_CTZ_10);
1307 #else
1308 mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1309 #endif
1310 }
1311 divtime += speed_endtime () / n;
1312 }
1313
1314 print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
1315 }
1316
1317
1318 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1319 giving wrong results. */
1320 void
tune_mul_n(void)1321 tune_mul_n (void)
1322 {
1323 static struct param_t param;
1324 mp_size_t next_toom_start;
1325 int something_changed;
1326
1327 param.function = speed_mpn_mul_n;
1328
1329 param.name = "MUL_TOOM22_THRESHOLD";
1330 param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1331 param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1332 one (&mul_toom22_threshold, ¶m);
1333
1334 param.noprint = 1;
1335
1336 /* Threshold sequence loop. Disable functions that would be used in a very
1337 narrow range, re-measuring things when that happens. */
1338 something_changed = 1;
1339 while (something_changed)
1340 {
1341 something_changed = 0;
1342
1343 next_toom_start = mul_toom22_threshold;
1344
1345 if (mul_toom33_threshold != 0)
1346 {
1347 param.name = "MUL_TOOM33_THRESHOLD";
1348 param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
1349 param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1350 one (&mul_toom33_threshold, ¶m);
1351
1352 if (next_toom_start * 1.05 >= mul_toom33_threshold)
1353 {
1354 mul_toom33_threshold = 0;
1355 something_changed = 1;
1356 }
1357 }
1358
1359 next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
1360
1361 if (mul_toom44_threshold != 0)
1362 {
1363 param.name = "MUL_TOOM44_THRESHOLD";
1364 param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
1365 param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1366 one (&mul_toom44_threshold, ¶m);
1367
1368 if (next_toom_start * 1.05 >= mul_toom44_threshold)
1369 {
1370 mul_toom44_threshold = 0;
1371 something_changed = 1;
1372 }
1373 }
1374
1375 next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
1376
1377 if (mul_toom6h_threshold != 0)
1378 {
1379 param.name = "MUL_TOOM6H_THRESHOLD";
1380 param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
1381 param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1382 one (&mul_toom6h_threshold, ¶m);
1383
1384 if (next_toom_start * 1.05 >= mul_toom6h_threshold)
1385 {
1386 mul_toom6h_threshold = 0;
1387 something_changed = 1;
1388 }
1389 }
1390
1391 next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
1392
1393 if (mul_toom8h_threshold != 0)
1394 {
1395 param.name = "MUL_TOOM8H_THRESHOLD";
1396 param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
1397 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1398 one (&mul_toom8h_threshold, ¶m);
1399
1400 if (next_toom_start * 1.05 >= mul_toom8h_threshold)
1401 {
1402 mul_toom8h_threshold = 0;
1403 something_changed = 1;
1404 }
1405 }
1406 }
1407
1408 print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
1409 print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
1410 print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
1411 print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
1412
1413 /* disabled until tuned */
1414 MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1415 }
1416
1417 void
tune_mul(void)1418 tune_mul (void)
1419 {
1420 static struct param_t param;
1421 mp_size_t thres;
1422
1423 param.noprint = 1;
1424
1425 param.function = speed_mpn_toom32_for_toom43_mul;
1426 param.function2 = speed_mpn_toom43_for_toom32_mul;
1427 param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1428 param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
1429 one (&thres, ¶m);
1430 mul_toom32_to_toom43_threshold = thres * 17 / 24;
1431 print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1432
1433 param.function = speed_mpn_toom32_for_toom53_mul;
1434 param.function2 = speed_mpn_toom53_for_toom32_mul;
1435 param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1436 param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
1437 one (&thres, ¶m);
1438 mul_toom32_to_toom53_threshold = thres * 19 / 30;
1439 print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1440
1441 param.function = speed_mpn_toom42_for_toom53_mul;
1442 param.function2 = speed_mpn_toom53_for_toom42_mul;
1443 param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1444 param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
1445 one (&thres, ¶m);
1446 mul_toom42_to_toom53_threshold = thres * 11 / 20;
1447 print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1448
1449 param.function = speed_mpn_toom42_mul;
1450 param.function2 = speed_mpn_toom63_mul;
1451 param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1452 param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
1453 one (&thres, ¶m);
1454 mul_toom42_to_toom63_threshold = thres / 2;
1455 print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1456
1457 /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
1458 param.function = speed_mpn_toom43_for_toom54_mul;
1459 param.function2 = speed_mpn_toom54_for_toom43_mul;
1460 param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
1461 param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
1462 one (&thres, ¶m);
1463 mul_toom43_to_toom54_threshold = thres * 5 / 6;
1464 print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
1465 }
1466
1467
1468 void
tune_mullo(void)1469 tune_mullo (void)
1470 {
1471 static struct param_t param;
1472
1473 param.function = speed_mpn_mullo_n;
1474
1475 param.name = "MULLO_BASECASE_THRESHOLD";
1476 param.min_size = 2;
1477 param.min_is_always = 1;
1478 param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1479 param.stop_factor = 1.5;
1480 param.noprint = 1;
1481 one (&mullo_basecase_threshold, ¶m);
1482
1483 param.name = "MULLO_DC_THRESHOLD";
1484 param.min_size = 8;
1485 param.min_is_always = 0;
1486 param.max_size = 1000;
1487 one (&mullo_dc_threshold, ¶m);
1488
1489 if (mullo_basecase_threshold >= mullo_dc_threshold)
1490 {
1491 print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1492 print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1493 }
1494 else
1495 {
1496 print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1497 print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1498 }
1499
1500 if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
1501 {
1502 param.name = "MULLO_MUL_N_THRESHOLD";
1503 param.min_size = mullo_dc_threshold;
1504 param.max_size = 2 * mul_fft_threshold;
1505 param.noprint = 0;
1506 param.step_factor = 0.03;
1507 one (&mullo_mul_n_threshold, ¶m);
1508 }
1509 else
1510 print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1511 "without FFT use mullo forever");
1512 }
1513
1514 void
tune_sqrlo(void)1515 tune_sqrlo (void)
1516 {
1517 static struct param_t param;
1518
1519 param.function = speed_mpn_sqrlo;
1520
1521 param.name = "SQRLO_BASECASE_THRESHOLD";
1522 param.min_size = 2;
1523 param.min_is_always = 1;
1524 param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
1525 param.stop_factor = 1.5;
1526 param.noprint = 1;
1527 one (&sqrlo_basecase_threshold, ¶m);
1528
1529 param.name = "SQRLO_DC_THRESHOLD";
1530 param.min_size = 8;
1531 param.min_is_always = 0;
1532 param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
1533 one (&sqrlo_dc_threshold, ¶m);
1534
1535 if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
1536 {
1537 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
1538 print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
1539 }
1540 else
1541 {
1542 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
1543 print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
1544 }
1545
1546 if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
1547 {
1548 param.name = "SQRLO_SQR_THRESHOLD";
1549 param.min_size = sqrlo_dc_threshold;
1550 param.max_size = 2 * sqr_fft_threshold;
1551 param.noprint = 0;
1552 param.step_factor = 0.03;
1553 one (&sqrlo_sqr_threshold, ¶m);
1554 }
1555 else
1556 print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
1557 "without FFT use sqrlo forever");
1558 }
1559
1560 void
tune_mulmid(void)1561 tune_mulmid (void)
1562 {
1563 static struct param_t param;
1564
1565 param.name = "MULMID_TOOM42_THRESHOLD";
1566 param.function = speed_mpn_mulmid_n;
1567 param.min_size = 4;
1568 param.max_size = 100;
1569 one (&mulmid_toom42_threshold, ¶m);
1570 }
1571
1572 void
tune_mulmod_bnm1(void)1573 tune_mulmod_bnm1 (void)
1574 {
1575 static struct param_t param;
1576
1577 param.name = "MULMOD_BNM1_THRESHOLD";
1578 param.function = speed_mpn_mulmod_bnm1;
1579 param.min_size = 4;
1580 param.max_size = 100;
1581 one (&mulmod_bnm1_threshold, ¶m);
1582 }
1583
1584 void
tune_sqrmod_bnm1(void)1585 tune_sqrmod_bnm1 (void)
1586 {
1587 static struct param_t param;
1588
1589 param.name = "SQRMOD_BNM1_THRESHOLD";
1590 param.function = speed_mpn_sqrmod_bnm1;
1591 param.min_size = 4;
1592 param.max_size = 100;
1593 one (&sqrmod_bnm1_threshold, ¶m);
1594 }
1595
1596
1597 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1598 is faster only at size==2 then we don't want to bother with extra code
1599 just for that. Start karatsuba from 4 same as MUL above. */
1600
1601 void
tune_sqr(void)1602 tune_sqr (void)
1603 {
1604 /* disabled until tuned */
1605 SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1606
1607 if (HAVE_NATIVE_mpn_sqr_basecase)
1608 {
1609 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1610 sqr_basecase_threshold = 0;
1611 }
1612 else
1613 {
1614 static struct param_t param;
1615 param.name = "SQR_BASECASE_THRESHOLD";
1616 param.function = speed_mpn_sqr;
1617 param.min_size = 3;
1618 param.min_is_always = 1;
1619 param.max_size = TUNE_SQR_TOOM2_MAX;
1620 param.noprint = 1;
1621 one (&sqr_basecase_threshold, ¶m);
1622 }
1623
1624 {
1625 static struct param_t param;
1626 param.name = "SQR_TOOM2_THRESHOLD";
1627 param.function = speed_mpn_sqr;
1628 param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1629 param.max_size = TUNE_SQR_TOOM2_MAX;
1630 param.noprint = 1;
1631 one (&sqr_toom2_threshold, ¶m);
1632
1633 if (! HAVE_NATIVE_mpn_sqr_basecase
1634 && sqr_toom2_threshold < sqr_basecase_threshold)
1635 {
1636 /* Karatsuba becomes faster than mul_basecase before
1637 sqr_basecase does. Arrange for the expression
1638 "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1639 selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1640 SQR_TOOM2_THRESHOLD to zero, making
1641 SQR_BASECASE_THRESHOLD the toom2 threshold. */
1642
1643 sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1644 SQR_TOOM2_THRESHOLD = 0;
1645
1646 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1647 "toom2");
1648 print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1649 "never sqr_basecase");
1650 }
1651 else
1652 {
1653 if (! HAVE_NATIVE_mpn_sqr_basecase)
1654 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1655 print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1656 }
1657 }
1658
1659 {
1660 static struct param_t param;
1661 mp_size_t next_toom_start;
1662 int something_changed;
1663
1664 param.function = speed_mpn_sqr;
1665 param.noprint = 1;
1666
1667 /* Threshold sequence loop. Disable functions that would be used in a very
1668 narrow range, re-measuring things when that happens. */
1669 something_changed = 1;
1670 while (something_changed)
1671 {
1672 something_changed = 0;
1673
1674 next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1675
1676 sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
1677 param.name = "SQR_TOOM3_THRESHOLD";
1678 param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
1679 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1680 one (&sqr_toom3_threshold, ¶m);
1681
1682 next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
1683
1684 if (sqr_toom4_threshold != 0)
1685 {
1686 param.name = "SQR_TOOM4_THRESHOLD";
1687 sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
1688 param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
1689 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1690 one (&sqr_toom4_threshold, ¶m);
1691
1692 if (next_toom_start * 1.05 >= sqr_toom4_threshold)
1693 {
1694 sqr_toom4_threshold = 0;
1695 something_changed = 1;
1696 }
1697 }
1698
1699 next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
1700
1701 if (sqr_toom6_threshold != 0)
1702 {
1703 param.name = "SQR_TOOM6_THRESHOLD";
1704 sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
1705 param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
1706 param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1707 one (&sqr_toom6_threshold, ¶m);
1708
1709 if (next_toom_start * 1.05 >= sqr_toom6_threshold)
1710 {
1711 sqr_toom6_threshold = 0;
1712 something_changed = 1;
1713 }
1714 }
1715
1716 next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
1717
1718 if (sqr_toom8_threshold != 0)
1719 {
1720 param.name = "SQR_TOOM8_THRESHOLD";
1721 sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
1722 param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
1723 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1724 one (&sqr_toom8_threshold, ¶m);
1725
1726 if (next_toom_start * 1.05 >= sqr_toom8_threshold)
1727 {
1728 sqr_toom8_threshold = 0;
1729 something_changed = 1;
1730 }
1731 }
1732 }
1733
1734 print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
1735 print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
1736 print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
1737 print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
1738 }
1739 }
1740
1741
1742 void
tune_dc_div(void)1743 tune_dc_div (void)
1744 {
1745 s.r = 0; /* clear to make speed function do 2n/n */
1746 {
1747 static struct param_t param;
1748 param.name = "DC_DIV_QR_THRESHOLD";
1749 param.function = speed_mpn_sbpi1_div_qr;
1750 param.function2 = speed_mpn_dcpi1_div_qr;
1751 param.min_size = 6;
1752 one (&dc_div_qr_threshold, ¶m);
1753 }
1754 {
1755 static struct param_t param;
1756 param.name = "DC_DIVAPPR_Q_THRESHOLD";
1757 param.function = speed_mpn_sbpi1_divappr_q;
1758 param.function2 = speed_mpn_dcpi1_divappr_q;
1759 param.min_size = 6;
1760 one (&dc_divappr_q_threshold, ¶m);
1761 }
1762 }
1763
1764 static double
speed_mpn_sbordcpi1_div_qr(struct speed_params * s)1765 speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1766 {
1767 if (s->size < DC_DIV_QR_THRESHOLD)
1768 return speed_mpn_sbpi1_div_qr (s);
1769 else
1770 return speed_mpn_dcpi1_div_qr (s);
1771 }
1772
1773 void
tune_mu_div(void)1774 tune_mu_div (void)
1775 {
1776 s.r = 0; /* clear to make speed function do 2n/n */
1777 {
1778 static struct param_t param;
1779 param.name = "MU_DIV_QR_THRESHOLD";
1780 param.function = speed_mpn_dcpi1_div_qr;
1781 param.function2 = speed_mpn_mu_div_qr;
1782 param.min_size = mul_toom22_threshold;
1783 param.max_size = 5000;
1784 param.step_factor = 0.02;
1785 one (&mu_div_qr_threshold, ¶m);
1786 }
1787 {
1788 static struct param_t param;
1789 param.name = "MU_DIVAPPR_Q_THRESHOLD";
1790 param.function = speed_mpn_dcpi1_divappr_q;
1791 param.function2 = speed_mpn_mu_divappr_q;
1792 param.min_size = mul_toom22_threshold;
1793 param.max_size = 5000;
1794 param.step_factor = 0.02;
1795 one (&mu_divappr_q_threshold, ¶m);
1796 }
1797 {
1798 static struct param_t param;
1799 param.name = "MUPI_DIV_QR_THRESHOLD";
1800 param.function = speed_mpn_sbordcpi1_div_qr;
1801 param.function2 = speed_mpn_mupi_div_qr;
1802 param.min_size = 6;
1803 param.min_is_always = 1;
1804 param.max_size = 1000;
1805 param.step_factor = 0.02;
1806 one (&mupi_div_qr_threshold, ¶m);
1807 }
1808 }
1809
1810 void
tune_dc_bdiv(void)1811 tune_dc_bdiv (void)
1812 {
1813 s.r = 0; /* clear to make speed function do 2n/n*/
1814 {
1815 static struct param_t param;
1816 param.name = "DC_BDIV_QR_THRESHOLD";
1817 param.function = speed_mpn_sbpi1_bdiv_qr;
1818 param.function2 = speed_mpn_dcpi1_bdiv_qr;
1819 param.min_size = 4;
1820 one (&dc_bdiv_qr_threshold, ¶m);
1821 }
1822 {
1823 static struct param_t param;
1824 param.name = "DC_BDIV_Q_THRESHOLD";
1825 param.function = speed_mpn_sbpi1_bdiv_q;
1826 param.function2 = speed_mpn_dcpi1_bdiv_q;
1827 param.min_size = 4;
1828 one (&dc_bdiv_q_threshold, ¶m);
1829 }
1830 }
1831
1832 void
tune_mu_bdiv(void)1833 tune_mu_bdiv (void)
1834 {
1835 s.r = 0; /* clear to make speed function do 2n/n*/
1836 {
1837 static struct param_t param;
1838 param.name = "MU_BDIV_QR_THRESHOLD";
1839 param.function = speed_mpn_dcpi1_bdiv_qr;
1840 param.function2 = speed_mpn_mu_bdiv_qr;
1841 param.min_size = dc_bdiv_qr_threshold;
1842 param.max_size = 5000;
1843 param.step_factor = 0.02;
1844 one (&mu_bdiv_qr_threshold, ¶m);
1845 }
1846 {
1847 static struct param_t param;
1848 param.name = "MU_BDIV_Q_THRESHOLD";
1849 param.function = speed_mpn_dcpi1_bdiv_q;
1850 param.function2 = speed_mpn_mu_bdiv_q;
1851 param.min_size = dc_bdiv_q_threshold;
1852 param.max_size = 5000;
1853 param.step_factor = 0.02;
1854 one (&mu_bdiv_q_threshold, ¶m);
1855 }
1856 }
1857
1858 void
tune_invertappr(void)1859 tune_invertappr (void)
1860 {
1861 static struct param_t param;
1862
1863 param.function = speed_mpn_ni_invertappr;
1864 param.name = "INV_MULMOD_BNM1_THRESHOLD";
1865 param.min_size = 5;
1866 one (&inv_mulmod_bnm1_threshold, ¶m);
1867
1868 param.function = speed_mpn_invertappr;
1869 param.name = "INV_NEWTON_THRESHOLD";
1870 param.min_size = 5;
1871 one (&inv_newton_threshold, ¶m);
1872 }
1873
1874 void
tune_invert(void)1875 tune_invert (void)
1876 {
1877 static struct param_t param;
1878
1879 param.function = speed_mpn_invert;
1880 param.name = "INV_APPR_THRESHOLD";
1881 param.min_size = 5;
1882 one (&inv_appr_threshold, ¶m);
1883 }
1884
1885 void
tune_binvert(void)1886 tune_binvert (void)
1887 {
1888 static struct param_t param;
1889
1890 param.function = speed_mpn_binvert;
1891 param.name = "BINV_NEWTON_THRESHOLD";
1892 param.min_size = 8; /* pointless with smaller operands */
1893 one (&binv_newton_threshold, ¶m);
1894 }
1895
1896 void
tune_redc(void)1897 tune_redc (void)
1898 {
1899 #define TUNE_REDC_2_MAX 100
1900 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1901 #define WANT_REDC_2 1
1902 #endif
1903
1904 #if WANT_REDC_2
1905 {
1906 static struct param_t param;
1907 param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1908 param.function = speed_mpn_redc_1;
1909 param.function2 = speed_mpn_redc_2;
1910 param.min_size = 1;
1911 param.min_is_always = 1;
1912 param.max_size = TUNE_REDC_2_MAX;
1913 param.noprint = 1;
1914 param.stop_factor = 1.5;
1915 one (&redc_1_to_redc_2_threshold, ¶m);
1916 }
1917 {
1918 static struct param_t param;
1919 param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1920 param.function = speed_mpn_redc_2;
1921 param.function2 = speed_mpn_redc_n;
1922 param.min_size = 16;
1923 param.noprint = 1;
1924 one (&redc_2_to_redc_n_threshold, ¶m);
1925 }
1926 if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
1927 {
1928 redc_2_to_redc_n_threshold = 0; /* disable redc_2 */
1929
1930 /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
1931 REDC_1_TO_REDC_2_THRESHOLD. */
1932 {
1933 static struct param_t param;
1934 param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1935 param.function = speed_mpn_redc_1;
1936 param.function2 = speed_mpn_redc_n;
1937 param.min_size = 16;
1938 param.noprint = 1;
1939 one (&redc_1_to_redc_2_threshold, ¶m);
1940 }
1941 }
1942 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1943 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1944 #else
1945 {
1946 static struct param_t param;
1947 param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1948 param.function = speed_mpn_redc_1;
1949 param.function2 = speed_mpn_redc_n;
1950 param.min_size = 16;
1951 one (&redc_1_to_redc_n_threshold, ¶m);
1952 }
1953 #endif
1954 }
1955
1956 void
tune_matrix22_mul(void)1957 tune_matrix22_mul (void)
1958 {
1959 static struct param_t param;
1960 param.name = "MATRIX22_STRASSEN_THRESHOLD";
1961 param.function = speed_mpn_matrix22_mul;
1962 param.min_size = 2;
1963 one (&matrix22_strassen_threshold, ¶m);
1964 }
1965
1966 void
tune_hgcd2(void)1967 tune_hgcd2 (void)
1968 {
1969 static struct param_t param;
1970 hgcd2_func_t *f[5] =
1971 { mpn_hgcd2_1,
1972 mpn_hgcd2_2,
1973 mpn_hgcd2_3,
1974 mpn_hgcd2_4,
1975 mpn_hgcd2_5 };
1976 speed_function_t speed_f[5] =
1977 { speed_mpn_hgcd2_1,
1978 speed_mpn_hgcd2_2,
1979 speed_mpn_hgcd2_3,
1980 speed_mpn_hgcd2_4,
1981 speed_mpn_hgcd2_5 };
1982 int best;
1983
1984 s.size = 1;
1985 best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", ¶m);
1986
1987 /* Use selected function when tuning hgcd and gcd */
1988 hgcd2_func = f[best];
1989 }
1990
1991 void
tune_hgcd(void)1992 tune_hgcd (void)
1993 {
1994 static struct param_t param;
1995 param.name = "HGCD_THRESHOLD";
1996 param.function = speed_mpn_hgcd;
1997 /* We seem to get strange results for small sizes */
1998 param.min_size = 30;
1999 one (&hgcd_threshold, ¶m);
2000 }
2001
2002 void
tune_hgcd_appr(void)2003 tune_hgcd_appr (void)
2004 {
2005 static struct param_t param;
2006 param.name = "HGCD_APPR_THRESHOLD";
2007 param.function = speed_mpn_hgcd_appr;
2008 /* We seem to get strange results for small sizes */
2009 param.min_size = 50;
2010 param.stop_since_change = 150;
2011 one (&hgcd_appr_threshold, ¶m);
2012 }
2013
2014 void
tune_hgcd_reduce(void)2015 tune_hgcd_reduce (void)
2016 {
2017 static struct param_t param;
2018 param.name = "HGCD_REDUCE_THRESHOLD";
2019 param.function = speed_mpn_hgcd_reduce;
2020 param.min_size = 30;
2021 param.max_size = 7000;
2022 param.step_factor = 0.04;
2023 one (&hgcd_reduce_threshold, ¶m);
2024 }
2025
2026 void
tune_gcd_dc(void)2027 tune_gcd_dc (void)
2028 {
2029 static struct param_t param;
2030 param.name = "GCD_DC_THRESHOLD";
2031 param.function = speed_mpn_gcd;
2032 param.min_size = hgcd_threshold;
2033 param.max_size = 3000;
2034 param.step_factor = 0.02;
2035 one (&gcd_dc_threshold, ¶m);
2036 }
2037
2038 void
tune_gcdext_dc(void)2039 tune_gcdext_dc (void)
2040 {
2041 static struct param_t param;
2042 param.name = "GCDEXT_DC_THRESHOLD";
2043 param.function = speed_mpn_gcdext;
2044 param.min_size = hgcd_threshold;
2045 param.max_size = 3000;
2046 param.step_factor = 0.02;
2047 one (&gcdext_dc_threshold, ¶m);
2048 }
2049
2050 /* In tune_powm_sec we compute the table used by the win_size function. The
2051 cutoff points are in exponent bits, disregarding other operand sizes. It is
2052 not possible to use the one framework since it currently uses a granularity
2053 of full limbs.
2054 */
2055
2056 /* This win_size replaces the variant in the powm code, allowing us to
2057 control k in the k-ary algorithms. */
2058 int winsize;
2059 int
win_size(mp_bitcnt_t eb)2060 win_size (mp_bitcnt_t eb)
2061 {
2062 return winsize;
2063 }
2064
2065 void
tune_powm_sec(void)2066 tune_powm_sec (void)
2067 {
2068 mp_size_t n;
2069 int k, i;
2070 mp_size_t itch;
2071 mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
2072 const int n_max = 3000 / GMP_NUMB_BITS;
2073 #define n_measurements 5
2074 mp_ptr rp, bp, ep, mp, tp;
2075 double ttab[n_measurements], tk, tkp1;
2076 TMP_DECL;
2077 TMP_MARK;
2078
2079 possible_nbits_cutoff = 0;
2080
2081 k = 1;
2082
2083 winsize = 10; /* the itch function needs this */
2084 itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
2085
2086 rp = TMP_ALLOC_LIMBS (n_max);
2087 bp = TMP_ALLOC_LIMBS (n_max);
2088 ep = TMP_ALLOC_LIMBS (n_max);
2089 mp = TMP_ALLOC_LIMBS (n_max);
2090 tp = TMP_ALLOC_LIMBS (itch);
2091
2092 mpn_random (bp, n_max);
2093 mpn_random (mp, n_max);
2094 mp[0] |= 1;
2095
2096 /* How about taking the M operand size into account?
2097
2098 An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
2099 B = O(M)).
2100
2101 Using k-ary and no sliding window, the precomputation will need time
2102 O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
2103 O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
2104
2105 An operation R=powm_sec(B,E,N) will take time like powm.
2106
2107 Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
2108 main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
2109 O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
2110 table reads, respectively. */
2111
2112 printf ("#define POWM_SEC_TABLE ");
2113
2114 /* For nbits == 1, we should always use k == 1, so no need to tune
2115 that. Starting with nbits == 2 also ensure that nbits always is
2116 larger than the windowsize k+1. */
2117 for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
2118 {
2119 n = (nbits - 1) / GMP_NUMB_BITS + 1;
2120
2121 /* Generate E such that sliding-window for k and k+1 works equally
2122 well/poorly (but sliding is not used in powm_sec, of course). */
2123 for (i = 0; i < n; i++)
2124 ep[i] = ~CNST_LIMB(0);
2125
2126 winsize = k;
2127 for (i = 0; i < n_measurements; i++)
2128 {
2129 speed_starttime ();
2130 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2131 ttab[i] = speed_endtime ();
2132 }
2133 tk = median (ttab, n_measurements);
2134
2135 winsize = k + 1;
2136 speed_starttime ();
2137 for (i = 0; i < n_measurements; i++)
2138 {
2139 speed_starttime ();
2140 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2141 ttab[i] = speed_endtime ();
2142 }
2143 tkp1 = median (ttab, n_measurements);
2144 /*
2145 printf ("testing: %ld, %d", nbits, k, ep[n-1]);
2146 printf (" %10.5f %10.5f\n", tk, tkp1);
2147 */
2148 if (tkp1 < tk)
2149 {
2150 if (possible_nbits_cutoff)
2151 {
2152 /* Two consecutive sizes indicate k increase, obey. */
2153
2154 /* Must always have x[k] >= k */
2155 ASSERT_ALWAYS (possible_nbits_cutoff >= k);
2156
2157 if (k > 1)
2158 printf (",");
2159 printf ("%ld", (long) possible_nbits_cutoff);
2160 k++;
2161 possible_nbits_cutoff = 0;
2162 }
2163 else
2164 {
2165 /* One measurement indicate k increase, save nbits for further
2166 consideration. */
2167 /* The new larger k gets used for sizes > the cutoff
2168 value, hence the cutoff should be one less than the
2169 smallest size where it gives a speedup. */
2170 possible_nbits_cutoff = nbits - 1;
2171 }
2172 }
2173 else
2174 possible_nbits_cutoff = 0;
2175
2176 nbits_next = nbits * 65 / 64;
2177 nbits = nbits_next + (nbits_next == nbits);
2178 }
2179 printf ("\n");
2180 TMP_FREE;
2181 }
2182
2183
2184 /* size_extra==1 reflects the fact that with high<divisor one division is
2185 always skipped. Forcing high<divisor while testing ensures consistency
2186 while stepping through sizes, ie. that size-1 divides will be done each
2187 time.
2188
2189 min_size==2 and min_is_always are used so that if plain division is only
2190 better at size==1 then don't bother including that code just for that
2191 case, instead go with preinv always and get a size saving. */
2192
2193 #define DIV_1_PARAMS \
2194 param.check_size = 256; \
2195 param.min_size = 2; \
2196 param.min_is_always = 1; \
2197 param.data_high = DATA_HIGH_LT_R; \
2198 param.size_extra = 1; \
2199 param.stop_factor = 2.0;
2200
2201
2202 double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
2203
2204 void
tune_divrem_1(void)2205 tune_divrem_1 (void)
2206 {
2207 /* plain version by default */
2208 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
2209
2210 /* No support for tuning native assembler code, do that by hand and put
2211 the results in the .asm file, there's no need for such thresholds to
2212 appear in gmp-mparam.h. */
2213 if (HAVE_NATIVE_mpn_divrem_1)
2214 return;
2215
2216 if (GMP_NAIL_BITS != 0)
2217 {
2218 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2219 "no preinv with nails");
2220 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2221 "no preinv with nails");
2222 return;
2223 }
2224
2225 if (UDIV_PREINV_ALWAYS)
2226 {
2227 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
2228 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
2229 return;
2230 }
2231
2232 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
2233
2234 /* Tune for the integer part of mpn_divrem_1. This will very possibly be
2235 a bit out for the fractional part, but that's too bad, the integer part
2236 is more important. */
2237 {
2238 static struct param_t param;
2239 param.name = "DIVREM_1_NORM_THRESHOLD";
2240 DIV_1_PARAMS;
2241 s.r = randlimb_norm ();
2242 param.function = speed_mpn_divrem_1_tune;
2243 one (&divrem_1_norm_threshold, ¶m);
2244 }
2245 {
2246 static struct param_t param;
2247 param.name = "DIVREM_1_UNNORM_THRESHOLD";
2248 DIV_1_PARAMS;
2249 s.r = randlimb_half ();
2250 param.function = speed_mpn_divrem_1_tune;
2251 one (&divrem_1_unnorm_threshold, ¶m);
2252 }
2253 }
2254
2255 void
tune_div_qr_1(void)2256 tune_div_qr_1 (void)
2257 {
2258 if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
2259 {
2260 static struct param_t param;
2261 speed_function_t f[2] =
2262 {
2263 speed_mpn_div_qr_1n_pi1_1,
2264 speed_mpn_div_qr_1n_pi1_2,
2265 };
2266
2267 s.size = 10;
2268 s.r = randlimb_norm ();
2269
2270 one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", ¶m);
2271 }
2272
2273 {
2274 static struct param_t param;
2275 param.name = "DIV_QR_1_NORM_THRESHOLD";
2276 DIV_1_PARAMS;
2277 param.min_size = 1;
2278 param.min_is_always = 0;
2279 s.r = randlimb_norm ();
2280 param.function = speed_mpn_div_qr_1_tune;
2281 one (&div_qr_1_norm_threshold, ¶m);
2282 }
2283 {
2284 static struct param_t param;
2285 param.name = "DIV_QR_1_UNNORM_THRESHOLD";
2286 DIV_1_PARAMS;
2287 param.min_size = 1;
2288 param.min_is_always = 0;
2289 s.r = randlimb_half();
2290 param.function = speed_mpn_div_qr_1_tune;
2291 one (&div_qr_1_unnorm_threshold, ¶m);
2292 }
2293 }
2294
2295
2296 void
tune_mod_1(void)2297 tune_mod_1 (void)
2298 {
2299 /* No support for tuning native assembler code, do that by hand and put
2300 the results in the .asm file, there's no need for such thresholds to
2301 appear in gmp-mparam.h. */
2302 if (HAVE_NATIVE_mpn_mod_1)
2303 return;
2304
2305 if (GMP_NAIL_BITS != 0)
2306 {
2307 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2308 "no preinv with nails");
2309 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2310 "no preinv with nails");
2311 return;
2312 }
2313
2314 if (!HAVE_NATIVE_mpn_mod_1_1p)
2315 {
2316 static struct param_t param;
2317 speed_function_t f[2] =
2318 {
2319 speed_mpn_mod_1_1_1,
2320 speed_mpn_mod_1_1_2,
2321 };
2322
2323 s.size = 10;
2324 s.r = randlimb_half ();
2325 one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", ¶m);
2326 }
2327
2328 if (UDIV_PREINV_ALWAYS)
2329 {
2330 print_define ("MOD_1_NORM_THRESHOLD", 0L);
2331 print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
2332 }
2333 else
2334 {
2335 {
2336 static struct param_t param;
2337 param.name = "MOD_1_NORM_THRESHOLD";
2338 DIV_1_PARAMS;
2339 s.r = randlimb_norm ();
2340 param.function = speed_mpn_mod_1_tune;
2341 one (&mod_1_norm_threshold, ¶m);
2342 }
2343 {
2344 static struct param_t param;
2345 param.name = "MOD_1_UNNORM_THRESHOLD";
2346 DIV_1_PARAMS;
2347 s.r = randlimb_half ();
2348 param.function = speed_mpn_mod_1_tune;
2349 one (&mod_1_unnorm_threshold, ¶m);
2350 }
2351 }
2352 {
2353 static struct param_t param;
2354
2355 param.check_size = 256;
2356
2357 s.r = randlimb_norm ();
2358 param.function = speed_mpn_mod_1_tune;
2359
2360 param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
2361 param.min_size = 2;
2362 one (&mod_1n_to_mod_1_1_threshold, ¶m);
2363 }
2364
2365 {
2366 static struct param_t param;
2367
2368 param.check_size = 256;
2369 s.r = randlimb_half ();
2370 param.noprint = 1;
2371
2372 param.function = speed_mpn_mod_1_1;
2373 param.function2 = speed_mpn_mod_1_2;
2374 param.min_is_always = 1;
2375 param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
2376 param.min_size = 2;
2377 one (&mod_1_1_to_mod_1_2_threshold, ¶m);
2378
2379 param.function = speed_mpn_mod_1_2;
2380 param.function2 = speed_mpn_mod_1_4;
2381 param.min_is_always = 1;
2382 param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
2383 param.min_size = 1;
2384 one (&mod_1_2_to_mod_1_4_threshold, ¶m);
2385
2386 if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
2387 {
2388 /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
2389 mod_1_2_to_mod_1_4_threshold = 0;
2390
2391 param.function = speed_mpn_mod_1_1;
2392 param.function2 = speed_mpn_mod_1_4;
2393 param.min_is_always = 1;
2394 param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
2395 param.min_size = 2;
2396 one (&mod_1_1_to_mod_1_2_threshold, ¶m);
2397 }
2398
2399 param.function = speed_mpn_mod_1_tune;
2400 param.function2 = NULL;
2401 param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
2402 param.min_size = 2;
2403 param.min_is_always = 0;
2404 one (&mod_1u_to_mod_1_1_threshold, ¶m);
2405
2406 if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
2407 mod_1_1_to_mod_1_2_threshold = 0;
2408 if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
2409 mod_1_2_to_mod_1_4_threshold = 0;
2410
2411 print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
2412 print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
2413 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
2414 print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
2415 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
2416 }
2417
2418 {
2419 static struct param_t param;
2420
2421 param.check_size = 256;
2422
2423 param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
2424 s.r = randlimb_norm ();
2425 param.function = speed_mpn_preinv_mod_1;
2426 param.function2 = speed_mpn_mod_1_tune;
2427 param.min_size = 1;
2428 one (&preinv_mod_1_to_mod_1_threshold, ¶m);
2429 }
2430 }
2431
2432
2433 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
2434 imply that udiv_qrnnd_preinv is worth using, but it seems most
2435 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
2436 directly. */
2437
2438 void
tune_preinv_divrem_1(void)2439 tune_preinv_divrem_1 (void)
2440 {
2441 static struct param_t param;
2442 speed_function_t divrem_1;
2443 const char *divrem_1_name;
2444 double t1, t2;
2445
2446 if (GMP_NAIL_BITS != 0)
2447 {
2448 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
2449 return;
2450 }
2451
2452 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
2453 it's faster than mpn_divrem_1. */
2454 if (HAVE_NATIVE_mpn_preinv_divrem_1)
2455 {
2456 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
2457 return;
2458 }
2459
2460 /* If udiv_qrnnd_preinv is the only division method then of course
2461 mpn_preinv_divrem_1 should be used. */
2462 if (UDIV_PREINV_ALWAYS)
2463 {
2464 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
2465 return;
2466 }
2467
2468 /* If we've got an assembler version of mpn_divrem_1, then compare against
2469 that, not the mpn_divrem_1_div generic C. */
2470 if (HAVE_NATIVE_mpn_divrem_1)
2471 {
2472 divrem_1 = speed_mpn_divrem_1;
2473 divrem_1_name = "mpn_divrem_1";
2474 }
2475 else
2476 {
2477 divrem_1 = speed_mpn_divrem_1_div;
2478 divrem_1_name = "mpn_divrem_1_div";
2479 }
2480
2481 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
2482 s.size = 200; /* generous but not too big */
2483 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
2484 since in general that's probably most common, though in fact for a
2485 64-bit limb mp_bases[10].big_base is normalized. */
2486 s.r = urandom() & (GMP_NUMB_MASK >> 4);
2487 if (s.r == 0) s.r = 123;
2488
2489 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s);
2490 t2 = tuneup_measure (divrem_1, ¶m, &s);
2491 if (t1 == -1.0 || t2 == -1.0)
2492 {
2493 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
2494 divrem_1_name, (long) s.size);
2495 abort ();
2496 }
2497 if (option_trace >= 1)
2498 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
2499 (long) s.size, t1, divrem_1_name, t2);
2500
2501 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
2502 }
2503
2504
2505
2506 void
tune_divrem_2(void)2507 tune_divrem_2 (void)
2508 {
2509 static struct param_t param;
2510
2511 /* No support for tuning native assembler code, do that by hand and put
2512 the results in the .asm file, and there's no need for such thresholds
2513 to appear in gmp-mparam.h. */
2514 if (HAVE_NATIVE_mpn_divrem_2)
2515 return;
2516
2517 if (GMP_NAIL_BITS != 0)
2518 {
2519 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
2520 "no preinv with nails");
2521 return;
2522 }
2523
2524 if (UDIV_PREINV_ALWAYS)
2525 {
2526 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
2527 return;
2528 }
2529
2530 /* Tune for the integer part of mpn_divrem_2. This will very possibly be
2531 a bit out for the fractional part, but that's too bad, the integer part
2532 is more important.
2533
2534 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
2535 code space if plain division is better only at size==2 or size==3. */
2536 param.name = "DIVREM_2_THRESHOLD";
2537 param.check_size = 256;
2538 param.min_size = 4;
2539 param.min_is_always = 1;
2540 param.size_extra = 2; /* does qsize==nsize-2 divisions */
2541 param.stop_factor = 2.0;
2542
2543 s.r = randlimb_norm ();
2544 param.function = speed_mpn_divrem_2;
2545 one (&divrem_2_threshold, ¶m);
2546 }
2547
2548 void
tune_div_qr_2(void)2549 tune_div_qr_2 (void)
2550 {
2551 static struct param_t param;
2552 param.name = "DIV_QR_2_PI2_THRESHOLD";
2553 param.function = speed_mpn_div_qr_2n;
2554 param.check_size = 500;
2555 param.min_size = 4;
2556 one (&div_qr_2_pi2_threshold, ¶m);
2557 }
2558
2559 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
2560 tune for that. Its speed can differ on odd or even divisor, so take an
2561 average threshold for the two.
2562
2563 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
2564 might not vary that way, but don't test this since high<divisor isn't
2565 expected to occur often with small divisors. */
2566
2567 void
tune_divexact_1(void)2568 tune_divexact_1 (void)
2569 {
2570 static struct param_t param;
2571 mp_size_t thresh[2], average;
2572 int low, i;
2573
2574 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
2575 full mpn_divrem_1. */
2576 if (HAVE_NATIVE_mpn_divexact_1)
2577 {
2578 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
2579 return;
2580 }
2581
2582 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
2583
2584 param.name = "DIVEXACT_1_THRESHOLD";
2585 param.data_high = DATA_HIGH_GE_R;
2586 param.check_size = 256;
2587 param.min_size = 2;
2588 param.stop_factor = 1.5;
2589 param.function = tuned_speed_mpn_divrem_1;
2590 param.function2 = speed_mpn_divexact_1;
2591 param.noprint = 1;
2592
2593 print_define_start (param.name);
2594
2595 for (low = 0; low <= 1; low++)
2596 {
2597 s.r = randlimb_half();
2598 if (low == 0)
2599 s.r |= 1;
2600 else
2601 s.r &= ~CNST_LIMB(7);
2602
2603 one (&thresh[low], ¶m);
2604 if (option_trace)
2605 printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
2606
2607 if (thresh[low] == MP_SIZE_T_MAX)
2608 {
2609 average = MP_SIZE_T_MAX;
2610 goto divexact_1_done;
2611 }
2612 }
2613
2614 if (option_trace)
2615 {
2616 printf ("average of:");
2617 for (i = 0; i < numberof(thresh); i++)
2618 printf (" %ld", (long) thresh[i]);
2619 printf ("\n");
2620 }
2621
2622 average = 0;
2623 for (i = 0; i < numberof(thresh); i++)
2624 average += thresh[i];
2625 average /= numberof(thresh);
2626
2627 /* If divexact turns out to be better as early as 3 limbs, then use it
2628 always, so as to reduce code size and conditional jumps. */
2629 if (average <= 3)
2630 average = 0;
2631
2632 divexact_1_done:
2633 print_define_end (param.name, average);
2634 }
2635
2636
2637 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2638 same as mpn_mod_1, but this might not be true of an assembler
2639 implementation. The threshold used is an average based on data where a
2640 divide can be skipped and where it can't.
2641
2642 If modexact turns out to be better as early as 3 limbs, then use it
2643 always, so as to reduce code size and conditional jumps. */
2644
2645 void
tune_modexact_1_odd(void)2646 tune_modexact_1_odd (void)
2647 {
2648 static struct param_t param;
2649 mp_size_t thresh_lt, thresh_ge, average;
2650
2651 #if 0
2652 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2653 of a full mpn_mod_1. */
2654 if (HAVE_NATIVE_mpn_modexact_1_odd)
2655 {
2656 print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2657 return;
2658 }
2659 #endif
2660
2661 param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2662 param.check_size = 256;
2663 param.min_size = 2;
2664 param.stop_factor = 1.5;
2665 param.function = speed_mpn_modexact_1c_odd;
2666 param.function2 = speed_mpn_mod_1_tune;
2667 param.noprint = 1;
2668 s.r = randlimb_half () | 1;
2669
2670 print_define_start (param.name);
2671
2672 param.data_high = DATA_HIGH_LT_R;
2673 one (&thresh_lt, ¶m);
2674 if (option_trace)
2675 printf ("lt thresh %ld\n", (long) thresh_lt);
2676
2677 average = thresh_lt;
2678 if (thresh_lt != MP_SIZE_T_MAX)
2679 {
2680 param.data_high = DATA_HIGH_GE_R;
2681 one (&thresh_ge, ¶m);
2682 if (option_trace)
2683 printf ("ge thresh %ld\n", (long) thresh_ge);
2684
2685 if (thresh_ge != MP_SIZE_T_MAX)
2686 {
2687 average = (thresh_ge + thresh_lt) / 2;
2688 if (thresh_ge <= 3)
2689 average = 0;
2690 }
2691 }
2692
2693 print_define_end (param.name, average);
2694 }
2695
2696
2697 void
tune_jacobi_base(void)2698 tune_jacobi_base (void)
2699 {
2700 static struct param_t param;
2701 speed_function_t f[4] =
2702 {
2703 speed_mpn_jacobi_base_1,
2704 speed_mpn_jacobi_base_2,
2705 speed_mpn_jacobi_base_3,
2706 speed_mpn_jacobi_base_4,
2707 };
2708
2709 s.size = GMP_LIMB_BITS * 3 / 4;
2710
2711 one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", ¶m);
2712 }
2713
2714
2715 void
tune_get_str(void)2716 tune_get_str (void)
2717 {
2718 /* Tune for decimal, it being most common. Some rough testing suggests
2719 other bases are different, but not by very much. */
2720 s.r = 10;
2721 {
2722 static struct param_t param;
2723 GET_STR_PRECOMPUTE_THRESHOLD = 0;
2724 param.name = "GET_STR_DC_THRESHOLD";
2725 param.function = speed_mpn_get_str;
2726 param.min_size = 4;
2727 param.max_size = GET_STR_THRESHOLD_LIMIT;
2728 one (&get_str_dc_threshold, ¶m);
2729 }
2730 {
2731 static struct param_t param;
2732 param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2733 param.function = speed_mpn_get_str;
2734 param.min_size = GET_STR_DC_THRESHOLD;
2735 param.max_size = GET_STR_THRESHOLD_LIMIT;
2736 one (&get_str_precompute_threshold, ¶m);
2737 }
2738 }
2739
2740
2741 double
speed_mpn_pre_set_str(struct speed_params * s)2742 speed_mpn_pre_set_str (struct speed_params *s)
2743 {
2744 unsigned char *str;
2745 mp_ptr wp;
2746 mp_size_t wn;
2747 unsigned i;
2748 int base;
2749 double t;
2750 mp_ptr powtab_mem, tp;
2751 powers_t powtab[GMP_LIMB_BITS];
2752 mp_size_t un;
2753 int chars_per_limb;
2754 size_t n_pows;
2755 powers_t *pt;
2756 TMP_DECL;
2757
2758 SPEED_RESTRICT_COND (s->size >= 1);
2759
2760 base = s->r == 0 ? 10 : s->r;
2761 SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2762
2763 TMP_MARK;
2764
2765 str = (unsigned char *) TMP_ALLOC (s->size);
2766 for (i = 0; i < s->size; i++)
2767 str[i] = s->xp[i] % base;
2768
2769 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
2770 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2771
2772 /* use this during development to check wn is big enough */
2773 /*
2774 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2775 */
2776
2777 speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
2778 speed_operand_dst (s, wp, wn);
2779 speed_cache_fill (s);
2780
2781 chars_per_limb = mp_bases[base].chars_per_limb;
2782 un = s->size / chars_per_limb + 1;
2783 powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
2784 n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
2785 pt = powtab + n_pows;
2786 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2787
2788 speed_starttime ();
2789 i = s->reps;
2790 do
2791 {
2792 mpn_pre_set_str (wp, str, s->size, pt, tp);
2793 }
2794 while (--i != 0);
2795 t = speed_endtime ();
2796
2797 TMP_FREE;
2798 return t;
2799 }
2800
2801 void
tune_set_str(void)2802 tune_set_str (void)
2803 {
2804 s.r = 10; /* decimal */
2805 {
2806 static struct param_t param;
2807 SET_STR_PRECOMPUTE_THRESHOLD = 0;
2808 param.step_factor = 0.01;
2809 param.name = "SET_STR_DC_THRESHOLD";
2810 param.function = speed_mpn_pre_set_str;
2811 param.min_size = 100;
2812 param.max_size = 50000;
2813 one (&set_str_dc_threshold, ¶m);
2814 }
2815 {
2816 static struct param_t param;
2817 param.step_factor = 0.02;
2818 param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2819 param.function = speed_mpn_set_str;
2820 param.min_size = SET_STR_DC_THRESHOLD;
2821 param.max_size = 100000;
2822 one (&set_str_precompute_threshold, ¶m);
2823 }
2824 }
2825
2826
2827 void
tune_fft_mul(void)2828 tune_fft_mul (void)
2829 {
2830 static struct fft_param_t param;
2831
2832 if (option_fft_max_size == 0)
2833 return;
2834
2835 param.table_name = "MUL_FFT_TABLE3";
2836 param.threshold_name = "MUL_FFT_THRESHOLD";
2837 param.p_threshold = &mul_fft_threshold;
2838 param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2839 param.p_modf_threshold = &mul_fft_modf_threshold;
2840 param.first_size = MUL_TOOM33_THRESHOLD / 2;
2841 param.max_size = option_fft_max_size;
2842 param.function = speed_mpn_fft_mul;
2843 param.mul_modf_function = speed_mpn_mul_fft;
2844 param.mul_function = speed_mpn_mul_n;
2845 param.sqr = 0;
2846 fft (¶m);
2847 }
2848
2849
2850 void
tune_fft_sqr(void)2851 tune_fft_sqr (void)
2852 {
2853 static struct fft_param_t param;
2854
2855 if (option_fft_max_size == 0)
2856 return;
2857
2858 param.table_name = "SQR_FFT_TABLE3";
2859 param.threshold_name = "SQR_FFT_THRESHOLD";
2860 param.p_threshold = &sqr_fft_threshold;
2861 param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2862 param.p_modf_threshold = &sqr_fft_modf_threshold;
2863 param.first_size = SQR_TOOM3_THRESHOLD / 2;
2864 param.max_size = option_fft_max_size;
2865 param.function = speed_mpn_fft_sqr;
2866 param.mul_modf_function = speed_mpn_mul_fft_sqr;
2867 param.mul_function = speed_mpn_sqr;
2868 param.sqr = 1;
2869 fft (¶m);
2870 }
2871
2872 void
tune_fac_ui(void)2873 tune_fac_ui (void)
2874 {
2875 static struct param_t param;
2876
2877 param.function = speed_mpz_fac_ui_tune;
2878
2879 param.name = "FAC_DSC_THRESHOLD";
2880 param.min_size = 70;
2881 param.max_size = FAC_DSC_THRESHOLD_LIMIT;
2882 one (&fac_dsc_threshold, ¶m);
2883
2884 param.name = "FAC_ODD_THRESHOLD";
2885 param.min_size = 22;
2886 param.stop_factor = 1.7;
2887 param.min_is_always = 1;
2888 one (&fac_odd_threshold, ¶m);
2889 }
2890
2891 void
all(void)2892 all (void)
2893 {
2894 time_t start_time, end_time;
2895 TMP_DECL;
2896
2897 TMP_MARK;
2898 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2899 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2900
2901 mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2902 mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2903
2904 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2905
2906 speed_time_init ();
2907 fprintf (stderr, "Using: %s\n", speed_time_string);
2908
2909 fprintf (stderr, "speed_precision %d", speed_precision);
2910 if (speed_unittime == 1.0)
2911 fprintf (stderr, ", speed_unittime 1 cycle");
2912 else
2913 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2914 if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2915 fprintf (stderr, ", CPU freq unknown\n");
2916 else
2917 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2918
2919 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2920 DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2921 fprintf (stderr, "\n");
2922
2923 time (&start_time);
2924 {
2925 struct tm *tp;
2926 tp = localtime (&start_time);
2927 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2928 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2929
2930 #ifdef __GNUC__
2931 /* gcc sub-minor version doesn't seem to come through as a define */
2932 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2933 #define PRINTED_COMPILER
2934 #endif
2935 #if defined (__SUNPRO_C)
2936 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2937 #define PRINTED_COMPILER
2938 #endif
2939 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2940 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2941 printf ("MIPSpro C %d.%d.%d */\n",
2942 _COMPILER_VERSION / 100,
2943 _COMPILER_VERSION / 10 % 10,
2944 _COMPILER_VERSION % 10);
2945 #define PRINTED_COMPILER
2946 #endif
2947 #if defined (__DECC) && defined (__DECC_VER)
2948 printf ("DEC C %d */\n", __DECC_VER);
2949 #define PRINTED_COMPILER
2950 #endif
2951 #if ! defined (PRINTED_COMPILER)
2952 printf ("system compiler */\n");
2953 #endif
2954 }
2955 printf ("\n");
2956
2957 tune_divrem_1 ();
2958 tune_mod_1 ();
2959 tune_preinv_divrem_1 ();
2960 tune_div_qr_1 ();
2961 #if 0
2962 tune_divrem_2 ();
2963 #endif
2964 tune_div_qr_2 ();
2965 tune_divexact_1 ();
2966 tune_modexact_1_odd ();
2967 printf("\n");
2968
2969 relspeed_div_1_vs_mul_1 ();
2970 printf("\n");
2971
2972 tune_mul_n ();
2973 printf("\n");
2974
2975 tune_mul ();
2976 printf("\n");
2977
2978 tune_sqr ();
2979 printf("\n");
2980
2981 tune_mulmid ();
2982 printf("\n");
2983
2984 tune_mulmod_bnm1 ();
2985 tune_sqrmod_bnm1 ();
2986 printf("\n");
2987
2988 tune_fft_mul ();
2989 printf("\n");
2990
2991 tune_fft_sqr ();
2992 printf ("\n");
2993
2994 tune_mullo ();
2995 tune_sqrlo ();
2996 printf("\n");
2997
2998 tune_dc_div ();
2999 tune_dc_bdiv ();
3000
3001 printf("\n");
3002 tune_invertappr ();
3003 tune_invert ();
3004 printf("\n");
3005
3006 tune_binvert ();
3007 tune_redc ();
3008 printf("\n");
3009
3010 tune_mu_div ();
3011 tune_mu_bdiv ();
3012 printf("\n");
3013
3014 tune_powm_sec ();
3015 printf("\n");
3016
3017 tune_get_str ();
3018 tune_set_str ();
3019 printf("\n");
3020
3021 tune_fac_ui ();
3022 printf("\n");
3023
3024 tune_matrix22_mul ();
3025 tune_hgcd2 ();
3026 tune_hgcd ();
3027 tune_hgcd_appr ();
3028 tune_hgcd_reduce();
3029 tune_gcd_dc ();
3030 tune_gcdext_dc ();
3031 tune_jacobi_base ();
3032 printf("\n");
3033
3034 time (&end_time);
3035 printf ("/* Tuneup completed successfully, took %ld seconds */\n",
3036 (long) (end_time - start_time));
3037
3038 TMP_FREE;
3039 }
3040
3041
3042 int
main(int argc,char * argv[])3043 main (int argc, char *argv[])
3044 {
3045 int opt;
3046
3047 /* Unbuffered so if output is redirected to a file it isn't lost if the
3048 program is killed part way through. */
3049 setbuf (stdout, NULL);
3050 setbuf (stderr, NULL);
3051
3052 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
3053 {
3054 switch (opt) {
3055 case 'f':
3056 if (optarg[0] == 't')
3057 option_fft_trace = 2;
3058 else
3059 option_fft_max_size = atol (optarg);
3060 break;
3061 case 'o':
3062 speed_option_set (optarg);
3063 break;
3064 case 'p':
3065 speed_precision = atoi (optarg);
3066 break;
3067 case 't':
3068 option_trace++;
3069 break;
3070 case '?':
3071 exit(1);
3072 }
3073 }
3074
3075 all ();
3076 exit (0);
3077 }
3078