1 /* Create tuned thresholds for various algorithms.
2
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010,
4 2011, 2012 Free Software Foundation, Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 */
22
23
24 /* Usage: tuneup [-t] [-t] [-p precision]
25
26 -t turns on some diagnostic traces, a second -t turns on more traces.
27
28 Algorithm:
29
30 The thresholds are determined as follows: two algorithms A and B are
31 compared over a range of sizes. At each point, we define "badness" to
32 be the percentage time lost if algorithm B is chosen over algorithm A.
33 Then total badness is the sum of this over all sizes measured. The
34 threshold is set to minimize total badness.
35
36 In practice the thresholds tend to be chosen to bring on algorithm B
37 fairly quickly.
38
39 Implementation:
40
41 In a normal library build the thresholds are constants. To tune them
42 selected objects are recompiled with the thresholds as global variables
43 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
44 the end of gmp-impl.h, and rules in tune/Makefile.am.
45
46 MUL_KARATSUBA_THRESHOLD for example uses a recompiled mpn_mul_n. The
47 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
48 level, but recurse into the basecase.
49
50 MUL_TOOM3_THRESHOLD makes use of the tuned MUL_KARATSUBA_THRESHOLD value.
51 Other routines in turn will make use of both of those. Naturally the
52 dependants must be tuned first.
53
54 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
55 just a threshold based on comparing two routines (mpn_divrem_1 and
56 mpn_divexact_1), and no further use of the value determined.
57
58 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
59 just comparisons between certain routines on representative data.
60
61 Shortcuts are applied when native (assembler) versions of routines exist.
62 For instance a native mpn_sqr_basecase is assumed to be always faster
63 than mpn_mul_basecase, with no measuring.
64
65 No attempt is made to tune within assembler routines, for instance
66 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
67 written and tuned all by hand. Assembler routines that might have hard
68 limits are recompiled though, to make them accept a bigger range of sizes
69 than normal, eg. mpn_sqr_basecase to compare against mpn_kara_sqr_n.
70
71 Code:
72 - main : checks for various command line options and calls all()
73 - all : prints the tuneup message, date and compiler, then calls
74 each of the individual tuning functions in turn, e.g.
75 tune_mul()
76 - tune_blah() : tunes function of type blah, e.g. tune_mul() tunes the
77 karatsuba and toom cutoffs. It sets up a param struct with the
78 following parameters:
79 a) name : the name of the threshold being tuned, e.g.
80 MUL_TOOM3_THRESHOLD
81 b) function : the first function being compared (this must be
82 of the form speed_blah and the function speed_blah will
83 exist in speed.h and speed.c
84 c) function2 : the second function being compared (if set to
85 NULL, this is automatically set to equal function
86 d) step_factor : the size of the step between sizes,
87 set to 0.01 by default, i.e. 1% increments
88 e) function_fudge : multiplier for the speed of function, used
89 to adjust for overheads, by default set to 1.0
90 f) stop_since_change is a stop condition. If the threshold
91 has not changed for this many iterations, then stop. This
92 is set to 80 iterations by default.
93 g) stop_factor : this is another stop factor. If method B
94 becomes faster by at least this factor, then stop. By
95 default this is set to 1.2, i.e. 20% faster.
96 h) min_size : the minimum size to start comparing from.
97 i) min_is_always : if this is set to 1, then if the threshold
98 just ends up being min_size, then the threshold is actually
99 set to 0, i.e. algorithm B is always used.
100 j) max_size : the maximum size to compare up to. By default this
101 is set to DEFAULT_MAX_SIZE which is 1000 limbs.
102 h) check_size : if set, will check that the given starting size
103 is valid for both algorithms and that algorithm B is at least
104 4% slower than algorithm A at that point.
105 i) size_extra : this is a bias added to each size when doing
106 measurements. It is subtracted off after each measurement.
107 It is basically used for shifting a threshold from the
108 measured value.
109 j) data_high : if set to 1, the high limb of xp and yp are set to
110 be less than s->r, if set to 2, the high limb of xp and yp are
111 set to be greater than or equal to s->r
112 k) noprint : if set, the threshold is computed but not printed.
113
114 After setting all the appropriate parameters, the function one() is
115 called. It takes a reference to a parameter, e.g. mul_toom3_threshold
116 which is defined in a table below. That threshold will have been given
117 some initial value (usually MP_SIZE_T_MAX) in the table. It also takes
118 a reference to the param struct.
119 - one() : does repeated timings over the given range of sizes always setting
120 the threshold to size+1 for function and size for function2.
121
122 N.B: the functions that need to be rebuilt to use variable thresholds must be
123 added to the Makefile.am file (and automake run) before tune can work.
124
125 */
126
127 #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
128
129 #include "config.h"
130
131 #include <math.h>
132 #include <stdio.h>
133 #include <stdlib.h>
134 #include <time.h>
135 #if HAVE_UNISTD_H
136 #include <unistd.h>
137 #endif
138
139 #include "mpir.h"
140 #include "gmp-impl.h"
141 #include "longlong.h"
142
143 #include "tests.h"
144 #include "speed.h"
145
146 #if !HAVE_DECL_OPTARG
147 extern char *optarg;
148 extern int optind, opterr;
149 #endif
150
151
152 #define DEFAULT_MAX_SIZE 1000 /* limbs */
153
154 #if WANT_FFT
155 mp_size_t option_fft_max_size = 50000; /* limbs */
156 #else
157 mp_size_t option_fft_max_size = 0;
158 #endif
159 int option_trace = 0;
160 int option_fft_trace = 0;
161 struct speed_params s;
162
163 struct dat_t {
164 mp_size_t size;
165 double d;
166 } *dat = NULL;
167 int ndat = 0;
168 int allocdat = 0;
169
170 /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
171 case use zero here, which for params.max_size means no limit. */
172 #ifndef TUNE_SQR_KARATSUBA_MAX
173 #define TUNE_SQR_KARATSUBA_MAX 0
174 #endif
175
176 mp_size_t mul_karatsuba_threshold = MP_SIZE_T_MAX;
177 mp_size_t mul_toom3_threshold = MUL_TOOM3_THRESHOLD_LIMIT;
178 mp_size_t mul_toom4_threshold = MUL_TOOM4_THRESHOLD_LIMIT;
179 mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
180 mp_size_t mul_fft_full_threshold = MP_SIZE_T_MAX;
181 mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
182 mp_size_t sqr_karatsuba_threshold
183 = (TUNE_SQR_KARATSUBA_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_KARATSUBA_MAX);
184 mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
185 mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
186 mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
187 mp_size_t sqr_fft_full_threshold = MP_SIZE_T_MAX;
188 mp_size_t mulmod_2expm1_threshold = MP_SIZE_T_MAX;
189 mp_size_t mullow_basecase_threshold = MP_SIZE_T_MAX;
190 mp_size_t mullow_dc_threshold = MP_SIZE_T_MAX;
191 mp_size_t mullow_mul_threshold = MP_SIZE_T_MAX;
192 mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX;
193 mp_size_t mulhigh_basecase_threshold = MP_SIZE_T_MAX;
194 mp_size_t mulhigh_dc_threshold = MP_SIZE_T_MAX;
195 mp_size_t mulhigh_mul_threshold = MP_SIZE_T_MAX;
196 mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
197 mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
198 mp_size_t inv_div_qr_threshold = MP_SIZE_T_MAX;
199 mp_size_t inv_divappr_q_n_threshold = MP_SIZE_T_MAX;
200 mp_size_t dc_div_q_threshold = MP_SIZE_T_MAX;
201 mp_size_t inv_div_q_threshold = MP_SIZE_T_MAX;
202 mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
203 mp_size_t inv_divappr_q_threshold = MP_SIZE_T_MAX;
204 mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
205 mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
206 mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
207 mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
208 mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
209 mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
210 mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
211 mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
212 mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX;
213 mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX;
214 mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
215 mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
216 mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
217 mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
218 mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
219 mp_size_t mod_1_1_threshold = MP_SIZE_T_MAX;
220 mp_size_t mod_1_2_threshold = MP_SIZE_T_MAX;
221 mp_size_t mod_1_3_threshold = MP_SIZE_T_MAX;
222 mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
223 mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
224 mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
225 mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
226 mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
227 mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
228 mp_size_t rootrem_threshold = MP_SIZE_T_MAX;
229 mp_size_t divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
230 mp_size_t rsh_divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
231 mp_size_t divrem_euclid_hensel_threshold = MP_SIZE_T_MAX;
232 mp_size_t fac_odd_threshold = 0;
233 mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT;
234
235 struct param_t {
236 const char *name;
237 speed_function_t function;
238 speed_function_t function2;
239 double step_factor; /* how much to step sizes (rounded down) */
240 double function_fudge; /* multiplier for "function" speeds */
241 int stop_since_change;
242 double stop_factor;
243 mp_size_t min_size;
244 int min_is_always;
245 mp_size_t max_size;
246 mp_size_t check_size;
247 mp_size_t size_extra;
248
249 #define DATA_HIGH_LT_R 1
250 #define DATA_HIGH_GE_R 2
251 int data_high;
252
253 int noprint;
254 };
255
256
257 /* These are normally undefined when false, which suits "#if" fine.
258 But give them zero values so they can be used in plain C "if"s. */
259 #ifndef UDIV_PREINV_ALWAYS
260 #define UDIV_PREINV_ALWAYS 0
261 #endif
262 #ifndef HAVE_NATIVE_mpn_divexact_1
263 #define HAVE_NATIVE_mpn_divexact_1 0
264 #endif
265 #ifndef HAVE_NATIVE_mpn_divrem_1
266 #define HAVE_NATIVE_mpn_divrem_1 0
267 #endif
268 #ifndef HAVE_NATIVE_mpn_divrem_2
269 #define HAVE_NATIVE_mpn_divrem_2 0
270 #endif
271 #ifndef HAVE_NATIVE_mpn_mod_1
272 #define HAVE_NATIVE_mpn_mod_1 0
273 #endif
274 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
275 #define HAVE_NATIVE_mpn_modexact_1_odd 0
276 #endif
277 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
278 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
279 #endif
280 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
281 #define HAVE_NATIVE_mpn_preinv_mod_1 0
282 #endif
283 #ifndef HAVE_NATIVE_mpn_sqr_basecase
284 #define HAVE_NATIVE_mpn_sqr_basecase 0
285 #endif
286
287
288 #define MAX3(a,b,c) MAX (MAX (a, b), c)
289
290 mp_limb_t
randlimb_norm(gmp_randstate_t rands)291 randlimb_norm (gmp_randstate_t rands)
292 {
293 mp_limb_t n;
294 mpn_randomb (&n,rands, 1);
295 n |= GMP_NUMB_HIGHBIT;
296 return n;
297 }
298
299 #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
300
301 mp_limb_t
randlimb_half(gmp_randstate_t rands)302 randlimb_half (gmp_randstate_t rands)
303 {
304 mp_limb_t n;
305 mpn_randomb (&n, rands,1);
306 n &= GMP_NUMB_HALFMASK;
307 n += (n==0);
308 return n;
309 }
310
311
312 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
313 if necessary. */
314 void
add_dat(mp_size_t size,double d)315 add_dat (mp_size_t size, double d)
316 {
317 #define ALLOCDAT_STEP 500
318
319 ASSERT_ALWAYS (ndat <= allocdat);
320
321 if (ndat == allocdat)
322 {
323 dat = (struct dat_t *) __gmp_allocate_or_reallocate
324 (dat, allocdat * sizeof(dat[0]),
325 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
326 allocdat += ALLOCDAT_STEP;
327 }
328
329 dat[ndat].size = size;
330 dat[ndat].d = d;
331 ndat++;
332 }
333
334
335 /* Return the threshold size based on the data accumulated. */
336 mp_size_t
analyze_dat(int final)337 analyze_dat (int final)
338 {
339 double x, min_x;
340 int j, min_j;
341
342 /* If the threshold is set at dat[0].size, any positive values are bad. */
343 x = 0.0;
344 for (j = 0; j < ndat; j++)
345 if (dat[j].d > 0.0)
346 x += dat[j].d;
347
348 if (option_trace >= 2 && final)
349 {
350 printf ("\n");
351 printf ("x is the sum of the badness from setting thresh at given size\n");
352 printf (" (minimum x is sought)\n");
353 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
354 }
355
356 min_x = x;
357 min_j = 0;
358
359
360 /* When stepping to the next dat[j].size, positive values are no longer
361 bad (so subtracted), negative values become bad (so add the absolute
362 value, meaning subtract). */
363 for (j = 0; j < ndat; x -= dat[j].d, j++)
364 {
365 if (option_trace >= 2 && final)
366 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
367
368 if (x < min_x)
369 {
370 min_x = x;
371 min_j = j;
372 }
373 }
374
375 return min_j;
376 }
377
378
379 /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
380
381 mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
382 mp_srcptr ap, mp_size_t size,
383 mp_limb_t d));
384 mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
385
386 void mpz_fac_ui_tune _PROTO ((mpz_ptr, mpir_ui));
387
388 double
speed_mpn_mod_1_tune(struct speed_params * s)389 speed_mpn_mod_1_tune (struct speed_params *s)
390 {
391 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
392 }
393 double
speed_mpn_divrem_1_tune(struct speed_params * s)394 speed_mpn_divrem_1_tune (struct speed_params *s)
395 {
396 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
397 }
398
399 double
speed_mpz_fac_ui_tune(struct speed_params * s)400 speed_mpz_fac_ui_tune (struct speed_params *s)
401 {
402 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
403 }
404
405 double
tuneup_measure(speed_function_t fun,gmp_randstate_t rands,const struct param_t * param,struct speed_params * s)406 tuneup_measure (speed_function_t fun,gmp_randstate_t rands,
407 const struct param_t *param,
408 struct speed_params *s)
409 {
410 static struct param_t dummy;
411 double t;
412 TMP_DECL;
413
414 if (! param)
415 param = &dummy;
416
417 s->size += param->size_extra;
418
419 TMP_MARK;
420 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
421 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
422
423 mpn_randomb (s->xp, rands, s->size);
424 mpn_randomb (s->yp, rands, s->size);
425
426 switch (param->data_high) {
427 case DATA_HIGH_LT_R:
428 s->xp[s->size-1] %= s->r;
429 s->yp[s->size-1] %= s->r;
430 break;
431 case DATA_HIGH_GE_R:
432 s->xp[s->size-1] |= s->r;
433 s->yp[s->size-1] |= s->r;
434 break;
435 }
436
437 t = speed_measure (fun, s);
438
439 s->size -= param->size_extra;
440
441 TMP_FREE;
442 return t;
443 }
444
445
446 #define PRINT_WIDTH 28
447
448 void
print_define_start(const char * name)449 print_define_start (const char *name)
450 {
451 printf ("#define %-*s ", PRINT_WIDTH, name);
452 if (option_trace)
453 printf ("...\n");
454 }
455
456 void
print_define_end_remark(const char * name,mp_size_t value,const char * remark)457 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
458 {
459 if (option_trace)
460 printf ("#define %-*s ", PRINT_WIDTH, name);
461
462 if (value == MP_SIZE_T_MAX)
463 printf ("MP_SIZE_T_MAX");
464 else
465 printf ("%5ld", (long) value);
466
467 if (remark != NULL)
468 printf (" /* %s */", remark);
469 printf ("\n");
470 }
471
472 void
print_define_end(const char * name,mp_size_t value)473 print_define_end (const char *name, mp_size_t value)
474 {
475 const char *remark;
476 if (value == MP_SIZE_T_MAX)
477 remark = "never";
478 else if (value == 0)
479 remark = "always";
480 else
481 remark = NULL;
482 print_define_end_remark (name, value, remark);
483 }
484
485 void
print_define(const char * name,mp_size_t value)486 print_define (const char *name, mp_size_t value)
487 {
488 print_define_start (name);
489 print_define_end (name, value);
490 }
491
492 void
print_define_remark(const char * name,mp_size_t value,const char * remark)493 print_define_remark (const char *name, mp_size_t value, const char *remark)
494 {
495 print_define_start (name);
496 print_define_end_remark (name, value, remark);
497 }
498
499
500 void
one(mp_size_t * threshold,gmp_randstate_t rands,struct param_t * param)501 one (mp_size_t *threshold, gmp_randstate_t rands,struct param_t *param)
502 {
503 int since_positive, since_thresh_change;
504 int thresh_idx, new_thresh_idx;
505
506 #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
507
508 DEFAULT (param->function_fudge, 1.0);
509 DEFAULT (param->function2, param->function);
510 DEFAULT (param->step_factor, 0.01); /* small steps by default */
511 DEFAULT (param->stop_since_change, 80);
512 DEFAULT (param->stop_factor, 1.2);
513 DEFAULT (param->min_size, 10);
514 DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
515
516 if (param->check_size != 0)
517 {
518 double t1, t2;
519 s.size = param->check_size;
520
521 *threshold = s.size+1;
522 t1 = tuneup_measure (param->function, rands,param, &s);
523
524 *threshold = s.size;
525 t2 = tuneup_measure (param->function2, rands, param, &s);
526 if (t1 == -1.0 || t2 == -1.0)
527 {
528 printf ("Oops, can't run both functions at size %ld\n",
529 (long) s.size);
530 abort ();
531 }
532 t1 *= param->function_fudge;
533
534 /* ask that t2 is at least 4% below t1 */
535 if (t1 < t2*1.04)
536 {
537 if (option_trace)
538 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
539 *threshold = MP_SIZE_T_MAX;
540 if (! param->noprint)
541 print_define (param->name, *threshold);
542 return;
543 }
544
545 if (option_trace >= 2)
546 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
547 (long) s.size, t1, t2);
548 }
549
550 if (! param->noprint || option_trace)
551 print_define_start (param->name);
552
553 ndat = 0;
554 since_positive = 0;
555 since_thresh_change = 0;
556 thresh_idx = 0;
557
558 if (option_trace >= 2)
559 {
560 printf (" algorithm-A algorithm-B ratio possible\n");
561 printf (" (seconds) (seconds) diff thresh\n");
562 }
563
564 for (s.size = param->min_size;
565 s.size < param->max_size;
566 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
567 {
568 double ti, tiplus1, d;
569
570 /* If there's a size limit and it's reached then it should still
571 be sensible to analyze the data since we want the threshold put
572 either at or near the limit. */
573 if (s.size >= param->max_size)
574 {
575 if (option_trace)
576 printf ("Reached maximum size (%ld) without otherwise stopping\n",
577 (long) param->max_size);
578 break;
579 }
580
581 /*
582 FIXME: check minimum size requirements are met, possibly by just
583 checking for the -1 returns from the speed functions.
584 */
585
586 /* using method A at this size */
587 *threshold = s.size+1;
588 ti = tuneup_measure (param->function, rands,param, &s);
589 if (ti == -1.0)
590 abort ();
591 ti *= param->function_fudge;
592
593 /* using method B at this size */
594 *threshold = s.size;
595 tiplus1 = tuneup_measure (param->function2, rands, param, &s);
596 if (tiplus1 == -1.0)
597 abort ();
598
599 /* Calculate the fraction by which the one or the other routine is
600 slower. */
601 if (tiplus1 >= ti)
602 d = (tiplus1 - ti) / tiplus1; /* negative */
603 else
604 d = (tiplus1 - ti) / ti; /* positive */
605
606 add_dat (s.size, d);
607
608 new_thresh_idx = analyze_dat (0);
609
610 if (option_trace >= 2)
611 printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
612 (long) s.size, ti, tiplus1, d,
613 ti > tiplus1 ? '#' : ' ',
614 (long) dat[new_thresh_idx].size);
615
616 /* Stop if the last time method i was faster was more than a
617 certain number of measurements ago. */
618 #define STOP_SINCE_POSITIVE 200
619 if (d >= 0)
620 since_positive = 0;
621 else
622 if (++since_positive > STOP_SINCE_POSITIVE)
623 {
624 if (option_trace >= 1)
625 printf ("stopped due to since_positive (%d)\n",
626 STOP_SINCE_POSITIVE);
627 break;
628 }
629
630 /* Stop if method A has become slower by a certain factor. */
631 if (ti >= tiplus1 * param->stop_factor)
632 {
633 if (option_trace >= 1)
634 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
635 param->stop_factor);
636 break;
637 }
638
639 /* Stop if the threshold implied hasn't changed in a certain
640 number of measurements. (It's this condition that ususally
641 stops the loop.) */
642 if (thresh_idx != new_thresh_idx)
643 since_thresh_change = 0, thresh_idx = new_thresh_idx;
644 else
645 if (++since_thresh_change > param->stop_since_change)
646 {
647 if (option_trace >= 1)
648 printf ("stopped due to since_thresh_change (%d)\n",
649 param->stop_since_change);
650 break;
651 }
652
653 /* Stop if the threshold implied is more than a certain number of
654 measurements ago. */
655 #define STOP_SINCE_AFTER 500
656 if (ndat - thresh_idx > STOP_SINCE_AFTER)
657 {
658 if (option_trace >= 1)
659 printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
660 STOP_SINCE_AFTER);
661 break;
662 }
663
664 /* Stop when the size limit is reached before the end of the
665 crossover, but only show this as an error for >= the default max
666 size. FIXME: Maybe should make it a param choice whether this is
667 an error. */
668 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
669 {
670 fprintf (stderr, "%s\n", param->name);
671 fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
672 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
673 fprintf (stderr, " max size reached before end of crossover\n");
674 break;
675 }
676 }
677
678 if (option_trace >= 1)
679 printf ("sizes %ld to %ld total %d measurements\n",
680 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
681
682 *threshold = dat[analyze_dat (1)].size;
683
684 if (param->min_is_always)
685 {
686 if (*threshold == param->min_size)
687 *threshold = 0;
688 }
689
690 if (! param->noprint || option_trace)
691 print_define_end (param->name, *threshold);
692 }
693
694
695 /* Special probing for the fft thresholds. The size restrictions on the
696 FFTs mean the graph of time vs size has a step effect. See this for
697 example using
698
699 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
700 gnuplot foo.gnuplot
701
702 The current approach is to compare routines at the midpoint of relevant
703 steps. Arguably a more sophisticated system of threshold data is wanted
704 if this step effect remains. */
705
706 struct fft_param_t {
707 const char *threshold_name;
708 mp_size_t *p_threshold;
709 mp_size_t first_size;
710 mp_size_t max_size;
711 speed_function_t function;
712 speed_function_t mul_function;
713 mp_size_t sqr;
714 };
715
716 mp_size_t
fft_step_size(int size)717 fft_step_size (int size)
718 {
719 mp_size_t step;
720
721 step = mpir_fft_adjust_limbs(size + 1) - size;
722
723 if (step <= 0)
724 {
725 printf ("Can't handle size=%d\n", size);
726 abort ();
727 }
728
729 return step;
730 }
731
732 void
fft(struct fft_param_t * p,gmp_randstate_t rands)733 fft (struct fft_param_t *p,gmp_randstate_t rands)
734 {
735 mp_size_t size;
736 int i, k;
737
738 *p->p_threshold = MP_SIZE_T_MAX;
739
740 option_trace = MAX (option_trace, option_fft_trace);
741
742 size = p->first_size;
743
744 /* Declare an FFT faster than a plain toom3 etc multiplication found as
745 soon as one faster measurement obtained. A multiplication in the
746 middle of the FFT step is tested. */
747 for (;;)
748 {
749 double tk, tm;
750
751 size = mpir_fft_adjust_limbs (size+1);
752
753 if (size >= p->max_size)
754 break;
755
756 s.size = size + fft_step_size (size) / 2;
757
758 tk = tuneup_measure (p->function, rands, NULL, &s);
759 if (tk == -1.0)
760 abort ();
761
762 tm = tuneup_measure (p->mul_function, rands, NULL, &s);
763 if (tm == -1.0)
764 abort ();
765
766 if (option_trace >= 2)
767 printf ("at %ld size=%ld %.9f size=%ld %s mul %.9f\n",
768 (long) size,
769 (long) size + fft_step_size (size) / 2, tk,
770 (long) s.size, "full", tm);
771
772 if (tk < tm)
773 {
774 *p->p_threshold = s.size;
775 print_define (p->threshold_name, *p->p_threshold);
776 break;
777 }
778 }
779
780 }
781
782 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
783 giving wrong results. */
784 void
tune_mul(gmp_randstate_t rands)785 tune_mul (gmp_randstate_t rands)
786 {
787 static struct param_t param;
788
789 param.function = speed_mpn_mul_n;
790
791 param.name = "MUL_KARATSUBA_THRESHOLD";
792 param.min_size = MAX (4, MPN_KARA_MUL_N_MINSIZE);
793 param.max_size = MUL_KARATSUBA_THRESHOLD_LIMIT-1;
794 one (&mul_karatsuba_threshold, rands,¶m);
795
796 param.name = "MUL_TOOM3_THRESHOLD";
797 param.min_size = MAX (mul_karatsuba_threshold, MPN_TOOM3_MUL_N_MINSIZE);
798 param.max_size = MUL_TOOM3_THRESHOLD_LIMIT-1;
799 one (&mul_toom3_threshold, rands, ¶m);
800
801 param.name = "MUL_TOOM4_THRESHOLD";
802 param.min_size = MAX (mul_toom3_threshold, MPN_TOOM4_MUL_N_MINSIZE);
803 param.max_size = MUL_TOOM4_THRESHOLD_LIMIT-1;
804 one (&mul_toom4_threshold, rands, ¶m);
805
806 param.name = "MUL_TOOM8H_THRESHOLD";
807 param.min_size = MAX (mul_toom4_threshold, MPN_TOOM8H_MUL_MINSIZE);
808 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
809 one (&mul_toom8h_threshold, rands, ¶m);
810
811 /* disabled until tuned */
812 MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
813 }
814
815
816 /* This was written by the tuneup challenged tege. Kevin, please delete
817 this comment when you've reviewed/rewritten this. :-) */
818 void
tune_mullow(gmp_randstate_t rands)819 tune_mullow (gmp_randstate_t rands)
820 {
821 static struct param_t param;
822
823 param.function = speed_mpn_mullow_n;
824
825 param.name = "MULLOW_BASECASE_THRESHOLD";
826 param.min_size = 3;
827 param.min_is_always = 1;
828 //param.max_size = MULLOW_BASECASE_THRESHOLD_LIMIT-1;
829 one (&mullow_basecase_threshold, rands, ¶m);
830
831 param.min_is_always = 0; /* ??? */
832
833 param.name = "MULLOW_DC_THRESHOLD";
834 param.min_size = mullow_basecase_threshold;
835 param.max_size = 1000;
836 one (&mullow_dc_threshold, rands, ¶m);
837
838 param.name = "MULLOW_MUL_THRESHOLD";
839 param.min_size = mullow_dc_threshold;
840 param.max_size = 10000;
841 one (&mullow_mul_threshold, rands, ¶m);
842
843 /* disabled until tuned */
844 MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
845 }
846
847 void
tune_mulmid(gmp_randstate_t rands)848 tune_mulmid (gmp_randstate_t rands)
849 {
850 static struct param_t param;
851
852 param.name = "MULMID_TOOM42_THRESHOLD";
853 param.function = speed_mpn_mulmid_n;
854 param.min_size = 4;
855 param.max_size = 100;
856 one (&mulmid_toom42_threshold, rands, ¶m);
857
858 /* disabled until tuned */
859 MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
860 }
861
862 void
tune_mulmod_2expm1(gmp_randstate_t rands)863 tune_mulmod_2expm1 (gmp_randstate_t rands)
864 {
865 static struct param_t param;
866 param.function = speed_mpn_mulmod_2expm1;
867 param.name = "MULMOD_2EXPM1_THRESHOLD";
868 param.min_size = 1;
869 //param.max_size = ?? ;
870 one (&mulmod_2expm1_threshold, rands, ¶m);
871 /* disabled until tuned */
872 MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; // ??????????????
873 }
874
875 void
tune_mulhigh(gmp_randstate_t rands)876 tune_mulhigh (gmp_randstate_t rands)
877 {
878 static struct param_t param;
879
880 param.function = speed_mpn_mulhigh_n;
881
882 param.name = "MULHIGH_BASECASE_THRESHOLD";
883 param.min_size = 3;
884 param.min_is_always = 3;
885 //param.max_size = MULHIGH_BASECASE_THRESHOLD_LIMIT-1;
886 one (&mulhigh_basecase_threshold, rands, ¶m);
887
888 param.min_is_always = 0; /* ??? */
889
890 param.name = "MULHIGH_DC_THRESHOLD";
891 param.min_size = MAX(mulhigh_basecase_threshold,4);
892 param.max_size = 1000;
893 one (&mulhigh_dc_threshold, rands, ¶m);
894
895 param.name = "MULHIGH_MUL_THRESHOLD";
896 param.min_size = mulhigh_dc_threshold;
897 param.max_size = 10000;
898 one (&mulhigh_mul_threshold, rands, ¶m);
899
900 /* disabled until tuned */
901 MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
902 }
903
904 void
tune_rootrem(gmp_randstate_t rands)905 tune_rootrem (gmp_randstate_t rands)
906 {
907
908 static struct param_t param;
909 s.r=5; // tune for 5th root
910 param.function = speed_mpn_rootrem;
911 param.name = "ROOTREM_THRESHOLD";
912 param.min_size = 1;
913 one (&rootrem_threshold, rands, ¶m);
914 }
915
916 void
tune_divrem_hensel_qr_1(gmp_randstate_t rands)917 tune_divrem_hensel_qr_1 (gmp_randstate_t rands)
918 {
919 static struct param_t param;
920 param.function = speed_mpn_divrem_hensel_qr_1;
921 param.name = "DIVREM_HENSEL_QR_1_THRESHOLD";
922 param.min_size = 2;
923 one (&divrem_hensel_qr_1_threshold, rands, ¶m);
924 }
925
926 void
tune_rsh_divrem_hensel_qr_1(gmp_randstate_t rands)927 tune_rsh_divrem_hensel_qr_1 (gmp_randstate_t rands)
928 {
929 static struct param_t param;
930 param.function = speed_mpn_rsh_divrem_hensel_qr_1;
931 param.name = "RSH_DIVREM_HENSEL_QR_1_THRESHOLD";
932 param.min_size = 3;
933 one (&rsh_divrem_hensel_qr_1_threshold, rands, ¶m);
934 }
935
936 void
tune_divrem_euclid_hensel(gmp_randstate_t rands)937 tune_divrem_euclid_hensel (gmp_randstate_t rands)
938 {
939 static struct param_t param;
940 param.function = speed_mpn_divrem_1;
941 param.name = "DIVREM_EUCLID_HENSEL_THRESHOLD";
942 param.min_size = 8;
943 s.r=0x81234567;// tune for this divisor
944 one (&divrem_euclid_hensel_threshold, rands, ¶m);
945 }
946
947 // for tuning we dont care if the divisors go out of range as it doesn't affect the runtime
tune_mod_1_k(gmp_randstate_t rands)948 void tune_mod_1_k (gmp_randstate_t rands)
949 {
950 static struct param_t param;
951
952 param.function = speed_mpn_divrem_euclidean_r_1;
953
954 param.name = "MOD_1_1_THRESHOLD";
955 param.min_size = 3;
956 one (&mod_1_1_threshold, rands, ¶m);
957
958 param.name = "MOD_1_2_THRESHOLD";
959 param.min_size = MAX(mod_1_1_threshold,4);
960 //param.max_size = 1000;
961 one (&mod_1_2_threshold, rands, ¶m);
962
963 param.name = "MOD_1_3_THRESHOLD";
964 param.min_size = MAX(mod_1_2_threshold,5);
965 //param.max_size = 10000;
966 one (&mod_1_3_threshold, rands, ¶m);
967
968 }
969
970
971
972 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
973 is faster only at size==2 then we don't want to bother with extra code
974 just for that. Start karatsuba from 4 same as MUL above. */
975
976 void
tune_sqr(gmp_randstate_t rands)977 tune_sqr (gmp_randstate_t rands)
978 {
979 /* disabled until tuned */
980 SQR_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
981
982 if (HAVE_NATIVE_mpn_sqr_basecase)
983 {
984 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
985 sqr_basecase_threshold = 0;
986 }
987 else
988 {
989 static struct param_t param;
990 param.name = "SQR_BASECASE_THRESHOLD";
991 param.function = speed_mpn_sqr;
992 param.min_size = 3;
993 param.min_is_always = 1;
994 param.max_size = TUNE_SQR_KARATSUBA_MAX;
995 param.noprint = 1;
996 one (&sqr_basecase_threshold, rands, ¶m);
997 }
998
999 {
1000 static struct param_t param;
1001 param.name = "SQR_KARATSUBA_THRESHOLD";
1002 param.function = speed_mpn_sqr;
1003 param.min_size = MAX (4, MPN_KARA_SQR_N_MINSIZE);
1004 param.max_size = TUNE_SQR_KARATSUBA_MAX;
1005 param.noprint = 1;
1006 one (&sqr_karatsuba_threshold, rands, ¶m);
1007
1008 if (! HAVE_NATIVE_mpn_sqr_basecase
1009 && sqr_karatsuba_threshold < sqr_basecase_threshold)
1010 {
1011 /* Karatsuba becomes faster than mul_basecase before
1012 sqr_basecase does. Arrange for the expression
1013 "BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD))" which
1014 selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1015 SQR_KARATSUBA_THRESHOLD to zero, making
1016 SQR_BASECASE_THRESHOLD the karatsuba threshold. */
1017
1018 sqr_basecase_threshold = SQR_KARATSUBA_THRESHOLD;
1019 SQR_KARATSUBA_THRESHOLD = 0;
1020
1021 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1022 "karatsuba");
1023 print_define_remark ("SQR_KARATSUBA_THRESHOLD",SQR_KARATSUBA_THRESHOLD,
1024 "never sqr_basecase");
1025 }
1026 else
1027 {
1028 if (! HAVE_NATIVE_mpn_sqr_basecase)
1029 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1030 print_define ("SQR_KARATSUBA_THRESHOLD", SQR_KARATSUBA_THRESHOLD);
1031 }
1032 }
1033
1034 {
1035 static struct param_t param;
1036 param.function = speed_mpn_sqr;
1037
1038 {
1039 param.name = "SQR_TOOM3_THRESHOLD";
1040 param.min_size = MAX3 (MPN_TOOM3_SQR_N_MINSIZE,
1041 SQR_KARATSUBA_THRESHOLD, SQR_BASECASE_THRESHOLD);
1042 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1043 one (&sqr_toom3_threshold, rands, ¶m);
1044 }
1045
1046 {
1047 param.name = "SQR_TOOM4_THRESHOLD";
1048 param.min_size = MAX (MPN_TOOM4_SQR_N_MINSIZE, sqr_toom3_threshold);
1049 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1050 one (&sqr_toom4_threshold, rands, ¶m);
1051 }
1052
1053 {
1054 param.name = "SQR_TOOM8_THRESHOLD";
1055 param.min_size = MAX (MPN_TOOM8_SQR_N_MINSIZE, sqr_toom4_threshold);
1056 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1057 one (&sqr_toom8_threshold, rands, ¶m);
1058 }
1059 }
1060 }
1061
1062 void
tune_dc_div(gmp_randstate_t rands)1063 tune_dc_div (gmp_randstate_t rands)
1064 {
1065 {
1066 static struct param_t param;
1067 param.name = "DC_DIV_QR_THRESHOLD";
1068 param.function = speed_mpn_dc_div_qr_n;
1069 param.step_factor = 0.02;
1070 one (&dc_div_qr_threshold, rands, ¶m);
1071 }
1072
1073 {
1074 static struct param_t param;
1075 param.name = "INV_DIV_QR_THRESHOLD";
1076 param.max_size = 10000;
1077 param.function = speed_mpn_inv_div_qr;
1078 param.min_size = dc_div_qr_threshold;
1079 param.step_factor = 0.02;
1080 one (&inv_div_qr_threshold, rands, ¶m);
1081 }
1082
1083 {
1084 static struct param_t param;
1085 param.name = "INV_DIVAPPR_Q_N_THRESHOLD";
1086 param.function = speed_mpn_inv_divappr_q;
1087 param.max_size = 10000;
1088 param.min_size = dc_divappr_q_threshold;
1089 param.step_factor = 0.1;
1090 param.stop_factor = 0.2;
1091 one (&inv_divappr_q_n_threshold, rands, ¶m);
1092 }
1093 }
1094
1095 void
tune_tdiv_q(gmp_randstate_t rands)1096 tune_tdiv_q (gmp_randstate_t rands)
1097 {
1098 {
1099 static struct param_t param;
1100 param.name = "DC_DIV_Q_THRESHOLD";
1101 param.function = speed_mpn_tdiv_q1;
1102 param.step_factor = 0.02;
1103 one (&dc_div_q_threshold, rands, ¶m);
1104 }
1105
1106 {
1107 static struct param_t param;
1108 param.name = "INV_DIV_Q_THRESHOLD";
1109 param.function = speed_mpn_tdiv_q1;
1110 param.max_size = 10000;
1111 param.min_size = dc_div_q_threshold;
1112 param.step_factor = 0.02;
1113 one (&inv_div_q_threshold, rands, ¶m);
1114 }
1115
1116 {
1117 static struct param_t param;
1118 param.name = "DC_DIVAPPR_Q_THRESHOLD";
1119 param.function = speed_mpn_tdiv_q2;
1120 param.step_factor = 0.02;
1121 one (&dc_divappr_q_threshold, rands, ¶m);
1122 }
1123
1124 {
1125 static struct param_t param;
1126 param.name = "INV_DIVAPPR_Q_THRESHOLD";
1127 param.function = speed_mpn_tdiv_q2;
1128 param.max_size = 20000;
1129 param.min_size = dc_divappr_q_threshold;
1130 param.step_factor = 0.1;
1131 one (&inv_divappr_q_threshold, rands, ¶m);
1132 }
1133 }
1134
1135 void
tune_dc_bdiv(gmp_randstate_t rands)1136 tune_dc_bdiv (gmp_randstate_t rands)
1137 {
1138 {
1139 static struct param_t param;
1140 param.name = "DC_BDIV_QR_THRESHOLD";
1141 param.function = speed_mpn_dc_bdiv_qr_n;
1142 param.step_factor = 0.02;
1143 one (&dc_bdiv_qr_threshold, rands, ¶m);
1144 }
1145
1146 {
1147 static struct param_t param;
1148 param.name = "DC_BDIV_Q_THRESHOLD";
1149 param.function = speed_mpn_dc_bdiv_q;
1150 param.step_factor = 0.02;
1151 one (&dc_bdiv_q_threshold, rands, ¶m);
1152 }
1153 }
1154
1155 void
tune_binvert(gmp_randstate_t rands)1156 tune_binvert (gmp_randstate_t rands)
1157 {
1158 static struct param_t param;
1159
1160 param.function = speed_mpn_binvert;
1161 param.name = "BINV_NEWTON_THRESHOLD";
1162 param.min_size = 8; /* pointless with smaller operands */
1163 one (&binv_newton_threshold, rands, ¶m);
1164 }
1165
1166 void
tune_redc(gmp_randstate_t rands)1167 tune_redc (gmp_randstate_t rands)
1168 {
1169 #define TUNE_REDC_2_MAX 100
1170 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1171 #define WANT_REDC_2 1
1172 #endif
1173
1174 #if WANT_REDC_2
1175 {
1176 static struct param_t param;
1177 param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1178 param.function = speed_mpn_redc_1;
1179 param.function2 = speed_mpn_redc_2;
1180 param.min_size = 1;
1181 param.min_is_always = 1;
1182 param.max_size = TUNE_REDC_2_MAX;
1183 param.noprint = 1;
1184 param.stop_factor = 1.5;
1185 one (&redc_1_to_redc_2_threshold, rands, ¶m);
1186 }
1187 {
1188 static struct param_t param;
1189 param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1190 param.function = speed_mpn_redc_2;
1191 param.function2 = speed_mpn_redc_n;
1192 param.min_size = 16;
1193 param.noprint = 1;
1194 one (&redc_2_to_redc_n_threshold, rands, ¶m);
1195 }
1196 if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
1197 {
1198 redc_2_to_redc_n_threshold = 0; /* disable redc_2 */
1199
1200 /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
1201 REDC_1_TO_REDC_2_THRESHOLD. */
1202 {
1203 static struct param_t param;
1204 param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1205 param.function = speed_mpn_redc_1;
1206 param.function2 = speed_mpn_redc_n;
1207 param.min_size = 16;
1208 param.noprint = 1;
1209 one (&redc_1_to_redc_2_threshold, rands, ¶m);
1210 }
1211 }
1212 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1213 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1214 #else
1215 {
1216 static struct param_t param;
1217 param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1218 param.function = speed_mpn_redc_1;
1219 param.function2 = speed_mpn_redc_n;
1220 param.min_size = 16;
1221 one (&redc_1_to_redc_n_threshold, rands, ¶m);
1222 }
1223 #endif
1224 }
1225
1226 void
tune_matrix22_mul(gmp_randstate_t rands)1227 tune_matrix22_mul (gmp_randstate_t rands)
1228 {
1229 static struct param_t param;
1230 param.name = "MATRIX22_STRASSEN_THRESHOLD";
1231 param.function = speed_mpn_matrix22_mul;
1232 param.min_size = 2;
1233 one (&matrix22_strassen_threshold, rands, ¶m);
1234 }
1235
1236 void
tune_hgcd(gmp_randstate_t rands)1237 tune_hgcd (gmp_randstate_t rands)
1238 {
1239 static struct param_t param;
1240 param.name = "HGCD_THRESHOLD";
1241 param.function = speed_mpn_hgcd;
1242 /* We seem to get strange results for small sizes */
1243 param.min_size = 30;
1244 one (&hgcd_threshold, rands, ¶m);
1245 }
1246
1247 void
tune_hgcd_appr(gmp_randstate_t rands)1248 tune_hgcd_appr (gmp_randstate_t rands)
1249 {
1250 static struct param_t param;
1251 param.name = "HGCD_APPR_THRESHOLD";
1252 param.function = speed_mpn_hgcd_appr;
1253 /* We seem to get strange results for small sizes */
1254 param.min_size = 50;
1255 param.stop_since_change = 150;
1256 one (&hgcd_appr_threshold, rands, ¶m);
1257 }
1258
1259 void
tune_hgcd_reduce(gmp_randstate_t rands)1260 tune_hgcd_reduce (gmp_randstate_t rands)
1261 {
1262 static struct param_t param;
1263 param.name = "HGCD_REDUCE_THRESHOLD";
1264 param.function = speed_mpn_hgcd_reduce;
1265 param.min_size = 30;
1266 param.max_size = 7000;
1267 param.step_factor = 0.04;
1268 one (&hgcd_reduce_threshold, rands, ¶m);
1269 }
1270
1271 void
tune_gcd_dc(gmp_randstate_t rands)1272 tune_gcd_dc (gmp_randstate_t rands)
1273 {
1274 static struct param_t param;
1275 param.name = "GCD_DC_THRESHOLD";
1276 param.function = speed_mpn_gcd;
1277 param.min_size = hgcd_threshold;
1278 param.max_size = 3000;
1279 param.step_factor = 0.02;
1280 one (&gcd_dc_threshold, rands, ¶m);
1281 }
1282
1283 void
tune_gcdext_dc(gmp_randstate_t rands)1284 tune_gcdext_dc (gmp_randstate_t rands)
1285 {
1286 static struct param_t param;
1287 param.name = "GCDEXT_DC_THRESHOLD";
1288 param.function = speed_mpn_gcdext;
1289 param.min_size = hgcd_threshold;
1290 param.max_size = 3000;
1291 param.step_factor = 0.02;
1292 one (&gcdext_dc_threshold, rands, ¶m);
1293 }
1294
1295
1296
1297 /* size_extra==1 reflects the fact that with high<divisor one division is
1298 always skipped. Forcing high<divisor while testing ensures consistency
1299 while stepping through sizes, ie. that size-1 divides will be done each
1300 time.
1301
1302 min_size==2 and min_is_always are used so that if plain division is only
1303 better at size==1 then don't bother including that code just for that
1304 case, instead go with preinv always and get a size saving. */
1305
1306 #define DIV_1_PARAMS \
1307 param.check_size = 256; \
1308 param.min_size = 2; \
1309 param.min_is_always = 1; \
1310 param.data_high = DATA_HIGH_LT_R; \
1311 param.size_extra = 1; \
1312 param.stop_factor = 2.0;
1313
1314
1315 double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
1316
1317 void
tune_divrem_1(gmp_randstate_t rands)1318 tune_divrem_1 (gmp_randstate_t rands)
1319 {
1320 /* plain version by default */
1321 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
1322
1323 /* No support for tuning native assembler code, do that by hand and put
1324 the results in the .asm file, there's no need for such thresholds to
1325 appear in gmp-mparam.h. */
1326 if (HAVE_NATIVE_mpn_divrem_1)
1327 return;
1328
1329 if (GMP_NAIL_BITS != 0)
1330 {
1331 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1332 "no preinv with nails");
1333 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1334 "no preinv with nails");
1335 return;
1336 }
1337
1338 if (UDIV_PREINV_ALWAYS)
1339 {
1340 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
1341 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
1342 return;
1343 }
1344
1345 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1346
1347 /* Tune for the integer part of mpn_divrem_1. This will very possibly be
1348 a bit out for the fractional part, but that's too bad, the integer part
1349 is more important. */
1350 {
1351 static struct param_t param;
1352 param.name = "DIVREM_1_NORM_THRESHOLD";
1353 DIV_1_PARAMS;
1354 s.r = randlimb_norm (rands);
1355 param.function = speed_mpn_divrem_1_tune;
1356 one (&divrem_1_norm_threshold, rands, ¶m);
1357 }
1358 {
1359 static struct param_t param;
1360 param.name = "DIVREM_1_UNNORM_THRESHOLD";
1361 DIV_1_PARAMS;
1362 s.r = randlimb_half (rands);
1363 param.function = speed_mpn_divrem_1_tune;
1364 one (&divrem_1_unnorm_threshold, rands, ¶m);
1365 }
1366 }
1367
1368
1369 double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
1370
1371 void
tune_mod_1(gmp_randstate_t rands)1372 tune_mod_1 (gmp_randstate_t rands)
1373 {
1374 /* plain version by default */
1375 tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
1376
1377 /* No support for tuning native assembler code, do that by hand and put
1378 the results in the .asm file, there's no need for such thresholds to
1379 appear in gmp-mparam.h. */
1380 if (HAVE_NATIVE_mpn_mod_1)
1381 return;
1382
1383 if (GMP_NAIL_BITS != 0)
1384 {
1385 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1386 "no preinv with nails");
1387 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1388 "no preinv with nails");
1389 return;
1390 }
1391
1392 if (UDIV_PREINV_ALWAYS)
1393 {
1394 print_define ("MOD_1_NORM_THRESHOLD", 0L);
1395 print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
1396 return;
1397 }
1398
1399 tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
1400
1401 {
1402 static struct param_t param;
1403 param.name = "MOD_1_NORM_THRESHOLD";
1404 DIV_1_PARAMS;
1405 s.r = randlimb_norm (rands);
1406 param.function = speed_mpn_mod_1_tune;
1407 one (&mod_1_norm_threshold, rands, ¶m);
1408 }
1409 {
1410 static struct param_t param;
1411 param.name = "MOD_1_UNNORM_THRESHOLD";
1412 DIV_1_PARAMS;
1413 s.r = randlimb_half (rands);
1414 param.function = speed_mpn_mod_1_tune;
1415 one (&mod_1_unnorm_threshold, rands, ¶m);
1416 }
1417 }
1418
1419
1420 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
1421 imply that udiv_qrnnd_preinv is worth using, but it seems most
1422 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
1423 directly. */
1424
1425 void
tune_preinv_divrem_1(gmp_randstate_t rands)1426 tune_preinv_divrem_1 (gmp_randstate_t rands)
1427 {
1428 static struct param_t param;
1429 speed_function_t divrem_1;
1430 const char *divrem_1_name;
1431 double t1, t2;
1432
1433 if (GMP_NAIL_BITS != 0)
1434 {
1435 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
1436 return;
1437 }
1438
1439 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
1440 it's faster than mpn_divrem_1. */
1441 if (HAVE_NATIVE_mpn_preinv_divrem_1)
1442 {
1443 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
1444 return;
1445 }
1446
1447 /* If udiv_qrnnd_preinv is the only division method then of course
1448 mpn_preinv_divrem_1 should be used. */
1449 if (UDIV_PREINV_ALWAYS)
1450 {
1451 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
1452 return;
1453 }
1454
1455 /* If we've got an assembler version of mpn_divrem_1, then compare against
1456 that, not the mpn_divrem_1_div generic C. */
1457 if (HAVE_NATIVE_mpn_divrem_1)
1458 {
1459 divrem_1 = speed_mpn_divrem_1;
1460 divrem_1_name = "mpn_divrem_1";
1461 }
1462 else
1463 {
1464 divrem_1 = speed_mpn_divrem_1_div;
1465 divrem_1_name = "mpn_divrem_1_div";
1466 }
1467
1468 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
1469 s.size = 200; /* generous but not too big */
1470 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
1471 since in general that's probably most common, though in fact for a
1472 64-bit limb mp_bases[10].big_base is normalized. */
1473 s.r = urandom(rands) & (GMP_NUMB_MASK >> 4);
1474 if (s.r == 0) s.r = 123;
1475
1476 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, rands, ¶m, &s);
1477 t2 = tuneup_measure (divrem_1, rands, ¶m, &s);
1478 if (t1 == -1.0 || t2 == -1.0)
1479 {
1480 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
1481 divrem_1_name, (long) s.size);
1482 abort ();
1483 }
1484 if (option_trace >= 1)
1485 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
1486 (long) s.size, t1, divrem_1_name, t2);
1487
1488 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
1489 }
1490
1491
1492 /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
1493 that udiv_qrnnd_preinv is worth using, but it seems most straightforward
1494 to compare mpn_preinv_mod_1 and mpn_mod_1_div directly. */
1495
1496 void
tune_preinv_mod_1(gmp_randstate_t rands)1497 tune_preinv_mod_1 (gmp_randstate_t rands)
1498 {
1499 static struct param_t param;
1500 speed_function_t mod_1;
1501 const char *mod_1_name;
1502 double t1, t2;
1503
1504 /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
1505 faster than mpn_mod_1. */
1506 if (HAVE_NATIVE_mpn_preinv_mod_1)
1507 {
1508 print_define_remark ("USE_PREINV_MOD_1", 1, "native");
1509 return;
1510 }
1511
1512 if (GMP_NAIL_BITS != 0)
1513 {
1514 print_define_remark ("USE_PREINV_MOD_1", 0, "no preinv with nails");
1515 return;
1516 }
1517
1518 /* If udiv_qrnnd_preinv is the only division method then of course
1519 mpn_preinv_mod_1 should be used. */
1520 if (UDIV_PREINV_ALWAYS)
1521 {
1522 print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
1523 return;
1524 }
1525
1526 /* If we've got an assembler version of mpn_mod_1, then compare against
1527 that, not the mpn_mod_1_div generic C. */
1528 if (HAVE_NATIVE_mpn_mod_1)
1529 {
1530 mod_1 = speed_mpn_mod_1;
1531 mod_1_name = "mpn_mod_1";
1532 }
1533 else
1534 {
1535 mod_1 = speed_mpn_mod_1_div;
1536 mod_1_name = "mpn_mod_1_div";
1537 }
1538
1539 param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
1540 s.size = 200; /* generous but not too big */
1541 s.r = randlimb_norm(rands); /* divisor */
1542
1543 t1 = tuneup_measure (speed_mpn_preinv_mod_1, rands, ¶m, &s);
1544 t2 = tuneup_measure (mod_1, rands, ¶m, &s);
1545 if (t1 == -1.0 || t2 == -1.0)
1546 {
1547 printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
1548 mod_1_name, (long) s.size);
1549 abort ();
1550 }
1551 if (option_trace >= 1)
1552 printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
1553 (long) s.size, t1, mod_1_name, t2);
1554
1555 print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
1556 }
1557
1558
1559 void
tune_divrem_2(gmp_randstate_t rands)1560 tune_divrem_2 (gmp_randstate_t rands)
1561 {
1562 static struct param_t param;
1563
1564 /* No support for tuning native assembler code, do that by hand and put
1565 the results in the .asm file, and there's no need for such thresholds
1566 to appear in gmp-mparam.h. */
1567 if (HAVE_NATIVE_mpn_divrem_2)
1568 return;
1569
1570 if (GMP_NAIL_BITS != 0)
1571 {
1572 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
1573 "no preinv with nails");
1574 return;
1575 }
1576
1577 if (UDIV_PREINV_ALWAYS)
1578 {
1579 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
1580 return;
1581 }
1582
1583 /* Tune for the integer part of mpn_divrem_2. This will very possibly be
1584 a bit out for the fractional part, but that's too bad, the integer part
1585 is more important.
1586
1587 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
1588 code space if plain division is better only at size==2 or size==3. */
1589 param.name = "DIVREM_2_THRESHOLD";
1590 param.check_size = 256;
1591 param.min_size = 4;
1592 param.min_is_always = 1;
1593 param.size_extra = 2; /* does qsize==nsize-2 divisions */
1594 param.stop_factor = 2.0;
1595
1596 s.r = randlimb_norm (rands);
1597 param.function = speed_mpn_divrem_2;
1598 one (&divrem_2_threshold, rands, ¶m);
1599 }
1600
1601
1602 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
1603 tune for that. Its speed can differ on odd or even divisor, so take an
1604 average threshold for the two.
1605
1606 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
1607 might not vary that way, but don't test this since high<divisor isn't
1608 expected to occur often with small divisors. */
1609
1610 void
tune_divexact_1(gmp_randstate_t rands)1611 tune_divexact_1 (gmp_randstate_t rands)
1612 {
1613 static struct param_t param;
1614 mp_size_t thresh[2], average;
1615 int low, i;
1616
1617 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
1618 full mpn_divrem_1. */
1619 if (HAVE_NATIVE_mpn_divexact_1)
1620 {
1621 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
1622 return;
1623 }
1624
1625 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
1626
1627 param.name = "DIVEXACT_1_THRESHOLD";
1628 param.data_high = DATA_HIGH_GE_R;
1629 param.check_size = 256;
1630 param.min_size = 2;
1631 param.stop_factor = 1.5;
1632 param.function = tuned_speed_mpn_divrem_1;
1633 param.function2 = speed_mpn_divexact_1;
1634 param.noprint = 1;
1635
1636 print_define_start (param.name);
1637
1638 for (low = 0; low <= 1; low++)
1639 {
1640 s.r = randlimb_half(rands);
1641 if (low == 0)
1642 s.r |= 1;
1643 else
1644 s.r &= ~CNST_LIMB(7);
1645
1646 one (&thresh[low], rands, ¶m);
1647 if (option_trace)
1648 printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
1649
1650 if (thresh[low] == MP_SIZE_T_MAX)
1651 {
1652 average = MP_SIZE_T_MAX;
1653 goto divexact_1_done;
1654 }
1655 }
1656
1657 if (option_trace)
1658 {
1659 printf ("average of:");
1660 for (i = 0; i < numberof(thresh); i++)
1661 printf (" %ld", (long) thresh[i]);
1662 printf ("\n");
1663 }
1664
1665 average = 0;
1666 for (i = 0; i < numberof(thresh); i++)
1667 average += thresh[i];
1668 average /= numberof(thresh);
1669
1670 /* If divexact turns out to be better as early as 3 limbs, then use it
1671 always, so as to reduce code size and conditional jumps. */
1672 if (average <= 3)
1673 average = 0;
1674
1675 divexact_1_done:
1676 print_define_end (param.name, average);
1677 }
1678
1679
1680 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
1681 same as mpn_mod_1, but this might not be true of an assembler
1682 implementation. The threshold used is an average based on data where a
1683 divide can be skipped and where it can't.
1684
1685 If modexact turns out to be better as early as 3 limbs, then use it
1686 always, so as to reduce code size and conditional jumps. */
1687
1688 void
tune_modexact_1_odd(gmp_randstate_t rands)1689 tune_modexact_1_odd (gmp_randstate_t rands)
1690 {
1691 static struct param_t param;
1692 mp_size_t thresh_lt, thresh_ge, average;
1693
1694 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
1695 of a full mpn_mod_1. */
1696 if (HAVE_NATIVE_mpn_modexact_1_odd)
1697 {
1698 print_define_remark ("MODEXACT_1_ODD_THRESHOLD", 0, "always (native)");
1699 return;
1700 }
1701
1702 ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
1703
1704 param.name = "MODEXACT_1_ODD_THRESHOLD";
1705 param.check_size = 256;
1706 param.min_size = 2;
1707 param.stop_factor = 1.5;
1708 param.function = tuned_speed_mpn_mod_1;
1709 param.function2 = speed_mpn_modexact_1c_odd;
1710 param.noprint = 1;
1711 s.r = randlimb_half (rands) | 1;
1712
1713 print_define_start (param.name);
1714
1715 param.data_high = DATA_HIGH_LT_R;
1716 one (&thresh_lt, rands, ¶m);
1717 if (option_trace)
1718 printf ("lt thresh %ld\n", (long) thresh_lt);
1719
1720 average = thresh_lt;
1721 if (thresh_lt != MP_SIZE_T_MAX)
1722 {
1723 param.data_high = DATA_HIGH_GE_R;
1724 one (&thresh_ge, rands, ¶m);
1725 if (option_trace)
1726 printf ("ge thresh %ld\n", (long) thresh_ge);
1727
1728 if (thresh_ge != MP_SIZE_T_MAX)
1729 {
1730 average = (thresh_ge + thresh_lt) / 2;
1731 if (thresh_ge <= 3)
1732 average = 0;
1733 }
1734 }
1735
1736 print_define_end (param.name, average);
1737 }
1738
1739
1740 void
tune_jacobi_base(gmp_randstate_t rands)1741 tune_jacobi_base (gmp_randstate_t rands)
1742 {
1743 static struct param_t param;
1744 double t1, t2, t3, t4;
1745 int method;
1746
1747 s.size = GMP_LIMB_BITS * 3 / 4;
1748
1749 t1 = tuneup_measure (speed_mpn_jacobi_base_1, rands, ¶m, &s);
1750 if (option_trace >= 1)
1751 printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
1752
1753 t2 = tuneup_measure (speed_mpn_jacobi_base_2, rands, ¶m, &s);
1754 if (option_trace >= 1)
1755 printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
1756
1757 t3 = tuneup_measure (speed_mpn_jacobi_base_3, rands, ¶m, &s);
1758 if (option_trace >= 1)
1759 printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
1760
1761 t4 = tuneup_measure (speed_mpn_jacobi_base_4, rands, ¶m, &s);
1762 if (option_trace >= 1)
1763 printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4);
1764
1765 if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0)
1766 {
1767 printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
1768 (long) s.size);
1769 abort ();
1770 }
1771
1772 if (t1 < t2 && t1 < t3 && t1 < t4)
1773 method = 1;
1774 else if (t2 < t3 && t2 < t4)
1775 method = 2;
1776 else if (t3 < t4)
1777 method = 3;
1778 else
1779 method = 4;
1780
1781 print_define ("JACOBI_BASE_METHOD", method);
1782 }
1783
1784
1785
1786 void
tune_get_str(gmp_randstate_t rands)1787 tune_get_str (gmp_randstate_t rands)
1788 {
1789 /* Tune for decimal, it being most common. Some rough testing suggests
1790 other bases are different, but not by very much. */
1791 s.r = 10;
1792 {
1793 static struct param_t param;
1794 GET_STR_PRECOMPUTE_THRESHOLD = 0;
1795 param.name = "GET_STR_DC_THRESHOLD";
1796 param.function = speed_mpn_get_str;
1797 param.min_size = 4;
1798 param.max_size = GET_STR_THRESHOLD_LIMIT;
1799 one (&get_str_dc_threshold, rands, ¶m);
1800 }
1801 {
1802 static struct param_t param;
1803 param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
1804 param.function = speed_mpn_get_str;
1805 param.min_size = GET_STR_DC_THRESHOLD;
1806 param.max_size = GET_STR_THRESHOLD_LIMIT;
1807 one (&get_str_precompute_threshold, rands, ¶m);
1808 }
1809 }
1810
1811 double
speed_mpn_pre_set_str(struct speed_params * s)1812 speed_mpn_pre_set_str (struct speed_params *s)
1813 {
1814 unsigned char *str;
1815 mp_ptr wp;
1816 mp_size_t wn;
1817 unsigned i;
1818 int base;
1819 double t;
1820 mp_ptr powtab_mem, tp;
1821 powers_t powtab[GMP_LIMB_BITS];
1822 mp_size_t un;
1823 int chars_per_limb;
1824 TMP_DECL;
1825
1826 SPEED_RESTRICT_COND (s->size >= 1);
1827
1828 base = s->r == 0 ? 10 : s->r;
1829 SPEED_RESTRICT_COND (base >= 2 && base <= 256);
1830
1831 TMP_MARK;
1832
1833 str = TMP_ALLOC (s->size);
1834 for (i = 0; i < s->size; i++)
1835 str[i] = s->xp[i] % base;
1836
1837 wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly))
1838 / GMP_LIMB_BITS + 2;
1839 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
1840
1841 /* use this during development to check wn is big enough */
1842 /*
1843 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
1844 */
1845
1846 speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
1847 speed_operand_dst (s, wp, wn);
1848 speed_cache_fill (s);
1849
1850 chars_per_limb = mp_bases[base].chars_per_limb;
1851 un = s->size / chars_per_limb + 1;
1852 powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
1853 mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
1854 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
1855
1856 speed_starttime ();
1857 i = s->reps;
1858 do
1859 {
1860 mpn_pre_set_str (wp, str, s->size, powtab, tp);
1861 }
1862 while (--i != 0);
1863 t = speed_endtime ();
1864
1865 TMP_FREE;
1866 return t;
1867 }
1868
1869 void
tune_set_str(gmp_randstate_t rands)1870 tune_set_str (gmp_randstate_t rands)
1871 {
1872 s.r = 10; /* decimal */
1873 {
1874 static struct param_t param;
1875 SET_STR_PRECOMPUTE_THRESHOLD = 0;
1876 param.step_factor = 0.01;
1877 param.name = "SET_STR_DC_THRESHOLD";
1878 param.function = speed_mpn_pre_set_str;
1879 param.min_size = 100;
1880 param.max_size = 50000;
1881 one (&set_str_dc_threshold, rands, ¶m);
1882 }
1883 {
1884 static struct param_t param;
1885 param.step_factor = 0.02;
1886 param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
1887 param.function = speed_mpn_set_str;
1888 param.min_size = SET_STR_DC_THRESHOLD;
1889 param.max_size = 100000;
1890 one (&set_str_precompute_threshold, rands, ¶m);
1891 }
1892 }
1893
1894 void
tune_fft(gmp_randstate_t state)1895 tune_fft(gmp_randstate_t state)
1896 {
1897 mp_bitcnt_t depth, w, depth1, w1;
1898 clock_t start, end;
1899 double elapsed;
1900 double best = 0.0;
1901 mp_size_t best_off, off, best_d, best_w, num_twos, num_printed;
1902
1903 if (option_fft_max_size == 0)
1904 return;
1905
1906 printf("/* fft_tuning -- autogenerated by tune-fft */\n\n");
1907 printf("#define FFT_TAB \\\n");
1908 fflush(stdout);
1909
1910 printf(" { "); fflush(stdout);
1911 for (depth = 6; depth <= 10; depth++)
1912 {
1913 printf("{ "); fflush(stdout);
1914 for (w = 1; w <= 2; w++)
1915 {
1916 int iters = 100*((mp_size_t) 1 << (3*(10 - depth)/2)), i;
1917
1918 mp_size_t n = ((mp_limb_t)1<<depth);
1919 mp_bitcnt_t bits1 = (n*w - (depth + 1))/2;
1920 mp_size_t len1 = 2*n;
1921 mp_size_t len2 = 2*n;
1922
1923 mp_bitcnt_t b1 = len1*bits1, b2 = len2*bits1;
1924 mp_size_t n1, n2;
1925 mp_size_t j;
1926 mp_limb_t * i1, *i2, *r1;
1927
1928 n1 = (b1 - 1)/GMP_LIMB_BITS + 1;
1929 n2 = (b2 - 1)/GMP_LIMB_BITS + 1;
1930
1931 i1 = malloc(2*(n1 + n2)*sizeof(mp_limb_t));
1932 i2 = i1 + n1;
1933 r1 = i2 + n2;
1934
1935 mpn_urandomb(i1, state, b1);
1936 mpn_urandomb(i2, state, b2);
1937
1938 best_off = -1;
1939
1940 for (off = 0; off <= 4; off++)
1941 {
1942 start = clock();
1943 for (i = 0; i < iters; i++)
1944 mpn_mul_trunc_sqrt2(r1, i1, n1, i2, n2, depth - off, w*((mp_size_t)1 << (off*2)));
1945 end = clock();
1946
1947 elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
1948
1949 if (elapsed < best || best_off == -1)
1950 {
1951 best_off = off;
1952 best = elapsed;
1953 }
1954 }
1955
1956 printf("%ld", best_off);
1957 if (w != 2) printf(",");
1958 printf(" "); fflush(stdout);
1959
1960 free(i1);
1961 }
1962 printf("}");
1963 if (depth != 10) printf(",");
1964 printf(" "); fflush(stdout);
1965 }
1966
1967 printf("}\n\n");
1968
1969 best_d = 12;
1970 best_w = 1;
1971 best_off = -1;
1972 num_printed = 0;
1973 num_twos = 0;
1974
1975 printf("#define MULMOD_TAB \\\n");
1976 fflush(stdout);
1977 printf(" { "); fflush(stdout);
1978 for (depth = 12; best_off != 1 && !(num_printed >= 25 && best_off == 2 && num_twos >= 5) ; depth++)
1979 {
1980 for (w = 1; w <= 2; w++)
1981 {
1982 int iters = 100*((mp_size_t) 1 << (3*(18 - depth)/2)), i;
1983 mp_size_t n = ((mp_limb_t)1<<depth);
1984 mp_bitcnt_t bits = n*w;
1985 mp_size_t int_limbs = (bits - 1)/GMP_LIMB_BITS + 1;
1986 mp_size_t j;
1987 mp_limb_t c, * i1, * i2, * r1, * tt;
1988
1989 if (depth <= 21) iters = 32*((mp_size_t) 1 << (21 - depth));
1990 else iters = MAX(32/((mp_size_t) 1 << (depth - 21)), 1);
1991
1992 i1 = malloc(6*(int_limbs+1)*sizeof(mp_limb_t));
1993 i2 = i1 + int_limbs + 1;
1994 r1 = i2 + int_limbs + 1;
1995 tt = r1 + 2*(int_limbs + 1);
1996
1997 mpn_urandomb(i1, state, int_limbs*GMP_LIMB_BITS);
1998 mpn_urandomb(i2, state, int_limbs*GMP_LIMB_BITS);
1999 i1[int_limbs] = 0;
2000 i2[int_limbs] = 0;
2001
2002 depth1 = 1;
2003 while ((((mp_limb_t)1)<<depth1) < bits) depth1++;
2004 depth1 = depth1/2;
2005
2006 w1 = bits/(((mp_limb_t)1)<<(2*depth1));
2007
2008 best_off = -1;
2009
2010 for (off = 0; off <= 4; off++)
2011 {
2012 start = clock();
2013 for (i = 0; i < iters; i++)
2014 mpir_fft_mulmod_2expp1(r1, i1, i2, int_limbs, depth1 - off, w1*((mp_size_t)1 << (off*2)));
2015 end = clock();
2016
2017 elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
2018
2019 if (best_off == -1 || elapsed < best)
2020 {
2021 best_off = off;
2022 best = elapsed;
2023 }
2024 }
2025
2026 start = clock();
2027 for (i = 0; i < iters; i++)
2028 mpn_mulmod_2expp1_basecase(r1, i1, i2, 0, bits, tt);
2029 end = clock();
2030
2031 elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
2032 if (elapsed < best)
2033 {
2034 best_d = depth + (w == 2);
2035 best_w = w + 1 - 2*(w == 2);
2036 }
2037
2038 printf("%ld", best_off);
2039 if (best_off == 2)
2040 num_twos++;
2041 else
2042 num_twos = 0;
2043 num_printed++;
2044 if (w != 2) printf(", "); fflush(stdout);
2045
2046 free(i1);
2047 }
2048 printf(", "); fflush(stdout);
2049 }
2050 if (best_off == 2)
2051 {
2052 printf("2, 2, 2, 2, 2, 1, 1 }\n\n");
2053 num_printed += 6;
2054 } else
2055 printf("1 }\n\n");
2056
2057 printf("#define FFT_N_NUM %ld\n\n", num_printed + 1);
2058
2059 printf("#define FFT_MULMOD_2EXPP1_CUTOFF %ld\n\n", ((mp_limb_t) 1 << best_d)*best_w/(2*GMP_LIMB_BITS));
2060 }
2061
2062 void
tune_fac_ui(gmp_randstate_t rands)2063 tune_fac_ui (gmp_randstate_t rands)
2064 {
2065 static struct param_t param;
2066
2067 param.function = speed_mpz_fac_ui_tune;
2068
2069 param.name = "FAC_DSC_THRESHOLD";
2070 param.min_size = 70;
2071 param.max_size = FAC_DSC_THRESHOLD_LIMIT;
2072 one (&fac_dsc_threshold, rands, ¶m);
2073
2074 param.name = "FAC_ODD_THRESHOLD";
2075 param.min_size = 22;
2076 param.stop_factor = 1.7;
2077 param.min_is_always = 1;
2078 one (&fac_odd_threshold, rands, ¶m);
2079 }
2080
2081 void
tune_fft_mul(gmp_randstate_t rands)2082 tune_fft_mul (gmp_randstate_t rands)
2083 {
2084 static struct fft_param_t param;
2085
2086 if (option_fft_max_size == 0)
2087 return;
2088
2089 param.threshold_name = "MUL_FFT_FULL_THRESHOLD";
2090 param.p_threshold = &mul_fft_full_threshold;
2091 param.first_size = MUL_TOOM8H_THRESHOLD / 2;
2092 param.max_size = option_fft_max_size;
2093 param.function = speed_mpn_mul_fft_main;
2094 param.mul_function = speed_mpn_mul_n;
2095 param.sqr = 0;
2096 fft (¶m,rands);
2097 }
2098
2099
2100 void
tune_fft_sqr(gmp_randstate_t rands)2101 tune_fft_sqr (gmp_randstate_t rands)
2102 {
2103 static struct fft_param_t param;
2104
2105 if (option_fft_max_size == 0)
2106 return;
2107
2108 param.threshold_name = "SQR_FFT_FULL_THRESHOLD";
2109 param.p_threshold = &sqr_fft_full_threshold;
2110 param.first_size = SQR_TOOM8_THRESHOLD / 2;
2111 param.max_size = option_fft_max_size;
2112 param.function = speed_mpn_sqr_fft_main;
2113 param.mul_function = speed_mpn_sqr;
2114 param.sqr = 0;
2115 fft (¶m,rands);
2116 }
2117
2118 #ifdef _MSC_VER
2119 #define GMP_MPARAM_H_SUGGEST "vc_gmp_mparam.h"
2120 #endif
2121
2122 void
all(gmp_randstate_t rands)2123 all (gmp_randstate_t rands)
2124 {
2125 time_t start_time, end_time;
2126 TMP_DECL;
2127
2128 TMP_MARK;
2129 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2130 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2131
2132 mpn_randomb (s.xp_block, rands, SPEED_BLOCK_SIZE);
2133 mpn_randomb (s.yp_block, rands, SPEED_BLOCK_SIZE);
2134
2135 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2136
2137 speed_time_init ();
2138 fprintf (stderr, "Using: %s\n", speed_time_string);
2139
2140 fprintf (stderr, "speed_precision %d", speed_precision);
2141 if (speed_unittime == 1.0)
2142 fprintf (stderr, ", speed_unittime 1 cycle");
2143 else
2144 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2145 if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2146 fprintf (stderr, ", CPU freq unknown\n");
2147 else
2148 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2149
2150 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2151 DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2152 fprintf (stderr, "\n");
2153
2154 time (&start_time);
2155 {
2156 struct tm *tp;
2157 tp = localtime (&start_time);
2158 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2159 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2160
2161 #ifdef __GNUC__
2162 /* gcc sub-minor version doesn't seem to come through as a define */
2163 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2164 #define PRINTED_COMPILER
2165 #endif
2166 #if defined (__SUNPRO_C)
2167 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2168 #define PRINTED_COMPILER
2169 #endif
2170 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2171 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2172 printf ("MIPSpro C %d.%d.%d */\n",
2173 _COMPILER_VERSION / 100,
2174 _COMPILER_VERSION / 10 % 10,
2175 _COMPILER_VERSION % 10);
2176 #define PRINTED_COMPILER
2177 #endif
2178 #if defined (__DECC) && defined (__DECC_VER)
2179 printf ("DEC C %d */\n", __DECC_VER);
2180 #define PRINTED_COMPILER
2181 #endif
2182 #if ! defined (PRINTED_COMPILER)
2183 printf ("system compiler */\n");
2184 #endif
2185 }
2186 printf ("\n");
2187
2188 tune_mul (rands);
2189 printf("\n");
2190
2191 tune_sqr (rands);
2192 printf("\n");
2193
2194 tune_divrem_1 (rands);
2195 tune_mod_1 (rands);
2196 tune_preinv_divrem_1 (rands);
2197 tune_preinv_mod_1 (rands);
2198 tune_divrem_2 (rands);
2199 tune_divexact_1 (rands);
2200 tune_modexact_1_odd (rands);
2201 tune_mod_1_k(rands);
2202 tune_divrem_hensel_qr_1(rands);
2203 tune_rsh_divrem_hensel_qr_1(rands);
2204 tune_divrem_euclid_hensel(rands);
2205 printf("\n");
2206
2207 tune_fft_mul (rands);
2208 printf("\n");
2209
2210 tune_fft_sqr (rands);
2211 printf ("\n");
2212
2213 tune_mullow (rands);
2214 printf("\n");
2215 tune_mulmid (rands);
2216 printf("\n");
2217 tune_mulhigh (rands);
2218 printf("\n");
2219
2220 tune_mulmod_2expm1(rands);
2221 printf("\n");
2222
2223 /* dc_div_qr_n, dc_divappr_q, inv_div_qr, inv_divappr_q */
2224 tune_dc_div (rands);
2225
2226 /* mpn_tdiv_q : balanced */
2227 tune_tdiv_q (rands);
2228
2229 /* dc_bdiv_qr_n, dc_bdiv_q */
2230 tune_dc_bdiv (rands);
2231 printf("\n");
2232
2233 tune_binvert (rands);
2234 tune_redc (rands);
2235 printf("\n");
2236
2237 tune_rootrem(rands);
2238 printf("\n");
2239
2240 tune_matrix22_mul (rands);
2241 tune_hgcd (rands);
2242 tune_hgcd_appr (rands);
2243 tune_hgcd_reduce(rands);
2244 tune_gcd_dc (rands);
2245 tune_gcdext_dc (rands);
2246 tune_jacobi_base (rands);
2247 printf("\n");
2248
2249 tune_get_str (rands);
2250 tune_set_str (rands);
2251 printf("\n");
2252
2253 tune_fac_ui (rands);
2254 printf("\n");
2255
2256 tune_fft (rands);
2257 printf("\n");
2258
2259 time (&end_time);
2260 printf ("/* Tuneup completed successfully, took %ld seconds */\n",
2261 end_time - start_time);
2262
2263 TMP_FREE;
2264 }
2265
2266
2267 int
main(int argc,char * argv[])2268 main (int argc, char *argv[])
2269 {
2270 int opt;
2271 gmp_randstate_t rands;
2272
2273 gmp_randinit_default(rands);
2274 /* Unbuffered so if output is redirected to a file it isn't lost if the
2275 program is killed part way through. */
2276 setbuf (stdout, NULL);
2277 setbuf (stderr, NULL);
2278
2279 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
2280 {
2281 switch (opt) {
2282 case 'f':
2283 if (optarg[0] == 't')
2284 option_fft_trace = 2;
2285 else
2286 option_fft_max_size = atol (optarg);
2287 break;
2288 case 'o':
2289 speed_option_set (optarg);
2290 break;
2291 case 'p':
2292 speed_precision = atoi (optarg);
2293 break;
2294 case 't':
2295 option_trace++;
2296 break;
2297 case '?':
2298 exit(1);
2299 }
2300 }
2301
2302 all (rands);
2303 exit (0);
2304 }
2305