1 /* Shared speed subroutines.
2 
3 Copyright 1999-2006, 2008-2015 Free Software Foundation, Inc.
4 
5 This file is part of the GNU MP Library.
6 
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9 
10   * the GNU Lesser General Public License as published by the Free
11     Software Foundation; either version 3 of the License, or (at your
12     option) any later version.
13 
14 or
15 
16   * the GNU General Public License as published by the Free Software
17     Foundation; either version 2 of the License, or (at your option) any
18     later version.
19 
20 or both in parallel, as here.
21 
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 for more details.
26 
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library.  If not,
29 see https://www.gnu.org/licenses/.  */
30 
31 #define __GMP_NO_ATTRIBUTE_CONST_PURE
32 
33 #include <errno.h>
34 #include <fcntl.h>
35 #include <math.h>
36 #include <stdio.h>
37 #include <stdlib.h> /* for qsort */
38 #include <string.h>
39 #include <unistd.h>
40 #if 0
41 #include <sys/ioctl.h>
42 #endif
43 
44 #include "gmp.h"
45 #include "gmp-impl.h"
46 #include "longlong.h"
47 
48 #include "tests.h"
49 #include "speed.h"
50 
51 
52 int   speed_option_addrs = 0;
53 int   speed_option_verbose = 0;
54 int   speed_option_cycles_broken = 0;
55 
56 
57 /* Provide __clz_tab even if it's not required, for the benefit of new code
58    being tested with many.pl. */
59 #ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
60 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
61 #include "mp_clz_tab.c"
62 #undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
63 #endif
64 
65 
66 void
pentium_wbinvd(void)67 pentium_wbinvd(void)
68 {
69 #if 0
70   {
71     static int  fd = -2;
72 
73     if (fd == -2)
74       {
75 	fd = open ("/dev/wbinvd", O_RDWR);
76 	if (fd == -1)
77 	  perror ("open /dev/wbinvd");
78       }
79 
80     if (fd != -1)
81       ioctl (fd, 0, 0);
82   }
83 #endif
84 
85 #if 0
86 #define WBINVDSIZE  1024*1024*2
87   {
88     static char  *p = NULL;
89     int   i, sum;
90 
91     if (p == NULL)
92       p = malloc (WBINVDSIZE);
93 
94 #if 0
95     for (i = 0; i < WBINVDSIZE; i++)
96       p[i] = i & 0xFF;
97 #endif
98 
99     sum = 0;
100     for (i = 0; i < WBINVDSIZE; i++)
101       sum += p[i];
102 
103     mpn_cache_fill_dummy (sum);
104   }
105 #endif
106 }
107 
108 
109 int
double_cmp_ptr(const double * p,const double * q)110 double_cmp_ptr (const double *p, const double *q)
111 {
112   if (*p > *q)  return 1;
113   if (*p < *q)  return -1;
114   return 0;
115 }
116 
117 
118 /* Measure the speed of a given routine.
119 
120    The routine is run with enough repetitions to make it take at least
121    speed_precision * speed_unittime.  This aims to minimize the effects of a
122    limited accuracy time base and the overhead of the measuring itself.
123 
124    Measurements are made looking for 4 results within TOLERANCE of each
125    other (or 3 for routines taking longer than 2 seconds).  This aims to get
126    an accurate reading even if some runs are bloated by interrupts or task
127    switches or whatever.
128 
129    The given (*fun)() is expected to run its function "s->reps" many times
130    and return the total elapsed time measured using speed_starttime() and
131    speed_endtime().  If the function doesn't support the given s->size or
132    s->r, -1.0 should be returned.  See the various base routines below.  */
133 
134 double
speed_measure(double (* fun)(struct speed_params * s),struct speed_params * s)135 speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s)
136 {
137 #define TOLERANCE    1.01  /* 1% */
138   const int max_zeros = 10;
139 
140   struct speed_params  s_dummy;
141   int     i, j, e;
142   double  t[30];
143   double  t_unsorted[30];
144   double  reps_d;
145   int     zeros = 0;
146 
147   /* Use dummy parameters if caller doesn't provide any.  Only a few special
148      "fun"s will cope with this, speed_noop() is one.  */
149   if (s == NULL)
150     {
151       memset (&s_dummy, '\0', sizeof (s_dummy));
152       s = &s_dummy;
153     }
154 
155   s->reps = 1;
156   s->time_divisor = 1.0;
157   for (i = 0; i < numberof (t); i++)
158     {
159       for (;;)
160 	{
161 	  s->src_num = 0;
162 	  s->dst_num = 0;
163 
164 	  t[i] = (*fun) (s);
165 
166 	  if (speed_option_verbose >= 3)
167 	    gmp_printf("size=%ld reps=%u r=%Md attempt=%d  %.9f\n",
168 		       (long) s->size, s->reps, s->r, i, t[i]);
169 
170 	  if (t[i] == 0.0)
171 	    {
172 	      zeros++;
173 	      if (zeros > max_zeros)
174 		{
175 		  fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros);
176 		  abort ();
177 		}
178 	     if (s->reps < 10000)
179 	       s->reps *= 2;
180 
181 	      continue;
182 	    }
183 
184 	  if (t[i] == -1.0)
185 	    return -1.0;
186 
187 	  if (t[i] >= speed_unittime * speed_precision)
188 	    break;
189 
190 	  /* go to a value of reps to make t[i] >= precision */
191 	  reps_d = ceil (1.1 * s->reps
192 			 * speed_unittime * speed_precision
193 			 / MAX (t[i], speed_unittime));
194 	  if (reps_d > 2e9 || reps_d < 1.0)
195 	    {
196 	      fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d);
197 	      fprintf (stderr, "  (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n",
198 		       s->reps, speed_unittime, speed_precision, t[i]);
199 	      abort ();
200 	    }
201 	  s->reps = (unsigned) reps_d;
202 	}
203       t[i] /= s->reps;
204       t_unsorted[i] = t[i];
205 
206       if (speed_precision == 0)
207 	return t[i];
208 
209       /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
210       if (t[0] >= 2.0)
211 	e = 3;
212       else
213 	e = 4;
214 
215       /* Look for e many t[]'s within TOLERANCE of each other to consider a
216 	 valid measurement.  Return smallest among them.  */
217       if (i >= e)
218 	{
219 	  qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
220 	  for (j = e-1; j < i; j++)
221 	    if (t[j] <= t[j-e+1] * TOLERANCE)
222 	      return t[j-e+1] / s->time_divisor;
223 	}
224     }
225 
226   fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
227 	   e, (TOLERANCE-1.0)*100.0);
228   fprintf (stderr, "    unsorted         sorted\n");
229   fprintf (stderr, "  %.12f    %.12f    is about 0.5%%\n",
230 	   t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0));
231   for (i = 0; i < numberof (t); i++)
232     fprintf (stderr, "  %.09f       %.09f\n", t_unsorted[i], t[i]);
233 
234   return -1.0;
235 }
236 
237 
238 /* Read all of ptr,size to get it into the CPU memory cache.
239 
240    A call to mpn_cache_fill_dummy() is used to make sure the compiler
241    doesn't optimize away the whole loop.  Using "volatile mp_limb_t sum"
242    would work too, but the function call means we don't rely on every
243    compiler actually implementing volatile properly.
244 
245    mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
246    it can inline it.  */
247 
248 void
mpn_cache_fill(mp_srcptr ptr,mp_size_t size)249 mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
250 {
251   mp_limb_t  sum = 0;
252   mp_size_t  i;
253 
254   for (i = 0; i < size; i++)
255     sum += ptr[i];
256 
257   mpn_cache_fill_dummy(sum);
258 }
259 
260 
261 void
mpn_cache_fill_write(mp_ptr ptr,mp_size_t size)262 mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
263 {
264   mpn_cache_fill (ptr, size);
265 
266 #if 0
267   mpn_random (ptr, size);
268 #endif
269 
270 #if 0
271   mp_size_t  i;
272 
273   for (i = 0; i < size; i++)
274     ptr[i] = i;
275 #endif
276 }
277 
278 
279 void
speed_operand_src(struct speed_params * s,mp_ptr ptr,mp_size_t size)280 speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
281 {
282   if (s->src_num >= numberof (s->src))
283     {
284       fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
285       abort ();
286     }
287   s->src[s->src_num].ptr = ptr;
288   s->src[s->src_num].size = size;
289   s->src_num++;
290 }
291 
292 
293 void
speed_operand_dst(struct speed_params * s,mp_ptr ptr,mp_size_t size)294 speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
295 {
296   if (s->dst_num >= numberof (s->dst))
297     {
298       fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
299       abort ();
300     }
301   s->dst[s->dst_num].ptr = ptr;
302   s->dst[s->dst_num].size = size;
303   s->dst_num++;
304 }
305 
306 
307 void
speed_cache_fill(struct speed_params * s)308 speed_cache_fill (struct speed_params *s)
309 {
310   static struct speed_params  prev;
311   int  i;
312 
313   /* FIXME: need a better way to get the format string for a pointer */
314 
315   if (speed_option_addrs)
316     {
317       int  different;
318 
319       different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
320       for (i = 0; i < s->dst_num; i++)
321 	different |= (s->dst[i].ptr != prev.dst[i].ptr);
322       for (i = 0; i < s->src_num; i++)
323 	different |= (s->src[i].ptr != prev.src[i].ptr);
324 
325       if (different)
326 	{
327 	  if (s->dst_num != 0)
328 	    {
329 	      printf ("dst");
330 	      for (i = 0; i < s->dst_num; i++)
331 		printf (" %08lX", (unsigned long) s->dst[i].ptr);
332 	      printf (" ");
333 	    }
334 
335 	  if (s->src_num != 0)
336 	    {
337 	      printf ("src");
338 	      for (i = 0; i < s->src_num; i++)
339 		printf (" %08lX", (unsigned long) s->src[i].ptr);
340 	      printf (" ");
341 	    }
342 	  printf ("  (cf sp approx %08lX)\n", (unsigned long) &different);
343 
344 	}
345 
346       memcpy (&prev, s, sizeof(prev));
347     }
348 
349   switch (s->cache) {
350   case 0:
351     for (i = 0; i < s->dst_num; i++)
352       mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
353     for (i = 0; i < s->src_num; i++)
354       mpn_cache_fill (s->src[i].ptr, s->src[i].size);
355     break;
356   case 1:
357     pentium_wbinvd();
358     break;
359   }
360 }
361 
362 
363 /* Miscellaneous options accepted by tune and speed programs under -o. */
364 
365 void
speed_option_set(const char * s)366 speed_option_set (const char *s)
367 {
368   int  n;
369 
370   if (strcmp (s, "addrs") == 0)
371     {
372       speed_option_addrs = 1;
373     }
374   else if (strcmp (s, "verbose") == 0)
375     {
376       speed_option_verbose++;
377     }
378   else if (sscanf (s, "verbose=%d", &n) == 1)
379     {
380       speed_option_verbose = n;
381     }
382   else if (strcmp (s, "cycles-broken") == 0)
383     {
384       speed_option_cycles_broken = 1;
385     }
386   else
387     {
388       printf ("Unrecognised -o option: %s\n", s);
389       exit (1);
390     }
391 }
392 
393 
394 /* The following are basic speed running routines for various gmp functions.
395    Many are very similar and use speed.h macros.
396 
397    Each routine allocates it's own destination space for the result of the
398    function, because only it can know what the function needs.
399 
400    speed_starttime() and speed_endtime() are put tight around the code to be
401    measured.  Any setups are done outside the timed portion.
402 
403    Each routine is responsible for its own cache priming.
404    speed_cache_fill() is a good way to do this, see examples in speed.h.
405    One cache priming possibility, for CPUs with write-allocate cache, and
406    functions that don't take too long, is to do one dummy call before timing
407    so as to cache everything that gets used.  But speed_measure() runs a
408    routine at least twice and will take the smaller time, so this might not
409    be necessary.
410 
411    Data alignment will be important, for source, destination and temporary
412    workspace.  A routine can align its destination and workspace.  Programs
413    using the routines will ensure s->xp and s->yp are aligned.  Aligning
414    onto a CACHE_LINE_SIZE boundary is suggested.  s->align_wp and
415    s->align_wp2 should be respected where it makes sense to do so.
416    SPEED_TMP_ALLOC_LIMBS is a good way to do this.
417 
418    A loop of the following form can be expected to turn into good assembler
419    code on most CPUs, thereby minimizing overhead in the measurement.  It
420    can always be assumed s->reps >= 1.
421 
422 	  i = s->reps
423 	  do
424 	    foo();
425 	  while (--i != 0);
426 
427    Additional parameters might be added to "struct speed_params" in the
428    future.  Routines should ignore anything they don't use.
429 
430    s->size can be used creatively, and s->xp and s->yp can be ignored.  For
431    example, speed_mpz_fac_ui() uses s->size as n for the factorial.  s->r is
432    just a user-supplied parameter.  speed_mpn_lshift() uses it as a shift,
433    speed_mpn_mul_1() uses it as a multiplier.  */
434 
435 
436 /* MPN_COPY etc can be macros, so the _CALL forms are necessary */
437 double
speed_MPN_COPY(struct speed_params * s)438 speed_MPN_COPY (struct speed_params *s)
439 {
440   SPEED_ROUTINE_MPN_COPY (MPN_COPY);
441 }
442 double
speed_MPN_COPY_INCR(struct speed_params * s)443 speed_MPN_COPY_INCR (struct speed_params *s)
444 {
445   SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR);
446 }
447 double
speed_MPN_COPY_DECR(struct speed_params * s)448 speed_MPN_COPY_DECR (struct speed_params *s)
449 {
450   SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR);
451 }
452 #if HAVE_NATIVE_mpn_copyi
453 double
speed_mpn_copyi(struct speed_params * s)454 speed_mpn_copyi (struct speed_params *s)
455 {
456   SPEED_ROUTINE_MPN_COPY (mpn_copyi);
457 }
458 #endif
459 #if HAVE_NATIVE_mpn_copyd
460 double
speed_mpn_copyd(struct speed_params * s)461 speed_mpn_copyd (struct speed_params *s)
462 {
463   SPEED_ROUTINE_MPN_COPY (mpn_copyd);
464 }
465 #endif
466 double
speed_memcpy(struct speed_params * s)467 speed_memcpy (struct speed_params *s)
468 {
469   SPEED_ROUTINE_MPN_COPY_BYTES (memcpy);
470 }
471 double
speed_mpn_com(struct speed_params * s)472 speed_mpn_com (struct speed_params *s)
473 {
474   SPEED_ROUTINE_MPN_COPY (mpn_com);
475 }
476 double
speed_mpn_neg(struct speed_params * s)477 speed_mpn_neg (struct speed_params *s)
478 {
479   SPEED_ROUTINE_MPN_COPY (mpn_neg);
480 }
481 double
speed_mpn_sec_tabselect(struct speed_params * s)482 speed_mpn_sec_tabselect (struct speed_params *s)
483 {
484   SPEED_ROUTINE_MPN_TABSELECT (mpn_sec_tabselect);
485 }
486 
487 
488 double
speed_mpn_addmul_1(struct speed_params * s)489 speed_mpn_addmul_1 (struct speed_params *s)
490 {
491   SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
492 }
493 double
speed_mpn_submul_1(struct speed_params * s)494 speed_mpn_submul_1 (struct speed_params *s)
495 {
496   SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
497 }
498 
499 #if HAVE_NATIVE_mpn_addmul_2
500 double
speed_mpn_addmul_2(struct speed_params * s)501 speed_mpn_addmul_2 (struct speed_params *s)
502 {
503   SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2);
504 }
505 #endif
506 #if HAVE_NATIVE_mpn_addmul_3
507 double
speed_mpn_addmul_3(struct speed_params * s)508 speed_mpn_addmul_3 (struct speed_params *s)
509 {
510   SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3);
511 }
512 #endif
513 #if HAVE_NATIVE_mpn_addmul_4
514 double
speed_mpn_addmul_4(struct speed_params * s)515 speed_mpn_addmul_4 (struct speed_params *s)
516 {
517   SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4);
518 }
519 #endif
520 #if HAVE_NATIVE_mpn_addmul_5
521 double
speed_mpn_addmul_5(struct speed_params * s)522 speed_mpn_addmul_5 (struct speed_params *s)
523 {
524   SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5);
525 }
526 #endif
527 #if HAVE_NATIVE_mpn_addmul_6
528 double
speed_mpn_addmul_6(struct speed_params * s)529 speed_mpn_addmul_6 (struct speed_params *s)
530 {
531   SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6);
532 }
533 #endif
534 #if HAVE_NATIVE_mpn_addmul_7
535 double
speed_mpn_addmul_7(struct speed_params * s)536 speed_mpn_addmul_7 (struct speed_params *s)
537 {
538   SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7);
539 }
540 #endif
541 #if HAVE_NATIVE_mpn_addmul_8
542 double
speed_mpn_addmul_8(struct speed_params * s)543 speed_mpn_addmul_8 (struct speed_params *s)
544 {
545   SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8);
546 }
547 #endif
548 
549 double
speed_mpn_mul_1(struct speed_params * s)550 speed_mpn_mul_1 (struct speed_params *s)
551 {
552   SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
553 }
554 double
speed_mpn_mul_1_inplace(struct speed_params * s)555 speed_mpn_mul_1_inplace (struct speed_params *s)
556 {
557   SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1);
558 }
559 
560 #if HAVE_NATIVE_mpn_mul_2
561 double
speed_mpn_mul_2(struct speed_params * s)562 speed_mpn_mul_2 (struct speed_params *s)
563 {
564   SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2);
565 }
566 #endif
567 #if HAVE_NATIVE_mpn_mul_3
568 double
speed_mpn_mul_3(struct speed_params * s)569 speed_mpn_mul_3 (struct speed_params *s)
570 {
571   SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3);
572 }
573 #endif
574 #if HAVE_NATIVE_mpn_mul_4
575 double
speed_mpn_mul_4(struct speed_params * s)576 speed_mpn_mul_4 (struct speed_params *s)
577 {
578   SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4);
579 }
580 #endif
581 #if HAVE_NATIVE_mpn_mul_5
582 double
speed_mpn_mul_5(struct speed_params * s)583 speed_mpn_mul_5 (struct speed_params *s)
584 {
585   SPEED_ROUTINE_MPN_UNARY_5 (mpn_mul_5);
586 }
587 #endif
588 #if HAVE_NATIVE_mpn_mul_6
589 double
speed_mpn_mul_6(struct speed_params * s)590 speed_mpn_mul_6 (struct speed_params *s)
591 {
592   SPEED_ROUTINE_MPN_UNARY_6 (mpn_mul_6);
593 }
594 #endif
595 
596 
597 double
speed_mpn_lshift(struct speed_params * s)598 speed_mpn_lshift (struct speed_params *s)
599 {
600   SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
601 }
602 double
speed_mpn_lshiftc(struct speed_params * s)603 speed_mpn_lshiftc (struct speed_params *s)
604 {
605   SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc);
606 }
607 double
speed_mpn_rshift(struct speed_params * s)608 speed_mpn_rshift (struct speed_params *s)
609 {
610   SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
611 }
612 
613 
614 /* The carry-in variants (if available) are good for measuring because they
615    won't skip a division if high<divisor.  Alternately, use -1 as a divisor
616    with the plain _1 forms. */
617 double
speed_mpn_divrem_1(struct speed_params * s)618 speed_mpn_divrem_1 (struct speed_params *s)
619 {
620   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
621 }
622 double
speed_mpn_divrem_1f(struct speed_params * s)623 speed_mpn_divrem_1f (struct speed_params *s)
624 {
625   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
626 }
627 #if HAVE_NATIVE_mpn_divrem_1c
628 double
speed_mpn_divrem_1c(struct speed_params * s)629 speed_mpn_divrem_1c (struct speed_params *s)
630 {
631   SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
632 }
633 double
speed_mpn_divrem_1cf(struct speed_params * s)634 speed_mpn_divrem_1cf (struct speed_params *s)
635 {
636   SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
637 }
638 #endif
639 
640 double
speed_mpn_divrem_1_div(struct speed_params * s)641 speed_mpn_divrem_1_div (struct speed_params *s)
642 {
643   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div);
644 }
645 double
speed_mpn_divrem_1f_div(struct speed_params * s)646 speed_mpn_divrem_1f_div (struct speed_params *s)
647 {
648   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div);
649 }
650 double
speed_mpn_divrem_1_inv(struct speed_params * s)651 speed_mpn_divrem_1_inv (struct speed_params *s)
652 {
653   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv);
654 }
655 double
speed_mpn_divrem_1f_inv(struct speed_params * s)656 speed_mpn_divrem_1f_inv (struct speed_params *s)
657 {
658   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv);
659 }
660 double
speed_mpn_mod_1_div(struct speed_params * s)661 speed_mpn_mod_1_div (struct speed_params *s)
662 {
663   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div);
664 }
665 double
speed_mpn_mod_1_inv(struct speed_params * s)666 speed_mpn_mod_1_inv (struct speed_params *s)
667 {
668   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv);
669 }
670 
671 double
speed_mpn_preinv_divrem_1(struct speed_params * s)672 speed_mpn_preinv_divrem_1 (struct speed_params *s)
673 {
674   SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1);
675 }
676 double
speed_mpn_preinv_divrem_1f(struct speed_params * s)677 speed_mpn_preinv_divrem_1f (struct speed_params *s)
678 {
679   SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1);
680 }
681 
682 #if GMP_NUMB_BITS % 4 == 0
683 double
speed_mpn_mod_34lsub1(struct speed_params * s)684 speed_mpn_mod_34lsub1 (struct speed_params *s)
685 {
686   SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1);
687 }
688 #endif
689 
690 double
speed_mpn_divrem_2(struct speed_params * s)691 speed_mpn_divrem_2 (struct speed_params *s)
692 {
693   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
694 }
695 double
speed_mpn_divrem_2_div(struct speed_params * s)696 speed_mpn_divrem_2_div (struct speed_params *s)
697 {
698   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div);
699 }
700 double
speed_mpn_divrem_2_inv(struct speed_params * s)701 speed_mpn_divrem_2_inv (struct speed_params *s)
702 {
703   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv);
704 }
705 
706 double
speed_mpn_div_qr_1n_pi1(struct speed_params * s)707 speed_mpn_div_qr_1n_pi1 (struct speed_params *s)
708 {
709   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1);
710 }
711 double
speed_mpn_div_qr_1n_pi1_1(struct speed_params * s)712 speed_mpn_div_qr_1n_pi1_1 (struct speed_params *s)
713 {
714   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_1);
715 }
716 double
speed_mpn_div_qr_1n_pi1_2(struct speed_params * s)717 speed_mpn_div_qr_1n_pi1_2 (struct speed_params *s)
718 {
719   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2);
720 }
721 
722 double
speed_mpn_div_qr_1(struct speed_params * s)723 speed_mpn_div_qr_1 (struct speed_params *s)
724 {
725   SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1);
726 }
727 
728 double
speed_mpn_div_qr_2n(struct speed_params * s)729 speed_mpn_div_qr_2n (struct speed_params *s)
730 {
731   SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 1);
732 }
733 double
speed_mpn_div_qr_2u(struct speed_params * s)734 speed_mpn_div_qr_2u (struct speed_params *s)
735 {
736   SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 0);
737 }
738 
739 double
speed_mpn_mod_1(struct speed_params * s)740 speed_mpn_mod_1 (struct speed_params *s)
741 {
742   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
743 }
744 #if HAVE_NATIVE_mpn_mod_1c
745 double
speed_mpn_mod_1c(struct speed_params * s)746 speed_mpn_mod_1c (struct speed_params *s)
747 {
748   SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
749 }
750 #endif
751 double
speed_mpn_preinv_mod_1(struct speed_params * s)752 speed_mpn_preinv_mod_1 (struct speed_params *s)
753 {
754   SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1);
755 }
756 double
speed_mpn_mod_1_1(struct speed_params * s)757 speed_mpn_mod_1_1 (struct speed_params *s)
758 {
759   SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p,mpn_mod_1_1p_cps);
760 }
761 double
speed_mpn_mod_1_1_1(struct speed_params * s)762 speed_mpn_mod_1_1_1 (struct speed_params *s)
763 {
764   SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_1,mpn_mod_1_1p_cps_1);
765 }
766 double
speed_mpn_mod_1_1_2(struct speed_params * s)767 speed_mpn_mod_1_1_2 (struct speed_params *s)
768 {
769   SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_2,mpn_mod_1_1p_cps_2);
770 }
771 double
speed_mpn_mod_1_2(struct speed_params * s)772 speed_mpn_mod_1_2 (struct speed_params *s)
773 {
774   SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_2p,mpn_mod_1s_2p_cps,2);
775 }
776 double
speed_mpn_mod_1_3(struct speed_params * s)777 speed_mpn_mod_1_3 (struct speed_params *s)
778 {
779   SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_3p,mpn_mod_1s_3p_cps,3);
780 }
781 double
speed_mpn_mod_1_4(struct speed_params * s)782 speed_mpn_mod_1_4 (struct speed_params *s)
783 {
784   SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_4p,mpn_mod_1s_4p_cps,4);
785 }
786 
787 double
speed_mpn_divexact_1(struct speed_params * s)788 speed_mpn_divexact_1 (struct speed_params *s)
789 {
790   SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1);
791 }
792 
793 double
speed_mpn_divexact_by3(struct speed_params * s)794 speed_mpn_divexact_by3 (struct speed_params *s)
795 {
796   SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3);
797 }
798 
799 double
speed_mpn_bdiv_dbm1c(struct speed_params * s)800 speed_mpn_bdiv_dbm1c (struct speed_params *s)
801 {
802   SPEED_ROUTINE_MPN_BDIV_DBM1C (mpn_bdiv_dbm1c);
803 }
804 
805 double
speed_mpn_bdiv_q_1(struct speed_params * s)806 speed_mpn_bdiv_q_1 (struct speed_params *s)
807 {
808   SPEED_ROUTINE_MPN_BDIV_Q_1 (mpn_bdiv_q_1);
809 }
810 
811 double
speed_mpn_pi1_bdiv_q_1(struct speed_params * s)812 speed_mpn_pi1_bdiv_q_1 (struct speed_params *s)
813 {
814   SPEED_ROUTINE_MPN_PI1_BDIV_Q_1 (mpn_pi1_bdiv_q_1);
815 }
816 
817 #if HAVE_NATIVE_mpn_modexact_1_odd
818 double
speed_mpn_modexact_1_odd(struct speed_params * s)819 speed_mpn_modexact_1_odd (struct speed_params *s)
820 {
821   SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd);
822 }
823 #endif
824 
825 double
speed_mpn_modexact_1c_odd(struct speed_params * s)826 speed_mpn_modexact_1c_odd (struct speed_params *s)
827 {
828   SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd);
829 }
830 
831 double
speed_mpz_mod(struct speed_params * s)832 speed_mpz_mod (struct speed_params *s)
833 {
834   SPEED_ROUTINE_MPZ_MOD (mpz_mod);
835 }
836 
837 double
speed_mpn_sbpi1_div_qr(struct speed_params * s)838 speed_mpn_sbpi1_div_qr (struct speed_params *s)
839 {
840   SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_div_qr, inv.inv32, 2,0);
841 }
842 double
speed_mpn_dcpi1_div_qr(struct speed_params * s)843 speed_mpn_dcpi1_div_qr (struct speed_params *s)
844 {
845   SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_div_qr, &inv, 6,3);
846 }
847 double
speed_mpn_sbpi1_divappr_q(struct speed_params * s)848 speed_mpn_sbpi1_divappr_q (struct speed_params *s)
849 {
850   SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_divappr_q, inv.inv32, 2,0);
851 }
852 double
speed_mpn_dcpi1_divappr_q(struct speed_params * s)853 speed_mpn_dcpi1_divappr_q (struct speed_params *s)
854 {
855   SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_divappr_q, &inv, 6,3);
856 }
857 double
speed_mpn_mu_div_qr(struct speed_params * s)858 speed_mpn_mu_div_qr (struct speed_params *s)
859 {
860   SPEED_ROUTINE_MPN_MU_DIV_QR (mpn_mu_div_qr, mpn_mu_div_qr_itch);
861 }
862 double
speed_mpn_mu_divappr_q(struct speed_params * s)863 speed_mpn_mu_divappr_q (struct speed_params *s)
864 {
865   SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_divappr_q, mpn_mu_divappr_q_itch);
866 }
867 double
speed_mpn_mu_div_q(struct speed_params * s)868 speed_mpn_mu_div_q (struct speed_params *s)
869 {
870   SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_div_q, mpn_mu_div_q_itch);
871 }
872 double
speed_mpn_mupi_div_qr(struct speed_params * s)873 speed_mpn_mupi_div_qr (struct speed_params *s)
874 {
875   SPEED_ROUTINE_MPN_MUPI_DIV_QR (mpn_preinv_mu_div_qr, mpn_preinv_mu_div_qr_itch);
876 }
877 
878 double
speed_mpn_sbpi1_bdiv_qr(struct speed_params * s)879 speed_mpn_sbpi1_bdiv_qr (struct speed_params *s)
880 {
881   SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_sbpi1_bdiv_qr);
882 }
883 double
speed_mpn_dcpi1_bdiv_qr(struct speed_params * s)884 speed_mpn_dcpi1_bdiv_qr (struct speed_params *s)
885 {
886   SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_dcpi1_bdiv_qr);
887 }
888 double
speed_mpn_sbpi1_bdiv_q(struct speed_params * s)889 speed_mpn_sbpi1_bdiv_q (struct speed_params *s)
890 {
891   SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_sbpi1_bdiv_q);
892 }
893 double
speed_mpn_dcpi1_bdiv_q(struct speed_params * s)894 speed_mpn_dcpi1_bdiv_q (struct speed_params *s)
895 {
896   SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_dcpi1_bdiv_q);
897 }
898 double
speed_mpn_mu_bdiv_q(struct speed_params * s)899 speed_mpn_mu_bdiv_q (struct speed_params *s)
900 {
901   SPEED_ROUTINE_MPN_MU_BDIV_Q (mpn_mu_bdiv_q, mpn_mu_bdiv_q_itch);
902 }
903 double
speed_mpn_mu_bdiv_qr(struct speed_params * s)904 speed_mpn_mu_bdiv_qr (struct speed_params *s)
905 {
906   SPEED_ROUTINE_MPN_MU_BDIV_QR (mpn_mu_bdiv_qr, mpn_mu_bdiv_qr_itch);
907 }
908 
909 double
speed_mpn_broot(struct speed_params * s)910 speed_mpn_broot (struct speed_params *s)
911 {
912   SPEED_ROUTINE_MPN_BROOT (mpn_broot);
913 }
914 double
speed_mpn_broot_invm1(struct speed_params * s)915 speed_mpn_broot_invm1 (struct speed_params *s)
916 {
917   SPEED_ROUTINE_MPN_BROOT (mpn_broot_invm1);
918 }
919 double
speed_mpn_brootinv(struct speed_params * s)920 speed_mpn_brootinv (struct speed_params *s)
921 {
922   SPEED_ROUTINE_MPN_BROOTINV (mpn_brootinv, 5*s->size);
923 }
924 
925 double
speed_mpn_binvert(struct speed_params * s)926 speed_mpn_binvert (struct speed_params *s)
927 {
928   SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch);
929 }
930 
931 double
speed_mpn_invert(struct speed_params * s)932 speed_mpn_invert (struct speed_params *s)
933 {
934   SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch);
935 }
936 
937 double
speed_mpn_invertappr(struct speed_params * s)938 speed_mpn_invertappr (struct speed_params *s)
939 {
940   SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch);
941 }
942 
943 double
speed_mpn_ni_invertappr(struct speed_params * s)944 speed_mpn_ni_invertappr (struct speed_params *s)
945 {
946   SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch);
947 }
948 
949 double
speed_mpn_sec_invert(struct speed_params * s)950 speed_mpn_sec_invert (struct speed_params *s)
951 {
952   SPEED_ROUTINE_MPN_SEC_INVERT (mpn_sec_invert, mpn_sec_invert_itch);
953 }
954 
955 double
speed_mpn_redc_1(struct speed_params * s)956 speed_mpn_redc_1 (struct speed_params *s)
957 {
958   SPEED_ROUTINE_REDC_1 (mpn_redc_1);
959 }
960 double
speed_mpn_redc_2(struct speed_params * s)961 speed_mpn_redc_2 (struct speed_params *s)
962 {
963   SPEED_ROUTINE_REDC_2 (mpn_redc_2);
964 }
965 double
speed_mpn_redc_n(struct speed_params * s)966 speed_mpn_redc_n (struct speed_params *s)
967 {
968   SPEED_ROUTINE_REDC_N (mpn_redc_n);
969 }
970 
971 
972 double
speed_mpn_popcount(struct speed_params * s)973 speed_mpn_popcount (struct speed_params *s)
974 {
975   SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
976 }
977 double
speed_mpn_hamdist(struct speed_params * s)978 speed_mpn_hamdist (struct speed_params *s)
979 {
980   SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
981 }
982 
983 
984 double
speed_mpn_add_n(struct speed_params * s)985 speed_mpn_add_n (struct speed_params *s)
986 {
987   SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
988 }
989 double
speed_mpn_sub_n(struct speed_params * s)990 speed_mpn_sub_n (struct speed_params *s)
991 {
992 SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
993 }
994 double
speed_mpn_add_1(struct speed_params * s)995 speed_mpn_add_1 (struct speed_params *s)
996 {
997   SPEED_ROUTINE_MPN_UNARY_1 (mpn_add_1);
998 }
999 double
speed_mpn_add_1_inplace(struct speed_params * s)1000 speed_mpn_add_1_inplace (struct speed_params *s)
1001 {
1002   SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_add_1);
1003 }
1004 double
speed_mpn_sub_1(struct speed_params * s)1005 speed_mpn_sub_1 (struct speed_params *s)
1006 {
1007   SPEED_ROUTINE_MPN_UNARY_1 (mpn_sub_1);
1008 }
1009 double
speed_mpn_sub_1_inplace(struct speed_params * s)1010 speed_mpn_sub_1_inplace (struct speed_params *s)
1011 {
1012   SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_sub_1);
1013 }
1014 
1015 double
speed_mpn_add_err1_n(struct speed_params * s)1016 speed_mpn_add_err1_n (struct speed_params *s)
1017 {
1018   SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_add_err1_n);
1019 }
1020 double
speed_mpn_sub_err1_n(struct speed_params * s)1021 speed_mpn_sub_err1_n (struct speed_params *s)
1022 {
1023   SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_sub_err1_n);
1024 }
1025 double
speed_mpn_add_err2_n(struct speed_params * s)1026 speed_mpn_add_err2_n (struct speed_params *s)
1027 {
1028   SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_add_err2_n);
1029 }
1030 double
speed_mpn_sub_err2_n(struct speed_params * s)1031 speed_mpn_sub_err2_n (struct speed_params *s)
1032 {
1033   SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_sub_err2_n);
1034 }
1035 double
speed_mpn_add_err3_n(struct speed_params * s)1036 speed_mpn_add_err3_n (struct speed_params *s)
1037 {
1038   SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_add_err3_n);
1039 }
1040 double
speed_mpn_sub_err3_n(struct speed_params * s)1041 speed_mpn_sub_err3_n (struct speed_params *s)
1042 {
1043   SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_sub_err3_n);
1044 }
1045 
1046 
1047 #if HAVE_NATIVE_mpn_add_n_sub_n
1048 double
speed_mpn_add_n_sub_n(struct speed_params * s)1049 speed_mpn_add_n_sub_n (struct speed_params *s)
1050 {
1051   SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size));
1052 }
1053 #endif
1054 
1055 #if HAVE_NATIVE_mpn_addlsh1_n == 1
1056 double
speed_mpn_addlsh1_n(struct speed_params * s)1057 speed_mpn_addlsh1_n (struct speed_params *s)
1058 {
1059   SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n);
1060 }
1061 #endif
1062 #if HAVE_NATIVE_mpn_sublsh1_n == 1
1063 double
speed_mpn_sublsh1_n(struct speed_params * s)1064 speed_mpn_sublsh1_n (struct speed_params *s)
1065 {
1066   SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n);
1067 }
1068 #endif
1069 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
1070 double
speed_mpn_addlsh1_n_ip1(struct speed_params * s)1071 speed_mpn_addlsh1_n_ip1 (struct speed_params *s)
1072 {
1073   SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip1);
1074 }
1075 #endif
1076 #if HAVE_NATIVE_mpn_addlsh1_n_ip2
1077 double
speed_mpn_addlsh1_n_ip2(struct speed_params * s)1078 speed_mpn_addlsh1_n_ip2 (struct speed_params *s)
1079 {
1080   SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip2);
1081 }
1082 #endif
1083 #if HAVE_NATIVE_mpn_sublsh1_n_ip1
1084 double
speed_mpn_sublsh1_n_ip1(struct speed_params * s)1085 speed_mpn_sublsh1_n_ip1 (struct speed_params *s)
1086 {
1087   SPEED_ROUTINE_MPN_COPY (mpn_sublsh1_n_ip1);
1088 }
1089 #endif
1090 #if HAVE_NATIVE_mpn_rsblsh1_n == 1
1091 double
speed_mpn_rsblsh1_n(struct speed_params * s)1092 speed_mpn_rsblsh1_n (struct speed_params *s)
1093 {
1094   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n);
1095 }
1096 #endif
1097 #if HAVE_NATIVE_mpn_addlsh2_n == 1
1098 double
speed_mpn_addlsh2_n(struct speed_params * s)1099 speed_mpn_addlsh2_n (struct speed_params *s)
1100 {
1101   SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n);
1102 }
1103 #endif
1104 #if HAVE_NATIVE_mpn_sublsh2_n == 1
1105 double
speed_mpn_sublsh2_n(struct speed_params * s)1106 speed_mpn_sublsh2_n (struct speed_params *s)
1107 {
1108   SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n);
1109 }
1110 #endif
1111 #if HAVE_NATIVE_mpn_addlsh2_n_ip1
1112 double
speed_mpn_addlsh2_n_ip1(struct speed_params * s)1113 speed_mpn_addlsh2_n_ip1 (struct speed_params *s)
1114 {
1115   SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip1);
1116 }
1117 #endif
1118 #if HAVE_NATIVE_mpn_addlsh2_n_ip2
1119 double
speed_mpn_addlsh2_n_ip2(struct speed_params * s)1120 speed_mpn_addlsh2_n_ip2 (struct speed_params *s)
1121 {
1122   SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip2);
1123 }
1124 #endif
1125 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
1126 double
speed_mpn_sublsh2_n_ip1(struct speed_params * s)1127 speed_mpn_sublsh2_n_ip1 (struct speed_params *s)
1128 {
1129   SPEED_ROUTINE_MPN_COPY (mpn_sublsh2_n_ip1);
1130 }
1131 #endif
1132 #if HAVE_NATIVE_mpn_rsblsh2_n == 1
1133 double
speed_mpn_rsblsh2_n(struct speed_params * s)1134 speed_mpn_rsblsh2_n (struct speed_params *s)
1135 {
1136   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n);
1137 }
1138 #endif
1139 #if HAVE_NATIVE_mpn_addlsh_n
1140 double
speed_mpn_addlsh_n(struct speed_params * s)1141 speed_mpn_addlsh_n (struct speed_params *s)
1142 {
1143   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addlsh_n (wp, xp, yp, s->size, 7));
1144 }
1145 #endif
1146 #if HAVE_NATIVE_mpn_sublsh_n
1147 double
speed_mpn_sublsh_n(struct speed_params * s)1148 speed_mpn_sublsh_n (struct speed_params *s)
1149 {
1150   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_sublsh_n (wp, xp, yp, s->size, 7));
1151 }
1152 #endif
1153 #if HAVE_NATIVE_mpn_addlsh_n_ip1
1154 double
speed_mpn_addlsh_n_ip1(struct speed_params * s)1155 speed_mpn_addlsh_n_ip1 (struct speed_params *s)
1156 {
1157   SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip1 (wp, s->xp, s->size, 7));
1158 }
1159 #endif
1160 #if HAVE_NATIVE_mpn_addlsh_n_ip2
1161 double
speed_mpn_addlsh_n_ip2(struct speed_params * s)1162 speed_mpn_addlsh_n_ip2 (struct speed_params *s)
1163 {
1164   SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip2 (wp, s->xp, s->size, 7));
1165 }
1166 #endif
1167 #if HAVE_NATIVE_mpn_sublsh_n_ip1
1168 double
speed_mpn_sublsh_n_ip1(struct speed_params * s)1169 speed_mpn_sublsh_n_ip1 (struct speed_params *s)
1170 {
1171   SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_sublsh_n_ip1 (wp, s->xp, s->size, 7));
1172 }
1173 #endif
1174 #if HAVE_NATIVE_mpn_rsblsh_n
1175 double
speed_mpn_rsblsh_n(struct speed_params * s)1176 speed_mpn_rsblsh_n (struct speed_params *s)
1177 {
1178   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_rsblsh_n (wp, xp, yp, s->size, 7));
1179 }
1180 #endif
1181 #if HAVE_NATIVE_mpn_rsh1add_n
1182 double
speed_mpn_rsh1add_n(struct speed_params * s)1183 speed_mpn_rsh1add_n (struct speed_params *s)
1184 {
1185   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n);
1186 }
1187 #endif
1188 #if HAVE_NATIVE_mpn_rsh1sub_n
1189 double
speed_mpn_rsh1sub_n(struct speed_params * s)1190 speed_mpn_rsh1sub_n (struct speed_params *s)
1191 {
1192   SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n);
1193 }
1194 #endif
1195 
1196 double
speed_mpn_cnd_add_n(struct speed_params * s)1197 speed_mpn_cnd_add_n (struct speed_params *s)
1198 {
1199   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_add_n (1, wp, xp, yp, s->size));
1200 }
1201 double
speed_mpn_cnd_sub_n(struct speed_params * s)1202 speed_mpn_cnd_sub_n (struct speed_params *s)
1203 {
1204   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_sub_n (1, wp, xp, yp, s->size));
1205 }
1206 
1207 /* mpn_and_n etc can be macros and so have to be handled with
1208    SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
1209 double
speed_mpn_and_n(struct speed_params * s)1210 speed_mpn_and_n (struct speed_params *s)
1211 {
1212   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, xp, yp, s->size));
1213 }
1214 double
speed_mpn_andn_n(struct speed_params * s)1215 speed_mpn_andn_n (struct speed_params *s)
1216 {
1217   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, xp, yp, s->size));
1218 }
1219 double
speed_mpn_nand_n(struct speed_params * s)1220 speed_mpn_nand_n (struct speed_params *s)
1221 {
1222   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, xp, yp, s->size));
1223 }
1224 double
speed_mpn_ior_n(struct speed_params * s)1225 speed_mpn_ior_n (struct speed_params *s)
1226 {
1227   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, xp, yp, s->size));
1228 }
1229 double
speed_mpn_iorn_n(struct speed_params * s)1230 speed_mpn_iorn_n (struct speed_params *s)
1231 {
1232   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, xp, yp, s->size));
1233 }
1234 double
speed_mpn_nior_n(struct speed_params * s)1235 speed_mpn_nior_n (struct speed_params *s)
1236 {
1237   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, xp, yp, s->size));
1238 }
1239 double
speed_mpn_xor_n(struct speed_params * s)1240 speed_mpn_xor_n (struct speed_params *s)
1241 {
1242   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, xp, yp, s->size));
1243 }
1244 double
speed_mpn_xnor_n(struct speed_params * s)1245 speed_mpn_xnor_n (struct speed_params *s)
1246 {
1247   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, xp, yp, s->size));
1248 }
1249 
1250 
1251 double
speed_mpn_mul_n(struct speed_params * s)1252 speed_mpn_mul_n (struct speed_params *s)
1253 {
1254   SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
1255 }
1256 double
speed_mpn_sqr(struct speed_params * s)1257 speed_mpn_sqr (struct speed_params *s)
1258 {
1259   SPEED_ROUTINE_MPN_SQR (mpn_sqr);
1260 }
1261 double
speed_mpn_mul_n_sqr(struct speed_params * s)1262 speed_mpn_mul_n_sqr (struct speed_params *s)
1263 {
1264   SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
1265 }
1266 
1267 double
speed_mpn_mul_basecase(struct speed_params * s)1268 speed_mpn_mul_basecase (struct speed_params *s)
1269 {
1270   SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase);
1271 }
1272 double
speed_mpn_mul(struct speed_params * s)1273 speed_mpn_mul (struct speed_params *s)
1274 {
1275   SPEED_ROUTINE_MPN_MUL(mpn_mul);
1276 }
1277 double
speed_mpn_sqr_basecase(struct speed_params * s)1278 speed_mpn_sqr_basecase (struct speed_params *s)
1279 {
1280   /* FIXME: size restrictions on some versions of sqr_basecase */
1281   SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
1282 }
1283 
1284 #if HAVE_NATIVE_mpn_sqr_diagonal
1285 double
speed_mpn_sqr_diagonal(struct speed_params * s)1286 speed_mpn_sqr_diagonal (struct speed_params *s)
1287 {
1288   SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal);
1289 }
1290 #endif
1291 
1292 #if HAVE_NATIVE_mpn_sqr_diag_addlsh1
1293 double
speed_mpn_sqr_diag_addlsh1(struct speed_params * s)1294 speed_mpn_sqr_diag_addlsh1 (struct speed_params *s)
1295 {
1296   SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL (mpn_sqr_diag_addlsh1 (wp, tp, s->xp, s->size));
1297 }
1298 #endif
1299 
1300 double
speed_mpn_toom2_sqr(struct speed_params * s)1301 speed_mpn_toom2_sqr (struct speed_params *s)
1302 {
1303   SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr);
1304 }
1305 double
speed_mpn_toom3_sqr(struct speed_params * s)1306 speed_mpn_toom3_sqr (struct speed_params *s)
1307 {
1308   SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr);
1309 }
1310 double
speed_mpn_toom4_sqr(struct speed_params * s)1311 speed_mpn_toom4_sqr (struct speed_params *s)
1312 {
1313   SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr);
1314 }
1315 double
speed_mpn_toom6_sqr(struct speed_params * s)1316 speed_mpn_toom6_sqr (struct speed_params *s)
1317 {
1318   SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr);
1319 }
1320 double
speed_mpn_toom8_sqr(struct speed_params * s)1321 speed_mpn_toom8_sqr (struct speed_params *s)
1322 {
1323   SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr);
1324 }
1325 double
speed_mpn_toom22_mul(struct speed_params * s)1326 speed_mpn_toom22_mul (struct speed_params *s)
1327 {
1328   SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul);
1329 }
1330 double
speed_mpn_toom33_mul(struct speed_params * s)1331 speed_mpn_toom33_mul (struct speed_params *s)
1332 {
1333   SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul);
1334 }
1335 double
speed_mpn_toom44_mul(struct speed_params * s)1336 speed_mpn_toom44_mul (struct speed_params *s)
1337 {
1338   SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul);
1339 }
1340 double
speed_mpn_toom6h_mul(struct speed_params * s)1341 speed_mpn_toom6h_mul (struct speed_params *s)
1342 {
1343   SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul);
1344 }
1345 double
speed_mpn_toom8h_mul(struct speed_params * s)1346 speed_mpn_toom8h_mul (struct speed_params *s)
1347 {
1348   SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul);
1349 }
1350 
1351 double
speed_mpn_toom32_mul(struct speed_params * s)1352 speed_mpn_toom32_mul (struct speed_params *s)
1353 {
1354   SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul);
1355 }
1356 double
speed_mpn_toom42_mul(struct speed_params * s)1357 speed_mpn_toom42_mul (struct speed_params *s)
1358 {
1359   SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul);
1360 }
1361 double
speed_mpn_toom43_mul(struct speed_params * s)1362 speed_mpn_toom43_mul (struct speed_params *s)
1363 {
1364   SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul);
1365 }
1366 double
speed_mpn_toom63_mul(struct speed_params * s)1367 speed_mpn_toom63_mul (struct speed_params *s)
1368 {
1369   SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul);
1370 }
1371 double
speed_mpn_toom32_for_toom43_mul(struct speed_params * s)1372 speed_mpn_toom32_for_toom43_mul (struct speed_params *s)
1373 {
1374   SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul);
1375 }
1376 double
speed_mpn_toom43_for_toom32_mul(struct speed_params * s)1377 speed_mpn_toom43_for_toom32_mul (struct speed_params *s)
1378 {
1379   SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul);
1380 }
1381 double
speed_mpn_toom32_for_toom53_mul(struct speed_params * s)1382 speed_mpn_toom32_for_toom53_mul (struct speed_params *s)
1383 {
1384   SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul);
1385 }
1386 double
speed_mpn_toom53_for_toom32_mul(struct speed_params * s)1387 speed_mpn_toom53_for_toom32_mul (struct speed_params *s)
1388 {
1389   SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul);
1390 }
1391 double
speed_mpn_toom42_for_toom53_mul(struct speed_params * s)1392 speed_mpn_toom42_for_toom53_mul (struct speed_params *s)
1393 {
1394   SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul);
1395 }
1396 double
speed_mpn_toom53_for_toom42_mul(struct speed_params * s)1397 speed_mpn_toom53_for_toom42_mul (struct speed_params *s)
1398 {
1399   SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul);
1400 }
1401 double
speed_mpn_toom43_for_toom54_mul(struct speed_params * s)1402 speed_mpn_toom43_for_toom54_mul (struct speed_params *s)
1403 {
1404   SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL (mpn_toom43_mul);
1405 }
1406 double
speed_mpn_toom54_for_toom43_mul(struct speed_params * s)1407 speed_mpn_toom54_for_toom43_mul (struct speed_params *s)
1408 {
1409   SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL (mpn_toom54_mul);
1410 }
1411 
1412 double
speed_mpn_nussbaumer_mul(struct speed_params * s)1413 speed_mpn_nussbaumer_mul (struct speed_params *s)
1414 {
1415   SPEED_ROUTINE_MPN_MUL_N_CALL
1416     (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size));
1417 }
1418 double
speed_mpn_nussbaumer_mul_sqr(struct speed_params * s)1419 speed_mpn_nussbaumer_mul_sqr (struct speed_params *s)
1420 {
1421   SPEED_ROUTINE_MPN_SQR_CALL
1422     (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size));
1423 }
1424 
1425 #if WANT_OLD_FFT_FULL
1426 double
speed_mpn_mul_fft_full(struct speed_params * s)1427 speed_mpn_mul_fft_full (struct speed_params *s)
1428 {
1429   SPEED_ROUTINE_MPN_MUL_N_CALL
1430     (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
1431 }
1432 double
speed_mpn_mul_fft_full_sqr(struct speed_params * s)1433 speed_mpn_mul_fft_full_sqr (struct speed_params *s)
1434 {
1435   SPEED_ROUTINE_MPN_SQR_CALL
1436     (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
1437 }
1438 #endif
1439 
1440 /* These are mod 2^N+1 multiplies and squares.  If s->r is supplied it's
1441    used as k, otherwise the best k for the size is used.  If s->size isn't a
1442    multiple of 2^k it's rounded up to make the effective operation size.  */
1443 
1444 #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr)       \
1445   {                                                     \
1446     mp_ptr     wp;                                      \
1447     mp_size_t  pl;                                      \
1448     int        k;                                       \
1449     unsigned   i;                                       \
1450     double     t;                                       \
1451     TMP_DECL;                                           \
1452 							\
1453     SPEED_RESTRICT_COND (s->size >= 1);                 \
1454 							\
1455     if (s->r != 0)                                      \
1456       k = s->r;                                         \
1457     else                                                \
1458       k = mpn_fft_best_k (s->size, sqr);                \
1459 							\
1460     TMP_MARK;                                           \
1461     pl = mpn_fft_next_size (s->size, k);                \
1462     SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp);      \
1463 							\
1464     speed_operand_src (s, s->xp, s->size);              \
1465     if (!sqr)                                           \
1466       speed_operand_src (s, s->yp, s->size);            \
1467     speed_operand_dst (s, wp, pl+1);                    \
1468     speed_cache_fill (s);                               \
1469 							\
1470     speed_starttime ();                                 \
1471     i = s->reps;                                        \
1472     do                                                  \
1473       call;                                             \
1474     while (--i != 0);                                   \
1475     t = speed_endtime ();                               \
1476 							\
1477     TMP_FREE;                                           \
1478     return t;                                           \
1479   }
1480 
1481 double
speed_mpn_mul_fft(struct speed_params * s)1482 speed_mpn_mul_fft (struct speed_params *s)
1483 {
1484   SPEED_ROUTINE_MPN_MUL_FFT_CALL
1485     (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
1486 }
1487 
1488 double
speed_mpn_mul_fft_sqr(struct speed_params * s)1489 speed_mpn_mul_fft_sqr (struct speed_params *s)
1490 {
1491   SPEED_ROUTINE_MPN_MUL_FFT_CALL
1492     (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
1493 }
1494 
1495 double
speed_mpn_fft_mul(struct speed_params * s)1496 speed_mpn_fft_mul (struct speed_params *s)
1497 {
1498   SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size));
1499 }
1500 
1501 double
speed_mpn_fft_sqr(struct speed_params * s)1502 speed_mpn_fft_sqr (struct speed_params *s)
1503 {
1504   SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size));
1505 }
1506 
1507 double
speed_mpn_sqrlo(struct speed_params * s)1508 speed_mpn_sqrlo (struct speed_params *s)
1509 {
1510   SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo);
1511 }
1512 double
speed_mpn_sqrlo_basecase(struct speed_params * s)1513 speed_mpn_sqrlo_basecase (struct speed_params *s)
1514 {
1515   SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo_basecase);
1516 }
1517 double
speed_mpn_mullo_n(struct speed_params * s)1518 speed_mpn_mullo_n (struct speed_params *s)
1519 {
1520   SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n);
1521 }
1522 double
speed_mpn_mullo_basecase(struct speed_params * s)1523 speed_mpn_mullo_basecase (struct speed_params *s)
1524 {
1525   SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase);
1526 }
1527 
1528 double
speed_mpn_mulmid_basecase(struct speed_params * s)1529 speed_mpn_mulmid_basecase (struct speed_params *s)
1530 {
1531   SPEED_ROUTINE_MPN_MULMID (mpn_mulmid_basecase);
1532 }
1533 
1534 double
speed_mpn_mulmid(struct speed_params * s)1535 speed_mpn_mulmid (struct speed_params *s)
1536 {
1537   SPEED_ROUTINE_MPN_MULMID (mpn_mulmid);
1538 }
1539 
1540 double
speed_mpn_mulmid_n(struct speed_params * s)1541 speed_mpn_mulmid_n (struct speed_params *s)
1542 {
1543   SPEED_ROUTINE_MPN_MULMID_N (mpn_mulmid_n);
1544 }
1545 
1546 double
speed_mpn_toom42_mulmid(struct speed_params * s)1547 speed_mpn_toom42_mulmid (struct speed_params *s)
1548 {
1549   SPEED_ROUTINE_MPN_TOOM42_MULMID (mpn_toom42_mulmid);
1550 }
1551 
1552 double
speed_mpn_mulmod_bnm1(struct speed_params * s)1553 speed_mpn_mulmod_bnm1 (struct speed_params *s)
1554 {
1555   SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp));
1556 }
1557 
1558 double
speed_mpn_bc_mulmod_bnm1(struct speed_params * s)1559 speed_mpn_bc_mulmod_bnm1 (struct speed_params *s)
1560 {
1561   SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp));
1562 }
1563 
1564 double
speed_mpn_mulmod_bnm1_rounded(struct speed_params * s)1565 speed_mpn_mulmod_bnm1_rounded (struct speed_params *s)
1566 {
1567   SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1);
1568 }
1569 
1570 double
speed_mpn_sqrmod_bnm1(struct speed_params * s)1571 speed_mpn_sqrmod_bnm1 (struct speed_params *s)
1572 {
1573   SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp));
1574 }
1575 
1576 double
speed_mpn_matrix22_mul(struct speed_params * s)1577 speed_mpn_matrix22_mul (struct speed_params *s)
1578 {
1579   /* Speed params only includes 2 inputs, so we have to invent the
1580      other 6. */
1581 
1582   mp_ptr a;
1583   mp_ptr r;
1584   mp_ptr b;
1585   mp_ptr tp;
1586   mp_size_t itch;
1587   unsigned i;
1588   double t;
1589   TMP_DECL;
1590 
1591   TMP_MARK;
1592   SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp);
1593   SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp);
1594   SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp);
1595 
1596   MPN_COPY (a, s->xp, s->size);
1597   mpn_random (a + s->size, 3 * s->size);
1598   MPN_COPY (b, s->yp, s->size);
1599   mpn_random (b + s->size, 3 * s->size);
1600 
1601   itch = mpn_matrix22_mul_itch (s->size, s->size);
1602   SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);
1603 
1604   speed_operand_src (s, a, 4 * s->size);
1605   speed_operand_src (s, b, 4 * s->size);
1606   speed_operand_dst (s, r, 8 * s->size + 4);
1607   speed_operand_dst (s, tp, itch);
1608   speed_cache_fill (s);
1609 
1610   speed_starttime ();
1611   i = s->reps;
1612   do
1613     {
1614       mp_size_t sz = s->size;
1615       MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz);
1616       MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz);
1617       MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz);
1618       MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz);
1619       mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz,
1620 			b, b + 1 * sz,     b + 2 * sz,     b + 3 * sz,     sz,
1621 			tp);
1622     }
1623   while (--i != 0);
1624   t = speed_endtime();
1625   TMP_FREE;
1626   return t;
1627 }
1628 
1629 double
speed_mpn_hgcd(struct speed_params * s)1630 speed_mpn_hgcd (struct speed_params *s)
1631 {
1632   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd, mpn_hgcd_itch);
1633 }
1634 
1635 double
speed_mpn_hgcd_lehmer(struct speed_params * s)1636 speed_mpn_hgcd_lehmer (struct speed_params *s)
1637 {
1638   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch);
1639 }
1640 
1641 double
speed_mpn_hgcd_appr(struct speed_params * s)1642 speed_mpn_hgcd_appr (struct speed_params *s)
1643 {
1644   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch);
1645 }
1646 
1647 double
speed_mpn_hgcd_appr_lehmer(struct speed_params * s)1648 speed_mpn_hgcd_appr_lehmer (struct speed_params *s)
1649 {
1650   SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch);
1651 }
1652 
1653 double
speed_mpn_hgcd_reduce(struct speed_params * s)1654 speed_mpn_hgcd_reduce (struct speed_params *s)
1655 {
1656   SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch);
1657 }
1658 double
speed_mpn_hgcd_reduce_1(struct speed_params * s)1659 speed_mpn_hgcd_reduce_1 (struct speed_params *s)
1660 {
1661   SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch);
1662 }
1663 double
speed_mpn_hgcd_reduce_2(struct speed_params * s)1664 speed_mpn_hgcd_reduce_2 (struct speed_params *s)
1665 {
1666   SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch);
1667 }
1668 
1669 double
speed_mpn_gcd(struct speed_params * s)1670 speed_mpn_gcd (struct speed_params *s)
1671 {
1672   SPEED_ROUTINE_MPN_GCD (mpn_gcd);
1673 }
1674 
1675 double
speed_mpn_gcdext(struct speed_params * s)1676 speed_mpn_gcdext (struct speed_params *s)
1677 {
1678   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
1679 }
1680 #if 0
1681 double
1682 speed_mpn_gcdext_lehmer (struct speed_params *s)
1683 {
1684   SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer);
1685 }
1686 #endif
1687 double
speed_mpn_gcdext_single(struct speed_params * s)1688 speed_mpn_gcdext_single (struct speed_params *s)
1689 {
1690   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single);
1691 }
1692 double
speed_mpn_gcdext_double(struct speed_params * s)1693 speed_mpn_gcdext_double (struct speed_params *s)
1694 {
1695   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double);
1696 }
1697 double
speed_mpn_gcdext_one_single(struct speed_params * s)1698 speed_mpn_gcdext_one_single (struct speed_params *s)
1699 {
1700   SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single);
1701 }
1702 double
speed_mpn_gcdext_one_double(struct speed_params * s)1703 speed_mpn_gcdext_one_double (struct speed_params *s)
1704 {
1705   SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double);
1706 }
1707 double
speed_mpn_gcd_1(struct speed_params * s)1708 speed_mpn_gcd_1 (struct speed_params *s)
1709 {
1710   SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
1711 }
1712 double
speed_mpn_gcd_1N(struct speed_params * s)1713 speed_mpn_gcd_1N (struct speed_params *s)
1714 {
1715   SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1);
1716 }
1717 
1718 
1719 double
speed_mpz_jacobi(struct speed_params * s)1720 speed_mpz_jacobi (struct speed_params *s)
1721 {
1722   SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi);
1723 }
1724 double
speed_mpn_jacobi_base(struct speed_params * s)1725 speed_mpn_jacobi_base (struct speed_params *s)
1726 {
1727   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
1728 }
1729 double
speed_mpn_jacobi_base_1(struct speed_params * s)1730 speed_mpn_jacobi_base_1 (struct speed_params *s)
1731 {
1732   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1);
1733 }
1734 double
speed_mpn_jacobi_base_2(struct speed_params * s)1735 speed_mpn_jacobi_base_2 (struct speed_params *s)
1736 {
1737   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2);
1738 }
1739 double
speed_mpn_jacobi_base_3(struct speed_params * s)1740 speed_mpn_jacobi_base_3 (struct speed_params *s)
1741 {
1742   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3);
1743 }
1744 double
speed_mpn_jacobi_base_4(struct speed_params * s)1745 speed_mpn_jacobi_base_4 (struct speed_params *s)
1746 {
1747   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_4);
1748 }
1749 
1750 
1751 double
speed_mpn_sqrtrem(struct speed_params * s)1752 speed_mpn_sqrtrem (struct speed_params *s)
1753 {
1754   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, wp2, s->xp, s->size));
1755 }
1756 
1757 double
speed_mpn_sqrt(struct speed_params * s)1758 speed_mpn_sqrt (struct speed_params *s)
1759 {
1760   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, NULL, s->xp, s->size));
1761 }
1762 
1763 double
speed_mpn_rootrem(struct speed_params * s)1764 speed_mpn_rootrem (struct speed_params *s)
1765 {
1766   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, wp2, s->xp, s->size, s->r));
1767 }
1768 
1769 double
speed_mpn_root(struct speed_params * s)1770 speed_mpn_root (struct speed_params *s)
1771 {
1772   SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, NULL, s->xp, s->size, s->r));
1773 }
1774 
1775 
1776 double
speed_mpz_fac_ui(struct speed_params * s)1777 speed_mpz_fac_ui (struct speed_params *s)
1778 {
1779   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui);
1780 }
1781 
1782 double
speed_mpz_2fac_ui(struct speed_params * s)1783 speed_mpz_2fac_ui (struct speed_params *s)
1784 {
1785   SPEED_ROUTINE_MPZ_UI (mpz_2fac_ui);
1786 }
1787 
1788 
1789 double
speed_mpn_fib2_ui(struct speed_params * s)1790 speed_mpn_fib2_ui (struct speed_params *s)
1791 {
1792   SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui);
1793 }
1794 double
speed_mpz_fib_ui(struct speed_params * s)1795 speed_mpz_fib_ui (struct speed_params *s)
1796 {
1797   SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui);
1798 }
1799 double
speed_mpz_fib2_ui(struct speed_params * s)1800 speed_mpz_fib2_ui (struct speed_params *s)
1801 {
1802   SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui);
1803 }
1804 double
speed_mpz_lucnum_ui(struct speed_params * s)1805 speed_mpz_lucnum_ui (struct speed_params *s)
1806 {
1807   SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui);
1808 }
1809 double
speed_mpz_lucnum2_ui(struct speed_params * s)1810 speed_mpz_lucnum2_ui (struct speed_params *s)
1811 {
1812   SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui);
1813 }
1814 
1815 
1816 double
speed_mpz_powm(struct speed_params * s)1817 speed_mpz_powm (struct speed_params *s)
1818 {
1819   SPEED_ROUTINE_MPZ_POWM (mpz_powm);
1820 }
1821 double
speed_mpz_powm_mod(struct speed_params * s)1822 speed_mpz_powm_mod (struct speed_params *s)
1823 {
1824   SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod);
1825 }
1826 double
speed_mpz_powm_redc(struct speed_params * s)1827 speed_mpz_powm_redc (struct speed_params *s)
1828 {
1829   SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc);
1830 }
1831 double
speed_mpz_powm_sec(struct speed_params * s)1832 speed_mpz_powm_sec (struct speed_params *s)
1833 {
1834   SPEED_ROUTINE_MPZ_POWM (mpz_powm_sec);
1835 }
1836 double
speed_mpz_powm_ui(struct speed_params * s)1837 speed_mpz_powm_ui (struct speed_params *s)
1838 {
1839   SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui);
1840 }
1841 
1842 
1843 double
speed_binvert_limb(struct speed_params * s)1844 speed_binvert_limb (struct speed_params *s)
1845 {
1846   SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb);
1847 }
1848 
1849 
1850 double
speed_noop(struct speed_params * s)1851 speed_noop (struct speed_params *s)
1852 {
1853   unsigned  i;
1854 
1855   speed_starttime ();
1856   i = s->reps;
1857   do
1858     noop ();
1859   while (--i != 0);
1860   return speed_endtime ();
1861 }
1862 
1863 double
speed_noop_wxs(struct speed_params * s)1864 speed_noop_wxs (struct speed_params *s)
1865 {
1866   mp_ptr   wp;
1867   unsigned i;
1868   double   t;
1869   TMP_DECL;
1870 
1871   TMP_MARK;
1872   wp = TMP_ALLOC_LIMBS (1);
1873 
1874   speed_starttime ();
1875   i = s->reps;
1876   do
1877     noop_wxs (wp, s->xp, s->size);
1878   while (--i != 0);
1879   t = speed_endtime ();
1880 
1881   TMP_FREE;
1882   return t;
1883 }
1884 
1885 double
speed_noop_wxys(struct speed_params * s)1886 speed_noop_wxys (struct speed_params *s)
1887 {
1888   mp_ptr   wp;
1889   unsigned i;
1890   double   t;
1891   TMP_DECL;
1892 
1893   TMP_MARK;
1894   wp = TMP_ALLOC_LIMBS (1);
1895 
1896   speed_starttime ();
1897   i = s->reps;
1898   do
1899     noop_wxys (wp, s->xp, s->yp, s->size);
1900   while (--i != 0);
1901   t = speed_endtime ();
1902 
1903   TMP_FREE;
1904   return t;
1905 }
1906 
1907 
1908 #define SPEED_ROUTINE_ALLOC_FREE(variables, calls)      \
1909   {                                                     \
1910     unsigned  i;                                        \
1911     variables;                                          \
1912 							\
1913     speed_starttime ();                                 \
1914     i = s->reps;                                        \
1915     do                                                  \
1916       {                                                 \
1917 	calls;                                          \
1918       }                                                 \
1919     while (--i != 0);                                   \
1920     return speed_endtime ();                            \
1921   }
1922 
1923 
1924 /* Compare these to see how much malloc/free costs and then how much
1925    __gmp_default_allocate/free and mpz_init/clear add.  mpz_init/clear or
1926    mpq_init/clear will be doing a 1 limb allocate, so use that as the size
1927    when including them in comparisons.  */
1928 
1929 double
speed_malloc_free(struct speed_params * s)1930 speed_malloc_free (struct speed_params *s)
1931 {
1932   size_t  bytes = s->size * GMP_LIMB_BYTES;
1933   SPEED_ROUTINE_ALLOC_FREE (void *p,
1934 			    p = malloc (bytes);
1935 			    free (p));
1936 }
1937 
1938 double
speed_malloc_realloc_free(struct speed_params * s)1939 speed_malloc_realloc_free (struct speed_params *s)
1940 {
1941   size_t  bytes = s->size * GMP_LIMB_BYTES;
1942   SPEED_ROUTINE_ALLOC_FREE (void *p,
1943 			    p = malloc (GMP_LIMB_BYTES);
1944 			    p = realloc (p, bytes);
1945 			    free (p));
1946 }
1947 
1948 double
speed_gmp_allocate_free(struct speed_params * s)1949 speed_gmp_allocate_free (struct speed_params *s)
1950 {
1951   size_t  bytes = s->size * GMP_LIMB_BYTES;
1952   SPEED_ROUTINE_ALLOC_FREE (void *p,
1953 			    p = (*__gmp_allocate_func) (bytes);
1954 			    (*__gmp_free_func) (p, bytes));
1955 }
1956 
1957 double
speed_gmp_allocate_reallocate_free(struct speed_params * s)1958 speed_gmp_allocate_reallocate_free (struct speed_params *s)
1959 {
1960   size_t  bytes = s->size * GMP_LIMB_BYTES;
1961   SPEED_ROUTINE_ALLOC_FREE
1962     (void *p,
1963      p = (*__gmp_allocate_func) (GMP_LIMB_BYTES);
1964      p = (*__gmp_reallocate_func) (p, bytes, GMP_LIMB_BYTES);
1965      (*__gmp_free_func) (p, bytes));
1966 }
1967 
1968 double
speed_mpz_init_clear(struct speed_params * s)1969 speed_mpz_init_clear (struct speed_params *s)
1970 {
1971   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1972 			    mpz_init (z);
1973 			    mpz_clear (z));
1974 }
1975 
1976 double
speed_mpz_init_realloc_clear(struct speed_params * s)1977 speed_mpz_init_realloc_clear (struct speed_params *s)
1978 {
1979   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1980 			    mpz_init (z);
1981 			    _mpz_realloc (z, s->size);
1982 			    mpz_clear (z));
1983 }
1984 
1985 double
speed_mpq_init_clear(struct speed_params * s)1986 speed_mpq_init_clear (struct speed_params *s)
1987 {
1988   SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
1989 			    mpq_init (q);
1990 			    mpq_clear (q));
1991 }
1992 
1993 double
speed_mpf_init_clear(struct speed_params * s)1994 speed_mpf_init_clear (struct speed_params *s)
1995 {
1996   SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
1997 			    mpf_init (f);
1998 			    mpf_clear (f));
1999 }
2000 
2001 
2002 /* Compare this to mpn_add_n to see how much overhead mpz_add adds.  Note
2003    that repeatedly calling mpz_add with the same data gives branch prediction
2004    in it an advantage.  */
2005 
2006 double
speed_mpz_add(struct speed_params * s)2007 speed_mpz_add (struct speed_params *s)
2008 {
2009   mpz_t     w, x, y;
2010   unsigned  i;
2011   double    t;
2012 
2013   mpz_init (w);
2014   mpz_init (x);
2015   mpz_init (y);
2016 
2017   mpz_set_n (x, s->xp, s->size);
2018   mpz_set_n (y, s->yp, s->size);
2019   mpz_add (w, x, y);
2020 
2021   speed_starttime ();
2022   i = s->reps;
2023   do
2024     {
2025       mpz_add (w, x, y);
2026     }
2027   while (--i != 0);
2028   t = speed_endtime ();
2029 
2030   mpz_clear (w);
2031   mpz_clear (x);
2032   mpz_clear (y);
2033   return t;
2034 }
2035 
2036 
2037 /* If r==0, calculate (size,size/2),
2038    otherwise calculate (size,r). */
2039 
2040 double
speed_mpz_bin_uiui(struct speed_params * s)2041 speed_mpz_bin_uiui (struct speed_params *s)
2042 {
2043   mpz_t          w;
2044   unsigned long  k;
2045   unsigned  i;
2046   double    t;
2047 
2048   mpz_init (w);
2049   if (s->r != 0)
2050     k = s->r;
2051   else
2052     k = s->size/2;
2053 
2054   speed_starttime ();
2055   i = s->reps;
2056   do
2057     {
2058       mpz_bin_uiui (w, s->size, k);
2059     }
2060   while (--i != 0);
2061   t = speed_endtime ();
2062 
2063   mpz_clear (w);
2064   return t;
2065 }
2066 
2067 /* If r==0, calculate binomial(2^size,size),
2068    otherwise calculate binomial(2^size,r). */
2069 
2070 double
speed_mpz_bin_ui(struct speed_params * s)2071 speed_mpz_bin_ui (struct speed_params *s)
2072 {
2073   mpz_t          w, x;
2074   unsigned long  k;
2075   unsigned  i;
2076   double    t;
2077 
2078   mpz_init (w);
2079   mpz_init_set_ui (x, 0);
2080 
2081   mpz_setbit (x, s->size);
2082 
2083   if (s->r != 0)
2084     k = s->r;
2085   else
2086     k = s->size;
2087 
2088   speed_starttime ();
2089   i = s->reps;
2090   do
2091     {
2092       mpz_bin_ui (w, x, k);
2093     }
2094   while (--i != 0);
2095   t = speed_endtime ();
2096 
2097   mpz_clear (w);
2098   mpz_clear (x);
2099   return t;
2100 }
2101 
2102 /* The multiplies are successively dependent so the latency is measured, not
2103    the issue rate.  There's only 10 per loop so the code doesn't get too big
2104    since umul_ppmm is several instructions on some cpus.
2105 
2106    Putting the arguments as "h,l,l,h" gets slightly better code from gcc
2107    2.95.2 on x86, it puts only one mov between each mul, not two.  That mov
2108    though will probably show up as a bogus extra cycle though.
2109 
2110    The measuring function macros are into three parts to avoid overflowing
2111    preprocessor expansion space if umul_ppmm is big.
2112 
2113    Limitations:
2114 
2115    Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code
2116    generated first, especially on CPUs with low latency multipliers.
2117 
2118    The default umul_ppmm doing h*l will be getting increasing numbers of
2119    high zero bits in the calculation.  CPUs with data-dependent multipliers
2120    will want to use umul_ppmm.1 to get some randomization into the
2121    calculation.  The extra xors and fetches will be a slowdown of course.  */
2122 
2123 #define SPEED_MACRO_UMUL_PPMM_A \
2124   {                             \
2125     mp_limb_t  h, l;            \
2126     unsigned   i;               \
2127     double     t;               \
2128 				\
2129     s->time_divisor = 10;       \
2130 				\
2131     h = s->xp[0];               \
2132     l = s->yp[0];               \
2133 				\
2134     if (s->r == 1)              \
2135       {                         \
2136 	speed_starttime ();     \
2137 	i = s->reps;            \
2138 	do                      \
2139 	  {
2140 
2141 #define SPEED_MACRO_UMUL_PPMM_B \
2142 	  }                     \
2143 	while (--i != 0);       \
2144 	t = speed_endtime ();   \
2145       }                         \
2146     else                        \
2147       {                         \
2148 	speed_starttime ();     \
2149 	i = s->reps;            \
2150 	do                      \
2151 	  {
2152 
2153 #define SPEED_MACRO_UMUL_PPMM_C                                         \
2154 	  }                                                             \
2155 	while (--i != 0);                                               \
2156 	t = speed_endtime ();                                           \
2157       }                                                                 \
2158 									\
2159     /* stop the compiler optimizing away the whole calculation! */      \
2160     noop_1 (h);                                                         \
2161     noop_1 (l);                                                         \
2162 									\
2163     return t;                                                           \
2164   }
2165 
2166 
2167 double
speed_umul_ppmm(struct speed_params * s)2168 speed_umul_ppmm (struct speed_params *s)
2169 {
2170   SPEED_MACRO_UMUL_PPMM_A;
2171   {
2172     umul_ppmm (h, l, l, h);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
2173      umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
2174      umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
2175     umul_ppmm (h, l, l, h);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
2176      umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
2177      umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
2178     umul_ppmm (h, l, l, h);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
2179      umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
2180      umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
2181     umul_ppmm (h, l, l, h);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
2182   }
2183   SPEED_MACRO_UMUL_PPMM_B;
2184   {
2185     umul_ppmm (h, l, l, h);
2186      umul_ppmm (h, l, l, h);
2187      umul_ppmm (h, l, l, h);
2188     umul_ppmm (h, l, l, h);
2189      umul_ppmm (h, l, l, h);
2190      umul_ppmm (h, l, l, h);
2191     umul_ppmm (h, l, l, h);
2192      umul_ppmm (h, l, l, h);
2193      umul_ppmm (h, l, l, h);
2194     umul_ppmm (h, l, l, h);
2195   }
2196   SPEED_MACRO_UMUL_PPMM_C;
2197 }
2198 
2199 
2200 #if HAVE_NATIVE_mpn_umul_ppmm
2201 double
speed_mpn_umul_ppmm(struct speed_params * s)2202 speed_mpn_umul_ppmm (struct speed_params *s)
2203 {
2204   SPEED_MACRO_UMUL_PPMM_A;
2205   {
2206     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
2207      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
2208      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
2209     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
2210      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
2211      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
2212     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
2213      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
2214      h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
2215     h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
2216   }
2217   SPEED_MACRO_UMUL_PPMM_B;
2218   {
2219     h = mpn_umul_ppmm (&l, h, l);
2220      h = mpn_umul_ppmm (&l, h, l);
2221      h = mpn_umul_ppmm (&l, h, l);
2222     h = mpn_umul_ppmm (&l, h, l);
2223      h = mpn_umul_ppmm (&l, h, l);
2224      h = mpn_umul_ppmm (&l, h, l);
2225     h = mpn_umul_ppmm (&l, h, l);
2226      h = mpn_umul_ppmm (&l, h, l);
2227      h = mpn_umul_ppmm (&l, h, l);
2228     h = mpn_umul_ppmm (&l, h, l);
2229   }
2230   SPEED_MACRO_UMUL_PPMM_C;
2231 }
2232 #endif
2233 
2234 #if HAVE_NATIVE_mpn_umul_ppmm_r
2235 double
speed_mpn_umul_ppmm_r(struct speed_params * s)2236 speed_mpn_umul_ppmm_r (struct speed_params *s)
2237 {
2238   SPEED_MACRO_UMUL_PPMM_A;
2239   {
2240     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
2241      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
2242      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
2243     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
2244      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
2245      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
2246     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
2247      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
2248      h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
2249     h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
2250   }
2251   SPEED_MACRO_UMUL_PPMM_B;
2252   {
2253     h = mpn_umul_ppmm_r (h, l, &l);
2254      h = mpn_umul_ppmm_r (h, l, &l);
2255      h = mpn_umul_ppmm_r (h, l, &l);
2256     h = mpn_umul_ppmm_r (h, l, &l);
2257      h = mpn_umul_ppmm_r (h, l, &l);
2258      h = mpn_umul_ppmm_r (h, l, &l);
2259     h = mpn_umul_ppmm_r (h, l, &l);
2260      h = mpn_umul_ppmm_r (h, l, &l);
2261      h = mpn_umul_ppmm_r (h, l, &l);
2262     h = mpn_umul_ppmm_r (h, l, &l);
2263   }
2264   SPEED_MACRO_UMUL_PPMM_C;
2265 }
2266 #endif
2267 
2268 
2269 /* The divisions are successively dependent so latency is measured, not
2270    issue rate.  There's only 10 per loop so the code doesn't get too big,
2271    especially for udiv_qrnnd_preinv and preinv2norm, which are several
2272    instructions each.
2273 
2274    Note that it's only the division which is measured here, there's no data
2275    fetching and no shifting if the divisor gets normalized.
2276 
2277    In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
2278    generate x86 div instructions with nothing in between.
2279 
2280    The measuring function macros are in two parts to avoid overflowing
2281    preprocessor expansion space if udiv_qrnnd etc are big.
2282 
2283    Limitations:
2284 
2285    Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
2286    generated first.
2287 
2288    CPUs with data-dependent divisions may want more attention paid to the
2289    randomness of the data used.  Probably the measurement wanted is over
2290    uniformly distributed numbers, but what's here might not be giving that.  */
2291 
2292 #define SPEED_ROUTINE_UDIV_QRNND_A(normalize)           \
2293   {                                                     \
2294     double     t;                                       \
2295     unsigned   i;                                       \
2296     mp_limb_t  q, r, d;                                 \
2297     mp_limb_t  dinv;                                    \
2298 							\
2299     s->time_divisor = 10;                               \
2300 							\
2301     /* divisor from "r" parameter, or a default */      \
2302     d = s->r;                                           \
2303     if (d == 0)                                         \
2304       d = mp_bases[10].big_base;                        \
2305 							\
2306     if (normalize)                                      \
2307       {                                                 \
2308 	unsigned  norm;                                 \
2309 	count_leading_zeros (norm, d);                  \
2310 	d <<= norm;                                     \
2311 	invert_limb (dinv, d);                          \
2312       }                                                 \
2313 							\
2314     q = s->xp[0];                                       \
2315     r = s->yp[0] % d;                                   \
2316 							\
2317     speed_starttime ();                                 \
2318     i = s->reps;                                        \
2319     do                                                  \
2320       {
2321 
2322 #define SPEED_ROUTINE_UDIV_QRNND_B                                      \
2323       }                                                                 \
2324     while (--i != 0);                                                   \
2325     t = speed_endtime ();                                               \
2326 									\
2327     /* stop the compiler optimizing away the whole calculation! */      \
2328     noop_1 (q);                                                         \
2329     noop_1 (r);                                                         \
2330 									\
2331     return t;                                                           \
2332   }
2333 
2334 double
speed_udiv_qrnnd(struct speed_params * s)2335 speed_udiv_qrnnd (struct speed_params *s)
2336 {
2337   SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
2338   {
2339     udiv_qrnnd (q, r, r, q, d);
2340      udiv_qrnnd (q, r, r, q, d);
2341      udiv_qrnnd (q, r, r, q, d);
2342     udiv_qrnnd (q, r, r, q, d);
2343      udiv_qrnnd (q, r, r, q, d);
2344      udiv_qrnnd (q, r, r, q, d);
2345     udiv_qrnnd (q, r, r, q, d);
2346      udiv_qrnnd (q, r, r, q, d);
2347      udiv_qrnnd (q, r, r, q, d);
2348     udiv_qrnnd (q, r, r, q, d);
2349   }
2350   SPEED_ROUTINE_UDIV_QRNND_B;
2351 }
2352 
2353 double
speed_udiv_qrnnd_c(struct speed_params * s)2354 speed_udiv_qrnnd_c (struct speed_params *s)
2355 {
2356   SPEED_ROUTINE_UDIV_QRNND_A (1);
2357   {
2358     __udiv_qrnnd_c (q, r, r, q, d);
2359      __udiv_qrnnd_c (q, r, r, q, d);
2360      __udiv_qrnnd_c (q, r, r, q, d);
2361     __udiv_qrnnd_c (q, r, r, q, d);
2362      __udiv_qrnnd_c (q, r, r, q, d);
2363      __udiv_qrnnd_c (q, r, r, q, d);
2364     __udiv_qrnnd_c (q, r, r, q, d);
2365      __udiv_qrnnd_c (q, r, r, q, d);
2366      __udiv_qrnnd_c (q, r, r, q, d);
2367     __udiv_qrnnd_c (q, r, r, q, d);
2368   }
2369   SPEED_ROUTINE_UDIV_QRNND_B;
2370 }
2371 
2372 #if HAVE_NATIVE_mpn_udiv_qrnnd
2373 double
speed_mpn_udiv_qrnnd(struct speed_params * s)2374 speed_mpn_udiv_qrnnd (struct speed_params *s)
2375 {
2376   SPEED_ROUTINE_UDIV_QRNND_A (1);
2377   {
2378     q = mpn_udiv_qrnnd (&r, r, q, d);
2379      q = mpn_udiv_qrnnd (&r, r, q, d);
2380      q = mpn_udiv_qrnnd (&r, r, q, d);
2381     q = mpn_udiv_qrnnd (&r, r, q, d);
2382      q = mpn_udiv_qrnnd (&r, r, q, d);
2383      q = mpn_udiv_qrnnd (&r, r, q, d);
2384     q = mpn_udiv_qrnnd (&r, r, q, d);
2385      q = mpn_udiv_qrnnd (&r, r, q, d);
2386      q = mpn_udiv_qrnnd (&r, r, q, d);
2387     q = mpn_udiv_qrnnd (&r, r, q, d);
2388   }
2389   SPEED_ROUTINE_UDIV_QRNND_B;
2390 }
2391 #endif
2392 
2393 #if HAVE_NATIVE_mpn_udiv_qrnnd_r
2394 double
speed_mpn_udiv_qrnnd_r(struct speed_params * s)2395 speed_mpn_udiv_qrnnd_r (struct speed_params *s)
2396 {
2397   SPEED_ROUTINE_UDIV_QRNND_A (1);
2398   {
2399     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2400      q = mpn_udiv_qrnnd_r (r, q, d, &r);
2401      q = mpn_udiv_qrnnd_r (r, q, d, &r);
2402     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2403      q = mpn_udiv_qrnnd_r (r, q, d, &r);
2404      q = mpn_udiv_qrnnd_r (r, q, d, &r);
2405     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2406      q = mpn_udiv_qrnnd_r (r, q, d, &r);
2407      q = mpn_udiv_qrnnd_r (r, q, d, &r);
2408     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2409   }
2410   SPEED_ROUTINE_UDIV_QRNND_B;
2411 }
2412 #endif
2413 
2414 
2415 double
speed_invert_limb(struct speed_params * s)2416 speed_invert_limb (struct speed_params *s)
2417 {
2418   SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d));
2419 }
2420 
2421 
2422 /* xp[0] might not be particularly random, but should give an indication how
2423    "/" runs.  Same for speed_operator_mod below.  */
2424 double
speed_operator_div(struct speed_params * s)2425 speed_operator_div (struct speed_params *s)
2426 {
2427   double     t;
2428   unsigned   i;
2429   mp_limb_t  x, q, d;
2430 
2431   s->time_divisor = 10;
2432 
2433   /* divisor from "r" parameter, or a default */
2434   d = s->r;
2435   if (d == 0)
2436     d = mp_bases[10].big_base;
2437 
2438   x = s->xp[0];
2439   q = 0;
2440 
2441   speed_starttime ();
2442   i = s->reps;
2443   do
2444     {
2445       q ^= x; q /= d;
2446        q ^= x; q /= d;
2447        q ^= x; q /= d;
2448       q ^= x; q /= d;
2449        q ^= x; q /= d;
2450        q ^= x; q /= d;
2451       q ^= x; q /= d;
2452        q ^= x; q /= d;
2453        q ^= x; q /= d;
2454       q ^= x; q /= d;
2455     }
2456   while (--i != 0);
2457   t = speed_endtime ();
2458 
2459   /* stop the compiler optimizing away the whole calculation! */
2460   noop_1 (q);
2461 
2462   return t;
2463 }
2464 
2465 double
speed_operator_mod(struct speed_params * s)2466 speed_operator_mod (struct speed_params *s)
2467 {
2468   double     t;
2469   unsigned   i;
2470   mp_limb_t  x, r, d;
2471 
2472   s->time_divisor = 10;
2473 
2474   /* divisor from "r" parameter, or a default */
2475   d = s->r;
2476   if (d == 0)
2477     d = mp_bases[10].big_base;
2478 
2479   x = s->xp[0];
2480   r = 0;
2481 
2482   speed_starttime ();
2483   i = s->reps;
2484   do
2485     {
2486       r ^= x; r %= d;
2487        r ^= x; r %= d;
2488        r ^= x; r %= d;
2489       r ^= x; r %= d;
2490        r ^= x; r %= d;
2491        r ^= x; r %= d;
2492       r ^= x; r %= d;
2493        r ^= x; r %= d;
2494        r ^= x; r %= d;
2495       r ^= x; r %= d;
2496     }
2497   while (--i != 0);
2498   t = speed_endtime ();
2499 
2500   /* stop the compiler optimizing away the whole calculation! */
2501   noop_1 (r);
2502 
2503   return t;
2504 }
2505 
2506 
2507 /* r==0 measures on data with the values uniformly distributed.  This will
2508    be typical for count_trailing_zeros in a GCD etc.
2509 
2510    r==1 measures on data with the resultant count uniformly distributed
2511    between 0 and GMP_LIMB_BITS-1.  This is probably sensible for
2512    count_leading_zeros on the high limbs of divisors.  */
2513 
2514 int
speed_routine_count_zeros_setup(struct speed_params * s,mp_ptr xp,int leading,int zero)2515 speed_routine_count_zeros_setup (struct speed_params *s,
2516 				 mp_ptr xp, int leading, int zero)
2517 {
2518   int        i, c;
2519   mp_limb_t  n;
2520 
2521   if (s->r == 0)
2522     {
2523       /* Make uniformly distributed data.  If zero isn't allowed then change
2524 	 it to 1 for leading, or 0x800..00 for trailing.  */
2525       MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE);
2526       if (! zero)
2527 	for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2528 	  if (xp[i] == 0)
2529 	    xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT;
2530     }
2531   else if (s->r == 1)
2532     {
2533       /* Make counts uniformly distributed.  A randomly chosen bit is set, and
2534 	 for leading the rest above it are cleared, or for trailing then the
2535 	 rest below.  */
2536       for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2537 	{
2538 	  mp_limb_t  set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS);
2539 	  mp_limb_t  keep_below = set-1;
2540 	  mp_limb_t  keep_above = MP_LIMB_T_MAX ^ keep_below;
2541 	  mp_limb_t  keep = (leading ? keep_below : keep_above);
2542 	  xp[i] = (s->xp_block[i] & keep) | set;
2543 	}
2544     }
2545   else
2546     {
2547       return 0;
2548     }
2549 
2550   /* Account for the effect of n^=c. */
2551   c = 0;
2552   for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2553     {
2554       n = xp[i];
2555       xp[i] ^= c;
2556 
2557       if (leading)
2558 	count_leading_zeros (c, n);
2559       else
2560 	count_trailing_zeros (c, n);
2561     }
2562 
2563   return 1;
2564 }
2565 
2566 double
speed_count_leading_zeros(struct speed_params * s)2567 speed_count_leading_zeros (struct speed_params *s)
2568 {
2569 #ifdef COUNT_LEADING_ZEROS_0
2570 #define COUNT_LEADING_ZEROS_0_ALLOWED   1
2571 #else
2572 #define COUNT_LEADING_ZEROS_0_ALLOWED   0
2573 #endif
2574 
2575   SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED);
2576   count_leading_zeros (c, n);
2577   SPEED_ROUTINE_COUNT_ZEROS_B ();
2578 }
2579 double
speed_count_trailing_zeros(struct speed_params * s)2580 speed_count_trailing_zeros (struct speed_params *s)
2581 {
2582   SPEED_ROUTINE_COUNT_ZEROS_A (0, 0);
2583   count_trailing_zeros (c, n);
2584   SPEED_ROUTINE_COUNT_ZEROS_B ();
2585 }
2586 
2587 
2588 double
speed_mpn_get_str(struct speed_params * s)2589 speed_mpn_get_str (struct speed_params *s)
2590 {
2591   SPEED_ROUTINE_MPN_GET_STR (mpn_get_str);
2592 }
2593 
2594 double
speed_mpn_set_str(struct speed_params * s)2595 speed_mpn_set_str (struct speed_params *s)
2596 {
2597   SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base));
2598 }
2599 double
speed_mpn_bc_set_str(struct speed_params * s)2600 speed_mpn_bc_set_str (struct speed_params *s)
2601 {
2602   SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base));
2603 }
2604 
2605 double
speed_MPN_ZERO(struct speed_params * s)2606 speed_MPN_ZERO (struct speed_params *s)
2607 {
2608   SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size));
2609 }
2610 
2611 
2612 int
speed_randinit(struct speed_params * s,gmp_randstate_ptr rstate)2613 speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate)
2614 {
2615   if (s->r == 0)
2616     gmp_randinit_default (rstate);
2617   else if (s->r == 1)
2618     gmp_randinit_mt (rstate);
2619   else
2620     {
2621       return gmp_randinit_lc_2exp_size (rstate, s->r);
2622     }
2623   return 1;
2624 }
2625 
2626 double
speed_gmp_randseed(struct speed_params * s)2627 speed_gmp_randseed (struct speed_params *s)
2628 {
2629   gmp_randstate_t  rstate;
2630   unsigned  i;
2631   double    t;
2632   mpz_t     x;
2633 
2634   SPEED_RESTRICT_COND (s->size >= 1);
2635   SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2636 
2637   /* s->size bits of seed */
2638   mpz_init_set_n (x, s->xp, s->size);
2639   mpz_fdiv_r_2exp (x, x, (unsigned long) s->size);
2640 
2641   /* cache priming */
2642   gmp_randseed (rstate, x);
2643 
2644   speed_starttime ();
2645   i = s->reps;
2646   do
2647     gmp_randseed (rstate, x);
2648   while (--i != 0);
2649   t = speed_endtime ();
2650 
2651   gmp_randclear (rstate);
2652   mpz_clear (x);
2653   return t;
2654 }
2655 
2656 double
speed_gmp_randseed_ui(struct speed_params * s)2657 speed_gmp_randseed_ui (struct speed_params *s)
2658 {
2659   gmp_randstate_t  rstate;
2660   unsigned  i, j;
2661   double    t;
2662 
2663   SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2664 
2665   /* cache priming */
2666   gmp_randseed_ui (rstate, 123L);
2667 
2668   speed_starttime ();
2669   i = s->reps;
2670   j = 0;
2671   do
2672     {
2673       gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]);
2674       j++;
2675       if (j >= SPEED_BLOCK_SIZE)
2676 	j = 0;
2677     }
2678   while (--i != 0);
2679   t = speed_endtime ();
2680 
2681   gmp_randclear (rstate);
2682   return t;
2683 }
2684 
2685 double
speed_mpz_urandomb(struct speed_params * s)2686 speed_mpz_urandomb (struct speed_params *s)
2687 {
2688   gmp_randstate_t  rstate;
2689   mpz_t     z;
2690   unsigned  i;
2691   double    t;
2692 
2693   SPEED_RESTRICT_COND (s->size >= 0);
2694   SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2695 
2696   mpz_init (z);
2697 
2698   /* cache priming */
2699   mpz_urandomb (z, rstate, (unsigned long) s->size);
2700   mpz_urandomb (z, rstate, (unsigned long) s->size);
2701 
2702   speed_starttime ();
2703   i = s->reps;
2704   do
2705     mpz_urandomb (z, rstate, (unsigned long) s->size);
2706   while (--i != 0);
2707   t = speed_endtime ();
2708 
2709   mpz_clear (z);
2710   gmp_randclear (rstate);
2711   return t;
2712 }
2713