xref: /netbsd/external/lgpl3/gmp/dist/tune/speed.h (revision 122966f8)
1 /* Header for speed and threshold things.
2 
3 Copyright 1999-2003, 2005, 2006, 2008-2017, 2019 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 #ifndef __SPEED_H__
33 #define __SPEED_H__
34 
35 
36 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
37    newsize long. */
38 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
39   do {							\
40     ASSERT ((newsize) >= (oldsize));			\
41     MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
42   } while (0)
43 
44 /* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
45    x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
46 #define MP_LIMB_T_LOWBITMASK(n)	\
47   ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
48 
49 
50 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
51 
52 #define TMP_ALLOC_ALIGNED(bytes, align)	\
53   align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
54 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
55   ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
56 
57 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the
58    limit on what s->align_xp etc and then request for off-alignment.  Maybe
59    this should be an option of some sort, but in any case here are some line
60    sizes,
61 
62        bytes
63 	 32   pentium
64 	 64   athlon
65 	 64   itanium-2 L1
66 	128   itanium-2 L2
67 */
68 #undef CACHE_LINE_SIZE
69 #define CACHE_LINE_SIZE   64 /* bytes */
70 
71 #define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
72 
73 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
74    alignment.  */
75 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
76   do {									\
77     mp_ptr     __ptr;							\
78     mp_size_t  __ptr_align, __ptr_add;					\
79 									\
80     ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0);		\
81     __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
82     __ptr_align = (__ptr - (mp_ptr) NULL);				\
83     __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
84     (ptr) = __ptr + __ptr_add;						\
85   } while (0)
86 
87 
88 /* This is the size for s->xp_block and s->yp_block, used in certain
89    routines that want to run across many different data values and use
90    s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
91 
92    512 means 2kbytes of data for each of xp_block and yp_block, making 4k
93    total, which should fit easily in any L1 data cache. */
94 
95 #define SPEED_BLOCK_SIZE   512 /* limbs */
96 
97 
98 extern double  speed_unittime;
99 extern double  speed_cycletime;
100 extern int     speed_precision;
101 extern char    speed_time_string[];
102 void speed_time_init (void);
103 void speed_cycletime_fail (const char *str);
104 void speed_cycletime_init (void);
105 void speed_cycletime_need_cycles (void);
106 void speed_cycletime_need_seconds (void);
107 void speed_starttime (void);
108 double speed_endtime (void);
109 
110 
111 struct speed_params {
112   unsigned   reps;	/* how many times to run the routine */
113   mp_ptr     xp;	/* first argument */
114   mp_ptr     yp;	/* second argument */
115   mp_size_t  size;	/* size of both arguments */
116   mp_limb_t  r;		/* user supplied parameter */
117   mp_size_t  align_xp;	/* alignment of xp */
118   mp_size_t  align_yp;	/* alignment of yp */
119   mp_size_t  align_wp;	/* intended alignment of wp */
120   mp_size_t  align_wp2; /* intended alignment of wp2 */
121   mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
122   mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
123 
124   double     time_divisor; /* optionally set by the speed routine */
125 
126   /* used by the cache priming things */
127   int	     cache;
128   unsigned   src_num, dst_num;
129   struct {
130     mp_ptr    ptr;
131     mp_size_t size;
132   } src[5], dst[4];
133 };
134 
135 typedef double (*speed_function_t) (struct speed_params *);
136 
137 double speed_measure (speed_function_t fun, struct speed_params *);
138 
139 /* Prototypes for speed measuring routines */
140 
141 double speed_back_to_back (struct speed_params *);
142 double speed_count_leading_zeros (struct speed_params *);
143 double speed_count_trailing_zeros (struct speed_params *);
144 double speed_find_a (struct speed_params *);
145 double speed_gmp_allocate_free (struct speed_params *);
146 double speed_gmp_allocate_reallocate_free (struct speed_params *);
147 double speed_invert_limb (struct speed_params *);
148 double speed_malloc_free (struct speed_params *);
149 double speed_malloc_realloc_free (struct speed_params *);
150 double speed_memcpy (struct speed_params *);
151 double speed_binvert_limb (struct speed_params *);
152 double speed_binvert_limb_mul1 (struct speed_params *);
153 double speed_binvert_limb_loop (struct speed_params *);
154 double speed_binvert_limb_cond (struct speed_params *);
155 double speed_binvert_limb_arith (struct speed_params *);
156 
157 double speed_mpf_init_clear (struct speed_params *);
158 
159 double speed_mpn_add_n (struct speed_params *);
160 double speed_mpn_add_1 (struct speed_params *);
161 double speed_mpn_add_1_inplace (struct speed_params *);
162 double speed_mpn_add_err1_n (struct speed_params *);
163 double speed_mpn_add_err2_n (struct speed_params *);
164 double speed_mpn_add_err3_n (struct speed_params *);
165 double speed_mpn_addlsh_n (struct speed_params *);
166 double speed_mpn_addlsh1_n (struct speed_params *);
167 double speed_mpn_addlsh2_n (struct speed_params *);
168 double speed_mpn_addlsh_n_ip1 (struct speed_params *);
169 double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
170 double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
171 double speed_mpn_addlsh_n_ip2 (struct speed_params *);
172 double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
173 double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
174 double speed_mpn_add_n_sub_n (struct speed_params *);
175 double speed_mpn_and_n (struct speed_params *);
176 double speed_mpn_andn_n (struct speed_params *);
177 double speed_mpn_addmul_1 (struct speed_params *);
178 double speed_mpn_addmul_2 (struct speed_params *);
179 double speed_mpn_addmul_3 (struct speed_params *);
180 double speed_mpn_addmul_4 (struct speed_params *);
181 double speed_mpn_addmul_5 (struct speed_params *);
182 double speed_mpn_addmul_6 (struct speed_params *);
183 double speed_mpn_addmul_7 (struct speed_params *);
184 double speed_mpn_addmul_8 (struct speed_params *);
185 double speed_mpn_cnd_add_n (struct speed_params *);
186 double speed_mpn_cnd_sub_n (struct speed_params *);
187 double speed_mpn_com (struct speed_params *);
188 double speed_mpn_neg (struct speed_params *);
189 double speed_mpn_copyd (struct speed_params *);
190 double speed_mpn_copyi (struct speed_params *);
191 double speed_MPN_COPY (struct speed_params *);
192 double speed_MPN_COPY_DECR (struct speed_params *);
193 double speed_MPN_COPY_INCR (struct speed_params *);
194 double speed_mpn_sec_tabselect (struct speed_params *);
195 double speed_mpn_divexact_1 (struct speed_params *);
196 double speed_mpn_divexact_by3 (struct speed_params *);
197 double speed_mpn_bdiv_q_1 (struct speed_params *);
198 double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
199 double speed_mpn_bdiv_dbm1c (struct speed_params *);
200 double speed_mpn_divrem_1 (struct speed_params *);
201 double speed_mpn_divrem_1f (struct speed_params *);
202 double speed_mpn_divrem_1c (struct speed_params *);
203 double speed_mpn_divrem_1cf (struct speed_params *);
204 double speed_mpn_divrem_1_div (struct speed_params *);
205 double speed_mpn_divrem_1f_div (struct speed_params *);
206 double speed_mpn_divrem_1_inv (struct speed_params *);
207 double speed_mpn_divrem_1f_inv (struct speed_params *);
208 double speed_mpn_divrem_2 (struct speed_params *);
209 double speed_mpn_divrem_2_div (struct speed_params *);
210 double speed_mpn_divrem_2_inv (struct speed_params *);
211 double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
212 double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
213 double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
214 double speed_mpn_div_qr_1 (struct speed_params *);
215 double speed_mpn_div_qr_2n (struct speed_params *);
216 double speed_mpn_div_qr_2u (struct speed_params *);
217 double speed_mpn_fib2_ui (struct speed_params *);
218 double speed_mpn_matrix22_mul (struct speed_params *);
219 double speed_mpn_hgcd2 (struct speed_params *);
220 double speed_mpn_hgcd2_1 (struct speed_params *);
221 double speed_mpn_hgcd2_2 (struct speed_params *);
222 double speed_mpn_hgcd2_3 (struct speed_params *);
223 double speed_mpn_hgcd2_4 (struct speed_params *);
224 double speed_mpn_hgcd2_5 (struct speed_params *);
225 double speed_mpn_hgcd (struct speed_params *);
226 double speed_mpn_hgcd_lehmer (struct speed_params *);
227 double speed_mpn_hgcd_appr (struct speed_params *);
228 double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
229 double speed_mpn_hgcd_reduce (struct speed_params *);
230 double speed_mpn_hgcd_reduce_1 (struct speed_params *);
231 double speed_mpn_hgcd_reduce_2 (struct speed_params *);
232 double speed_mpn_gcd (struct speed_params *);
233 double speed_mpn_gcd_1 (struct speed_params *);
234 double speed_mpn_gcd_11 (struct speed_params *);
235 double speed_mpn_gcd_1N (struct speed_params *);
236 double speed_mpn_gcd_22 (struct speed_params *);
237 double speed_mpn_gcdext (struct speed_params *);
238 double speed_mpn_gcdext_double (struct speed_params *);
239 double speed_mpn_gcdext_one_double (struct speed_params *);
240 double speed_mpn_gcdext_one_single (struct speed_params *);
241 double speed_mpn_gcdext_single (struct speed_params *);
242 double speed_mpn_get_str (struct speed_params *);
243 double speed_mpn_hamdist (struct speed_params *);
244 double speed_mpn_ior_n (struct speed_params *);
245 double speed_mpn_iorn_n (struct speed_params *);
246 double speed_mpn_jacobi_base (struct speed_params *);
247 double speed_mpn_jacobi_base_1 (struct speed_params *);
248 double speed_mpn_jacobi_base_2 (struct speed_params *);
249 double speed_mpn_jacobi_base_3 (struct speed_params *);
250 double speed_mpn_jacobi_base_4 (struct speed_params *);
251 double speed_mpn_lshift (struct speed_params *);
252 double speed_mpn_lshiftc (struct speed_params *);
253 double speed_mpn_mod_1 (struct speed_params *);
254 double speed_mpn_mod_1c (struct speed_params *);
255 double speed_mpn_mod_1_div (struct speed_params *);
256 double speed_mpn_mod_1_inv (struct speed_params *);
257 double speed_mpn_mod_1_1 (struct speed_params *);
258 double speed_mpn_mod_1_1_1 (struct speed_params *);
259 double speed_mpn_mod_1_1_2 (struct speed_params *);
260 double speed_mpn_mod_1_2 (struct speed_params *);
261 double speed_mpn_mod_1_3 (struct speed_params *);
262 double speed_mpn_mod_1_4 (struct speed_params *);
263 double speed_mpn_mod_34lsub1 (struct speed_params *);
264 double speed_mpn_modexact_1_odd (struct speed_params *);
265 double speed_mpn_modexact_1c_odd (struct speed_params *);
266 double speed_mpn_mul_1 (struct speed_params *);
267 double speed_mpn_mul_1_inplace (struct speed_params *);
268 double speed_mpn_mul_2 (struct speed_params *);
269 double speed_mpn_mul_3 (struct speed_params *);
270 double speed_mpn_mul_4 (struct speed_params *);
271 double speed_mpn_mul_5 (struct speed_params *);
272 double speed_mpn_mul_6 (struct speed_params *);
273 double speed_mpn_mul (struct speed_params *);
274 double speed_mpn_mul_basecase (struct speed_params *);
275 double speed_mpn_mulmid (struct speed_params *);
276 double speed_mpn_mulmid_basecase (struct speed_params *);
277 double speed_mpn_mul_fft (struct speed_params *);
278 double speed_mpn_mul_fft_sqr (struct speed_params *);
279 double speed_mpn_fft_mul (struct speed_params *);
280 double speed_mpn_fft_sqr (struct speed_params *);
281 #if WANT_OLD_FFT_FULL
282 double speed_mpn_mul_fft_full (struct speed_params *);
283 double speed_mpn_mul_fft_full_sqr (struct speed_params *);
284 #endif
285 double speed_mpn_nussbaumer_mul (struct speed_params *);
286 double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
287 double speed_mpn_mul_n (struct speed_params *);
288 double speed_mpn_mul_n_sqr (struct speed_params *);
289 double speed_mpn_mulmid_n (struct speed_params *);
290 double speed_mpn_sqrlo (struct speed_params *);
291 double speed_mpn_sqrlo_basecase (struct speed_params *);
292 double speed_mpn_mullo_n (struct speed_params *);
293 double speed_mpn_mullo_basecase (struct speed_params *);
294 double speed_mpn_nand_n (struct speed_params *);
295 double speed_mpn_nior_n (struct speed_params *);
296 double speed_mpn_popcount (struct speed_params *);
297 double speed_mpn_preinv_divrem_1 (struct speed_params *);
298 double speed_mpn_preinv_divrem_1f (struct speed_params *);
299 double speed_mpn_preinv_mod_1 (struct speed_params *);
300 double speed_mpn_sbpi1_div_qr (struct speed_params *);
301 double speed_mpn_dcpi1_div_qr (struct speed_params *);
302 double speed_mpn_sbpi1_divappr_q (struct speed_params *);
303 double speed_mpn_dcpi1_divappr_q (struct speed_params *);
304 double speed_mpn_mu_div_qr (struct speed_params *);
305 double speed_mpn_mu_divappr_q (struct speed_params *);
306 double speed_mpn_mupi_div_qr (struct speed_params *);
307 double speed_mpn_mu_div_q (struct speed_params *);
308 double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
309 double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
310 double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
311 double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
312 double speed_mpn_sbpi1_bdiv_r (struct speed_params *);
313 double speed_mpn_mu_bdiv_q (struct speed_params *);
314 double speed_mpn_mu_bdiv_qr (struct speed_params *);
315 double speed_mpn_broot (struct speed_params *);
316 double speed_mpn_broot_invm1 (struct speed_params *);
317 double speed_mpn_brootinv (struct speed_params *);
318 double speed_mpn_invert (struct speed_params *);
319 double speed_mpn_invertappr (struct speed_params *);
320 double speed_mpn_ni_invertappr (struct speed_params *);
321 double speed_mpn_sec_invert (struct speed_params *s);
322 double speed_mpn_binvert (struct speed_params *);
323 double speed_mpn_redc_1 (struct speed_params *);
324 double speed_mpn_redc_2 (struct speed_params *);
325 double speed_mpn_redc_n (struct speed_params *);
326 double speed_mpn_rsblsh_n (struct speed_params *);
327 double speed_mpn_rsblsh1_n (struct speed_params *);
328 double speed_mpn_rsblsh2_n (struct speed_params *);
329 double speed_mpn_rsh1add_n (struct speed_params *);
330 double speed_mpn_rsh1sub_n (struct speed_params *);
331 double speed_mpn_rshift (struct speed_params *);
332 double speed_mpn_sb_divrem_m3 (struct speed_params *);
333 double speed_mpn_sb_divrem_m3_div (struct speed_params *);
334 double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
335 double speed_mpn_set_str (struct speed_params *);
336 double speed_mpn_bc_set_str (struct speed_params *);
337 double speed_mpn_dc_set_str (struct speed_params *);
338 double speed_mpn_set_str_pre (struct speed_params *);
339 double speed_mpn_sqr_basecase (struct speed_params *);
340 double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
341 double speed_mpn_sqr_diagonal (struct speed_params *);
342 double speed_mpn_sqr (struct speed_params *);
343 double speed_mpn_sqrtrem (struct speed_params *);
344 double speed_mpn_rootrem (struct speed_params *);
345 double speed_mpn_sqrt (struct speed_params *);
346 double speed_mpn_root (struct speed_params *);
347 double speed_mpn_perfect_power_p (struct speed_params *);
348 double speed_mpn_perfect_square_p (struct speed_params *);
349 double speed_mpn_sub_n (struct speed_params *);
350 double speed_mpn_sub_1 (struct speed_params *);
351 double speed_mpn_sub_1_inplace (struct speed_params *);
352 double speed_mpn_sub_err1_n (struct speed_params *);
353 double speed_mpn_sub_err2_n (struct speed_params *);
354 double speed_mpn_sub_err3_n (struct speed_params *);
355 double speed_mpn_sublsh_n (struct speed_params *);
356 double speed_mpn_sublsh1_n (struct speed_params *);
357 double speed_mpn_sublsh2_n (struct speed_params *);
358 double speed_mpn_sublsh_n_ip1 (struct speed_params *);
359 double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
360 double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
361 double speed_mpn_submul_1 (struct speed_params *);
362 double speed_mpn_toom2_sqr (struct speed_params *);
363 double speed_mpn_toom3_sqr (struct speed_params *);
364 double speed_mpn_toom4_sqr (struct speed_params *);
365 double speed_mpn_toom6_sqr (struct speed_params *);
366 double speed_mpn_toom8_sqr (struct speed_params *);
367 double speed_mpn_toom22_mul (struct speed_params *);
368 double speed_mpn_toom33_mul (struct speed_params *);
369 double speed_mpn_toom44_mul (struct speed_params *);
370 double speed_mpn_toom6h_mul (struct speed_params *);
371 double speed_mpn_toom8h_mul (struct speed_params *);
372 double speed_mpn_toom32_mul (struct speed_params *);
373 double speed_mpn_toom42_mul (struct speed_params *);
374 double speed_mpn_toom43_mul (struct speed_params *);
375 double speed_mpn_toom63_mul (struct speed_params *);
376 double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
377 double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
378 double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
379 double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
380 double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
381 double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
382 double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
383 double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
384 double speed_mpn_toom42_mulmid (struct speed_params *);
385 double speed_mpn_mulmod_bnm1 (struct speed_params *);
386 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
387 double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
388 double speed_mpn_sqrmod_bnm1 (struct speed_params *);
389 double speed_mpn_udiv_qrnnd (struct speed_params *);
390 double speed_mpn_udiv_qrnnd_r (struct speed_params *);
391 double speed_mpn_umul_ppmm (struct speed_params *);
392 double speed_mpn_umul_ppmm_r (struct speed_params *);
393 double speed_mpn_xnor_n (struct speed_params *);
394 double speed_mpn_xor_n (struct speed_params *);
395 double speed_MPN_ZERO (struct speed_params *);
396 
397 double speed_mpq_init_clear (struct speed_params *);
398 
399 double speed_mpz_add (struct speed_params *);
400 double speed_mpz_invert (struct speed_params *);
401 double speed_mpz_bin_uiui (struct speed_params *);
402 double speed_mpz_bin_ui (struct speed_params *);
403 double speed_mpz_fac_ui (struct speed_params *);
404 double speed_mpz_2fac_ui (struct speed_params *);
405 double speed_mpz_mfac_uiui (struct speed_params *);
406 double speed_mpz_primorial_ui (struct speed_params *);
407 double speed_mpz_fib_ui (struct speed_params *);
408 double speed_mpz_fib2_ui (struct speed_params *);
409 double speed_mpz_init_clear (struct speed_params *);
410 double speed_mpz_init_realloc_clear (struct speed_params *);
411 double speed_mpz_nextprime (struct speed_params *);
412 double speed_mpz_jacobi (struct speed_params *);
413 double speed_mpz_lucnum_ui (struct speed_params *);
414 double speed_mpz_lucnum2_ui (struct speed_params *);
415 double speed_mpz_mod (struct speed_params *);
416 double speed_mpz_powm (struct speed_params *);
417 double speed_mpz_powm_mod (struct speed_params *);
418 double speed_mpz_powm_redc (struct speed_params *);
419 double speed_mpz_powm_sec (struct speed_params *);
420 double speed_mpz_powm_ui (struct speed_params *);
421 double speed_mpz_urandomb (struct speed_params *);
422 
423 double speed_gmp_randseed (struct speed_params *);
424 double speed_gmp_randseed_ui (struct speed_params *);
425 
426 double speed_noop (struct speed_params *);
427 double speed_noop_wxs (struct speed_params *);
428 double speed_noop_wxys (struct speed_params *);
429 
430 double speed_operator_div (struct speed_params *);
431 double speed_operator_mod (struct speed_params *);
432 
433 double speed_udiv_qrnnd (struct speed_params *);
434 double speed_udiv_qrnnd_preinv1 (struct speed_params *);
435 double speed_udiv_qrnnd_preinv2 (struct speed_params *);
436 double speed_udiv_qrnnd_preinv3 (struct speed_params *);
437 double speed_udiv_qrnnd_c (struct speed_params *);
438 double speed_umul_ppmm (struct speed_params *);
439 
440 /* Prototypes for other routines */
441 
442 #if defined (__cplusplus)
443 extern "C" {
444 #endif
445 
446 /* low 32-bits in p[0], high 32-bits in p[1] */
447 void speed_cyclecounter (unsigned p[2]);
448 
449 #if defined (__cplusplus)
450 }
451 #endif
452 
453 void mftb_function (unsigned p[2]);
454 
455 double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
456 int gettimeofday_microseconds_p (void);
457 int getrusage_microseconds_p (void);
458 int cycles_works_p (void);
459 long clk_tck (void);
460 double freq_measure (const char *, double (*)(void));
461 
462 int double_cmp_ptr (const double *, const double *);
463 void pentium_wbinvd (void);
464 typedef int (*qsort_function_t) (const void *, const void *);
465 
466 void noop (void);
467 void noop_1 (mp_limb_t);
468 void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
469 void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
470 void mpn_cache_fill (mp_srcptr, mp_size_t);
471 void mpn_cache_fill_dummy (mp_limb_t);
472 void speed_cache_fill (struct speed_params *);
473 void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
474 void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
475 
476 extern int  speed_option_addrs;
477 extern int  speed_option_verbose;
478 extern int  speed_option_cycles_broken;
479 void speed_option_set (const char *);
480 
481 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
482 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
483 
484 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
485 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
486 mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
487 mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
488 
489 int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
490 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
491 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
492 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
493 
494 int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
495 int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
496 int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
497 int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
498 int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
499 
500 mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
501 mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
502 
503 mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
504 mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
505 
506 void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
507 void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
508 
509 mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
510 mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
511 mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
512 mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
513 mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
514 mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
515 
516 mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
517 mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
518 
519 mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
520 mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
521 
522 mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
523 mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
524 
525 mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
526 mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
527 
528 mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
529 void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
530 
531 void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
532 void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
533 
534 int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
535 
536 
537 /* "get" is called repeatedly until it ticks over, just in case on a fast
538    processor it takes less than a microsecond, though this is probably
539    unlikely if it's a system call.
540 
541    speed_cyclecounter is called on the same side of the "get" for the start
542    and end measurements.  It doesn't matter how long it takes from the "get"
543    sample to the cycles sample, since that period will cancel out in the
544    difference calculation (assuming it's the same each time).
545 
546    Letting the test run for more than a process time slice is probably only
547    going to reduce accuracy, especially for getrusage when the cycle counter
548    is real time, or for gettimeofday if the cycle counter is in fact process
549    time.  Use CLK_TCK/2 as a reasonable stop.
550 
551    It'd be desirable to be quite accurate here.  The default speed_precision
552    for a cycle counter is 10000 cycles, so to mix that with getrusage or
553    gettimeofday the frequency should be at least that accurate.  But running
554    measurements for 10000 microseconds (or more) is too long.  Be satisfied
555    with just a half clock tick (5000 microseconds usually).  */
556 
557 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
558   do {									\
559     type      st1, st, et1, et;						\
560     unsigned  sc[2], ec[2];						\
561     long      dt, half_tick;						\
562     double    dc, cyc;							\
563 									\
564     half_tick = (1000000L / clk_tck()) / 2;				\
565 									\
566     get (st1);								\
567     do {								\
568       get (st);								\
569     } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
570 									\
571     getc (sc);								\
572 									\
573     for (;;)								\
574       {									\
575 	get (et1);							\
576 	do {								\
577 	  get (et);							\
578 	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
579 									\
580 	getc (ec);							\
581 									\
582 	dc = speed_cyclecounter_diff (ec, sc);				\
583 									\
584 	/* allow secs to cancel before multiplying */			\
585 	dt = sec(et) - sec(st);						\
586 	dt = dt * 1000000L + (usec(et) - usec(st));			\
587 									\
588 	if (dt >= half_tick)						\
589 	  break;							\
590       }									\
591 									\
592     cyc = dt * 1e-6 / dc;						\
593 									\
594     if (speed_option_verbose >= 2)					\
595       printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
596 	      name, dc, dt, cyc);					\
597 									\
598     return dt * 1e-6 / dc;						\
599 									\
600   } while (0)
601 
602 
603 
604 
605 /* The measuring routines use these big macros to save duplication for
606    similar forms.  They also get used for some automatically generated
607    measuring of new implementations of functions.
608 
609    Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
610    function pointer is considered undesirable since it's not the way a
611    normal application will be calling, and some processors might do
612    different things with an indirect call, like not branch predicting, or
613    doing a full pipe flush.  At least some of the "functions" measured are
614    actually macros too.
615 
616    The net effect is to bloat the object code, possibly in a big way, but
617    only what's being measured is being run, so that doesn't matter.
618 
619    The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
620    ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
621    function pointer doesn't work in gcc 3.2.  Using an actual non-pure
622    function pointer variable works, but stands a real risk of a
623    non-optimizing compiler generating unnecessary overheads in the call.
624    Currently the best idea is not to use those attributes for a timing
625    program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
626    gmp-impl.h to omit them from routines there.  */
627 
628 #define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
629 
630 /* For mpn_copy or similar. */
631 #define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
632   {									\
633     mp_ptr    wp;							\
634     unsigned  i;							\
635     double    t;							\
636     TMP_DECL;								\
637 									\
638     SPEED_RESTRICT_COND (s->size >= 0);					\
639 									\
640     TMP_MARK;								\
641     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
642 									\
643     speed_operand_src (s, s->xp, s->size);				\
644     speed_operand_dst (s, wp, s->size);					\
645     speed_cache_fill (s);						\
646 									\
647     speed_starttime ();							\
648     i = s->reps;							\
649     do									\
650       call;								\
651     while (--i != 0);							\
652     t = speed_endtime ();						\
653 									\
654     TMP_FREE;								\
655     return t;								\
656   }
657 #define SPEED_ROUTINE_MPN_COPY(function)				\
658   SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
659 
660 #define SPEED_ROUTINE_MPN_TABSELECT(function)				\
661   {									\
662     mp_ptr    xp, wp;							\
663     unsigned  i;							\
664     double    t;							\
665     TMP_DECL;								\
666 									\
667     SPEED_RESTRICT_COND (s->size >= 0);					\
668 									\
669     if (s->r == 0)							\
670       s->r = s->size;	/* default to a quadratic shape */		\
671 									\
672     TMP_MARK;								\
673     SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp);		\
674     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
675 									\
676     speed_operand_src (s, xp, s->size * s->r);				\
677     speed_operand_dst (s, wp, s->size);					\
678     speed_cache_fill (s);						\
679 									\
680     speed_starttime ();							\
681     i = s->reps;							\
682     do									\
683       function (wp, xp, s->size, s->r, (s->r) / 2);			\
684     while (--i != 0);							\
685     t = speed_endtime () / s->r;					\
686 									\
687     TMP_FREE;								\
688     return t;								\
689   }
690 
691 
692 #define SPEED_ROUTINE_MPN_COPYC(function)				\
693   {									\
694     mp_ptr    wp;							\
695     unsigned  i;							\
696     double    t;							\
697     TMP_DECL;								\
698 									\
699     SPEED_RESTRICT_COND (s->size >= 0);					\
700 									\
701     TMP_MARK;								\
702     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
703 									\
704     speed_operand_src (s, s->xp, s->size);				\
705     speed_operand_dst (s, wp, s->size);					\
706     speed_cache_fill (s);						\
707 									\
708     speed_starttime ();							\
709     i = s->reps;							\
710     do									\
711       function (wp, s->xp, s->size, 0);					\
712     while (--i != 0);							\
713     t = speed_endtime ();						\
714 									\
715     TMP_FREE;								\
716     return t;								\
717   }
718 
719 /* s->size is still in limbs, and it's limbs which are copied, but
720    "function" takes a size in bytes not limbs.  */
721 #define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
722   {									\
723     mp_ptr    wp;							\
724     unsigned  i;							\
725     double    t;							\
726     TMP_DECL;								\
727 									\
728     SPEED_RESTRICT_COND (s->size >= 0);					\
729 									\
730     TMP_MARK;								\
731     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
732 									\
733     speed_operand_src (s, s->xp, s->size);				\
734     speed_operand_dst (s, wp, s->size);					\
735     speed_cache_fill (s);						\
736 									\
737     speed_starttime ();							\
738     i = s->reps;							\
739     do									\
740       function (wp, s->xp, s->size * GMP_LIMB_BYTES);		\
741     while (--i != 0);							\
742     t = speed_endtime ();						\
743 									\
744     TMP_FREE;								\
745     return t;								\
746   }
747 
748 
749 /* For mpn_add_n, mpn_sub_n, or similar. */
750 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
751   {									\
752     mp_ptr     wp;							\
753     mp_ptr     xp, yp;							\
754     unsigned   i;							\
755     double     t;							\
756     TMP_DECL;								\
757 									\
758     SPEED_RESTRICT_COND (s->size >= 1);					\
759 									\
760     TMP_MARK;								\
761     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
762 									\
763     xp = s->xp;								\
764     yp = s->yp;								\
765 									\
766     if (s->r == 0)	;						\
767     else if (s->r == 1) { xp = wp;	    }				\
768     else if (s->r == 2) {	   yp = wp; }				\
769     else if (s->r == 3) { xp = wp; yp = wp; }				\
770     else if (s->r == 4) {     yp = xp;	    }				\
771     else		{						\
772       TMP_FREE;								\
773       return -1.0;							\
774     }									\
775 									\
776     /* initialize wp if operand overlap */				\
777     if (xp == wp || yp == wp)						\
778       MPN_COPY (wp, s->xp, s->size);					\
779 									\
780     speed_operand_src (s, xp, s->size);					\
781     speed_operand_src (s, yp, s->size);					\
782     speed_operand_dst (s, wp, s->size);					\
783     speed_cache_fill (s);						\
784 									\
785     speed_starttime ();							\
786     i = s->reps;							\
787     do									\
788       call;								\
789     while (--i != 0);							\
790     t = speed_endtime ();						\
791 									\
792     TMP_FREE;								\
793     return t;								\
794   }
795 
796 
797 /* For mpn_aors_errK_n, where 1 <= K <= 3. */
798 #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
799   {									\
800     mp_ptr     wp;							\
801     mp_ptr     xp, yp;							\
802     mp_ptr     zp[K];							\
803     mp_limb_t  ep[2*K];							\
804     unsigned   i;							\
805     double     t;							\
806     TMP_DECL;								\
807 									\
808     SPEED_RESTRICT_COND (s->size >= 1);					\
809 									\
810     TMP_MARK;								\
811     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
812 									\
813     /* (don't have a mechanism to specify zp alignments) */		\
814     for (i = 0; i < K; i++)						\
815       SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
816 									\
817     xp = s->xp;								\
818     yp = s->yp;								\
819 									\
820     if (s->r == 0)	;						\
821     else if (s->r == 1) { xp = wp;	    }				\
822     else if (s->r == 2) {	   yp = wp; }				\
823     else if (s->r == 3) { xp = wp; yp = wp; }				\
824     else if (s->r == 4) {     yp = xp;	    }				\
825     else		{						\
826       TMP_FREE;								\
827       return -1.0;							\
828     }									\
829 									\
830     /* initialize wp if operand overlap */				\
831     if (xp == wp || yp == wp)						\
832       MPN_COPY (wp, s->xp, s->size);					\
833 									\
834     speed_operand_src (s, xp, s->size);					\
835     speed_operand_src (s, yp, s->size);					\
836     for (i = 0; i < K; i++)						\
837       speed_operand_src (s, zp[i], s->size);				\
838     speed_operand_dst (s, wp, s->size);					\
839     speed_cache_fill (s);						\
840 									\
841     speed_starttime ();							\
842     i = s->reps;							\
843     do									\
844       call;								\
845     while (--i != 0);							\
846     t = speed_endtime ();						\
847 									\
848     TMP_FREE;								\
849     return t;								\
850   }
851 
852 #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
853   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
854 
855 #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
856   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
857 
858 #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
859   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
860 
861 
862 /* For mpn_add_n, mpn_sub_n, or similar. */
863 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
864   {									\
865     mp_ptr     ap, sp;							\
866     mp_ptr     xp, yp;							\
867     unsigned   i;							\
868     double     t;							\
869     TMP_DECL;								\
870 									\
871     SPEED_RESTRICT_COND (s->size >= 1);					\
872 									\
873     TMP_MARK;								\
874     SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
875     SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
876 									\
877     xp = s->xp;								\
878     yp = s->yp;								\
879 									\
880     if ((s->r & 1) != 0) { xp = ap; }					\
881     if ((s->r & 2) != 0) { yp = ap; }					\
882     if ((s->r & 4) != 0) { xp = sp; }					\
883     if ((s->r & 8) != 0) { yp = sp; }					\
884     if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
885       {									\
886 	TMP_FREE;							\
887 	return -1.0;							\
888       }									\
889 									\
890     /* initialize ap if operand overlap */				\
891     if (xp == ap || yp == ap)						\
892       MPN_COPY (ap, s->xp, s->size);					\
893     /* initialize sp if operand overlap */				\
894     if (xp == sp || yp == sp)						\
895       MPN_COPY (sp, s->xp, s->size);					\
896 									\
897     speed_operand_src (s, xp, s->size);					\
898     speed_operand_src (s, yp, s->size);					\
899     speed_operand_dst (s, ap, s->size);					\
900     speed_operand_dst (s, sp, s->size);					\
901     speed_cache_fill (s);						\
902 									\
903     speed_starttime ();							\
904     i = s->reps;							\
905     do									\
906       call;								\
907     while (--i != 0);							\
908     t = speed_endtime ();						\
909 									\
910     TMP_FREE;								\
911     return t;								\
912   }
913 
914 #define SPEED_ROUTINE_MPN_BINARY_N(function)				\
915    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
916 
917 #define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
918    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
919 
920 
921 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
922 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
923   {									\
924     mp_ptr    wp;							\
925     unsigned  i;							\
926     double    t;							\
927     TMP_DECL;								\
928 									\
929     SPEED_RESTRICT_COND (s->size >= 1);					\
930 									\
931     TMP_MARK;								\
932     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
933 									\
934     speed_operand_src (s, s->xp, s->size);				\
935     speed_operand_dst (s, wp, s->size);					\
936     speed_cache_fill (s);						\
937 									\
938     speed_starttime ();							\
939     i = s->reps;							\
940     do									\
941       call;								\
942     while (--i != 0);							\
943     t = speed_endtime ();						\
944 									\
945     TMP_FREE;								\
946     return t;								\
947   }
948 
949 #define SPEED_ROUTINE_MPN_UNARY_1(function)				\
950   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
951 
952 #define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
953   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
954 
955 /* FIXME: wp is uninitialized here, should start it off from xp */
956 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
957   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
958 
959 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
960   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
961 
962 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
963     SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
964 
965 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
966   {									\
967     unsigned   shift;							\
968     mp_limb_t  dinv;							\
969 									\
970     SPEED_RESTRICT_COND (s->size > 0);					\
971     SPEED_RESTRICT_COND (s->r != 0);					\
972 									\
973     count_trailing_zeros (shift, s->r);					\
974     binvert_limb (dinv, s->r >> shift);					\
975 									\
976     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
977   }
978 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
979   SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
980   ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
981 
982 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
983   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
984 
985 #define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
986   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
987 
988 #define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
989   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
990 
991 #define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
992   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
993 
994 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
995   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
996 
997 
998 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
999   {									\
1000     unsigned   shift;							\
1001     mp_limb_t  dinv;							\
1002 									\
1003     SPEED_RESTRICT_COND (s->size >= 0);					\
1004     SPEED_RESTRICT_COND (s->r != 0);					\
1005 									\
1006     count_leading_zeros (shift, s->r);					\
1007     invert_limb (dinv, s->r << shift);					\
1008 									\
1009     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
1010   }									\
1011 
1012 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
1013   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
1014   ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
1015 
1016 /* s->size limbs worth of fraction part */
1017 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
1018   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
1019   ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
1020 
1021 
1022 /* s->r is duplicated to form the multiplier, defaulting to
1023    MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
1024    least it provides some control.  */
1025 #define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
1026   {									\
1027     mp_ptr     wp;							\
1028     mp_size_t  wn;							\
1029     unsigned   i;							\
1030     double     t;							\
1031     mp_limb_t  yp[N];							\
1032     TMP_DECL;								\
1033 									\
1034     SPEED_RESTRICT_COND (s->size >= N);					\
1035 									\
1036     TMP_MARK;								\
1037     wn = s->size + N-1;							\
1038     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
1039     for (i = 0; i < N; i++)						\
1040       yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
1041 									\
1042     speed_operand_src (s, s->xp, s->size);				\
1043     speed_operand_src (s, yp, (mp_size_t) N);				\
1044     speed_operand_dst (s, wp, wn);					\
1045     speed_cache_fill (s);						\
1046 									\
1047     speed_starttime ();							\
1048     i = s->reps;							\
1049     do									\
1050       function (wp, s->xp, s->size, yp);				\
1051     while (--i != 0);							\
1052     t = speed_endtime ();						\
1053 									\
1054     TMP_FREE;								\
1055     return t;								\
1056   }
1057 
1058 #define SPEED_ROUTINE_MPN_UNARY_2(function)				\
1059   SPEED_ROUTINE_MPN_UNARY_N (function, 2)
1060 #define SPEED_ROUTINE_MPN_UNARY_3(function)				\
1061   SPEED_ROUTINE_MPN_UNARY_N (function, 3)
1062 #define SPEED_ROUTINE_MPN_UNARY_4(function)				\
1063   SPEED_ROUTINE_MPN_UNARY_N (function, 4)
1064 #define SPEED_ROUTINE_MPN_UNARY_5(function)				\
1065   SPEED_ROUTINE_MPN_UNARY_N (function, 5)
1066 #define SPEED_ROUTINE_MPN_UNARY_6(function)				\
1067   SPEED_ROUTINE_MPN_UNARY_N (function, 6)
1068 #define SPEED_ROUTINE_MPN_UNARY_7(function)				\
1069   SPEED_ROUTINE_MPN_UNARY_N (function, 7)
1070 #define SPEED_ROUTINE_MPN_UNARY_8(function)				\
1071   SPEED_ROUTINE_MPN_UNARY_N (function, 8)
1072 
1073 
1074 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
1075 #define SPEED_ROUTINE_MPN_MUL(function)					\
1076   {									\
1077     mp_ptr    wp;							\
1078     mp_size_t size1;							\
1079     unsigned  i;							\
1080     double    t;							\
1081     TMP_DECL;								\
1082 									\
1083     size1 = (s->r == 0 ? s->size : s->r);				\
1084     if (size1 < 0) size1 = -size1 - s->size;				\
1085 									\
1086     SPEED_RESTRICT_COND (size1 >= 1);					\
1087     SPEED_RESTRICT_COND (s->size >= size1);				\
1088 									\
1089     TMP_MARK;								\
1090     SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
1091 									\
1092     speed_operand_src (s, s->xp, s->size);				\
1093     speed_operand_src (s, s->yp, size1);				\
1094     speed_operand_dst (s, wp, size1 + s->size);				\
1095     speed_cache_fill (s);						\
1096 									\
1097     speed_starttime ();							\
1098     i = s->reps;							\
1099     do									\
1100       function (wp, s->xp, s->size, s->yp, size1);			\
1101     while (--i != 0);							\
1102     t = speed_endtime ();						\
1103 									\
1104     TMP_FREE;								\
1105     return t;								\
1106   }
1107 
1108 
1109 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
1110   {									\
1111     mp_ptr    wp;							\
1112     unsigned  i;							\
1113     double    t;							\
1114     TMP_DECL;								\
1115 									\
1116     SPEED_RESTRICT_COND (s->size >= 1);					\
1117 									\
1118     TMP_MARK;								\
1119     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1120 									\
1121     speed_operand_src (s, s->xp, s->size);				\
1122     speed_operand_src (s, s->yp, s->size);				\
1123     speed_operand_dst (s, wp, 2*s->size);				\
1124     speed_cache_fill (s);						\
1125 									\
1126     speed_starttime ();							\
1127     i = s->reps;							\
1128     do									\
1129       call;								\
1130     while (--i != 0);							\
1131     t = speed_endtime ();						\
1132 									\
1133     TMP_FREE;								\
1134     return t;								\
1135   }
1136 
1137 #define SPEED_ROUTINE_MPN_MUL_N(function)				\
1138   SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
1139 
1140 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
1141   {									\
1142     mp_ptr    wp;							\
1143     unsigned  i;							\
1144     double    t;							\
1145     TMP_DECL;								\
1146 									\
1147     SPEED_RESTRICT_COND (s->size >= 1);					\
1148 									\
1149     TMP_MARK;								\
1150     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1151 									\
1152     speed_operand_src (s, s->xp, s->size);				\
1153     speed_operand_src (s, s->yp, s->size);				\
1154     speed_operand_dst (s, wp, s->size);					\
1155     speed_cache_fill (s);						\
1156 									\
1157     speed_starttime ();							\
1158     i = s->reps;							\
1159     do									\
1160       call;								\
1161     while (--i != 0);							\
1162     t = speed_endtime ();						\
1163 									\
1164     TMP_FREE;								\
1165     return t;								\
1166   }
1167 
1168 #define SPEED_ROUTINE_MPN_MULLO_N(function)				\
1169   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1170 
1171 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
1172   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1173 
1174 #define SPEED_ROUTINE_MPN_SQRLO(function)				\
1175   {									\
1176     mp_ptr    wp;							\
1177     unsigned  i;							\
1178     double    t;							\
1179     TMP_DECL;								\
1180 									\
1181     SPEED_RESTRICT_COND (s->size >= 1);					\
1182 									\
1183     TMP_MARK;								\
1184     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1185 									\
1186     speed_operand_src (s, s->xp, s->size);				\
1187     speed_operand_dst (s, wp, s->size);					\
1188     speed_cache_fill (s);						\
1189 									\
1190     speed_starttime ();							\
1191     i = s->reps;							\
1192     do									\
1193       function (wp, s->xp, s->size);					\
1194     while (--i != 0);							\
1195     t = speed_endtime ();						\
1196 									\
1197     TMP_FREE;								\
1198     return t;								\
1199   }
1200 
1201 /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
1202 #define SPEED_ROUTINE_MPN_MULMID(function)				\
1203   {									\
1204     mp_ptr    wp, xp;							\
1205     mp_size_t size1;							\
1206     unsigned  i;							\
1207     double    t;							\
1208     TMP_DECL;								\
1209 									\
1210     size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
1211 									\
1212     SPEED_RESTRICT_COND (s->size >= 1);					\
1213     SPEED_RESTRICT_COND (size1 >= s->size);				\
1214 									\
1215     TMP_MARK;								\
1216     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1217     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1218 									\
1219     speed_operand_src (s, xp, size1);					\
1220     speed_operand_src (s, s->yp, s->size);				\
1221     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1222     speed_cache_fill (s);						\
1223 									\
1224     speed_starttime ();							\
1225     i = s->reps;							\
1226     do									\
1227       function (wp, xp, size1, s->yp, s->size);				\
1228     while (--i != 0);							\
1229     t = speed_endtime ();						\
1230 									\
1231     TMP_FREE;								\
1232     return t;								\
1233   }
1234 
1235 #define SPEED_ROUTINE_MPN_MULMID_N(function)				\
1236   {									\
1237     mp_ptr    wp, xp;							\
1238     mp_size_t size1;							\
1239     unsigned  i;							\
1240     double    t;							\
1241     TMP_DECL;								\
1242 									\
1243     size1 = 2 * s->size - 1;						\
1244 									\
1245     SPEED_RESTRICT_COND (s->size >= 1);					\
1246 									\
1247     TMP_MARK;								\
1248     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1249     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1250 									\
1251     speed_operand_src (s, xp, size1);					\
1252     speed_operand_src (s, s->yp, s->size);				\
1253     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1254     speed_cache_fill (s);						\
1255 									\
1256     speed_starttime ();							\
1257     i = s->reps;							\
1258     do									\
1259       function (wp, xp, s->yp, s->size);				\
1260     while (--i != 0);							\
1261     t = speed_endtime ();						\
1262 									\
1263     TMP_FREE;								\
1264     return t;								\
1265   }
1266 
1267 #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
1268   {									\
1269     mp_ptr    wp, xp, scratch;						\
1270     mp_size_t size1, scratch_size;					\
1271     unsigned  i;							\
1272     double    t;							\
1273     TMP_DECL;								\
1274 									\
1275     size1 = 2 * s->size - 1;						\
1276 									\
1277     SPEED_RESTRICT_COND (s->size >= 1);					\
1278 									\
1279     TMP_MARK;								\
1280     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1281     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1282     scratch_size = mpn_toom42_mulmid_itch (s->size);			\
1283     SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
1284 									\
1285     speed_operand_src (s, xp, size1);					\
1286     speed_operand_src (s, s->yp, s->size);				\
1287     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1288     speed_cache_fill (s);						\
1289 									\
1290     speed_starttime ();							\
1291     i = s->reps;							\
1292     do									\
1293       function (wp, xp, s->yp, s->size, scratch);			\
1294     while (--i != 0);							\
1295     t = speed_endtime ();						\
1296 									\
1297     TMP_FREE;								\
1298     return t;								\
1299   }
1300 
1301 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
1302   {									\
1303     mp_ptr    wp, tp;							\
1304     unsigned  i;							\
1305     double    t;							\
1306     mp_size_t itch;							\
1307     TMP_DECL;								\
1308 									\
1309     SPEED_RESTRICT_COND (s->size >= 1);					\
1310 									\
1311     itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
1312 									\
1313     TMP_MARK;								\
1314     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1315     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1316 									\
1317     speed_operand_src (s, s->xp, s->size);				\
1318     speed_operand_src (s, s->yp, s->size);				\
1319     speed_operand_dst (s, wp, 2 * s->size);				\
1320     speed_operand_dst (s, tp, itch);					\
1321     speed_cache_fill (s);						\
1322 									\
1323     speed_starttime ();							\
1324     i = s->reps;							\
1325     do									\
1326       call;								\
1327     while (--i != 0);							\
1328     t = speed_endtime ();						\
1329 									\
1330     TMP_FREE;								\
1331     return t;								\
1332   }
1333 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
1334   {									\
1335     mp_ptr    wp, tp;							\
1336     unsigned  i;							\
1337     double    t;							\
1338     mp_size_t size, itch;						\
1339     TMP_DECL;								\
1340 									\
1341     SPEED_RESTRICT_COND (s->size >= 1);					\
1342 									\
1343     size = mpn_mulmod_bnm1_next_size (s->size);				\
1344     itch = mpn_mulmod_bnm1_itch (size, size, size);			\
1345 									\
1346     TMP_MARK;								\
1347     SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
1348     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1349 									\
1350     speed_operand_src (s, s->xp, s->size);				\
1351     speed_operand_src (s, s->yp, s->size);				\
1352     speed_operand_dst (s, wp, size);					\
1353     speed_operand_dst (s, tp, itch);					\
1354     speed_cache_fill (s);						\
1355 									\
1356     speed_starttime ();							\
1357     i = s->reps;							\
1358     do									\
1359       function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
1360     while (--i != 0);							\
1361     t = speed_endtime ();						\
1362 									\
1363     TMP_FREE;								\
1364     return t;								\
1365   }
1366 
1367 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
1368   {									\
1369     mp_ptr    wp, tspace;						\
1370     unsigned  i;							\
1371     double    t;							\
1372     TMP_DECL;								\
1373 									\
1374     SPEED_RESTRICT_COND (s->size >= minsize);				\
1375 									\
1376     TMP_MARK;								\
1377     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1378     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1379 									\
1380     speed_operand_src (s, s->xp, s->size);				\
1381     speed_operand_src (s, s->yp, s->size);				\
1382     speed_operand_dst (s, wp, 2*s->size);				\
1383     speed_operand_dst (s, tspace, tsize);				\
1384     speed_cache_fill (s);						\
1385 									\
1386     speed_starttime ();							\
1387     i = s->reps;							\
1388     do									\
1389       call;								\
1390     while (--i != 0);							\
1391     t = speed_endtime ();						\
1392 									\
1393     TMP_FREE;								\
1394     return t;								\
1395   }
1396 
1397 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
1398   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1399     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1400      mpn_toom22_mul_itch (s->size, s->size),				\
1401      MPN_TOOM22_MUL_MINSIZE)
1402 
1403 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
1404   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1405     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1406      mpn_toom33_mul_itch (s->size, s->size),				\
1407      MPN_TOOM33_MUL_MINSIZE)
1408 
1409 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
1410   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1411     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1412      mpn_toom44_mul_itch (s->size, s->size),				\
1413      MPN_TOOM44_MUL_MINSIZE)
1414 
1415 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
1416   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1417     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1418      mpn_toom6h_mul_itch (s->size, s->size),				\
1419      MPN_TOOM6H_MUL_MINSIZE)
1420 
1421 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
1422   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1423     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1424      mpn_toom8h_mul_itch (s->size, s->size),				\
1425      MPN_TOOM8H_MUL_MINSIZE)
1426 
1427 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
1428   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1429     (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
1430      mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
1431      MPN_TOOM32_MUL_MINSIZE)
1432 
1433 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
1434   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1435     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1436      mpn_toom42_mul_itch (s->size, s->size/2),				\
1437      MPN_TOOM42_MUL_MINSIZE)
1438 
1439 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
1440   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1441     (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
1442      mpn_toom43_mul_itch (s->size, s->size*3/4),			\
1443      MPN_TOOM43_MUL_MINSIZE)
1444 
1445 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
1446   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1447     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1448      mpn_toom63_mul_itch (s->size, s->size/2),				\
1449      MPN_TOOM63_MUL_MINSIZE)
1450 
1451 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
1452   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1453     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1454      mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
1455      MPN_TOOM32_MUL_MINSIZE)
1456 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
1457   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1458     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1459      mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
1460      MPN_TOOM43_MUL_MINSIZE)
1461 
1462 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
1463   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1464     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1465      mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
1466      MPN_TOOM32_MUL_MINSIZE)
1467 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
1468   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1469     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1470      mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
1471      MPN_TOOM53_MUL_MINSIZE)
1472 
1473 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
1474   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1475     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1476      mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
1477      MPN_TOOM42_MUL_MINSIZE)
1478 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
1479   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1480     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1481      mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
1482      MPN_TOOM53_MUL_MINSIZE)
1483 
1484 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
1485   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1486     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1487      mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
1488      MPN_TOOM54_MUL_MINSIZE)
1489 #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
1490   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1491     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1492      mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
1493      MPN_TOOM54_MUL_MINSIZE)
1494 
1495 
1496 
1497 #define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
1498   {									\
1499     mp_ptr    wp;							\
1500     unsigned  i;							\
1501     double    t;							\
1502     TMP_DECL;								\
1503 									\
1504     SPEED_RESTRICT_COND (s->size >= 1);					\
1505 									\
1506     TMP_MARK;								\
1507     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1508 									\
1509     speed_operand_src (s, s->xp, s->size);				\
1510     speed_operand_dst (s, wp, 2*s->size);				\
1511     speed_cache_fill (s);						\
1512 									\
1513     speed_starttime ();							\
1514     i = s->reps;							\
1515     do									\
1516       call;								\
1517     while (--i != 0);							\
1518     t = speed_endtime ();						\
1519 									\
1520     TMP_FREE;								\
1521     return t;								\
1522   }
1523 
1524 #define SPEED_ROUTINE_MPN_SQR(function)					\
1525   SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1526 
1527 #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
1528   {									\
1529     mp_ptr    wp, tp;							\
1530     unsigned  i;							\
1531     double    t;							\
1532     TMP_DECL;								\
1533 									\
1534     SPEED_RESTRICT_COND (s->size >= 2);					\
1535 									\
1536     TMP_MARK;								\
1537     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
1538     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1539 									\
1540     speed_operand_src (s, s->xp, s->size);				\
1541     speed_operand_src (s, tp, 2 * s->size);				\
1542     speed_operand_dst (s, wp, 2 * s->size);				\
1543     speed_cache_fill (s);						\
1544 									\
1545     speed_starttime ();							\
1546     i = s->reps;							\
1547     do									\
1548       call;								\
1549     while (--i != 0);							\
1550     t = speed_endtime () / 2;						\
1551 									\
1552     TMP_FREE;								\
1553     return t;								\
1554   }
1555 
1556 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
1557   {									\
1558     mp_ptr    wp, tspace;						\
1559     unsigned  i;							\
1560     double    t;							\
1561     TMP_DECL;								\
1562 									\
1563     SPEED_RESTRICT_COND (s->size >= minsize);				\
1564 									\
1565     TMP_MARK;								\
1566     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1567     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1568 									\
1569     speed_operand_src (s, s->xp, s->size);				\
1570     speed_operand_dst (s, wp, 2*s->size);				\
1571     speed_operand_dst (s, tspace, tsize);				\
1572     speed_cache_fill (s);						\
1573 									\
1574     speed_starttime ();							\
1575     i = s->reps;							\
1576     do									\
1577       call;								\
1578     while (--i != 0);							\
1579     t = speed_endtime ();						\
1580 									\
1581     TMP_FREE;								\
1582     return t;								\
1583   }
1584 
1585 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
1586   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1587 				mpn_toom2_sqr_itch (s->size),		\
1588 				MPN_TOOM2_SQR_MINSIZE)
1589 
1590 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
1591   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1592 				mpn_toom3_sqr_itch (s->size),		\
1593 				MPN_TOOM3_SQR_MINSIZE)
1594 
1595 
1596 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
1597   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1598 				mpn_toom4_sqr_itch (s->size),		\
1599 				MPN_TOOM4_SQR_MINSIZE)
1600 
1601 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
1602   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1603 				mpn_toom6_sqr_itch (s->size),		\
1604 				MPN_TOOM6_SQR_MINSIZE)
1605 
1606 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
1607   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1608 				mpn_toom8_sqr_itch (s->size),		\
1609 				MPN_TOOM8_SQR_MINSIZE)
1610 
1611 #define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
1612   {									\
1613     unsigned   i;							\
1614 									\
1615     SPEED_RESTRICT_COND (s->size >= 0);					\
1616 									\
1617     speed_operand_src (s, s->xp, s->size);				\
1618     speed_cache_fill (s);						\
1619 									\
1620     speed_starttime ();							\
1621     i = s->reps;							\
1622     do									\
1623       call;								\
1624     while (--i != 0);							\
1625 									\
1626     return speed_endtime ();						\
1627   }
1628 
1629 #define SPEED_ROUTINE_MPN_MOD_1(function)				\
1630    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1631 
1632 #define SPEED_ROUTINE_MPN_MOD_1C(function)				\
1633    SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1634 
1635 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
1636   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1637 
1638 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
1639   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1640 
1641 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
1642    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1643 
1644 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
1645   {									\
1646     unsigned   i;							\
1647     mp_limb_t  inv;							\
1648 									\
1649     SPEED_RESTRICT_COND (s->size >= 0);					\
1650     SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
1651 									\
1652     invert_limb (inv, s->r);						\
1653     speed_operand_src (s, s->xp, s->size);				\
1654     speed_cache_fill (s);						\
1655 									\
1656     speed_starttime ();							\
1657     i = s->reps;							\
1658     do									\
1659       (*function) (s->xp, s->size, s->r, inv);				\
1660     while (--i != 0);							\
1661 									\
1662     return speed_endtime ();						\
1663   }
1664 
1665 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
1666   {									\
1667     unsigned   i;							\
1668     mp_limb_t  inv[4];							\
1669 									\
1670     SPEED_RESTRICT_COND (s->size >= 2);					\
1671 									\
1672     mpn_mod_1_1p_cps (inv, s->r);					\
1673     speed_operand_src (s, s->xp, s->size);				\
1674     speed_cache_fill (s);						\
1675 									\
1676     speed_starttime ();							\
1677     i = s->reps;							\
1678     do {								\
1679       pfunc (inv, s->r);						\
1680       function (s->xp, s->size, s->r << inv[1], inv);				\
1681     } while (--i != 0);							\
1682 									\
1683     return speed_endtime ();						\
1684   }
1685 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
1686   {									\
1687     unsigned   i;							\
1688     mp_limb_t  inv[N+3];						\
1689 									\
1690     SPEED_RESTRICT_COND (s->size >= 1);					\
1691     SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
1692 									\
1693     speed_operand_src (s, s->xp, s->size);				\
1694     speed_cache_fill (s);						\
1695 									\
1696     speed_starttime ();							\
1697     i = s->reps;							\
1698     do {								\
1699       pfunc (inv, s->r);						\
1700       function (s->xp, s->size, s->r, inv);				\
1701     } while (--i != 0);							\
1702 									\
1703     return speed_endtime ();						\
1704   }
1705 
1706 
1707 /* A division of 2*s->size by s->size limbs */
1708 
1709 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
1710   {									\
1711     unsigned  i;							\
1712     mp_ptr    a, d, q, r;						\
1713     double    t;							\
1714     gmp_pi1_t dinv;							\
1715     TMP_DECL;								\
1716 									\
1717     SPEED_RESTRICT_COND (s->size >= 1);					\
1718 									\
1719     TMP_MARK;								\
1720     SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
1721     SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
1722     SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
1723     SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
1724 									\
1725     MPN_COPY (a, s->xp, s->size);					\
1726     MPN_COPY (a+s->size, s->xp, s->size);				\
1727 									\
1728     MPN_COPY (d, s->yp, s->size);					\
1729 									\
1730     /* normalize the data */						\
1731     d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1732     a[2*s->size-1] = d[s->size-1] - 1;					\
1733 									\
1734     invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
1735 									\
1736     speed_operand_src (s, a, 2*s->size);				\
1737     speed_operand_src (s, d, s->size);					\
1738     speed_operand_dst (s, q, s->size+1);				\
1739     speed_operand_dst (s, r, s->size);					\
1740     speed_cache_fill (s);						\
1741 									\
1742     speed_starttime ();							\
1743     i = s->reps;							\
1744     do									\
1745       call;								\
1746     while (--i != 0);							\
1747     t = speed_endtime ();						\
1748 									\
1749     TMP_FREE;								\
1750     return t;								\
1751   }
1752 
1753 
1754 /* A remainder 2*s->size by s->size limbs */
1755 
1756 #define SPEED_ROUTINE_MPZ_MOD(function)					\
1757   {									\
1758     unsigned   i;							\
1759     mpz_t      a, d, r;							\
1760 									\
1761     SPEED_RESTRICT_COND (s->size >= 1);					\
1762 									\
1763     mpz_init_set_n (d, s->yp, s->size);					\
1764 									\
1765     /* high part less than d, low part a duplicate copied in */		\
1766     mpz_init_set_n (a, s->xp, s->size);					\
1767     mpz_mod (a, a, d);							\
1768     mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
1769     MPN_COPY (PTR(a), s->xp, s->size);					\
1770 									\
1771     mpz_init (r);							\
1772 									\
1773     speed_operand_src (s, PTR(a), SIZ(a));				\
1774     speed_operand_src (s, PTR(d), SIZ(d));				\
1775     speed_cache_fill (s);						\
1776 									\
1777     speed_starttime ();							\
1778     i = s->reps;							\
1779     do									\
1780       function (r, a, d);						\
1781     while (--i != 0);							\
1782     return speed_endtime ();						\
1783   }
1784 
1785 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
1786   {									\
1787     unsigned   i;							\
1788     mp_ptr     dp, tp, ap, qp;						\
1789     gmp_pi1_t  inv;							\
1790     double     t;							\
1791     mp_size_t size1;							\
1792     TMP_DECL;								\
1793 									\
1794     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1795 									\
1796     SPEED_RESTRICT_COND (s->size >= DMIN);				\
1797     SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
1798 									\
1799     TMP_MARK;								\
1800     SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
1801     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1802     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1803     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
1804 									\
1805     /* we don't fill in dividend completely when size1 > s->size */	\
1806     MPN_COPY (ap,         s->xp, s->size);				\
1807     MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
1808 									\
1809     MPN_COPY (dp,         s->yp, s->size);				\
1810 									\
1811     /* normalize the data */						\
1812     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1813     ap[size1 - 1] = dp[s->size - 1] - 1;				\
1814 									\
1815     invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
1816 									\
1817     speed_operand_src (s, ap, size1);					\
1818     speed_operand_dst (s, tp, size1);					\
1819     speed_operand_src (s, dp, s->size);					\
1820     speed_operand_dst (s, qp, size1 - s->size);				\
1821     speed_cache_fill (s);						\
1822 									\
1823     speed_starttime ();							\
1824     i = s->reps;							\
1825     do {								\
1826       MPN_COPY (tp, ap, size1);						\
1827       function (qp, tp, size1, dp, s->size, INV);			\
1828     } while (--i != 0);							\
1829     t = speed_endtime ();						\
1830 									\
1831     TMP_FREE;								\
1832     return t;								\
1833   }
1834 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
1835   {									\
1836     unsigned   i;							\
1837     mp_ptr     dp, tp, qp, scratch;					\
1838     double     t;							\
1839     mp_size_t itch;							\
1840     TMP_DECL;								\
1841 									\
1842     SPEED_RESTRICT_COND (s->size >= 2);					\
1843 									\
1844     itch = itchfn (2 * s->size, s->size, 0);				\
1845     TMP_MARK;								\
1846     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1847     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1848     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
1849     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1850 									\
1851     MPN_COPY (tp,         s->xp, s->size);				\
1852     MPN_COPY (tp+s->size, s->xp, s->size);				\
1853 									\
1854     /* normalize the data */						\
1855     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1856     tp[2*s->size-1] = dp[s->size-1] - 1;				\
1857 									\
1858     speed_operand_dst (s, qp, s->size);					\
1859     speed_operand_src (s, tp, 2 * s->size);				\
1860     speed_operand_src (s, dp, s->size);					\
1861     speed_operand_dst (s, scratch, itch);				\
1862     speed_cache_fill (s);						\
1863 									\
1864     speed_starttime ();							\
1865     i = s->reps;							\
1866     do {								\
1867       function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
1868     } while (--i != 0);							\
1869     t = speed_endtime ();						\
1870 									\
1871     TMP_FREE;								\
1872     return t;								\
1873   }
1874 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
1875   {									\
1876     unsigned   i;							\
1877     mp_ptr     dp, tp, qp, rp, scratch;					\
1878     double     t;							\
1879     mp_size_t size1, itch;						\
1880     TMP_DECL;								\
1881 									\
1882     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1883 									\
1884     SPEED_RESTRICT_COND (s->size >= 2);					\
1885     SPEED_RESTRICT_COND (size1 >= s->size);				\
1886 									\
1887     itch = itchfn (size1, s->size, 0);					\
1888     TMP_MARK;								\
1889     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1890     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1891     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1892     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1893     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1894 									\
1895     /* we don't fill in dividend completely when size1 > s->size */	\
1896     MPN_COPY (tp,         s->xp, s->size);				\
1897     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1898 									\
1899     MPN_COPY (dp,         s->yp, s->size);				\
1900 									\
1901     /* normalize the data */						\
1902     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1903     tp[size1 - 1] = dp[s->size - 1] - 1;				\
1904 									\
1905     speed_operand_dst (s, qp, size1 - s->size);				\
1906     speed_operand_dst (s, rp, s->size);					\
1907     speed_operand_src (s, tp, size1);					\
1908     speed_operand_src (s, dp, s->size);					\
1909     speed_operand_dst (s, scratch, itch);				\
1910     speed_cache_fill (s);						\
1911 									\
1912     speed_starttime ();							\
1913     i = s->reps;							\
1914     do {								\
1915       function (qp, rp, tp, size1, dp, s->size, scratch);		\
1916     } while (--i != 0);							\
1917     t = speed_endtime ();						\
1918 									\
1919     TMP_FREE;								\
1920     return t;								\
1921   }
1922 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
1923   {									\
1924     unsigned   i;							\
1925     mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
1926     double     t;							\
1927     mp_size_t  size1, itch;						\
1928     TMP_DECL;								\
1929 									\
1930     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1931 									\
1932     SPEED_RESTRICT_COND (s->size >= 2);					\
1933     SPEED_RESTRICT_COND (size1 >= s->size);				\
1934 									\
1935     itch = itchfn (size1, s->size, s->size);				\
1936     TMP_MARK;								\
1937     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1938     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1939     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1940     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1941     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1942     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
1943 									\
1944     /* we don't fill in dividend completely when size1 > s->size */	\
1945     MPN_COPY (tp,         s->xp, s->size);				\
1946     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1947 									\
1948     MPN_COPY (dp,         s->yp, s->size);				\
1949 									\
1950     /* normalize the data */						\
1951     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1952     tp[size1 - 1] = dp[s->size-1] - 1;					\
1953 									\
1954     tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
1955     mpn_invert (ip, dp, s->size, tmp);					\
1956 									\
1957     speed_operand_dst (s, qp, size1 - s->size);				\
1958     speed_operand_dst (s, rp, s->size);					\
1959     speed_operand_src (s, tp, size1);					\
1960     speed_operand_src (s, dp, s->size);					\
1961     speed_operand_src (s, ip, s->size);					\
1962     speed_operand_dst (s, scratch, itch);				\
1963     speed_cache_fill (s);						\
1964 									\
1965     speed_starttime ();							\
1966     i = s->reps;							\
1967     do {								\
1968       function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
1969     } while (--i != 0);							\
1970     t = speed_endtime ();						\
1971 									\
1972     TMP_FREE;								\
1973     return t;								\
1974   }
1975 
1976 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
1977   {									\
1978     unsigned   i;							\
1979     mp_ptr     dp, tp, ap, qp;						\
1980     mp_limb_t  inv;							\
1981     double     t;							\
1982     TMP_DECL;								\
1983 									\
1984     SPEED_RESTRICT_COND (s->size >= 1);					\
1985 									\
1986     TMP_MARK;								\
1987     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
1988     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1989     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1990     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
1991 									\
1992     MPN_COPY (ap,         s->xp, s->size);				\
1993     MPN_COPY (ap+s->size, s->xp, s->size);				\
1994 									\
1995     /* divisor must be odd */						\
1996     MPN_COPY (dp, s->yp, s->size);					\
1997     dp[0] |= 1;								\
1998     binvert_limb (inv, dp[0]);						\
1999     inv = -inv;								\
2000 									\
2001     speed_operand_src (s, ap, 2*s->size);				\
2002     speed_operand_dst (s, tp, 2*s->size);				\
2003     speed_operand_src (s, dp, s->size);					\
2004     speed_operand_dst (s, qp, s->size);					\
2005     speed_cache_fill (s);						\
2006 									\
2007     speed_starttime ();							\
2008     i = s->reps;							\
2009     do {								\
2010       MPN_COPY (tp, ap, 2*s->size);					\
2011       function (qp, tp, 2*s->size, dp, s->size, inv);			\
2012     } while (--i != 0);							\
2013     t = speed_endtime ();						\
2014 									\
2015     TMP_FREE;								\
2016     return t;								\
2017   }
2018 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
2019   {									\
2020     unsigned   i;							\
2021     mp_ptr     dp, tp, qp;						\
2022     mp_limb_t  inv;							\
2023     double     t;							\
2024     TMP_DECL;								\
2025 									\
2026     SPEED_RESTRICT_COND (s->size >= 1);					\
2027 									\
2028     TMP_MARK;								\
2029     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2030     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2031     SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
2032 									\
2033     /* divisor must be odd */						\
2034     MPN_COPY (dp, s->yp, s->size);					\
2035     dp[0] |= 1;								\
2036     binvert_limb (inv, dp[0]);						\
2037     inv = -inv;								\
2038 									\
2039     speed_operand_src (s, s->xp, s->size);				\
2040     speed_operand_dst (s, tp, s->size);					\
2041     speed_operand_src (s, dp, s->size);					\
2042     speed_operand_dst (s, qp, s->size);					\
2043     speed_cache_fill (s);						\
2044 									\
2045     speed_starttime ();							\
2046     i = s->reps;							\
2047     do {								\
2048       MPN_COPY (tp, s->xp, s->size);					\
2049       function (qp, tp, s->size, dp, s->size, inv);			\
2050     } while (--i != 0);							\
2051     t = speed_endtime ();						\
2052 									\
2053     TMP_FREE;								\
2054     return t;								\
2055   }
2056 #define SPEED_ROUTINE_MPN_PI1_BDIV_R(function)				\
2057   {									\
2058     unsigned   i;							\
2059     mp_ptr     dp, tp, ap;						\
2060     mp_limb_t  inv;							\
2061     double     t;							\
2062     TMP_DECL;								\
2063 									\
2064     SPEED_RESTRICT_COND (s->size >= 1);					\
2065 									\
2066     TMP_MARK;								\
2067     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
2068     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2069     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
2070 									\
2071     MPN_COPY (ap,         s->xp, s->size);				\
2072     MPN_COPY (ap+s->size, s->xp, s->size);				\
2073 									\
2074     /* divisor must be odd */						\
2075     MPN_COPY (dp, s->yp, s->size);					\
2076     dp[0] |= 1;								\
2077     binvert_limb (inv, dp[0]);						\
2078     inv = -inv;								\
2079 									\
2080     speed_operand_src (s, ap, 2*s->size);				\
2081     speed_operand_dst (s, tp, 2*s->size);				\
2082     speed_operand_src (s, dp, s->size);					\
2083     speed_cache_fill (s);						\
2084 									\
2085     speed_starttime ();							\
2086     i = s->reps;							\
2087     do {								\
2088       MPN_COPY (tp, ap, 2*s->size);					\
2089       function (tp, 2*s->size, dp, s->size, inv);			\
2090     } while (--i != 0);							\
2091     t = speed_endtime ();						\
2092 									\
2093     TMP_FREE;								\
2094     return t;								\
2095   }
2096 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
2097   {									\
2098     unsigned   i;							\
2099     mp_ptr     dp, qp, scratch;						\
2100     double     t;							\
2101     mp_size_t itch;							\
2102     TMP_DECL;								\
2103 									\
2104     SPEED_RESTRICT_COND (s->size >= 2);					\
2105 									\
2106     itch = itchfn (s->size, s->size);					\
2107     TMP_MARK;								\
2108     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2109     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2110     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2111 									\
2112     /* divisor must be odd */						\
2113     MPN_COPY (dp, s->yp, s->size);					\
2114     dp[0] |= 1;								\
2115 									\
2116     speed_operand_dst (s, qp, s->size);					\
2117     speed_operand_src (s, s->xp, s->size);				\
2118     speed_operand_src (s, dp, s->size);					\
2119     speed_operand_dst (s, scratch, itch);				\
2120     speed_cache_fill (s);						\
2121 									\
2122     speed_starttime ();							\
2123     i = s->reps;							\
2124     do {								\
2125       function (qp, s->xp, s->size, dp, s->size, scratch);		\
2126     } while (--i != 0);							\
2127     t = speed_endtime ();						\
2128 									\
2129     TMP_FREE;								\
2130     return t;								\
2131   }
2132 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
2133   {									\
2134     unsigned   i;							\
2135     mp_ptr     dp, tp, qp, rp, scratch;					\
2136     double     t;							\
2137     mp_size_t itch;							\
2138     TMP_DECL;								\
2139 									\
2140     SPEED_RESTRICT_COND (s->size >= 2);					\
2141 									\
2142     itch = itchfn (2 * s->size, s->size);				\
2143     TMP_MARK;								\
2144     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2145     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2146     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
2147     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2148     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
2149 									\
2150     MPN_COPY (tp,         s->xp, s->size);				\
2151     MPN_COPY (tp+s->size, s->xp, s->size);				\
2152 									\
2153     /* divisor must be odd */						\
2154     MPN_COPY (dp, s->yp, s->size);					\
2155     dp[0] |= 1;								\
2156 									\
2157     speed_operand_dst (s, qp, s->size);					\
2158     speed_operand_dst (s, rp, s->size);					\
2159     speed_operand_src (s, tp, 2 * s->size);				\
2160     speed_operand_src (s, dp, s->size);					\
2161     speed_operand_dst (s, scratch, itch);				\
2162     speed_cache_fill (s);						\
2163 									\
2164     speed_starttime ();							\
2165     i = s->reps;							\
2166     do {								\
2167       function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
2168     } while (--i != 0);							\
2169     t = speed_endtime ();						\
2170 									\
2171     TMP_FREE;								\
2172     return t;								\
2173   }
2174 
2175 #define SPEED_ROUTINE_MPN_BROOT(function)	\
2176   {						\
2177     SPEED_RESTRICT_COND (s->r & 1);		\
2178     s->xp[0] |= 1;				\
2179     SPEED_ROUTINE_MPN_UNARY_1_CALL		\
2180       ((*function) (wp, s->xp, s->size, s->r));	\
2181   }
2182 
2183 #define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
2184   {							\
2185     mp_ptr    wp, tp;					\
2186     unsigned  i;					\
2187     double    t;					\
2188     TMP_DECL;						\
2189     TMP_MARK;						\
2190     SPEED_RESTRICT_COND (s->size >= 1);			\
2191     SPEED_RESTRICT_COND (s->r & 1);			\
2192     wp = TMP_ALLOC_LIMBS (s->size);			\
2193     tp = TMP_ALLOC_LIMBS ( (itch));			\
2194     s->xp[0] |= 1;					\
2195 							\
2196     speed_operand_src (s, s->xp, s->size);		\
2197     speed_operand_dst (s, wp, s->size);			\
2198     speed_cache_fill (s);				\
2199 							\
2200     speed_starttime ();					\
2201     i = s->reps;					\
2202     do							\
2203       (*function) (wp, s->xp, s->size, s->r, tp);	\
2204     while (--i != 0);					\
2205     t = speed_endtime ();				\
2206 							\
2207     TMP_FREE;						\
2208     return t;						\
2209   }
2210 
2211 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
2212   {									\
2213     long  i;								\
2214     mp_ptr    up, tp, ip;						\
2215     double    t;							\
2216     TMP_DECL;								\
2217 									\
2218     SPEED_RESTRICT_COND (s->size >= 1);					\
2219 									\
2220     TMP_MARK;								\
2221     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2222     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2223     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2224 									\
2225     MPN_COPY (up, s->xp, s->size);					\
2226 									\
2227     /* normalize the data */						\
2228     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2229 									\
2230     speed_operand_src (s, up, s->size);					\
2231     speed_operand_dst (s, tp, s->size);					\
2232     speed_operand_dst (s, ip, s->size);					\
2233     speed_cache_fill (s);						\
2234 									\
2235     speed_starttime ();							\
2236     i = s->reps;							\
2237     do									\
2238       function (ip, up, s->size, tp);					\
2239     while (--i != 0);							\
2240     t = speed_endtime ();						\
2241 									\
2242     TMP_FREE;								\
2243     return t;								\
2244   }
2245 
2246 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
2247   {									\
2248     long  i;								\
2249     mp_ptr    up, tp, ip;						\
2250     double    t;							\
2251     TMP_DECL;								\
2252 									\
2253     SPEED_RESTRICT_COND (s->size >= 1);					\
2254 									\
2255     TMP_MARK;								\
2256     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2257     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2258     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2259 									\
2260     MPN_COPY (up, s->xp, s->size);					\
2261 									\
2262     /* normalize the data */						\
2263     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2264 									\
2265     speed_operand_src (s, up, s->size);					\
2266     speed_operand_dst (s, tp, s->size);					\
2267     speed_operand_dst (s, ip, s->size);					\
2268     speed_cache_fill (s);						\
2269 									\
2270     speed_starttime ();							\
2271     i = s->reps;							\
2272     do									\
2273       function (ip, up, s->size, tp);					\
2274     while (--i != 0);							\
2275     t = speed_endtime ();						\
2276 									\
2277     TMP_FREE;								\
2278     return t;								\
2279   }
2280 
2281 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
2282   {									\
2283     long  i;								\
2284     mp_ptr    up, tp, ip;						\
2285     double    t;							\
2286     TMP_DECL;								\
2287 									\
2288     SPEED_RESTRICT_COND (s->size >= 3);					\
2289 									\
2290     TMP_MARK;								\
2291     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2292     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2293     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2294 									\
2295     MPN_COPY (up, s->xp, s->size);					\
2296 									\
2297     /* normalize the data */						\
2298     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2299 									\
2300     speed_operand_src (s, up, s->size);					\
2301     speed_operand_dst (s, tp, s->size);					\
2302     speed_operand_dst (s, ip, s->size);					\
2303     speed_cache_fill (s);						\
2304 									\
2305     speed_starttime ();							\
2306     i = s->reps;							\
2307     do									\
2308       function (ip, up, s->size, tp);					\
2309     while (--i != 0);							\
2310     t = speed_endtime ();						\
2311 									\
2312     TMP_FREE;								\
2313     return t;								\
2314   }
2315 
2316 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
2317   {									\
2318     long  i;								\
2319     mp_ptr    up, tp, ip;						\
2320     double    t;							\
2321     TMP_DECL;								\
2322 									\
2323     SPEED_RESTRICT_COND (s->size >= 1);					\
2324 									\
2325     TMP_MARK;								\
2326     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2327     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2328     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2329 									\
2330     MPN_COPY (up, s->xp, s->size);					\
2331 									\
2332     /* normalize the data */						\
2333     up[0] |= 1;								\
2334 									\
2335     speed_operand_src (s, up, s->size);					\
2336     speed_operand_dst (s, tp, s->size);					\
2337     speed_operand_dst (s, ip, s->size);					\
2338     speed_cache_fill (s);						\
2339 									\
2340     speed_starttime ();							\
2341     i = s->reps;							\
2342     do									\
2343       function (ip, up, s->size, tp);					\
2344     while (--i != 0);							\
2345     t = speed_endtime ();						\
2346 									\
2347     TMP_FREE;								\
2348     return t;								\
2349   }
2350 
2351 #define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn)			\
2352   {									\
2353     long  i;								\
2354     mp_ptr    up, mp, tp, ip;						\
2355     double    t;							\
2356     TMP_DECL;								\
2357 									\
2358     SPEED_RESTRICT_COND (s->size >= 1);					\
2359 									\
2360     TMP_MARK;								\
2361     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2362     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2363     SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp);			\
2364     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2365 									\
2366     speed_operand_src (s, up, s->size);					\
2367     speed_operand_dst (s, tp, s->size);					\
2368     speed_operand_dst (s, ip, s->size);					\
2369     speed_cache_fill (s);						\
2370 									\
2371     MPN_COPY (mp, s->yp, s->size);					\
2372     /* Must be odd */							\
2373     mp[0] |= 1;								\
2374     speed_starttime ();							\
2375     i = s->reps;							\
2376     do									\
2377       {									\
2378 	MPN_COPY (up, s->xp, s->size);					\
2379 	function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp);	\
2380       }									\
2381     while (--i != 0);							\
2382     t = speed_endtime ();						\
2383 									\
2384     TMP_FREE;								\
2385     return t;								\
2386   }
2387 
2388 #define SPEED_ROUTINE_REDC_1(function)					\
2389   {									\
2390     unsigned   i;							\
2391     mp_ptr     cp, mp, tp, ap;						\
2392     mp_limb_t  inv;							\
2393     double     t;							\
2394     TMP_DECL;								\
2395 									\
2396     SPEED_RESTRICT_COND (s->size >= 1);					\
2397 									\
2398     TMP_MARK;								\
2399     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2400     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2401     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2402     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2403 									\
2404     MPN_COPY (ap,         s->xp, s->size);				\
2405     MPN_COPY (ap+s->size, s->xp, s->size);				\
2406 									\
2407     /* modulus must be odd */						\
2408     MPN_COPY (mp, s->yp, s->size);					\
2409     mp[0] |= 1;								\
2410     binvert_limb (inv, mp[0]);						\
2411     inv = -inv;								\
2412 									\
2413     speed_operand_src (s, ap, 2*s->size+1);				\
2414     speed_operand_dst (s, tp, 2*s->size+1);				\
2415     speed_operand_src (s, mp, s->size);					\
2416     speed_operand_dst (s, cp, s->size);					\
2417     speed_cache_fill (s);						\
2418 									\
2419     speed_starttime ();							\
2420     i = s->reps;							\
2421     do {								\
2422       MPN_COPY (tp, ap, 2*s->size);					\
2423       function (cp, tp, mp, s->size, inv);				\
2424     } while (--i != 0);							\
2425     t = speed_endtime ();						\
2426 									\
2427     TMP_FREE;								\
2428     return t;								\
2429   }
2430 #define SPEED_ROUTINE_REDC_2(function)					\
2431   {									\
2432     unsigned   i;							\
2433     mp_ptr     cp, mp, tp, ap;						\
2434     mp_limb_t  invp[2];							\
2435     double     t;							\
2436     TMP_DECL;								\
2437 									\
2438     SPEED_RESTRICT_COND (s->size >= 1);					\
2439 									\
2440     TMP_MARK;								\
2441     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2442     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2443     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2444     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2445 									\
2446     MPN_COPY (ap,         s->xp, s->size);				\
2447     MPN_COPY (ap+s->size, s->xp, s->size);				\
2448 									\
2449     /* modulus must be odd */						\
2450     MPN_COPY (mp, s->yp, s->size);					\
2451     mp[0] |= 1;								\
2452     mpn_binvert (invp, mp, 2, tp);					\
2453     invp[0] = -invp[0]; invp[1] = ~invp[1];				\
2454 									\
2455     speed_operand_src (s, ap, 2*s->size+1);				\
2456     speed_operand_dst (s, tp, 2*s->size+1);				\
2457     speed_operand_src (s, mp, s->size);					\
2458     speed_operand_dst (s, cp, s->size);					\
2459     speed_cache_fill (s);						\
2460 									\
2461     speed_starttime ();							\
2462     i = s->reps;							\
2463     do {								\
2464       MPN_COPY (tp, ap, 2*s->size);					\
2465       function (cp, tp, mp, s->size, invp);				\
2466     } while (--i != 0);							\
2467     t = speed_endtime ();						\
2468 									\
2469     TMP_FREE;								\
2470     return t;								\
2471   }
2472 #define SPEED_ROUTINE_REDC_N(function)					\
2473   {									\
2474     unsigned   i;							\
2475     mp_ptr     cp, mp, tp, ap, invp;					\
2476     double     t;							\
2477     TMP_DECL;								\
2478 									\
2479     SPEED_RESTRICT_COND (s->size > 8);					\
2480 									\
2481     TMP_MARK;								\
2482     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2483     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2484     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2485     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2486     SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
2487 									\
2488     MPN_COPY (ap,         s->xp, s->size);				\
2489     MPN_COPY (ap+s->size, s->xp, s->size);				\
2490 									\
2491     /* modulus must be odd */						\
2492     MPN_COPY (mp, s->yp, s->size);					\
2493     mp[0] |= 1;								\
2494     mpn_binvert (invp, mp, s->size, tp);				\
2495 									\
2496     speed_operand_src (s, ap, 2*s->size+1);				\
2497     speed_operand_dst (s, tp, 2*s->size+1);				\
2498     speed_operand_src (s, mp, s->size);					\
2499     speed_operand_dst (s, cp, s->size);					\
2500     speed_cache_fill (s);						\
2501 									\
2502     speed_starttime ();							\
2503     i = s->reps;							\
2504     do {								\
2505       MPN_COPY (tp, ap, 2*s->size);					\
2506       function (cp, tp, mp, s->size, invp);				\
2507     } while (--i != 0);							\
2508     t = speed_endtime ();						\
2509 									\
2510     TMP_FREE;								\
2511     return t;								\
2512   }
2513 
2514 
2515 #define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
2516   {									\
2517     unsigned i;								\
2518 									\
2519     SPEED_RESTRICT_COND (s->size >= 1);					\
2520 									\
2521     speed_operand_src (s, s->xp, s->size);				\
2522     speed_cache_fill (s);						\
2523 									\
2524     speed_starttime ();							\
2525     i = s->reps;							\
2526     do									\
2527       function (s->xp, s->size);					\
2528     while (--i != 0);							\
2529 									\
2530     return speed_endtime ();						\
2531   }
2532 
2533 #define SPEED_ROUTINE_MPN_HAMDIST(function)				\
2534   {									\
2535     unsigned i;								\
2536 									\
2537     SPEED_RESTRICT_COND (s->size >= 1);					\
2538 									\
2539     speed_operand_src (s, s->xp, s->size);				\
2540     speed_operand_src (s, s->yp, s->size);				\
2541     speed_cache_fill (s);						\
2542 									\
2543     speed_starttime ();							\
2544     i = s->reps;							\
2545     do									\
2546       function (s->xp, s->yp, s->size);					\
2547     while (--i != 0);							\
2548 									\
2549     return speed_endtime ();						\
2550   }
2551 
2552 
2553 #define SPEED_ROUTINE_MPZ_UI(function)					\
2554   {									\
2555     mpz_t     z;							\
2556     unsigned  i;							\
2557     double    t;							\
2558 									\
2559     SPEED_RESTRICT_COND (s->size >= 0);					\
2560 									\
2561     mpz_init (z);							\
2562 									\
2563     speed_starttime ();							\
2564     i = s->reps;							\
2565     do									\
2566       function (z, s->size);						\
2567     while (--i != 0);							\
2568     t = speed_endtime ();						\
2569 									\
2570     mpz_clear (z);							\
2571     return t;								\
2572   }
2573 
2574 #define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2575 #define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2576 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2577 
2578 
2579 #define SPEED_ROUTINE_MPZ_2_UI(function)				\
2580   {									\
2581     mpz_t     z, z2;							\
2582     unsigned  i;							\
2583     double    t;							\
2584 									\
2585     SPEED_RESTRICT_COND (s->size >= 0);					\
2586 									\
2587     mpz_init (z);							\
2588     mpz_init (z2);							\
2589 									\
2590     speed_starttime ();							\
2591     i = s->reps;							\
2592     do									\
2593       function (z, z2, s->size);					\
2594     while (--i != 0);							\
2595     t = speed_endtime ();						\
2596 									\
2597     mpz_clear (z);							\
2598     mpz_clear (z2);							\
2599     return t;								\
2600   }
2601 
2602 #define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
2603 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2604 
2605 
2606 #define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
2607   {									\
2608     mp_ptr     fp, f1p;							\
2609     mp_size_t  alloc;							\
2610     unsigned   i;							\
2611     double     t;							\
2612     TMP_DECL;								\
2613 									\
2614     SPEED_RESTRICT_COND (s->size >= 0);					\
2615 									\
2616     TMP_MARK;								\
2617     alloc = MPN_FIB2_SIZE (s->size);					\
2618     SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
2619     SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
2620 									\
2621     speed_starttime ();							\
2622     i = s->reps;							\
2623     do									\
2624       function (fp, f1p, s->size);					\
2625     while (--i != 0);							\
2626     t = speed_endtime ();						\
2627 									\
2628     TMP_FREE;								\
2629     return t;								\
2630   }
2631 
2632 
2633 
2634 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2635    limbs.  m is forced to odd so that redc can be used.  e is limited in
2636    size so the calculation doesn't take too long. */
2637 #define SPEED_ROUTINE_MPZ_POWM(function)				\
2638   {									\
2639     mpz_t     r, b, e, m;						\
2640     unsigned  i;							\
2641     double    t;							\
2642 									\
2643     SPEED_RESTRICT_COND (s->size >= 1);					\
2644 									\
2645     mpz_init (r);							\
2646     if (s->r < 2)							\
2647       mpz_init_set_n (b, s->xp, s->size);				\
2648     else								\
2649       mpz_init_set_ui (b, s->r);					\
2650     mpz_init_set_n (m, s->yp, s->size);					\
2651     mpz_setbit (m, 0);	/* force m to odd */				\
2652     mpz_init_set_n (e, s->xp_block, 6);					\
2653 									\
2654     speed_starttime ();							\
2655     i = s->reps;							\
2656     do									\
2657       function (r, b, e, m);						\
2658     while (--i != 0);							\
2659     t = speed_endtime ();						\
2660 									\
2661     mpz_clear (r);							\
2662     mpz_clear (b);							\
2663     mpz_clear (e);							\
2664     mpz_clear (m);							\
2665     return t;								\
2666   }
2667 
2668 /* (m-2)^0xAAAAAAAA mod m */
2669 #define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
2670   {									\
2671     mpz_t     r, b, m;							\
2672     unsigned  long  e;							\
2673     unsigned  i;							\
2674     double    t;							\
2675 									\
2676     SPEED_RESTRICT_COND (s->size >= 1);					\
2677 									\
2678     mpz_init (r);							\
2679 									\
2680     /* force m to odd */						\
2681     mpz_init (m);							\
2682     mpz_set_n (m, s->xp, s->size);					\
2683     PTR(m)[0] |= 1;							\
2684 									\
2685     e = (~ (unsigned long) 0) / 3;					\
2686     if (s->r != 0)							\
2687       e = s->r;								\
2688 									\
2689     mpz_init_set (b, m);						\
2690     mpz_sub_ui (b, b, 2);						\
2691 /* printf ("%X\n", mpz_get_ui(m)); */					\
2692     i = s->reps;							\
2693     speed_starttime ();							\
2694     do									\
2695       function (r, b, e, m);						\
2696     while (--i != 0);							\
2697     t = speed_endtime ();						\
2698 									\
2699     mpz_clear (r);							\
2700     mpz_clear (b);							\
2701     mpz_clear (m);							\
2702     return t;								\
2703   }
2704 
2705 
2706 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
2707   {									\
2708     mp_ptr    wp, wp2, xp, yp;						\
2709     unsigned  i;							\
2710     double    t;							\
2711     TMP_DECL;								\
2712 									\
2713     SPEED_RESTRICT_COND (s->size >= 0);					\
2714 									\
2715     TMP_MARK;								\
2716     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2717     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2718     xp = s->xp;								\
2719     yp = s->yp;								\
2720 									\
2721     if (s->r == 0)	;						\
2722     else if (s->r == 1) { xp = wp;	      }				\
2723     else if (s->r == 2) {	    yp = wp2; }				\
2724     else if (s->r == 3) { xp = wp;  yp = wp2; }				\
2725     else if (s->r == 4) { xp = wp2; yp = wp;  }				\
2726     else {								\
2727       TMP_FREE;								\
2728       return -1.0;							\
2729     }									\
2730     if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
2731     if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
2732 									\
2733     speed_operand_src (s, xp, s->size);					\
2734     speed_operand_src (s, yp, s->size);					\
2735     speed_operand_dst (s, wp, s->size);					\
2736     speed_operand_dst (s, wp2, s->size);				\
2737     speed_cache_fill (s);						\
2738 									\
2739     speed_starttime ();							\
2740     i = s->reps;							\
2741     do									\
2742       call;								\
2743     while (--i != 0);							\
2744     t = speed_endtime ();						\
2745 									\
2746     TMP_FREE;								\
2747     return t;								\
2748   }
2749 
2750 #define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
2751   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2752     (function (wp, wp2, xp, yp, s->size));
2753 
2754 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
2755   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2756     (function (wp, wp2, xp, yp, s->size, 0));
2757 
2758 
2759 /* Doing an Nx1 gcd with the given r. */
2760 #define SPEED_ROUTINE_MPN_GCD_1N(function)				\
2761   {									\
2762     mp_ptr    xp;							\
2763     unsigned  i;							\
2764     double    t;							\
2765     TMP_DECL;								\
2766 									\
2767     SPEED_RESTRICT_COND (s->size >= 1);					\
2768     SPEED_RESTRICT_COND (s->r != 0);					\
2769 									\
2770     TMP_MARK;								\
2771     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
2772     MPN_COPY (xp, s->xp, s->size);					\
2773     xp[0] |= refmpn_zero_p (xp, s->size);				\
2774 									\
2775     speed_operand_src (s, s->xp, s->size);				\
2776     speed_cache_fill (s);						\
2777 									\
2778     speed_starttime ();							\
2779     i = s->reps;							\
2780     do									\
2781       function (xp, s->size, s->r);					\
2782     while (--i != 0);							\
2783     t = speed_endtime ();						\
2784 									\
2785     TMP_FREE;								\
2786     return t;								\
2787   }
2788 
2789 
2790 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2791 
2792 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
2793   {									\
2794     unsigned  i, j;							\
2795     mp_ptr    px, py;							\
2796     mp_limb_t x_mask, y_mask;						\
2797     double    t;							\
2798     TMP_DECL;								\
2799 									\
2800     SPEED_RESTRICT_COND (s->size >= 1);					\
2801     SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
2802 									\
2803     TMP_MARK;								\
2804     SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
2805     SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
2806     MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
2807     MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
2808 									\
2809     x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
2810     y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
2811     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2812       {									\
2813 	px[i] &= x_mask; px[i] += (px[i] == 0);				\
2814 	py[i] &= y_mask; py[i] += (py[i] == 0);				\
2815 	setup;								\
2816       }									\
2817 									\
2818     speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
2819     speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
2820     speed_cache_fill (s);						\
2821 									\
2822     speed_starttime ();							\
2823     i = s->reps;							\
2824     do									\
2825       {									\
2826 	j = SPEED_BLOCK_SIZE;						\
2827 	do								\
2828 	  {								\
2829 	    call;							\
2830 	  }								\
2831 	while (--j != 0);						\
2832       }									\
2833     while (--i != 0);							\
2834     t = speed_endtime ();						\
2835 									\
2836     TMP_FREE;								\
2837 									\
2838     s->time_divisor = SPEED_BLOCK_SIZE;					\
2839     return t;								\
2840   }
2841 
2842 #define SPEED_ROUTINE_MPN_GCD_1(function)				\
2843   SPEED_ROUTINE_MPN_GCD_1_CALL(do{}while(0) , function (&px[j-1], 1, py[j-1]))
2844 
2845 #define SPEED_ROUTINE_MPN_GCD_11(function)				\
2846   SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
2847 			       function (px[j-1], py[j-1]))
2848 
2849 /* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so
2850    we can measure gcd_22 loop only, without gcd_11. */
2851 #define SPEED_ROUTINE_MPN_GCD_22(function)				\
2852   SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
2853 			       function (px[j-1], px[j-1], py[j-1], py[j-1]))
2854 
2855 #define SPEED_ROUTINE_MPN_JACBASE(function)				\
2856   SPEED_ROUTINE_MPN_GCD_1_CALL						\
2857     ({									\
2858        /* require x<y, y odd, y!=1 */					\
2859        px[i] %= py[i];							\
2860        px[i] |= 1;							\
2861        py[i] |= 1;							\
2862        if (py[i]==1) py[i]=3;						\
2863      },									\
2864      function (px[j-1], py[j-1], 0))
2865 
2866 #define SPEED_ROUTINE_MPN_HGCD2(function)				\
2867   {									\
2868     unsigned   i, j;							\
2869     struct hgcd_matrix1 m = {{{0,0},{0,0}}};				\
2870     double     t;							\
2871     mp_limb_t chain;							\
2872 									\
2873     speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
2874     speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE);		\
2875     speed_cache_fill (s);						\
2876 									\
2877     speed_starttime ();							\
2878     i = s->reps;							\
2879     chain = 0;								\
2880     do									\
2881       {									\
2882 	for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2)			\
2883 	  {								\
2884 	    /* randomized but successively dependent */			\
2885 	    function (s->xp_block[j] | GMP_NUMB_HIGHBIT,		\
2886 		      s->xp_block[j+1] + chain,				\
2887 		      s->yp_block[j] | GMP_NUMB_HIGHBIT,		\
2888 		      s->yp_block[j+1], &m);				\
2889 	    chain += m.u[0][0];						\
2890 	  }								\
2891       }									\
2892     while (--i != 0);							\
2893     t = speed_endtime ();						\
2894 									\
2895     /* make sure the compiler won't optimize away chain */		\
2896     noop_1 (chain);							\
2897 									\
2898     s->time_divisor = SPEED_BLOCK_SIZE / 2;				\
2899     return t;								\
2900   }
2901 
2902 #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
2903   {									\
2904     mp_size_t hgcd_init_itch, hgcd_itch;				\
2905     mp_ptr ap, bp, wp, tmp1;						\
2906     struct hgcd_matrix hgcd;						\
2907     int res;								\
2908     unsigned i;								\
2909     double t;								\
2910     TMP_DECL;								\
2911 									\
2912     if (s->size < 2)							\
2913       return -1;							\
2914 									\
2915     TMP_MARK;								\
2916 									\
2917     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2918     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2919 									\
2920     s->xp[s->size - 1] |= 1;						\
2921     s->yp[s->size - 1] |= 1;						\
2922 									\
2923     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2924     hgcd_itch = itchfunc (s->size);					\
2925 									\
2926     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2927     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
2928 									\
2929     speed_operand_src (s, s->xp, s->size);				\
2930     speed_operand_src (s, s->yp, s->size);				\
2931     speed_operand_dst (s, ap, s->size + 1);				\
2932     speed_operand_dst (s, bp, s->size + 1);				\
2933     speed_operand_dst (s, wp, hgcd_itch);				\
2934     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2935     speed_cache_fill (s);						\
2936 									\
2937     speed_starttime ();							\
2938     i = s->reps;							\
2939     do									\
2940       {									\
2941 	MPN_COPY (ap, s->xp, s->size);					\
2942 	MPN_COPY (bp, s->yp, s->size);					\
2943 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2944 	res = func (ap, bp, s->size, &hgcd, wp);			\
2945       }									\
2946     while (--i != 0);							\
2947     t = speed_endtime ();						\
2948     TMP_FREE;								\
2949     return t;								\
2950   }
2951 
2952 #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
2953   {									\
2954     mp_size_t hgcd_init_itch, hgcd_step_itch;				\
2955     mp_ptr ap, bp, wp, tmp1;						\
2956     struct hgcd_matrix hgcd;						\
2957     mp_size_t p = s->size/2;						\
2958     int res;								\
2959     unsigned i;								\
2960     double t;								\
2961     TMP_DECL;								\
2962 									\
2963     if (s->size < 2)							\
2964       return -1;							\
2965 									\
2966     TMP_MARK;								\
2967 									\
2968     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2969     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2970 									\
2971     s->xp[s->size - 1] |= 1;						\
2972     s->yp[s->size - 1] |= 1;						\
2973 									\
2974     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2975     hgcd_step_itch = itchfunc (s->size, p);				\
2976 									\
2977     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2978     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
2979 									\
2980     speed_operand_src (s, s->xp, s->size);				\
2981     speed_operand_src (s, s->yp, s->size);				\
2982     speed_operand_dst (s, ap, s->size + 1);				\
2983     speed_operand_dst (s, bp, s->size + 1);				\
2984     speed_operand_dst (s, wp, hgcd_step_itch);				\
2985     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2986     speed_cache_fill (s);						\
2987 									\
2988     speed_starttime ();							\
2989     i = s->reps;							\
2990     do									\
2991       {									\
2992 	MPN_COPY (ap, s->xp, s->size);					\
2993 	MPN_COPY (bp, s->yp, s->size);					\
2994 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2995 	res = func (&hgcd, ap, bp, s->size, p, wp);			\
2996       }									\
2997     while (--i != 0);							\
2998     t = speed_endtime ();						\
2999     TMP_FREE;								\
3000     return t;								\
3001   }
3002 
3003 /* Run some GCDs of s->size limbs each.  The number of different data values
3004    is decreased as s->size**2, since GCD is a quadratic algorithm.
3005    SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
3006    though, because the plain gcd is about twice as fast as gcdext.  */
3007 
3008 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
3009   {									\
3010     unsigned  i;							\
3011     mp_size_t j, pieces, psize;						\
3012     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
3013     double    t;							\
3014     TMP_DECL;								\
3015 									\
3016     SPEED_RESTRICT_COND (s->size >= 1);					\
3017 									\
3018     TMP_MARK;								\
3019     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
3020     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
3021     SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
3022     SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
3023 									\
3024     pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
3025     pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
3026     pieces = MAX (pieces, 1);						\
3027 									\
3028     psize = pieces * s->size;						\
3029     px = TMP_ALLOC_LIMBS (psize);					\
3030     py = TMP_ALLOC_LIMBS (psize);					\
3031     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3032     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3033 									\
3034     /* Requirements: x >= y, y must be odd, high limbs != 0.		\
3035        No need to ensure random numbers are really great.  */		\
3036     for (j = 0; j < pieces; j++)					\
3037       {									\
3038 	mp_ptr	x = px + j * s->size;					\
3039 	mp_ptr	y = py + j * s->size;					\
3040 	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
3041 	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
3042 									\
3043 	if (x[s->size - 1] < y[s->size - 1])				\
3044 	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
3045 	else if (x[s->size - 1] == y[s->size - 1])			\
3046 	  {								\
3047 	    x[s->size - 1] = 2;						\
3048 	    y[s->size - 1] = 1;						\
3049 	  }								\
3050 	y[0] |= 1;							\
3051       }									\
3052 									\
3053     speed_operand_src (s, px, psize);					\
3054     speed_operand_src (s, py, psize);					\
3055     speed_operand_dst (s, xtmp, s->size);				\
3056     speed_operand_dst (s, ytmp, s->size);				\
3057     speed_operand_dst (s, wp, s->size);					\
3058     speed_cache_fill (s);						\
3059 									\
3060     speed_starttime ();							\
3061     i = s->reps;							\
3062     do									\
3063       {									\
3064 	j = pieces;							\
3065 	do								\
3066 	  {								\
3067 	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
3068 	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
3069 	    call;							\
3070 	  }								\
3071 	while (--j != 0);						\
3072       }									\
3073     while (--i != 0);							\
3074     t = speed_endtime ();						\
3075 									\
3076     TMP_FREE;								\
3077 									\
3078     s->time_divisor = pieces;						\
3079     return t;								\
3080   }
3081 
3082 #define SPEED_ROUTINE_MPN_GCD(function)	\
3083   SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
3084 
3085 #define SPEED_ROUTINE_MPN_GCDEXT(function)				\
3086   SPEED_ROUTINE_MPN_GCD_CALL						\
3087     (4, { mp_size_t  wp2size;						\
3088 	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
3089 
3090 
3091 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
3092   {									\
3093     unsigned  i;							\
3094     mp_size_t j, pieces, psize, wp2size;				\
3095     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
3096     double    t;							\
3097     TMP_DECL;								\
3098 									\
3099     SPEED_RESTRICT_COND (s->size >= 1);					\
3100 									\
3101     TMP_MARK;								\
3102 									\
3103     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
3104     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
3105     MPN_COPY (xtmp, s->xp, s->size);					\
3106     MPN_COPY (ytmp, s->yp, s->size);					\
3107 									\
3108     SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
3109     SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
3110 									\
3111     pieces = SPEED_BLOCK_SIZE / 3;					\
3112     psize = 3 * pieces;							\
3113     px = TMP_ALLOC_LIMBS (psize);					\
3114     py = TMP_ALLOC_LIMBS (psize);					\
3115     MPN_COPY (px, s->xp_block, psize);					\
3116     MPN_COPY (py, s->yp_block, psize);					\
3117 									\
3118     /* x must have at least as many bits as y,				\
3119        high limbs must be non-zero */					\
3120     for (j = 0; j < pieces; j++)					\
3121       {									\
3122 	mp_ptr	x = px+3*j;						\
3123 	mp_ptr	y = py+3*j;						\
3124 	x[2] += (x[2] == 0);						\
3125 	y[2] += (y[2] == 0);						\
3126 	if (x[2] < y[2])						\
3127 	  MP_LIMB_T_SWAP (x[2], y[2]);					\
3128       }									\
3129 									\
3130     speed_operand_src (s, px, psize);					\
3131     speed_operand_src (s, py, psize);					\
3132     speed_operand_dst (s, xtmp, s->size);				\
3133     speed_operand_dst (s, ytmp, s->size);				\
3134     speed_operand_dst (s, wp, s->size);					\
3135     speed_cache_fill (s);						\
3136 									\
3137     speed_starttime ();							\
3138     i = s->reps;							\
3139     do									\
3140       {									\
3141 	mp_ptr	x = px;							\
3142 	mp_ptr	y = py;							\
3143 	mp_ptr	xth = &xtmp[s->size-3];					\
3144 	mp_ptr	yth = &ytmp[s->size-3];					\
3145 	j = pieces;							\
3146 	do								\
3147 	  {								\
3148 	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
3149 	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
3150 									\
3151 	    ytmp[0] |= 1; /* y must be odd, */				\
3152 									\
3153 	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
3154 									\
3155 	    x += 3;							\
3156 	    y += 3;							\
3157 	  }								\
3158 	while (--j != 0);						\
3159       }									\
3160     while (--i != 0);							\
3161     t = speed_endtime ();						\
3162 									\
3163     TMP_FREE;								\
3164 									\
3165     s->time_divisor = pieces;						\
3166     return t;								\
3167   }
3168 
3169 /* Calculate nextprime(n) for random n of s->size bits (not limbs). */
3170 #define SPEED_ROUTINE_MPZ_NEXTPRIME(function)				\
3171   {									\
3172     unsigned  i, j;							\
3173     mpz_t     wp, n;							\
3174     double    t;							\
3175 									\
3176     SPEED_RESTRICT_COND (s->size >= 10);				\
3177 									\
3178     mpz_init (wp);							\
3179     mpz_init_set_n (n, s->xp, s->size);					\
3180     /* limit to s->size bits, as this function is very slow */		\
3181     mpz_tdiv_r_2exp (n, n, s->size);					\
3182     /* set high bits so operand and result are genaral s->size bits */	\
3183     mpz_setbit (n, s->size - 1);					\
3184     mpz_clrbit (n, s->size - 2);					\
3185 									\
3186     speed_starttime ();							\
3187     i = s->reps;							\
3188     do									\
3189       {									\
3190         /* nextprime timing is variable, so average over many calls */	\
3191         j = SPEED_BLOCK_SIZE - 1;					\
3192         /* starts on random, after measures prime to next prime */	\
3193         function (wp, n);						\
3194         do								\
3195           {								\
3196             function (wp, wp);						\
3197           }								\
3198         while (--j != 0);						\
3199       }									\
3200     while (--i != 0);							\
3201     t = speed_endtime ();						\
3202 									\
3203     mpz_clear (wp);							\
3204     mpz_clear (n);							\
3205 									\
3206     s->time_divisor = SPEED_BLOCK_SIZE;					\
3207     return t;								\
3208   }
3209 
3210 #define SPEED_ROUTINE_MPZ_JACOBI(function)				\
3211   {									\
3212     mpz_t     a, b;							\
3213     unsigned  i;							\
3214     mp_size_t j, pieces, psize;						\
3215     mp_ptr    px, py;							\
3216     double    t;							\
3217     TMP_DECL;								\
3218 									\
3219     TMP_MARK;								\
3220     pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
3221     pieces = MAX (pieces, 1);						\
3222     s->time_divisor = pieces;						\
3223 									\
3224     psize = pieces * s->size;						\
3225     px = TMP_ALLOC_LIMBS (psize);					\
3226     py = TMP_ALLOC_LIMBS (psize);					\
3227     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3228     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3229 									\
3230     for (j = 0; j < pieces; j++)					\
3231       {									\
3232 	mp_ptr	x = px+j*s->size;					\
3233 	mp_ptr	y = py+j*s->size;					\
3234 									\
3235 	/* y odd */							\
3236 	y[0] |= 1;							\
3237 									\
3238 	/* high limbs non-zero */					\
3239 	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
3240 	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
3241       }									\
3242 									\
3243     SIZ(a) = s->size;							\
3244     SIZ(b) = s->size;							\
3245 									\
3246     speed_operand_src (s, px, psize);					\
3247     speed_operand_src (s, py, psize);					\
3248     speed_cache_fill (s);						\
3249 									\
3250     speed_starttime ();							\
3251     i = s->reps;							\
3252     do									\
3253       {									\
3254 	j = pieces;							\
3255 	do								\
3256 	  {								\
3257 	    PTR(a) = px+(j-1)*s->size;					\
3258 	    PTR(b) = py+(j-1)*s->size;					\
3259 	    function (a, b);						\
3260 	  }								\
3261 	while (--j != 0);						\
3262       }									\
3263     while (--i != 0);							\
3264     t = speed_endtime ();						\
3265 									\
3266     TMP_FREE;								\
3267     return t;								\
3268   }
3269 
3270 #define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
3271   {									\
3272     mp_ptr    wp, xp;							\
3273     mp_limb_t yp[2];							\
3274     unsigned  i;							\
3275     double    t;							\
3276     TMP_DECL;								\
3277 									\
3278     SPEED_RESTRICT_COND (s->size >= 2);					\
3279 									\
3280     TMP_MARK;								\
3281     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
3282     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3283 									\
3284     /* source is destroyed */						\
3285     MPN_COPY (xp, s->xp, s->size);					\
3286 									\
3287     /* divisor must be normalized */					\
3288     MPN_COPY (yp, s->yp_block, 2);					\
3289     yp[1] |= GMP_NUMB_HIGHBIT;						\
3290 									\
3291     speed_operand_src (s, xp, s->size);					\
3292     speed_operand_src (s, yp, 2);					\
3293     speed_operand_dst (s, wp, s->size);					\
3294     speed_cache_fill (s);						\
3295 									\
3296     speed_starttime ();							\
3297     i = s->reps;							\
3298     do									\
3299       function (wp, 0, xp, s->size, yp);				\
3300     while (--i != 0);							\
3301     t = speed_endtime ();						\
3302 									\
3303     TMP_FREE;								\
3304     return t;								\
3305   }
3306 
3307 #define SPEED_ROUTINE_MPN_DIV_QR_1(function)				\
3308   {									\
3309     mp_ptr    wp, xp;							\
3310     mp_limb_t d;							\
3311     mp_limb_t r;							\
3312     unsigned  i;							\
3313     double    t;							\
3314     TMP_DECL;								\
3315 									\
3316     SPEED_RESTRICT_COND (s->size >= 1);					\
3317 									\
3318     TMP_MARK;								\
3319     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3320 									\
3321     d = s->r;								\
3322     if (d == 0)								\
3323       d = 1;								\
3324     speed_operand_src (s, s->xp, s->size);				\
3325     speed_operand_dst (s, wp, s->size);					\
3326     speed_cache_fill (s);						\
3327 									\
3328     speed_starttime ();							\
3329     i = s->reps;							\
3330     do									\
3331       r = function (wp, wp+s->size-1, s->xp, s->size, d);		\
3332     while (--i != 0);							\
3333     t = speed_endtime ();						\
3334 									\
3335     TMP_FREE;								\
3336     return t;								\
3337   }
3338 
3339 #define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function)			\
3340   {									\
3341     mp_ptr    wp, xp;							\
3342     mp_limb_t d, dinv;							\
3343     mp_limb_t r;							\
3344     unsigned  i;							\
3345     double    t;							\
3346     TMP_DECL;								\
3347 									\
3348     SPEED_RESTRICT_COND (s->size >= 1);					\
3349 									\
3350     TMP_MARK;								\
3351     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3352 									\
3353     d = s->r;								\
3354     /* divisor must be normalized */					\
3355     SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT);				\
3356     invert_limb (dinv, d);						\
3357     speed_operand_src (s, s->xp, s->size);				\
3358     speed_operand_dst (s, wp, s->size);					\
3359     speed_cache_fill (s);						\
3360 									\
3361     speed_starttime ();							\
3362     i = s->reps;							\
3363     do									\
3364       r = function (wp, s->xp, s->size, 0, d, dinv);			\
3365     while (--i != 0);							\
3366     t = speed_endtime ();						\
3367 									\
3368     TMP_FREE;								\
3369     return t;								\
3370   }
3371 
3372 #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
3373   {									\
3374     mp_ptr    wp, xp;							\
3375     mp_limb_t yp[2];							\
3376     mp_limb_t rp[2];							\
3377     unsigned  i;							\
3378     double    t;							\
3379     TMP_DECL;								\
3380 									\
3381     SPEED_RESTRICT_COND (s->size >= 2);					\
3382 									\
3383     TMP_MARK;								\
3384     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3385 									\
3386     /* divisor must be normalized */					\
3387     MPN_COPY (yp, s->yp_block, 2);					\
3388     if (norm)								\
3389       yp[1] |= GMP_NUMB_HIGHBIT;					\
3390     else								\
3391       {									\
3392 	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
3393 	if (yp[1] == 0)							\
3394 	  yp[1] = 1;							\
3395       }									\
3396     speed_operand_src (s, s->xp, s->size);				\
3397     speed_operand_src (s, yp, 2);					\
3398     speed_operand_dst (s, wp, s->size);					\
3399     speed_operand_dst (s, rp, 2);					\
3400     speed_cache_fill (s);						\
3401 									\
3402     speed_starttime ();							\
3403     i = s->reps;							\
3404     do									\
3405       function (wp, rp, s->xp, s->size, yp);				\
3406     while (--i != 0);							\
3407     t = speed_endtime ();						\
3408 									\
3409     TMP_FREE;								\
3410     return t;								\
3411   }
3412 
3413 #define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
3414   {									\
3415     unsigned   i, j;							\
3416     mp_ptr     xp;							\
3417     mp_limb_t  n = 1;							\
3418     double     t;							\
3419 									\
3420     xp = s->xp_block-1;							\
3421 									\
3422     speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
3423     speed_cache_fill (s);						\
3424 									\
3425     speed_starttime ();							\
3426     i = s->reps;							\
3427     do									\
3428       {									\
3429 	j = SPEED_BLOCK_SIZE;						\
3430 	do								\
3431 	  {								\
3432 	    /* randomized but successively dependent */			\
3433 	    n += (xp[j] << 1);						\
3434 									\
3435 	    function (n, n);						\
3436 	  }								\
3437 	while (--j != 0);						\
3438       }									\
3439     while (--i != 0);							\
3440     t = speed_endtime ();						\
3441 									\
3442     /* make sure the compiler won't optimize away n */			\
3443     noop_1 (n);								\
3444 									\
3445     s->time_divisor = SPEED_BLOCK_SIZE;					\
3446     return t;								\
3447   }
3448 
3449 
3450 #define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call)				\
3451   {									\
3452     mp_ptr    wp, wp2;							\
3453     unsigned  i;							\
3454     double    t;							\
3455     TMP_DECL;								\
3456 									\
3457     SPEED_RESTRICT_COND (s->size >= 1);					\
3458 									\
3459     TMP_MARK;								\
3460     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
3461     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
3462 									\
3463     speed_operand_src (s, s->xp, s->size);				\
3464     speed_operand_dst (s, wp, s->size);					\
3465     speed_operand_dst (s, wp2, s->size);				\
3466     speed_cache_fill (s);						\
3467 									\
3468     speed_starttime ();							\
3469     i = s->reps;							\
3470     do									\
3471       call;								\
3472     while (--i != 0);							\
3473     t = speed_endtime ();						\
3474 									\
3475     TMP_FREE;								\
3476     return t;								\
3477   }
3478 
3479 
3480 /* Calculate worst case for perfect_power
3481    Worst case is multiple prime factors larger than trial div limit. */
3482 #define SPEED_ROUTINE_MPN_PERFECT_POWER(function)		 	\
3483   {									\
3484     mpz_t     r;							\
3485     unsigned  i, power;							\
3486     double    t;							\
3487 									\
3488     SPEED_RESTRICT_COND (s->size >= 10);				\
3489 									\
3490     mpz_init (r);							\
3491     power = s->size * GMP_NUMB_BITS / 17;				\
3492     mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1);				\
3493     mpz_mul_ui(r, r, (1 << 16) + 1);	/* larger than 1000th prime */	\
3494 									\
3495     speed_starttime ();							\
3496     i = s->reps;							\
3497     do									\
3498       function (PTR(r), SIZ(r));					\
3499     while (--i != 0);							\
3500     t = speed_endtime ();						\
3501 									\
3502     mpz_clear (r);							\
3503     return t;								\
3504   }
3505 
3506 /* Calculate worst case (larger prime) for perfect_square */
3507 #define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function)			\
3508   {									\
3509     mpz_t     r;							\
3510     unsigned  i;							\
3511     double    t;							\
3512 									\
3513     SPEED_RESTRICT_COND (s->size >= 2);					\
3514     mpz_init_set_n (r, s->xp, s->size / 2);				\
3515     mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1);			\
3516     mpz_mul (r, r, r);							\
3517 									\
3518     speed_starttime ();							\
3519     i = s->reps;							\
3520     do									\
3521       function (PTR(r), SIZ(r));					\
3522     while (--i != 0);							\
3523     t = speed_endtime ();						\
3524 									\
3525     mpz_clear (r);							\
3526     return t;								\
3527   }
3528 
3529 
3530 /* s->size controls the number of limbs in the input, s->r is the base, or
3531    decimal by default. */
3532 #define SPEED_ROUTINE_MPN_GET_STR(function)				\
3533   {									\
3534     unsigned char *wp;							\
3535     mp_size_t wn;							\
3536     mp_ptr xp;								\
3537     int base;								\
3538     unsigned i;								\
3539     double t;								\
3540     TMP_DECL;								\
3541 									\
3542     SPEED_RESTRICT_COND (s->size >= 1);					\
3543 									\
3544     base = s->r == 0 ? 10 : s->r;					\
3545     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3546 									\
3547     TMP_MARK;								\
3548     SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
3549 									\
3550     MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
3551     wp = (unsigned char *) TMP_ALLOC (wn);				\
3552 									\
3553     /* use this during development to guard against overflowing wp */	\
3554     /*									\
3555     MPN_COPY (xp, s->xp, s->size);					\
3556     ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
3557     */									\
3558 									\
3559     speed_operand_src (s, s->xp, s->size);				\
3560     speed_operand_dst (s, xp, s->size);					\
3561     speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES);		\
3562     speed_cache_fill (s);						\
3563 									\
3564     speed_starttime ();							\
3565     i = s->reps;							\
3566     do									\
3567       {									\
3568 	MPN_COPY (xp, s->xp, s->size);					\
3569 	function (wp, base, xp, s->size);				\
3570       }									\
3571     while (--i != 0);							\
3572     t = speed_endtime ();						\
3573 									\
3574     TMP_FREE;								\
3575     return t;								\
3576   }
3577 
3578 /* s->size controls the number of digits in the input, s->r is the base, or
3579    decimal by default. */
3580 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
3581   {									\
3582     unsigned char *xp;							\
3583     mp_ptr     wp;							\
3584     mp_size_t  wn;							\
3585     unsigned   i;							\
3586     int        base;							\
3587     double     t;							\
3588     TMP_DECL;								\
3589 									\
3590     SPEED_RESTRICT_COND (s->size >= 1);					\
3591 									\
3592     base = s->r == 0 ? 10 : s->r;					\
3593     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3594 									\
3595     TMP_MARK;								\
3596 									\
3597     xp = (unsigned char *) TMP_ALLOC (s->size);				\
3598     for (i = 0; i < s->size; i++)					\
3599       xp[i] = s->xp[i] % base;						\
3600 									\
3601     LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
3602     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
3603 									\
3604     /* use this during development to check wn is big enough */		\
3605     /*									\
3606     ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
3607     */									\
3608 									\
3609     speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES);	\
3610     speed_operand_dst (s, wp, wn);					\
3611     speed_cache_fill (s);						\
3612 									\
3613     speed_starttime ();							\
3614     i = s->reps;							\
3615     do									\
3616       call;								\
3617     while (--i != 0);							\
3618     t = speed_endtime ();						\
3619 									\
3620     TMP_FREE;								\
3621     return t;								\
3622   }
3623 
3624 
3625 /* Run an accel gcd find_a() function over various data values.  A set of
3626    values is used in case some run particularly fast or slow.  The size
3627    parameter is ignored, the amount of data tested is fixed.  */
3628 
3629 #define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
3630   {									\
3631     unsigned  i, j;							\
3632     mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
3633     double    t;							\
3634     TMP_DECL;								\
3635 									\
3636     TMP_MARK;								\
3637 									\
3638     /* low must be odd, high must be non-zero */			\
3639     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3640       {									\
3641 	cp[i][0] = s->xp_block[i] | 1;					\
3642 	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
3643       }									\
3644 									\
3645     speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
3646     speed_cache_fill (s);						\
3647 									\
3648     speed_starttime ();							\
3649     i = s->reps;							\
3650     do									\
3651       {									\
3652 	j = SPEED_BLOCK_SIZE;						\
3653 	do								\
3654 	  {								\
3655 	    function (cp[j-1]);						\
3656 	  }								\
3657 	while (--j != 0);						\
3658       }									\
3659     while (--i != 0);							\
3660     t = speed_endtime ();						\
3661 									\
3662     TMP_FREE;								\
3663 									\
3664     s->time_divisor = SPEED_BLOCK_SIZE;					\
3665     return t;								\
3666   }
3667 
3668 
3669 /* "call" should do "count_foo_zeros(c,n)".
3670    Give leading=1 if foo is leading zeros, leading=0 for trailing.
3671    Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
3672 
3673 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
3674   {									\
3675     mp_ptr     xp;							\
3676     int        i, c;							\
3677     unsigned   j;							\
3678     mp_limb_t  n;							\
3679     double     t;							\
3680     TMP_DECL;								\
3681 									\
3682     TMP_MARK;								\
3683     SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
3684 									\
3685     if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
3686       return -1.0;							\
3687     speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
3688     speed_cache_fill (s);						\
3689 									\
3690     c = 0;								\
3691     speed_starttime ();							\
3692     j = s->reps;							\
3693     do {								\
3694       for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3695 	{								\
3696 	  n = xp[i];							\
3697 	  n ^= c;							\
3698 
3699 #define SPEED_ROUTINE_COUNT_ZEROS_B()					\
3700 	}								\
3701     } while (--j != 0);							\
3702     t = speed_endtime ();						\
3703 									\
3704     /* don't let c go dead */						\
3705     noop_1 (c);								\
3706 									\
3707     s->time_divisor = SPEED_BLOCK_SIZE;					\
3708 									\
3709     TMP_FREE;								\
3710     return t;								\
3711   }									\
3712 
3713 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
3714   do {									\
3715     SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
3716     call;								\
3717     SPEED_ROUTINE_COUNT_ZEROS_B ();					\
3718   } while (0)								\
3719 
3720 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
3721   SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
3722 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
3723   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
3724 
3725 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
3726   SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
3727 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
3728   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
3729 
3730 
3731 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
3732   {									\
3733     unsigned   i, j;							\
3734     mp_limb_t  d, dinv=0;						\
3735     mp_ptr     xp = s->xp_block - 1;					\
3736 									\
3737     s->time_divisor = SPEED_BLOCK_SIZE;					\
3738 									\
3739     speed_starttime ();							\
3740     i = s->reps;							\
3741     do									\
3742       {									\
3743 	j = SPEED_BLOCK_SIZE;						\
3744 	do								\
3745 	  {								\
3746 	    d = dinv ^ xp[j];						\
3747 	    d |= GMP_LIMB_HIGHBIT;					\
3748 	    do { call; } while (0);					\
3749 	  }								\
3750 	while (--j != 0);						\
3751       }									\
3752     while (--i != 0);							\
3753 									\
3754     /* don't let the compiler optimize everything away */		\
3755     noop_1 (dinv);							\
3756 									\
3757     return speed_endtime();						\
3758   }
3759 
3760 
3761 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
3762   {									\
3763     unsigned  i;							\
3764     speed_starttime ();							\
3765     i = s->reps;							\
3766     do									\
3767       function ();							\
3768     while (--i != 0);							\
3769     return speed_endtime ();						\
3770   }
3771 
3772 
3773 #define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
3774   {									\
3775     mp_ptr    wp;							\
3776     unsigned  i;							\
3777     double    t;							\
3778     TMP_DECL;								\
3779 									\
3780     SPEED_RESTRICT_COND (s->size >= 0);					\
3781 									\
3782     TMP_MARK;								\
3783     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3784     speed_operand_dst (s, wp, s->size);					\
3785     speed_cache_fill (s);						\
3786 									\
3787     speed_starttime ();							\
3788     i = s->reps;							\
3789     do									\
3790       call;								\
3791     while (--i != 0);							\
3792     t = speed_endtime ();						\
3793 									\
3794     TMP_FREE;								\
3795     return t;								\
3796   }
3797 
3798 #define SPEED_ROUTINE_MPN_ZERO(function)				\
3799   SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
3800 
3801 
3802 #endif
3803