1 /* Header for speed and threshold things. 2 3 Copyright 1999-2003, 2005, 2006, 2008-2017, 2019 Free Software 4 Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of either: 10 11 * the GNU Lesser General Public License as published by the Free 12 Software Foundation; either version 3 of the License, or (at your 13 option) any later version. 14 15 or 16 17 * the GNU General Public License as published by the Free Software 18 Foundation; either version 2 of the License, or (at your option) any 19 later version. 20 21 or both in parallel, as here. 22 23 The GNU MP Library is distributed in the hope that it will be useful, but 24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 for more details. 27 28 You should have received copies of the GNU General Public License and the 29 GNU Lesser General Public License along with the GNU MP Library. If not, 30 see https://www.gnu.org/licenses/. */ 31 32 #ifndef __SPEED_H__ 33 #define __SPEED_H__ 34 35 36 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it 37 newsize long. */ 38 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \ 39 do { \ 40 ASSERT ((newsize) >= (oldsize)); \ 41 MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \ 42 } while (0) 43 44 /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on 45 x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */ 46 #define MP_LIMB_T_LOWBITMASK(n) \ 47 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1) 48 49 50 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */ 51 52 #define TMP_ALLOC_ALIGNED(bytes, align) \ 53 align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align)) 54 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \ 55 ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align)) 56 57 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the 58 limit on what s->align_xp etc and then request for off-alignment. Maybe 59 this should be an option of some sort, but in any case here are some line 60 sizes, 61 62 bytes 63 32 pentium 64 64 athlon 65 64 itanium-2 L1 66 128 itanium-2 L2 67 */ 68 #undef CACHE_LINE_SIZE 69 #define CACHE_LINE_SIZE 64 /* bytes */ 70 71 #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1) 72 73 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb 74 alignment. */ 75 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \ 76 do { \ 77 mp_ptr __ptr; \ 78 mp_size_t __ptr_align, __ptr_add; \ 79 \ 80 ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0); \ 81 __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \ 82 __ptr_align = (__ptr - (mp_ptr) NULL); \ 83 __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \ 84 (ptr) = __ptr + __ptr_add; \ 85 } while (0) 86 87 88 /* This is the size for s->xp_block and s->yp_block, used in certain 89 routines that want to run across many different data values and use 90 s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1. 91 92 512 means 2kbytes of data for each of xp_block and yp_block, making 4k 93 total, which should fit easily in any L1 data cache. */ 94 95 #define SPEED_BLOCK_SIZE 512 /* limbs */ 96 97 98 extern double speed_unittime; 99 extern double speed_cycletime; 100 extern int speed_precision; 101 extern char speed_time_string[]; 102 void speed_time_init (void); 103 void speed_cycletime_fail (const char *str); 104 void speed_cycletime_init (void); 105 void speed_cycletime_need_cycles (void); 106 void speed_cycletime_need_seconds (void); 107 void speed_starttime (void); 108 double speed_endtime (void); 109 110 111 struct speed_params { 112 unsigned reps; /* how many times to run the routine */ 113 mp_ptr xp; /* first argument */ 114 mp_ptr yp; /* second argument */ 115 mp_size_t size; /* size of both arguments */ 116 mp_limb_t r; /* user supplied parameter */ 117 mp_size_t align_xp; /* alignment of xp */ 118 mp_size_t align_yp; /* alignment of yp */ 119 mp_size_t align_wp; /* intended alignment of wp */ 120 mp_size_t align_wp2; /* intended alignment of wp2 */ 121 mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */ 122 mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */ 123 124 double time_divisor; /* optionally set by the speed routine */ 125 126 /* used by the cache priming things */ 127 int cache; 128 unsigned src_num, dst_num; 129 struct { 130 mp_ptr ptr; 131 mp_size_t size; 132 } src[5], dst[4]; 133 }; 134 135 typedef double (*speed_function_t) (struct speed_params *); 136 137 double speed_measure (speed_function_t fun, struct speed_params *); 138 139 /* Prototypes for speed measuring routines */ 140 141 double speed_back_to_back (struct speed_params *); 142 double speed_count_leading_zeros (struct speed_params *); 143 double speed_count_trailing_zeros (struct speed_params *); 144 double speed_find_a (struct speed_params *); 145 double speed_gmp_allocate_free (struct speed_params *); 146 double speed_gmp_allocate_reallocate_free (struct speed_params *); 147 double speed_invert_limb (struct speed_params *); 148 double speed_malloc_free (struct speed_params *); 149 double speed_malloc_realloc_free (struct speed_params *); 150 double speed_memcpy (struct speed_params *); 151 double speed_binvert_limb (struct speed_params *); 152 double speed_binvert_limb_mul1 (struct speed_params *); 153 double speed_binvert_limb_loop (struct speed_params *); 154 double speed_binvert_limb_cond (struct speed_params *); 155 double speed_binvert_limb_arith (struct speed_params *); 156 157 double speed_mpf_init_clear (struct speed_params *); 158 159 double speed_mpn_add_n (struct speed_params *); 160 double speed_mpn_add_1 (struct speed_params *); 161 double speed_mpn_add_1_inplace (struct speed_params *); 162 double speed_mpn_add_err1_n (struct speed_params *); 163 double speed_mpn_add_err2_n (struct speed_params *); 164 double speed_mpn_add_err3_n (struct speed_params *); 165 double speed_mpn_addlsh_n (struct speed_params *); 166 double speed_mpn_addlsh1_n (struct speed_params *); 167 double speed_mpn_addlsh2_n (struct speed_params *); 168 double speed_mpn_addlsh_n_ip1 (struct speed_params *); 169 double speed_mpn_addlsh1_n_ip1 (struct speed_params *); 170 double speed_mpn_addlsh2_n_ip1 (struct speed_params *); 171 double speed_mpn_addlsh_n_ip2 (struct speed_params *); 172 double speed_mpn_addlsh1_n_ip2 (struct speed_params *); 173 double speed_mpn_addlsh2_n_ip2 (struct speed_params *); 174 double speed_mpn_add_n_sub_n (struct speed_params *); 175 double speed_mpn_and_n (struct speed_params *); 176 double speed_mpn_andn_n (struct speed_params *); 177 double speed_mpn_addmul_1 (struct speed_params *); 178 double speed_mpn_addmul_2 (struct speed_params *); 179 double speed_mpn_addmul_3 (struct speed_params *); 180 double speed_mpn_addmul_4 (struct speed_params *); 181 double speed_mpn_addmul_5 (struct speed_params *); 182 double speed_mpn_addmul_6 (struct speed_params *); 183 double speed_mpn_addmul_7 (struct speed_params *); 184 double speed_mpn_addmul_8 (struct speed_params *); 185 double speed_mpn_cnd_add_n (struct speed_params *); 186 double speed_mpn_cnd_sub_n (struct speed_params *); 187 double speed_mpn_com (struct speed_params *); 188 double speed_mpn_neg (struct speed_params *); 189 double speed_mpn_copyd (struct speed_params *); 190 double speed_mpn_copyi (struct speed_params *); 191 double speed_MPN_COPY (struct speed_params *); 192 double speed_MPN_COPY_DECR (struct speed_params *); 193 double speed_MPN_COPY_INCR (struct speed_params *); 194 double speed_mpn_sec_tabselect (struct speed_params *); 195 double speed_mpn_divexact_1 (struct speed_params *); 196 double speed_mpn_divexact_by3 (struct speed_params *); 197 double speed_mpn_bdiv_q_1 (struct speed_params *); 198 double speed_mpn_pi1_bdiv_q_1 (struct speed_params *); 199 double speed_mpn_bdiv_dbm1c (struct speed_params *); 200 double speed_mpn_divrem_1 (struct speed_params *); 201 double speed_mpn_divrem_1f (struct speed_params *); 202 double speed_mpn_divrem_1c (struct speed_params *); 203 double speed_mpn_divrem_1cf (struct speed_params *); 204 double speed_mpn_divrem_1_div (struct speed_params *); 205 double speed_mpn_divrem_1f_div (struct speed_params *); 206 double speed_mpn_divrem_1_inv (struct speed_params *); 207 double speed_mpn_divrem_1f_inv (struct speed_params *); 208 double speed_mpn_divrem_2 (struct speed_params *); 209 double speed_mpn_divrem_2_div (struct speed_params *); 210 double speed_mpn_divrem_2_inv (struct speed_params *); 211 double speed_mpn_div_qr_1n_pi1 (struct speed_params *); 212 double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *); 213 double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *); 214 double speed_mpn_div_qr_1 (struct speed_params *); 215 double speed_mpn_div_qr_2n (struct speed_params *); 216 double speed_mpn_div_qr_2u (struct speed_params *); 217 double speed_mpn_fib2_ui (struct speed_params *); 218 double speed_mpn_matrix22_mul (struct speed_params *); 219 double speed_mpn_hgcd2 (struct speed_params *); 220 double speed_mpn_hgcd2_1 (struct speed_params *); 221 double speed_mpn_hgcd2_2 (struct speed_params *); 222 double speed_mpn_hgcd2_3 (struct speed_params *); 223 double speed_mpn_hgcd2_4 (struct speed_params *); 224 double speed_mpn_hgcd2_5 (struct speed_params *); 225 double speed_mpn_hgcd (struct speed_params *); 226 double speed_mpn_hgcd_lehmer (struct speed_params *); 227 double speed_mpn_hgcd_appr (struct speed_params *); 228 double speed_mpn_hgcd_appr_lehmer (struct speed_params *); 229 double speed_mpn_hgcd_reduce (struct speed_params *); 230 double speed_mpn_hgcd_reduce_1 (struct speed_params *); 231 double speed_mpn_hgcd_reduce_2 (struct speed_params *); 232 double speed_mpn_gcd (struct speed_params *); 233 double speed_mpn_gcd_1 (struct speed_params *); 234 double speed_mpn_gcd_11 (struct speed_params *); 235 double speed_mpn_gcd_1N (struct speed_params *); 236 double speed_mpn_gcd_22 (struct speed_params *); 237 double speed_mpn_gcdext (struct speed_params *); 238 double speed_mpn_gcdext_double (struct speed_params *); 239 double speed_mpn_gcdext_one_double (struct speed_params *); 240 double speed_mpn_gcdext_one_single (struct speed_params *); 241 double speed_mpn_gcdext_single (struct speed_params *); 242 double speed_mpn_get_str (struct speed_params *); 243 double speed_mpn_hamdist (struct speed_params *); 244 double speed_mpn_ior_n (struct speed_params *); 245 double speed_mpn_iorn_n (struct speed_params *); 246 double speed_mpn_jacobi_base (struct speed_params *); 247 double speed_mpn_jacobi_base_1 (struct speed_params *); 248 double speed_mpn_jacobi_base_2 (struct speed_params *); 249 double speed_mpn_jacobi_base_3 (struct speed_params *); 250 double speed_mpn_jacobi_base_4 (struct speed_params *); 251 double speed_mpn_lshift (struct speed_params *); 252 double speed_mpn_lshiftc (struct speed_params *); 253 double speed_mpn_mod_1 (struct speed_params *); 254 double speed_mpn_mod_1c (struct speed_params *); 255 double speed_mpn_mod_1_div (struct speed_params *); 256 double speed_mpn_mod_1_inv (struct speed_params *); 257 double speed_mpn_mod_1_1 (struct speed_params *); 258 double speed_mpn_mod_1_1_1 (struct speed_params *); 259 double speed_mpn_mod_1_1_2 (struct speed_params *); 260 double speed_mpn_mod_1_2 (struct speed_params *); 261 double speed_mpn_mod_1_3 (struct speed_params *); 262 double speed_mpn_mod_1_4 (struct speed_params *); 263 double speed_mpn_mod_34lsub1 (struct speed_params *); 264 double speed_mpn_modexact_1_odd (struct speed_params *); 265 double speed_mpn_modexact_1c_odd (struct speed_params *); 266 double speed_mpn_mul_1 (struct speed_params *); 267 double speed_mpn_mul_1_inplace (struct speed_params *); 268 double speed_mpn_mul_2 (struct speed_params *); 269 double speed_mpn_mul_3 (struct speed_params *); 270 double speed_mpn_mul_4 (struct speed_params *); 271 double speed_mpn_mul_5 (struct speed_params *); 272 double speed_mpn_mul_6 (struct speed_params *); 273 double speed_mpn_mul (struct speed_params *); 274 double speed_mpn_mul_basecase (struct speed_params *); 275 double speed_mpn_mulmid (struct speed_params *); 276 double speed_mpn_mulmid_basecase (struct speed_params *); 277 double speed_mpn_mul_fft (struct speed_params *); 278 double speed_mpn_mul_fft_sqr (struct speed_params *); 279 double speed_mpn_fft_mul (struct speed_params *); 280 double speed_mpn_fft_sqr (struct speed_params *); 281 #if WANT_OLD_FFT_FULL 282 double speed_mpn_mul_fft_full (struct speed_params *); 283 double speed_mpn_mul_fft_full_sqr (struct speed_params *); 284 #endif 285 double speed_mpn_nussbaumer_mul (struct speed_params *); 286 double speed_mpn_nussbaumer_mul_sqr (struct speed_params *); 287 double speed_mpn_mul_n (struct speed_params *); 288 double speed_mpn_mul_n_sqr (struct speed_params *); 289 double speed_mpn_mulmid_n (struct speed_params *); 290 double speed_mpn_sqrlo (struct speed_params *); 291 double speed_mpn_sqrlo_basecase (struct speed_params *); 292 double speed_mpn_mullo_n (struct speed_params *); 293 double speed_mpn_mullo_basecase (struct speed_params *); 294 double speed_mpn_nand_n (struct speed_params *); 295 double speed_mpn_nior_n (struct speed_params *); 296 double speed_mpn_popcount (struct speed_params *); 297 double speed_mpn_preinv_divrem_1 (struct speed_params *); 298 double speed_mpn_preinv_divrem_1f (struct speed_params *); 299 double speed_mpn_preinv_mod_1 (struct speed_params *); 300 double speed_mpn_sbpi1_div_qr (struct speed_params *); 301 double speed_mpn_dcpi1_div_qr (struct speed_params *); 302 double speed_mpn_sbpi1_divappr_q (struct speed_params *); 303 double speed_mpn_dcpi1_divappr_q (struct speed_params *); 304 double speed_mpn_mu_div_qr (struct speed_params *); 305 double speed_mpn_mu_divappr_q (struct speed_params *); 306 double speed_mpn_mupi_div_qr (struct speed_params *); 307 double speed_mpn_mu_div_q (struct speed_params *); 308 double speed_mpn_sbpi1_bdiv_qr (struct speed_params *); 309 double speed_mpn_dcpi1_bdiv_qr (struct speed_params *); 310 double speed_mpn_sbpi1_bdiv_q (struct speed_params *); 311 double speed_mpn_dcpi1_bdiv_q (struct speed_params *); 312 double speed_mpn_sbpi1_bdiv_r (struct speed_params *); 313 double speed_mpn_mu_bdiv_q (struct speed_params *); 314 double speed_mpn_mu_bdiv_qr (struct speed_params *); 315 double speed_mpn_broot (struct speed_params *); 316 double speed_mpn_broot_invm1 (struct speed_params *); 317 double speed_mpn_brootinv (struct speed_params *); 318 double speed_mpn_invert (struct speed_params *); 319 double speed_mpn_invertappr (struct speed_params *); 320 double speed_mpn_ni_invertappr (struct speed_params *); 321 double speed_mpn_sec_invert (struct speed_params *s); 322 double speed_mpn_binvert (struct speed_params *); 323 double speed_mpn_redc_1 (struct speed_params *); 324 double speed_mpn_redc_2 (struct speed_params *); 325 double speed_mpn_redc_n (struct speed_params *); 326 double speed_mpn_rsblsh_n (struct speed_params *); 327 double speed_mpn_rsblsh1_n (struct speed_params *); 328 double speed_mpn_rsblsh2_n (struct speed_params *); 329 double speed_mpn_rsh1add_n (struct speed_params *); 330 double speed_mpn_rsh1sub_n (struct speed_params *); 331 double speed_mpn_rshift (struct speed_params *); 332 double speed_mpn_sb_divrem_m3 (struct speed_params *); 333 double speed_mpn_sb_divrem_m3_div (struct speed_params *); 334 double speed_mpn_sb_divrem_m3_inv (struct speed_params *); 335 double speed_mpn_set_str (struct speed_params *); 336 double speed_mpn_bc_set_str (struct speed_params *); 337 double speed_mpn_dc_set_str (struct speed_params *); 338 double speed_mpn_set_str_pre (struct speed_params *); 339 double speed_mpn_sqr_basecase (struct speed_params *); 340 double speed_mpn_sqr_diag_addlsh1 (struct speed_params *); 341 double speed_mpn_sqr_diagonal (struct speed_params *); 342 double speed_mpn_sqr (struct speed_params *); 343 double speed_mpn_sqrtrem (struct speed_params *); 344 double speed_mpn_rootrem (struct speed_params *); 345 double speed_mpn_sqrt (struct speed_params *); 346 double speed_mpn_root (struct speed_params *); 347 double speed_mpn_perfect_power_p (struct speed_params *); 348 double speed_mpn_perfect_square_p (struct speed_params *); 349 double speed_mpn_sub_n (struct speed_params *); 350 double speed_mpn_sub_1 (struct speed_params *); 351 double speed_mpn_sub_1_inplace (struct speed_params *); 352 double speed_mpn_sub_err1_n (struct speed_params *); 353 double speed_mpn_sub_err2_n (struct speed_params *); 354 double speed_mpn_sub_err3_n (struct speed_params *); 355 double speed_mpn_sublsh_n (struct speed_params *); 356 double speed_mpn_sublsh1_n (struct speed_params *); 357 double speed_mpn_sublsh2_n (struct speed_params *); 358 double speed_mpn_sublsh_n_ip1 (struct speed_params *); 359 double speed_mpn_sublsh1_n_ip1 (struct speed_params *); 360 double speed_mpn_sublsh2_n_ip1 (struct speed_params *); 361 double speed_mpn_submul_1 (struct speed_params *); 362 double speed_mpn_toom2_sqr (struct speed_params *); 363 double speed_mpn_toom3_sqr (struct speed_params *); 364 double speed_mpn_toom4_sqr (struct speed_params *); 365 double speed_mpn_toom6_sqr (struct speed_params *); 366 double speed_mpn_toom8_sqr (struct speed_params *); 367 double speed_mpn_toom22_mul (struct speed_params *); 368 double speed_mpn_toom33_mul (struct speed_params *); 369 double speed_mpn_toom44_mul (struct speed_params *); 370 double speed_mpn_toom6h_mul (struct speed_params *); 371 double speed_mpn_toom8h_mul (struct speed_params *); 372 double speed_mpn_toom32_mul (struct speed_params *); 373 double speed_mpn_toom42_mul (struct speed_params *); 374 double speed_mpn_toom43_mul (struct speed_params *); 375 double speed_mpn_toom63_mul (struct speed_params *); 376 double speed_mpn_toom32_for_toom43_mul (struct speed_params *); 377 double speed_mpn_toom43_for_toom32_mul (struct speed_params *); 378 double speed_mpn_toom32_for_toom53_mul (struct speed_params *); 379 double speed_mpn_toom53_for_toom32_mul (struct speed_params *); 380 double speed_mpn_toom42_for_toom53_mul (struct speed_params *); 381 double speed_mpn_toom53_for_toom42_mul (struct speed_params *); 382 double speed_mpn_toom43_for_toom54_mul (struct speed_params *); 383 double speed_mpn_toom54_for_toom43_mul (struct speed_params *); 384 double speed_mpn_toom42_mulmid (struct speed_params *); 385 double speed_mpn_mulmod_bnm1 (struct speed_params *); 386 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *); 387 double speed_mpn_mulmod_bnm1_rounded (struct speed_params *); 388 double speed_mpn_sqrmod_bnm1 (struct speed_params *); 389 double speed_mpn_udiv_qrnnd (struct speed_params *); 390 double speed_mpn_udiv_qrnnd_r (struct speed_params *); 391 double speed_mpn_umul_ppmm (struct speed_params *); 392 double speed_mpn_umul_ppmm_r (struct speed_params *); 393 double speed_mpn_xnor_n (struct speed_params *); 394 double speed_mpn_xor_n (struct speed_params *); 395 double speed_MPN_ZERO (struct speed_params *); 396 397 double speed_mpq_init_clear (struct speed_params *); 398 399 double speed_mpz_add (struct speed_params *); 400 double speed_mpz_invert (struct speed_params *); 401 double speed_mpz_bin_uiui (struct speed_params *); 402 double speed_mpz_bin_ui (struct speed_params *); 403 double speed_mpz_fac_ui (struct speed_params *); 404 double speed_mpz_2fac_ui (struct speed_params *); 405 double speed_mpz_mfac_uiui (struct speed_params *); 406 double speed_mpz_primorial_ui (struct speed_params *); 407 double speed_mpz_fib_ui (struct speed_params *); 408 double speed_mpz_fib2_ui (struct speed_params *); 409 double speed_mpz_init_clear (struct speed_params *); 410 double speed_mpz_init_realloc_clear (struct speed_params *); 411 double speed_mpz_nextprime (struct speed_params *); 412 double speed_mpz_jacobi (struct speed_params *); 413 double speed_mpz_lucnum_ui (struct speed_params *); 414 double speed_mpz_lucnum2_ui (struct speed_params *); 415 double speed_mpz_mod (struct speed_params *); 416 double speed_mpz_powm (struct speed_params *); 417 double speed_mpz_powm_mod (struct speed_params *); 418 double speed_mpz_powm_redc (struct speed_params *); 419 double speed_mpz_powm_sec (struct speed_params *); 420 double speed_mpz_powm_ui (struct speed_params *); 421 double speed_mpz_urandomb (struct speed_params *); 422 423 double speed_gmp_randseed (struct speed_params *); 424 double speed_gmp_randseed_ui (struct speed_params *); 425 426 double speed_noop (struct speed_params *); 427 double speed_noop_wxs (struct speed_params *); 428 double speed_noop_wxys (struct speed_params *); 429 430 double speed_operator_div (struct speed_params *); 431 double speed_operator_mod (struct speed_params *); 432 433 double speed_udiv_qrnnd (struct speed_params *); 434 double speed_udiv_qrnnd_preinv1 (struct speed_params *); 435 double speed_udiv_qrnnd_preinv2 (struct speed_params *); 436 double speed_udiv_qrnnd_preinv3 (struct speed_params *); 437 double speed_udiv_qrnnd_c (struct speed_params *); 438 double speed_umul_ppmm (struct speed_params *); 439 440 /* Prototypes for other routines */ 441 442 #if defined (__cplusplus) 443 extern "C" { 444 #endif 445 446 /* low 32-bits in p[0], high 32-bits in p[1] */ 447 void speed_cyclecounter (unsigned p[2]); 448 449 #if defined (__cplusplus) 450 } 451 #endif 452 453 void mftb_function (unsigned p[2]); 454 455 double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]); 456 int gettimeofday_microseconds_p (void); 457 int getrusage_microseconds_p (void); 458 int cycles_works_p (void); 459 long clk_tck (void); 460 double freq_measure (const char *, double (*)(void)); 461 462 int double_cmp_ptr (const double *, const double *); 463 void pentium_wbinvd (void); 464 typedef int (*qsort_function_t) (const void *, const void *); 465 466 void noop (void); 467 void noop_1 (mp_limb_t); 468 void noop_wxs (mp_ptr, mp_srcptr, mp_size_t); 469 void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); 470 void mpn_cache_fill (mp_srcptr, mp_size_t); 471 void mpn_cache_fill_dummy (mp_limb_t); 472 void speed_cache_fill (struct speed_params *); 473 void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t); 474 void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t); 475 476 extern int speed_option_addrs; 477 extern int speed_option_verbose; 478 extern int speed_option_cycles_broken; 479 void speed_option_set (const char *); 480 481 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); 482 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); 483 484 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 485 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 486 mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); 487 mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); 488 489 int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int); 490 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int); 491 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int); 492 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int); 493 494 int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); 495 int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); 496 int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); 497 int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); 498 int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); 499 500 mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t); 501 mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t); 502 503 mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]); 504 mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]); 505 506 void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t); 507 void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t); 508 509 mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 510 mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 511 mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 512 mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 513 mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); 514 mp_size_t mpn_hgcd_lehmer_itch (mp_size_t); 515 516 mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); 517 mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t); 518 519 mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); 520 mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t); 521 522 mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); 523 mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t); 524 525 mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); 526 mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); 527 528 mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int); 529 void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr); 530 531 void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); 532 void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); 533 534 int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int); 535 536 537 /* "get" is called repeatedly until it ticks over, just in case on a fast 538 processor it takes less than a microsecond, though this is probably 539 unlikely if it's a system call. 540 541 speed_cyclecounter is called on the same side of the "get" for the start 542 and end measurements. It doesn't matter how long it takes from the "get" 543 sample to the cycles sample, since that period will cancel out in the 544 difference calculation (assuming it's the same each time). 545 546 Letting the test run for more than a process time slice is probably only 547 going to reduce accuracy, especially for getrusage when the cycle counter 548 is real time, or for gettimeofday if the cycle counter is in fact process 549 time. Use CLK_TCK/2 as a reasonable stop. 550 551 It'd be desirable to be quite accurate here. The default speed_precision 552 for a cycle counter is 10000 cycles, so to mix that with getrusage or 553 gettimeofday the frequency should be at least that accurate. But running 554 measurements for 10000 microseconds (or more) is too long. Be satisfied 555 with just a half clock tick (5000 microseconds usually). */ 556 557 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \ 558 do { \ 559 type st1, st, et1, et; \ 560 unsigned sc[2], ec[2]; \ 561 long dt, half_tick; \ 562 double dc, cyc; \ 563 \ 564 half_tick = (1000000L / clk_tck()) / 2; \ 565 \ 566 get (st1); \ 567 do { \ 568 get (st); \ 569 } while (usec(st) == usec(st1) && sec(st) == sec(st1)); \ 570 \ 571 getc (sc); \ 572 \ 573 for (;;) \ 574 { \ 575 get (et1); \ 576 do { \ 577 get (et); \ 578 } while (usec(et) == usec(et1) && sec(et) == sec(et1)); \ 579 \ 580 getc (ec); \ 581 \ 582 dc = speed_cyclecounter_diff (ec, sc); \ 583 \ 584 /* allow secs to cancel before multiplying */ \ 585 dt = sec(et) - sec(st); \ 586 dt = dt * 1000000L + (usec(et) - usec(st)); \ 587 \ 588 if (dt >= half_tick) \ 589 break; \ 590 } \ 591 \ 592 cyc = dt * 1e-6 / dc; \ 593 \ 594 if (speed_option_verbose >= 2) \ 595 printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \ 596 name, dc, dt, cyc); \ 597 \ 598 return dt * 1e-6 / dc; \ 599 \ 600 } while (0) 601 602 603 604 605 /* The measuring routines use these big macros to save duplication for 606 similar forms. They also get used for some automatically generated 607 measuring of new implementations of functions. 608 609 Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a 610 function pointer is considered undesirable since it's not the way a 611 normal application will be calling, and some processors might do 612 different things with an indirect call, like not branch predicting, or 613 doing a full pipe flush. At least some of the "functions" measured are 614 actually macros too. 615 616 The net effect is to bloat the object code, possibly in a big way, but 617 only what's being measured is being run, so that doesn't matter. 618 619 The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or 620 ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure 621 function pointer doesn't work in gcc 3.2. Using an actual non-pure 622 function pointer variable works, but stands a real risk of a 623 non-optimizing compiler generating unnecessary overheads in the call. 624 Currently the best idea is not to use those attributes for a timing 625 program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and 626 gmp-impl.h to omit them from routines there. */ 627 628 #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0; 629 630 /* For mpn_copy or similar. */ 631 #define SPEED_ROUTINE_MPN_COPY_CALL(call) \ 632 { \ 633 mp_ptr wp; \ 634 unsigned i; \ 635 double t; \ 636 TMP_DECL; \ 637 \ 638 SPEED_RESTRICT_COND (s->size >= 0); \ 639 \ 640 TMP_MARK; \ 641 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 642 \ 643 speed_operand_src (s, s->xp, s->size); \ 644 speed_operand_dst (s, wp, s->size); \ 645 speed_cache_fill (s); \ 646 \ 647 speed_starttime (); \ 648 i = s->reps; \ 649 do \ 650 call; \ 651 while (--i != 0); \ 652 t = speed_endtime (); \ 653 \ 654 TMP_FREE; \ 655 return t; \ 656 } 657 #define SPEED_ROUTINE_MPN_COPY(function) \ 658 SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size)) 659 660 #define SPEED_ROUTINE_MPN_TABSELECT(function) \ 661 { \ 662 mp_ptr xp, wp; \ 663 unsigned i; \ 664 double t; \ 665 TMP_DECL; \ 666 \ 667 SPEED_RESTRICT_COND (s->size >= 0); \ 668 \ 669 if (s->r == 0) \ 670 s->r = s->size; /* default to a quadratic shape */ \ 671 \ 672 TMP_MARK; \ 673 SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp); \ 674 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 675 \ 676 speed_operand_src (s, xp, s->size * s->r); \ 677 speed_operand_dst (s, wp, s->size); \ 678 speed_cache_fill (s); \ 679 \ 680 speed_starttime (); \ 681 i = s->reps; \ 682 do \ 683 function (wp, xp, s->size, s->r, (s->r) / 2); \ 684 while (--i != 0); \ 685 t = speed_endtime () / s->r; \ 686 \ 687 TMP_FREE; \ 688 return t; \ 689 } 690 691 692 #define SPEED_ROUTINE_MPN_COPYC(function) \ 693 { \ 694 mp_ptr wp; \ 695 unsigned i; \ 696 double t; \ 697 TMP_DECL; \ 698 \ 699 SPEED_RESTRICT_COND (s->size >= 0); \ 700 \ 701 TMP_MARK; \ 702 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 703 \ 704 speed_operand_src (s, s->xp, s->size); \ 705 speed_operand_dst (s, wp, s->size); \ 706 speed_cache_fill (s); \ 707 \ 708 speed_starttime (); \ 709 i = s->reps; \ 710 do \ 711 function (wp, s->xp, s->size, 0); \ 712 while (--i != 0); \ 713 t = speed_endtime (); \ 714 \ 715 TMP_FREE; \ 716 return t; \ 717 } 718 719 /* s->size is still in limbs, and it's limbs which are copied, but 720 "function" takes a size in bytes not limbs. */ 721 #define SPEED_ROUTINE_MPN_COPY_BYTES(function) \ 722 { \ 723 mp_ptr wp; \ 724 unsigned i; \ 725 double t; \ 726 TMP_DECL; \ 727 \ 728 SPEED_RESTRICT_COND (s->size >= 0); \ 729 \ 730 TMP_MARK; \ 731 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 732 \ 733 speed_operand_src (s, s->xp, s->size); \ 734 speed_operand_dst (s, wp, s->size); \ 735 speed_cache_fill (s); \ 736 \ 737 speed_starttime (); \ 738 i = s->reps; \ 739 do \ 740 function (wp, s->xp, s->size * GMP_LIMB_BYTES); \ 741 while (--i != 0); \ 742 t = speed_endtime (); \ 743 \ 744 TMP_FREE; \ 745 return t; \ 746 } 747 748 749 /* For mpn_add_n, mpn_sub_n, or similar. */ 750 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \ 751 { \ 752 mp_ptr wp; \ 753 mp_ptr xp, yp; \ 754 unsigned i; \ 755 double t; \ 756 TMP_DECL; \ 757 \ 758 SPEED_RESTRICT_COND (s->size >= 1); \ 759 \ 760 TMP_MARK; \ 761 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 762 \ 763 xp = s->xp; \ 764 yp = s->yp; \ 765 \ 766 if (s->r == 0) ; \ 767 else if (s->r == 1) { xp = wp; } \ 768 else if (s->r == 2) { yp = wp; } \ 769 else if (s->r == 3) { xp = wp; yp = wp; } \ 770 else if (s->r == 4) { yp = xp; } \ 771 else { \ 772 TMP_FREE; \ 773 return -1.0; \ 774 } \ 775 \ 776 /* initialize wp if operand overlap */ \ 777 if (xp == wp || yp == wp) \ 778 MPN_COPY (wp, s->xp, s->size); \ 779 \ 780 speed_operand_src (s, xp, s->size); \ 781 speed_operand_src (s, yp, s->size); \ 782 speed_operand_dst (s, wp, s->size); \ 783 speed_cache_fill (s); \ 784 \ 785 speed_starttime (); \ 786 i = s->reps; \ 787 do \ 788 call; \ 789 while (--i != 0); \ 790 t = speed_endtime (); \ 791 \ 792 TMP_FREE; \ 793 return t; \ 794 } 795 796 797 /* For mpn_aors_errK_n, where 1 <= K <= 3. */ 798 #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K) \ 799 { \ 800 mp_ptr wp; \ 801 mp_ptr xp, yp; \ 802 mp_ptr zp[K]; \ 803 mp_limb_t ep[2*K]; \ 804 unsigned i; \ 805 double t; \ 806 TMP_DECL; \ 807 \ 808 SPEED_RESTRICT_COND (s->size >= 1); \ 809 \ 810 TMP_MARK; \ 811 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 812 \ 813 /* (don't have a mechanism to specify zp alignments) */ \ 814 for (i = 0; i < K; i++) \ 815 SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0); \ 816 \ 817 xp = s->xp; \ 818 yp = s->yp; \ 819 \ 820 if (s->r == 0) ; \ 821 else if (s->r == 1) { xp = wp; } \ 822 else if (s->r == 2) { yp = wp; } \ 823 else if (s->r == 3) { xp = wp; yp = wp; } \ 824 else if (s->r == 4) { yp = xp; } \ 825 else { \ 826 TMP_FREE; \ 827 return -1.0; \ 828 } \ 829 \ 830 /* initialize wp if operand overlap */ \ 831 if (xp == wp || yp == wp) \ 832 MPN_COPY (wp, s->xp, s->size); \ 833 \ 834 speed_operand_src (s, xp, s->size); \ 835 speed_operand_src (s, yp, s->size); \ 836 for (i = 0; i < K; i++) \ 837 speed_operand_src (s, zp[i], s->size); \ 838 speed_operand_dst (s, wp, s->size); \ 839 speed_cache_fill (s); \ 840 \ 841 speed_starttime (); \ 842 i = s->reps; \ 843 do \ 844 call; \ 845 while (--i != 0); \ 846 t = speed_endtime (); \ 847 \ 848 TMP_FREE; \ 849 return t; \ 850 } 851 852 #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function) \ 853 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1) 854 855 #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function) \ 856 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2) 857 858 #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function) \ 859 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3) 860 861 862 /* For mpn_add_n, mpn_sub_n, or similar. */ 863 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \ 864 { \ 865 mp_ptr ap, sp; \ 866 mp_ptr xp, yp; \ 867 unsigned i; \ 868 double t; \ 869 TMP_DECL; \ 870 \ 871 SPEED_RESTRICT_COND (s->size >= 1); \ 872 \ 873 TMP_MARK; \ 874 SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \ 875 SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \ 876 \ 877 xp = s->xp; \ 878 yp = s->yp; \ 879 \ 880 if ((s->r & 1) != 0) { xp = ap; } \ 881 if ((s->r & 2) != 0) { yp = ap; } \ 882 if ((s->r & 4) != 0) { xp = sp; } \ 883 if ((s->r & 8) != 0) { yp = sp; } \ 884 if ((s->r & 3) == 3 || (s->r & 12) == 12) \ 885 { \ 886 TMP_FREE; \ 887 return -1.0; \ 888 } \ 889 \ 890 /* initialize ap if operand overlap */ \ 891 if (xp == ap || yp == ap) \ 892 MPN_COPY (ap, s->xp, s->size); \ 893 /* initialize sp if operand overlap */ \ 894 if (xp == sp || yp == sp) \ 895 MPN_COPY (sp, s->xp, s->size); \ 896 \ 897 speed_operand_src (s, xp, s->size); \ 898 speed_operand_src (s, yp, s->size); \ 899 speed_operand_dst (s, ap, s->size); \ 900 speed_operand_dst (s, sp, s->size); \ 901 speed_cache_fill (s); \ 902 \ 903 speed_starttime (); \ 904 i = s->reps; \ 905 do \ 906 call; \ 907 while (--i != 0); \ 908 t = speed_endtime (); \ 909 \ 910 TMP_FREE; \ 911 return t; \ 912 } 913 914 #define SPEED_ROUTINE_MPN_BINARY_N(function) \ 915 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size)) 916 917 #define SPEED_ROUTINE_MPN_BINARY_NC(function) \ 918 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0)) 919 920 921 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */ 922 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \ 923 { \ 924 mp_ptr wp; \ 925 unsigned i; \ 926 double t; \ 927 TMP_DECL; \ 928 \ 929 SPEED_RESTRICT_COND (s->size >= 1); \ 930 \ 931 TMP_MARK; \ 932 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 933 \ 934 speed_operand_src (s, s->xp, s->size); \ 935 speed_operand_dst (s, wp, s->size); \ 936 speed_cache_fill (s); \ 937 \ 938 speed_starttime (); \ 939 i = s->reps; \ 940 do \ 941 call; \ 942 while (--i != 0); \ 943 t = speed_endtime (); \ 944 \ 945 TMP_FREE; \ 946 return t; \ 947 } 948 949 #define SPEED_ROUTINE_MPN_UNARY_1(function) \ 950 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 951 952 #define SPEED_ROUTINE_MPN_UNARY_1C(function) \ 953 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 954 955 /* FIXME: wp is uninitialized here, should start it off from xp */ 956 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \ 957 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r)) 958 959 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \ 960 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 961 962 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function) \ 963 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 964 965 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call) \ 966 { \ 967 unsigned shift; \ 968 mp_limb_t dinv; \ 969 \ 970 SPEED_RESTRICT_COND (s->size > 0); \ 971 SPEED_RESTRICT_COND (s->r != 0); \ 972 \ 973 count_trailing_zeros (shift, s->r); \ 974 binvert_limb (dinv, s->r >> shift); \ 975 \ 976 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 977 } 978 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function) \ 979 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL \ 980 ((*function) (wp, s->xp, s->size, s->r, dinv, shift)) 981 982 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function) \ 983 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 984 985 #define SPEED_ROUTINE_MPN_DIVREM_1(function) \ 986 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r)) 987 988 #define SPEED_ROUTINE_MPN_DIVREM_1C(function) \ 989 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0)) 990 991 #define SPEED_ROUTINE_MPN_DIVREM_1F(function) \ 992 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r)) 993 994 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \ 995 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0)) 996 997 998 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \ 999 { \ 1000 unsigned shift; \ 1001 mp_limb_t dinv; \ 1002 \ 1003 SPEED_RESTRICT_COND (s->size >= 0); \ 1004 SPEED_RESTRICT_COND (s->r != 0); \ 1005 \ 1006 count_leading_zeros (shift, s->r); \ 1007 invert_limb (dinv, s->r << shift); \ 1008 \ 1009 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 1010 } \ 1011 1012 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \ 1013 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 1014 ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift)) 1015 1016 /* s->size limbs worth of fraction part */ 1017 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \ 1018 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 1019 ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift)) 1020 1021 1022 /* s->r is duplicated to form the multiplier, defaulting to 1023 MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at 1024 least it provides some control. */ 1025 #define SPEED_ROUTINE_MPN_UNARY_N(function,N) \ 1026 { \ 1027 mp_ptr wp; \ 1028 mp_size_t wn; \ 1029 unsigned i; \ 1030 double t; \ 1031 mp_limb_t yp[N]; \ 1032 TMP_DECL; \ 1033 \ 1034 SPEED_RESTRICT_COND (s->size >= N); \ 1035 \ 1036 TMP_MARK; \ 1037 wn = s->size + N-1; \ 1038 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 1039 for (i = 0; i < N; i++) \ 1040 yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \ 1041 \ 1042 speed_operand_src (s, s->xp, s->size); \ 1043 speed_operand_src (s, yp, (mp_size_t) N); \ 1044 speed_operand_dst (s, wp, wn); \ 1045 speed_cache_fill (s); \ 1046 \ 1047 speed_starttime (); \ 1048 i = s->reps; \ 1049 do \ 1050 function (wp, s->xp, s->size, yp); \ 1051 while (--i != 0); \ 1052 t = speed_endtime (); \ 1053 \ 1054 TMP_FREE; \ 1055 return t; \ 1056 } 1057 1058 #define SPEED_ROUTINE_MPN_UNARY_2(function) \ 1059 SPEED_ROUTINE_MPN_UNARY_N (function, 2) 1060 #define SPEED_ROUTINE_MPN_UNARY_3(function) \ 1061 SPEED_ROUTINE_MPN_UNARY_N (function, 3) 1062 #define SPEED_ROUTINE_MPN_UNARY_4(function) \ 1063 SPEED_ROUTINE_MPN_UNARY_N (function, 4) 1064 #define SPEED_ROUTINE_MPN_UNARY_5(function) \ 1065 SPEED_ROUTINE_MPN_UNARY_N (function, 5) 1066 #define SPEED_ROUTINE_MPN_UNARY_6(function) \ 1067 SPEED_ROUTINE_MPN_UNARY_N (function, 6) 1068 #define SPEED_ROUTINE_MPN_UNARY_7(function) \ 1069 SPEED_ROUTINE_MPN_UNARY_N (function, 7) 1070 #define SPEED_ROUTINE_MPN_UNARY_8(function) \ 1071 SPEED_ROUTINE_MPN_UNARY_N (function, 8) 1072 1073 1074 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */ 1075 #define SPEED_ROUTINE_MPN_MUL(function) \ 1076 { \ 1077 mp_ptr wp; \ 1078 mp_size_t size1; \ 1079 unsigned i; \ 1080 double t; \ 1081 TMP_DECL; \ 1082 \ 1083 size1 = (s->r == 0 ? s->size : s->r); \ 1084 if (size1 < 0) size1 = -size1 - s->size; \ 1085 \ 1086 SPEED_RESTRICT_COND (size1 >= 1); \ 1087 SPEED_RESTRICT_COND (s->size >= size1); \ 1088 \ 1089 TMP_MARK; \ 1090 SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp); \ 1091 \ 1092 speed_operand_src (s, s->xp, s->size); \ 1093 speed_operand_src (s, s->yp, size1); \ 1094 speed_operand_dst (s, wp, size1 + s->size); \ 1095 speed_cache_fill (s); \ 1096 \ 1097 speed_starttime (); \ 1098 i = s->reps; \ 1099 do \ 1100 function (wp, s->xp, s->size, s->yp, size1); \ 1101 while (--i != 0); \ 1102 t = speed_endtime (); \ 1103 \ 1104 TMP_FREE; \ 1105 return t; \ 1106 } 1107 1108 1109 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \ 1110 { \ 1111 mp_ptr wp; \ 1112 unsigned i; \ 1113 double t; \ 1114 TMP_DECL; \ 1115 \ 1116 SPEED_RESTRICT_COND (s->size >= 1); \ 1117 \ 1118 TMP_MARK; \ 1119 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1120 \ 1121 speed_operand_src (s, s->xp, s->size); \ 1122 speed_operand_src (s, s->yp, s->size); \ 1123 speed_operand_dst (s, wp, 2*s->size); \ 1124 speed_cache_fill (s); \ 1125 \ 1126 speed_starttime (); \ 1127 i = s->reps; \ 1128 do \ 1129 call; \ 1130 while (--i != 0); \ 1131 t = speed_endtime (); \ 1132 \ 1133 TMP_FREE; \ 1134 return t; \ 1135 } 1136 1137 #define SPEED_ROUTINE_MPN_MUL_N(function) \ 1138 SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size)); 1139 1140 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call) \ 1141 { \ 1142 mp_ptr wp; \ 1143 unsigned i; \ 1144 double t; \ 1145 TMP_DECL; \ 1146 \ 1147 SPEED_RESTRICT_COND (s->size >= 1); \ 1148 \ 1149 TMP_MARK; \ 1150 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 1151 \ 1152 speed_operand_src (s, s->xp, s->size); \ 1153 speed_operand_src (s, s->yp, s->size); \ 1154 speed_operand_dst (s, wp, s->size); \ 1155 speed_cache_fill (s); \ 1156 \ 1157 speed_starttime (); \ 1158 i = s->reps; \ 1159 do \ 1160 call; \ 1161 while (--i != 0); \ 1162 t = speed_endtime (); \ 1163 \ 1164 TMP_FREE; \ 1165 return t; \ 1166 } 1167 1168 #define SPEED_ROUTINE_MPN_MULLO_N(function) \ 1169 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); 1170 1171 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function) \ 1172 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); 1173 1174 #define SPEED_ROUTINE_MPN_SQRLO(function) \ 1175 { \ 1176 mp_ptr wp; \ 1177 unsigned i; \ 1178 double t; \ 1179 TMP_DECL; \ 1180 \ 1181 SPEED_RESTRICT_COND (s->size >= 1); \ 1182 \ 1183 TMP_MARK; \ 1184 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 1185 \ 1186 speed_operand_src (s, s->xp, s->size); \ 1187 speed_operand_dst (s, wp, s->size); \ 1188 speed_cache_fill (s); \ 1189 \ 1190 speed_starttime (); \ 1191 i = s->reps; \ 1192 do \ 1193 function (wp, s->xp, s->size); \ 1194 while (--i != 0); \ 1195 t = speed_endtime (); \ 1196 \ 1197 TMP_FREE; \ 1198 return t; \ 1199 } 1200 1201 /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */ 1202 #define SPEED_ROUTINE_MPN_MULMID(function) \ 1203 { \ 1204 mp_ptr wp, xp; \ 1205 mp_size_t size1; \ 1206 unsigned i; \ 1207 double t; \ 1208 TMP_DECL; \ 1209 \ 1210 size1 = (s->r == 0 ? (2 * s->size - 1) : s->r); \ 1211 \ 1212 SPEED_RESTRICT_COND (s->size >= 1); \ 1213 SPEED_RESTRICT_COND (size1 >= s->size); \ 1214 \ 1215 TMP_MARK; \ 1216 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1217 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1218 \ 1219 speed_operand_src (s, xp, size1); \ 1220 speed_operand_src (s, s->yp, s->size); \ 1221 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1222 speed_cache_fill (s); \ 1223 \ 1224 speed_starttime (); \ 1225 i = s->reps; \ 1226 do \ 1227 function (wp, xp, size1, s->yp, s->size); \ 1228 while (--i != 0); \ 1229 t = speed_endtime (); \ 1230 \ 1231 TMP_FREE; \ 1232 return t; \ 1233 } 1234 1235 #define SPEED_ROUTINE_MPN_MULMID_N(function) \ 1236 { \ 1237 mp_ptr wp, xp; \ 1238 mp_size_t size1; \ 1239 unsigned i; \ 1240 double t; \ 1241 TMP_DECL; \ 1242 \ 1243 size1 = 2 * s->size - 1; \ 1244 \ 1245 SPEED_RESTRICT_COND (s->size >= 1); \ 1246 \ 1247 TMP_MARK; \ 1248 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1249 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1250 \ 1251 speed_operand_src (s, xp, size1); \ 1252 speed_operand_src (s, s->yp, s->size); \ 1253 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1254 speed_cache_fill (s); \ 1255 \ 1256 speed_starttime (); \ 1257 i = s->reps; \ 1258 do \ 1259 function (wp, xp, s->yp, s->size); \ 1260 while (--i != 0); \ 1261 t = speed_endtime (); \ 1262 \ 1263 TMP_FREE; \ 1264 return t; \ 1265 } 1266 1267 #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function) \ 1268 { \ 1269 mp_ptr wp, xp, scratch; \ 1270 mp_size_t size1, scratch_size; \ 1271 unsigned i; \ 1272 double t; \ 1273 TMP_DECL; \ 1274 \ 1275 size1 = 2 * s->size - 1; \ 1276 \ 1277 SPEED_RESTRICT_COND (s->size >= 1); \ 1278 \ 1279 TMP_MARK; \ 1280 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1281 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1282 scratch_size = mpn_toom42_mulmid_itch (s->size); \ 1283 SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0); \ 1284 \ 1285 speed_operand_src (s, xp, size1); \ 1286 speed_operand_src (s, s->yp, s->size); \ 1287 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1288 speed_cache_fill (s); \ 1289 \ 1290 speed_starttime (); \ 1291 i = s->reps; \ 1292 do \ 1293 function (wp, xp, s->yp, s->size, scratch); \ 1294 while (--i != 0); \ 1295 t = speed_endtime (); \ 1296 \ 1297 TMP_FREE; \ 1298 return t; \ 1299 } 1300 1301 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \ 1302 { \ 1303 mp_ptr wp, tp; \ 1304 unsigned i; \ 1305 double t; \ 1306 mp_size_t itch; \ 1307 TMP_DECL; \ 1308 \ 1309 SPEED_RESTRICT_COND (s->size >= 1); \ 1310 \ 1311 itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size); \ 1312 \ 1313 TMP_MARK; \ 1314 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ 1315 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1316 \ 1317 speed_operand_src (s, s->xp, s->size); \ 1318 speed_operand_src (s, s->yp, s->size); \ 1319 speed_operand_dst (s, wp, 2 * s->size); \ 1320 speed_operand_dst (s, tp, itch); \ 1321 speed_cache_fill (s); \ 1322 \ 1323 speed_starttime (); \ 1324 i = s->reps; \ 1325 do \ 1326 call; \ 1327 while (--i != 0); \ 1328 t = speed_endtime (); \ 1329 \ 1330 TMP_FREE; \ 1331 return t; \ 1332 } 1333 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function) \ 1334 { \ 1335 mp_ptr wp, tp; \ 1336 unsigned i; \ 1337 double t; \ 1338 mp_size_t size, itch; \ 1339 TMP_DECL; \ 1340 \ 1341 SPEED_RESTRICT_COND (s->size >= 1); \ 1342 \ 1343 size = mpn_mulmod_bnm1_next_size (s->size); \ 1344 itch = mpn_mulmod_bnm1_itch (size, size, size); \ 1345 \ 1346 TMP_MARK; \ 1347 SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp); \ 1348 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1349 \ 1350 speed_operand_src (s, s->xp, s->size); \ 1351 speed_operand_src (s, s->yp, s->size); \ 1352 speed_operand_dst (s, wp, size); \ 1353 speed_operand_dst (s, tp, itch); \ 1354 speed_cache_fill (s); \ 1355 \ 1356 speed_starttime (); \ 1357 i = s->reps; \ 1358 do \ 1359 function (wp, size, s->xp, s->size, s->yp, s->size, tp); \ 1360 while (--i != 0); \ 1361 t = speed_endtime (); \ 1362 \ 1363 TMP_FREE; \ 1364 return t; \ 1365 } 1366 1367 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \ 1368 { \ 1369 mp_ptr wp, tspace; \ 1370 unsigned i; \ 1371 double t; \ 1372 TMP_DECL; \ 1373 \ 1374 SPEED_RESTRICT_COND (s->size >= minsize); \ 1375 \ 1376 TMP_MARK; \ 1377 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1378 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1379 \ 1380 speed_operand_src (s, s->xp, s->size); \ 1381 speed_operand_src (s, s->yp, s->size); \ 1382 speed_operand_dst (s, wp, 2*s->size); \ 1383 speed_operand_dst (s, tspace, tsize); \ 1384 speed_cache_fill (s); \ 1385 \ 1386 speed_starttime (); \ 1387 i = s->reps; \ 1388 do \ 1389 call; \ 1390 while (--i != 0); \ 1391 t = speed_endtime (); \ 1392 \ 1393 TMP_FREE; \ 1394 return t; \ 1395 } 1396 1397 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function) \ 1398 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1399 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1400 mpn_toom22_mul_itch (s->size, s->size), \ 1401 MPN_TOOM22_MUL_MINSIZE) 1402 1403 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function) \ 1404 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1405 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1406 mpn_toom33_mul_itch (s->size, s->size), \ 1407 MPN_TOOM33_MUL_MINSIZE) 1408 1409 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function) \ 1410 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1411 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1412 mpn_toom44_mul_itch (s->size, s->size), \ 1413 MPN_TOOM44_MUL_MINSIZE) 1414 1415 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function) \ 1416 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1417 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1418 mpn_toom6h_mul_itch (s->size, s->size), \ 1419 MPN_TOOM6H_MUL_MINSIZE) 1420 1421 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function) \ 1422 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1423 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1424 mpn_toom8h_mul_itch (s->size, s->size), \ 1425 MPN_TOOM8H_MUL_MINSIZE) 1426 1427 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \ 1428 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1429 (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \ 1430 mpn_toom32_mul_itch (s->size, 2*s->size/3), \ 1431 MPN_TOOM32_MUL_MINSIZE) 1432 1433 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \ 1434 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1435 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1436 mpn_toom42_mul_itch (s->size, s->size/2), \ 1437 MPN_TOOM42_MUL_MINSIZE) 1438 1439 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \ 1440 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1441 (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \ 1442 mpn_toom43_mul_itch (s->size, s->size*3/4), \ 1443 MPN_TOOM43_MUL_MINSIZE) 1444 1445 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \ 1446 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1447 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1448 mpn_toom63_mul_itch (s->size, s->size/2), \ 1449 MPN_TOOM63_MUL_MINSIZE) 1450 1451 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \ 1452 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1453 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1454 mpn_toom32_mul_itch (s->size, 17*s->size/24), \ 1455 MPN_TOOM32_MUL_MINSIZE) 1456 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \ 1457 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1458 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1459 mpn_toom43_mul_itch (s->size, 17*s->size/24), \ 1460 MPN_TOOM43_MUL_MINSIZE) 1461 1462 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \ 1463 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1464 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1465 mpn_toom32_mul_itch (s->size, 19*s->size/30), \ 1466 MPN_TOOM32_MUL_MINSIZE) 1467 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \ 1468 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1469 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1470 mpn_toom53_mul_itch (s->size, 19*s->size/30), \ 1471 MPN_TOOM53_MUL_MINSIZE) 1472 1473 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \ 1474 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1475 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1476 mpn_toom42_mul_itch (s->size, 11*s->size/20), \ 1477 MPN_TOOM42_MUL_MINSIZE) 1478 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \ 1479 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1480 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1481 mpn_toom53_mul_itch (s->size, 11*s->size/20), \ 1482 MPN_TOOM53_MUL_MINSIZE) 1483 1484 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function) \ 1485 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1486 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ 1487 mpn_toom42_mul_itch (s->size, 5*s->size/6), \ 1488 MPN_TOOM54_MUL_MINSIZE) 1489 #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function) \ 1490 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1491 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ 1492 mpn_toom54_mul_itch (s->size, 5*s->size/6), \ 1493 MPN_TOOM54_MUL_MINSIZE) 1494 1495 1496 1497 #define SPEED_ROUTINE_MPN_SQR_CALL(call) \ 1498 { \ 1499 mp_ptr wp; \ 1500 unsigned i; \ 1501 double t; \ 1502 TMP_DECL; \ 1503 \ 1504 SPEED_RESTRICT_COND (s->size >= 1); \ 1505 \ 1506 TMP_MARK; \ 1507 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1508 \ 1509 speed_operand_src (s, s->xp, s->size); \ 1510 speed_operand_dst (s, wp, 2*s->size); \ 1511 speed_cache_fill (s); \ 1512 \ 1513 speed_starttime (); \ 1514 i = s->reps; \ 1515 do \ 1516 call; \ 1517 while (--i != 0); \ 1518 t = speed_endtime (); \ 1519 \ 1520 TMP_FREE; \ 1521 return t; \ 1522 } 1523 1524 #define SPEED_ROUTINE_MPN_SQR(function) \ 1525 SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size)) 1526 1527 #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call) \ 1528 { \ 1529 mp_ptr wp, tp; \ 1530 unsigned i; \ 1531 double t; \ 1532 TMP_DECL; \ 1533 \ 1534 SPEED_RESTRICT_COND (s->size >= 2); \ 1535 \ 1536 TMP_MARK; \ 1537 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp); \ 1538 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ 1539 \ 1540 speed_operand_src (s, s->xp, s->size); \ 1541 speed_operand_src (s, tp, 2 * s->size); \ 1542 speed_operand_dst (s, wp, 2 * s->size); \ 1543 speed_cache_fill (s); \ 1544 \ 1545 speed_starttime (); \ 1546 i = s->reps; \ 1547 do \ 1548 call; \ 1549 while (--i != 0); \ 1550 t = speed_endtime () / 2; \ 1551 \ 1552 TMP_FREE; \ 1553 return t; \ 1554 } 1555 1556 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \ 1557 { \ 1558 mp_ptr wp, tspace; \ 1559 unsigned i; \ 1560 double t; \ 1561 TMP_DECL; \ 1562 \ 1563 SPEED_RESTRICT_COND (s->size >= minsize); \ 1564 \ 1565 TMP_MARK; \ 1566 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1567 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1568 \ 1569 speed_operand_src (s, s->xp, s->size); \ 1570 speed_operand_dst (s, wp, 2*s->size); \ 1571 speed_operand_dst (s, tspace, tsize); \ 1572 speed_cache_fill (s); \ 1573 \ 1574 speed_starttime (); \ 1575 i = s->reps; \ 1576 do \ 1577 call; \ 1578 while (--i != 0); \ 1579 t = speed_endtime (); \ 1580 \ 1581 TMP_FREE; \ 1582 return t; \ 1583 } 1584 1585 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function) \ 1586 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1587 mpn_toom2_sqr_itch (s->size), \ 1588 MPN_TOOM2_SQR_MINSIZE) 1589 1590 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function) \ 1591 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1592 mpn_toom3_sqr_itch (s->size), \ 1593 MPN_TOOM3_SQR_MINSIZE) 1594 1595 1596 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function) \ 1597 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1598 mpn_toom4_sqr_itch (s->size), \ 1599 MPN_TOOM4_SQR_MINSIZE) 1600 1601 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function) \ 1602 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1603 mpn_toom6_sqr_itch (s->size), \ 1604 MPN_TOOM6_SQR_MINSIZE) 1605 1606 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function) \ 1607 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1608 mpn_toom8_sqr_itch (s->size), \ 1609 MPN_TOOM8_SQR_MINSIZE) 1610 1611 #define SPEED_ROUTINE_MPN_MOD_CALL(call) \ 1612 { \ 1613 unsigned i; \ 1614 \ 1615 SPEED_RESTRICT_COND (s->size >= 0); \ 1616 \ 1617 speed_operand_src (s, s->xp, s->size); \ 1618 speed_cache_fill (s); \ 1619 \ 1620 speed_starttime (); \ 1621 i = s->reps; \ 1622 do \ 1623 call; \ 1624 while (--i != 0); \ 1625 \ 1626 return speed_endtime (); \ 1627 } 1628 1629 #define SPEED_ROUTINE_MPN_MOD_1(function) \ 1630 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r)) 1631 1632 #define SPEED_ROUTINE_MPN_MOD_1C(function) \ 1633 SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0))) 1634 1635 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \ 1636 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r)); 1637 1638 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \ 1639 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0))); 1640 1641 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \ 1642 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size)) 1643 1644 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \ 1645 { \ 1646 unsigned i; \ 1647 mp_limb_t inv; \ 1648 \ 1649 SPEED_RESTRICT_COND (s->size >= 0); \ 1650 SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \ 1651 \ 1652 invert_limb (inv, s->r); \ 1653 speed_operand_src (s, s->xp, s->size); \ 1654 speed_cache_fill (s); \ 1655 \ 1656 speed_starttime (); \ 1657 i = s->reps; \ 1658 do \ 1659 (*function) (s->xp, s->size, s->r, inv); \ 1660 while (--i != 0); \ 1661 \ 1662 return speed_endtime (); \ 1663 } 1664 1665 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc) \ 1666 { \ 1667 unsigned i; \ 1668 mp_limb_t inv[4]; \ 1669 \ 1670 SPEED_RESTRICT_COND (s->size >= 2); \ 1671 \ 1672 mpn_mod_1_1p_cps (inv, s->r); \ 1673 speed_operand_src (s, s->xp, s->size); \ 1674 speed_cache_fill (s); \ 1675 \ 1676 speed_starttime (); \ 1677 i = s->reps; \ 1678 do { \ 1679 pfunc (inv, s->r); \ 1680 function (s->xp, s->size, s->r << inv[1], inv); \ 1681 } while (--i != 0); \ 1682 \ 1683 return speed_endtime (); \ 1684 } 1685 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N) \ 1686 { \ 1687 unsigned i; \ 1688 mp_limb_t inv[N+3]; \ 1689 \ 1690 SPEED_RESTRICT_COND (s->size >= 1); \ 1691 SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N); \ 1692 \ 1693 speed_operand_src (s, s->xp, s->size); \ 1694 speed_cache_fill (s); \ 1695 \ 1696 speed_starttime (); \ 1697 i = s->reps; \ 1698 do { \ 1699 pfunc (inv, s->r); \ 1700 function (s->xp, s->size, s->r, inv); \ 1701 } while (--i != 0); \ 1702 \ 1703 return speed_endtime (); \ 1704 } 1705 1706 1707 /* A division of 2*s->size by s->size limbs */ 1708 1709 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \ 1710 { \ 1711 unsigned i; \ 1712 mp_ptr a, d, q, r; \ 1713 double t; \ 1714 gmp_pi1_t dinv; \ 1715 TMP_DECL; \ 1716 \ 1717 SPEED_RESTRICT_COND (s->size >= 1); \ 1718 \ 1719 TMP_MARK; \ 1720 SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ 1721 SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ 1722 SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ 1723 SPEED_TMP_ALLOC_LIMBS (r, s->size, s->align_wp2); \ 1724 \ 1725 MPN_COPY (a, s->xp, s->size); \ 1726 MPN_COPY (a+s->size, s->xp, s->size); \ 1727 \ 1728 MPN_COPY (d, s->yp, s->size); \ 1729 \ 1730 /* normalize the data */ \ 1731 d[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1732 a[2*s->size-1] = d[s->size-1] - 1; \ 1733 \ 1734 invert_pi1 (dinv, d[s->size-1], d[s->size-2]); \ 1735 \ 1736 speed_operand_src (s, a, 2*s->size); \ 1737 speed_operand_src (s, d, s->size); \ 1738 speed_operand_dst (s, q, s->size+1); \ 1739 speed_operand_dst (s, r, s->size); \ 1740 speed_cache_fill (s); \ 1741 \ 1742 speed_starttime (); \ 1743 i = s->reps; \ 1744 do \ 1745 call; \ 1746 while (--i != 0); \ 1747 t = speed_endtime (); \ 1748 \ 1749 TMP_FREE; \ 1750 return t; \ 1751 } 1752 1753 1754 /* A remainder 2*s->size by s->size limbs */ 1755 1756 #define SPEED_ROUTINE_MPZ_MOD(function) \ 1757 { \ 1758 unsigned i; \ 1759 mpz_t a, d, r; \ 1760 \ 1761 SPEED_RESTRICT_COND (s->size >= 1); \ 1762 \ 1763 mpz_init_set_n (d, s->yp, s->size); \ 1764 \ 1765 /* high part less than d, low part a duplicate copied in */ \ 1766 mpz_init_set_n (a, s->xp, s->size); \ 1767 mpz_mod (a, a, d); \ 1768 mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size); \ 1769 MPN_COPY (PTR(a), s->xp, s->size); \ 1770 \ 1771 mpz_init (r); \ 1772 \ 1773 speed_operand_src (s, PTR(a), SIZ(a)); \ 1774 speed_operand_src (s, PTR(d), SIZ(d)); \ 1775 speed_cache_fill (s); \ 1776 \ 1777 speed_starttime (); \ 1778 i = s->reps; \ 1779 do \ 1780 function (r, a, d); \ 1781 while (--i != 0); \ 1782 return speed_endtime (); \ 1783 } 1784 1785 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN) \ 1786 { \ 1787 unsigned i; \ 1788 mp_ptr dp, tp, ap, qp; \ 1789 gmp_pi1_t inv; \ 1790 double t; \ 1791 mp_size_t size1; \ 1792 TMP_DECL; \ 1793 \ 1794 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1795 \ 1796 SPEED_RESTRICT_COND (s->size >= DMIN); \ 1797 SPEED_RESTRICT_COND (size1 - s->size >= QMIN); \ 1798 \ 1799 TMP_MARK; \ 1800 SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp); \ 1801 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1802 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1803 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2); \ 1804 \ 1805 /* we don't fill in dividend completely when size1 > s->size */ \ 1806 MPN_COPY (ap, s->xp, s->size); \ 1807 MPN_COPY (ap + size1 - s->size, s->xp, s->size); \ 1808 \ 1809 MPN_COPY (dp, s->yp, s->size); \ 1810 \ 1811 /* normalize the data */ \ 1812 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1813 ap[size1 - 1] = dp[s->size - 1] - 1; \ 1814 \ 1815 invert_pi1 (inv, dp[s->size-1], dp[s->size-2]); \ 1816 \ 1817 speed_operand_src (s, ap, size1); \ 1818 speed_operand_dst (s, tp, size1); \ 1819 speed_operand_src (s, dp, s->size); \ 1820 speed_operand_dst (s, qp, size1 - s->size); \ 1821 speed_cache_fill (s); \ 1822 \ 1823 speed_starttime (); \ 1824 i = s->reps; \ 1825 do { \ 1826 MPN_COPY (tp, ap, size1); \ 1827 function (qp, tp, size1, dp, s->size, INV); \ 1828 } while (--i != 0); \ 1829 t = speed_endtime (); \ 1830 \ 1831 TMP_FREE; \ 1832 return t; \ 1833 } 1834 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn) \ 1835 { \ 1836 unsigned i; \ 1837 mp_ptr dp, tp, qp, scratch; \ 1838 double t; \ 1839 mp_size_t itch; \ 1840 TMP_DECL; \ 1841 \ 1842 SPEED_RESTRICT_COND (s->size >= 2); \ 1843 \ 1844 itch = itchfn (2 * s->size, s->size, 0); \ 1845 TMP_MARK; \ 1846 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1847 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1848 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 1849 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1850 \ 1851 MPN_COPY (tp, s->xp, s->size); \ 1852 MPN_COPY (tp+s->size, s->xp, s->size); \ 1853 \ 1854 /* normalize the data */ \ 1855 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1856 tp[2*s->size-1] = dp[s->size-1] - 1; \ 1857 \ 1858 speed_operand_dst (s, qp, s->size); \ 1859 speed_operand_src (s, tp, 2 * s->size); \ 1860 speed_operand_src (s, dp, s->size); \ 1861 speed_operand_dst (s, scratch, itch); \ 1862 speed_cache_fill (s); \ 1863 \ 1864 speed_starttime (); \ 1865 i = s->reps; \ 1866 do { \ 1867 function (qp, tp, 2 * s->size, dp, s->size, scratch); \ 1868 } while (--i != 0); \ 1869 t = speed_endtime (); \ 1870 \ 1871 TMP_FREE; \ 1872 return t; \ 1873 } 1874 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn) \ 1875 { \ 1876 unsigned i; \ 1877 mp_ptr dp, tp, qp, rp, scratch; \ 1878 double t; \ 1879 mp_size_t size1, itch; \ 1880 TMP_DECL; \ 1881 \ 1882 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1883 \ 1884 SPEED_RESTRICT_COND (s->size >= 2); \ 1885 SPEED_RESTRICT_COND (size1 >= s->size); \ 1886 \ 1887 itch = itchfn (size1, s->size, 0); \ 1888 TMP_MARK; \ 1889 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1890 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1891 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1892 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1893 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1894 \ 1895 /* we don't fill in dividend completely when size1 > s->size */ \ 1896 MPN_COPY (tp, s->xp, s->size); \ 1897 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1898 \ 1899 MPN_COPY (dp, s->yp, s->size); \ 1900 \ 1901 /* normalize the data */ \ 1902 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1903 tp[size1 - 1] = dp[s->size - 1] - 1; \ 1904 \ 1905 speed_operand_dst (s, qp, size1 - s->size); \ 1906 speed_operand_dst (s, rp, s->size); \ 1907 speed_operand_src (s, tp, size1); \ 1908 speed_operand_src (s, dp, s->size); \ 1909 speed_operand_dst (s, scratch, itch); \ 1910 speed_cache_fill (s); \ 1911 \ 1912 speed_starttime (); \ 1913 i = s->reps; \ 1914 do { \ 1915 function (qp, rp, tp, size1, dp, s->size, scratch); \ 1916 } while (--i != 0); \ 1917 t = speed_endtime (); \ 1918 \ 1919 TMP_FREE; \ 1920 return t; \ 1921 } 1922 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn) \ 1923 { \ 1924 unsigned i; \ 1925 mp_ptr dp, tp, qp, rp, ip, scratch, tmp; \ 1926 double t; \ 1927 mp_size_t size1, itch; \ 1928 TMP_DECL; \ 1929 \ 1930 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1931 \ 1932 SPEED_RESTRICT_COND (s->size >= 2); \ 1933 SPEED_RESTRICT_COND (size1 >= s->size); \ 1934 \ 1935 itch = itchfn (size1, s->size, s->size); \ 1936 TMP_MARK; \ 1937 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1938 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1939 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1940 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1941 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1942 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \ 1943 \ 1944 /* we don't fill in dividend completely when size1 > s->size */ \ 1945 MPN_COPY (tp, s->xp, s->size); \ 1946 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1947 \ 1948 MPN_COPY (dp, s->yp, s->size); \ 1949 \ 1950 /* normalize the data */ \ 1951 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1952 tp[size1 - 1] = dp[s->size-1] - 1; \ 1953 \ 1954 tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size)); \ 1955 mpn_invert (ip, dp, s->size, tmp); \ 1956 \ 1957 speed_operand_dst (s, qp, size1 - s->size); \ 1958 speed_operand_dst (s, rp, s->size); \ 1959 speed_operand_src (s, tp, size1); \ 1960 speed_operand_src (s, dp, s->size); \ 1961 speed_operand_src (s, ip, s->size); \ 1962 speed_operand_dst (s, scratch, itch); \ 1963 speed_cache_fill (s); \ 1964 \ 1965 speed_starttime (); \ 1966 i = s->reps; \ 1967 do { \ 1968 function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch); \ 1969 } while (--i != 0); \ 1970 t = speed_endtime (); \ 1971 \ 1972 TMP_FREE; \ 1973 return t; \ 1974 } 1975 1976 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function) \ 1977 { \ 1978 unsigned i; \ 1979 mp_ptr dp, tp, ap, qp; \ 1980 mp_limb_t inv; \ 1981 double t; \ 1982 TMP_DECL; \ 1983 \ 1984 SPEED_RESTRICT_COND (s->size >= 1); \ 1985 \ 1986 TMP_MARK; \ 1987 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \ 1988 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1989 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1990 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \ 1991 \ 1992 MPN_COPY (ap, s->xp, s->size); \ 1993 MPN_COPY (ap+s->size, s->xp, s->size); \ 1994 \ 1995 /* divisor must be odd */ \ 1996 MPN_COPY (dp, s->yp, s->size); \ 1997 dp[0] |= 1; \ 1998 binvert_limb (inv, dp[0]); \ 1999 inv = -inv; \ 2000 \ 2001 speed_operand_src (s, ap, 2*s->size); \ 2002 speed_operand_dst (s, tp, 2*s->size); \ 2003 speed_operand_src (s, dp, s->size); \ 2004 speed_operand_dst (s, qp, s->size); \ 2005 speed_cache_fill (s); \ 2006 \ 2007 speed_starttime (); \ 2008 i = s->reps; \ 2009 do { \ 2010 MPN_COPY (tp, ap, 2*s->size); \ 2011 function (qp, tp, 2*s->size, dp, s->size, inv); \ 2012 } while (--i != 0); \ 2013 t = speed_endtime (); \ 2014 \ 2015 TMP_FREE; \ 2016 return t; \ 2017 } 2018 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function) \ 2019 { \ 2020 unsigned i; \ 2021 mp_ptr dp, tp, qp; \ 2022 mp_limb_t inv; \ 2023 double t; \ 2024 TMP_DECL; \ 2025 \ 2026 SPEED_RESTRICT_COND (s->size >= 1); \ 2027 \ 2028 TMP_MARK; \ 2029 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2030 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2031 SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2); \ 2032 \ 2033 /* divisor must be odd */ \ 2034 MPN_COPY (dp, s->yp, s->size); \ 2035 dp[0] |= 1; \ 2036 binvert_limb (inv, dp[0]); \ 2037 inv = -inv; \ 2038 \ 2039 speed_operand_src (s, s->xp, s->size); \ 2040 speed_operand_dst (s, tp, s->size); \ 2041 speed_operand_src (s, dp, s->size); \ 2042 speed_operand_dst (s, qp, s->size); \ 2043 speed_cache_fill (s); \ 2044 \ 2045 speed_starttime (); \ 2046 i = s->reps; \ 2047 do { \ 2048 MPN_COPY (tp, s->xp, s->size); \ 2049 function (qp, tp, s->size, dp, s->size, inv); \ 2050 } while (--i != 0); \ 2051 t = speed_endtime (); \ 2052 \ 2053 TMP_FREE; \ 2054 return t; \ 2055 } 2056 #define SPEED_ROUTINE_MPN_PI1_BDIV_R(function) \ 2057 { \ 2058 unsigned i; \ 2059 mp_ptr dp, tp, ap; \ 2060 mp_limb_t inv; \ 2061 double t; \ 2062 TMP_DECL; \ 2063 \ 2064 SPEED_RESTRICT_COND (s->size >= 1); \ 2065 \ 2066 TMP_MARK; \ 2067 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \ 2068 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2069 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \ 2070 \ 2071 MPN_COPY (ap, s->xp, s->size); \ 2072 MPN_COPY (ap+s->size, s->xp, s->size); \ 2073 \ 2074 /* divisor must be odd */ \ 2075 MPN_COPY (dp, s->yp, s->size); \ 2076 dp[0] |= 1; \ 2077 binvert_limb (inv, dp[0]); \ 2078 inv = -inv; \ 2079 \ 2080 speed_operand_src (s, ap, 2*s->size); \ 2081 speed_operand_dst (s, tp, 2*s->size); \ 2082 speed_operand_src (s, dp, s->size); \ 2083 speed_cache_fill (s); \ 2084 \ 2085 speed_starttime (); \ 2086 i = s->reps; \ 2087 do { \ 2088 MPN_COPY (tp, ap, 2*s->size); \ 2089 function (tp, 2*s->size, dp, s->size, inv); \ 2090 } while (--i != 0); \ 2091 t = speed_endtime (); \ 2092 \ 2093 TMP_FREE; \ 2094 return t; \ 2095 } 2096 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn) \ 2097 { \ 2098 unsigned i; \ 2099 mp_ptr dp, qp, scratch; \ 2100 double t; \ 2101 mp_size_t itch; \ 2102 TMP_DECL; \ 2103 \ 2104 SPEED_RESTRICT_COND (s->size >= 2); \ 2105 \ 2106 itch = itchfn (s->size, s->size); \ 2107 TMP_MARK; \ 2108 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2109 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2110 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 2111 \ 2112 /* divisor must be odd */ \ 2113 MPN_COPY (dp, s->yp, s->size); \ 2114 dp[0] |= 1; \ 2115 \ 2116 speed_operand_dst (s, qp, s->size); \ 2117 speed_operand_src (s, s->xp, s->size); \ 2118 speed_operand_src (s, dp, s->size); \ 2119 speed_operand_dst (s, scratch, itch); \ 2120 speed_cache_fill (s); \ 2121 \ 2122 speed_starttime (); \ 2123 i = s->reps; \ 2124 do { \ 2125 function (qp, s->xp, s->size, dp, s->size, scratch); \ 2126 } while (--i != 0); \ 2127 t = speed_endtime (); \ 2128 \ 2129 TMP_FREE; \ 2130 return t; \ 2131 } 2132 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn) \ 2133 { \ 2134 unsigned i; \ 2135 mp_ptr dp, tp, qp, rp, scratch; \ 2136 double t; \ 2137 mp_size_t itch; \ 2138 TMP_DECL; \ 2139 \ 2140 SPEED_RESTRICT_COND (s->size >= 2); \ 2141 \ 2142 itch = itchfn (2 * s->size, s->size); \ 2143 TMP_MARK; \ 2144 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2145 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2146 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 2147 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 2148 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 2149 \ 2150 MPN_COPY (tp, s->xp, s->size); \ 2151 MPN_COPY (tp+s->size, s->xp, s->size); \ 2152 \ 2153 /* divisor must be odd */ \ 2154 MPN_COPY (dp, s->yp, s->size); \ 2155 dp[0] |= 1; \ 2156 \ 2157 speed_operand_dst (s, qp, s->size); \ 2158 speed_operand_dst (s, rp, s->size); \ 2159 speed_operand_src (s, tp, 2 * s->size); \ 2160 speed_operand_src (s, dp, s->size); \ 2161 speed_operand_dst (s, scratch, itch); \ 2162 speed_cache_fill (s); \ 2163 \ 2164 speed_starttime (); \ 2165 i = s->reps; \ 2166 do { \ 2167 function (qp, rp, tp, 2 * s->size, dp, s->size, scratch); \ 2168 } while (--i != 0); \ 2169 t = speed_endtime (); \ 2170 \ 2171 TMP_FREE; \ 2172 return t; \ 2173 } 2174 2175 #define SPEED_ROUTINE_MPN_BROOT(function) \ 2176 { \ 2177 SPEED_RESTRICT_COND (s->r & 1); \ 2178 s->xp[0] |= 1; \ 2179 SPEED_ROUTINE_MPN_UNARY_1_CALL \ 2180 ((*function) (wp, s->xp, s->size, s->r)); \ 2181 } 2182 2183 #define SPEED_ROUTINE_MPN_BROOTINV(function, itch) \ 2184 { \ 2185 mp_ptr wp, tp; \ 2186 unsigned i; \ 2187 double t; \ 2188 TMP_DECL; \ 2189 TMP_MARK; \ 2190 SPEED_RESTRICT_COND (s->size >= 1); \ 2191 SPEED_RESTRICT_COND (s->r & 1); \ 2192 wp = TMP_ALLOC_LIMBS (s->size); \ 2193 tp = TMP_ALLOC_LIMBS ( (itch)); \ 2194 s->xp[0] |= 1; \ 2195 \ 2196 speed_operand_src (s, s->xp, s->size); \ 2197 speed_operand_dst (s, wp, s->size); \ 2198 speed_cache_fill (s); \ 2199 \ 2200 speed_starttime (); \ 2201 i = s->reps; \ 2202 do \ 2203 (*function) (wp, s->xp, s->size, s->r, tp); \ 2204 while (--i != 0); \ 2205 t = speed_endtime (); \ 2206 \ 2207 TMP_FREE; \ 2208 return t; \ 2209 } 2210 2211 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn) \ 2212 { \ 2213 long i; \ 2214 mp_ptr up, tp, ip; \ 2215 double t; \ 2216 TMP_DECL; \ 2217 \ 2218 SPEED_RESTRICT_COND (s->size >= 1); \ 2219 \ 2220 TMP_MARK; \ 2221 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2222 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2223 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2224 \ 2225 MPN_COPY (up, s->xp, s->size); \ 2226 \ 2227 /* normalize the data */ \ 2228 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2229 \ 2230 speed_operand_src (s, up, s->size); \ 2231 speed_operand_dst (s, tp, s->size); \ 2232 speed_operand_dst (s, ip, s->size); \ 2233 speed_cache_fill (s); \ 2234 \ 2235 speed_starttime (); \ 2236 i = s->reps; \ 2237 do \ 2238 function (ip, up, s->size, tp); \ 2239 while (--i != 0); \ 2240 t = speed_endtime (); \ 2241 \ 2242 TMP_FREE; \ 2243 return t; \ 2244 } 2245 2246 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn) \ 2247 { \ 2248 long i; \ 2249 mp_ptr up, tp, ip; \ 2250 double t; \ 2251 TMP_DECL; \ 2252 \ 2253 SPEED_RESTRICT_COND (s->size >= 1); \ 2254 \ 2255 TMP_MARK; \ 2256 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2257 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2258 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2259 \ 2260 MPN_COPY (up, s->xp, s->size); \ 2261 \ 2262 /* normalize the data */ \ 2263 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2264 \ 2265 speed_operand_src (s, up, s->size); \ 2266 speed_operand_dst (s, tp, s->size); \ 2267 speed_operand_dst (s, ip, s->size); \ 2268 speed_cache_fill (s); \ 2269 \ 2270 speed_starttime (); \ 2271 i = s->reps; \ 2272 do \ 2273 function (ip, up, s->size, tp); \ 2274 while (--i != 0); \ 2275 t = speed_endtime (); \ 2276 \ 2277 TMP_FREE; \ 2278 return t; \ 2279 } 2280 2281 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn) \ 2282 { \ 2283 long i; \ 2284 mp_ptr up, tp, ip; \ 2285 double t; \ 2286 TMP_DECL; \ 2287 \ 2288 SPEED_RESTRICT_COND (s->size >= 3); \ 2289 \ 2290 TMP_MARK; \ 2291 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2292 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2293 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2294 \ 2295 MPN_COPY (up, s->xp, s->size); \ 2296 \ 2297 /* normalize the data */ \ 2298 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2299 \ 2300 speed_operand_src (s, up, s->size); \ 2301 speed_operand_dst (s, tp, s->size); \ 2302 speed_operand_dst (s, ip, s->size); \ 2303 speed_cache_fill (s); \ 2304 \ 2305 speed_starttime (); \ 2306 i = s->reps; \ 2307 do \ 2308 function (ip, up, s->size, tp); \ 2309 while (--i != 0); \ 2310 t = speed_endtime (); \ 2311 \ 2312 TMP_FREE; \ 2313 return t; \ 2314 } 2315 2316 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn) \ 2317 { \ 2318 long i; \ 2319 mp_ptr up, tp, ip; \ 2320 double t; \ 2321 TMP_DECL; \ 2322 \ 2323 SPEED_RESTRICT_COND (s->size >= 1); \ 2324 \ 2325 TMP_MARK; \ 2326 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2327 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2328 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2329 \ 2330 MPN_COPY (up, s->xp, s->size); \ 2331 \ 2332 /* normalize the data */ \ 2333 up[0] |= 1; \ 2334 \ 2335 speed_operand_src (s, up, s->size); \ 2336 speed_operand_dst (s, tp, s->size); \ 2337 speed_operand_dst (s, ip, s->size); \ 2338 speed_cache_fill (s); \ 2339 \ 2340 speed_starttime (); \ 2341 i = s->reps; \ 2342 do \ 2343 function (ip, up, s->size, tp); \ 2344 while (--i != 0); \ 2345 t = speed_endtime (); \ 2346 \ 2347 TMP_FREE; \ 2348 return t; \ 2349 } 2350 2351 #define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn) \ 2352 { \ 2353 long i; \ 2354 mp_ptr up, mp, tp, ip; \ 2355 double t; \ 2356 TMP_DECL; \ 2357 \ 2358 SPEED_RESTRICT_COND (s->size >= 1); \ 2359 \ 2360 TMP_MARK; \ 2361 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2362 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2363 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2364 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2365 \ 2366 speed_operand_src (s, up, s->size); \ 2367 speed_operand_dst (s, tp, s->size); \ 2368 speed_operand_dst (s, ip, s->size); \ 2369 speed_cache_fill (s); \ 2370 \ 2371 MPN_COPY (mp, s->yp, s->size); \ 2372 /* Must be odd */ \ 2373 mp[0] |= 1; \ 2374 speed_starttime (); \ 2375 i = s->reps; \ 2376 do \ 2377 { \ 2378 MPN_COPY (up, s->xp, s->size); \ 2379 function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp); \ 2380 } \ 2381 while (--i != 0); \ 2382 t = speed_endtime (); \ 2383 \ 2384 TMP_FREE; \ 2385 return t; \ 2386 } 2387 2388 #define SPEED_ROUTINE_REDC_1(function) \ 2389 { \ 2390 unsigned i; \ 2391 mp_ptr cp, mp, tp, ap; \ 2392 mp_limb_t inv; \ 2393 double t; \ 2394 TMP_DECL; \ 2395 \ 2396 SPEED_RESTRICT_COND (s->size >= 1); \ 2397 \ 2398 TMP_MARK; \ 2399 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2400 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2401 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2402 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2403 \ 2404 MPN_COPY (ap, s->xp, s->size); \ 2405 MPN_COPY (ap+s->size, s->xp, s->size); \ 2406 \ 2407 /* modulus must be odd */ \ 2408 MPN_COPY (mp, s->yp, s->size); \ 2409 mp[0] |= 1; \ 2410 binvert_limb (inv, mp[0]); \ 2411 inv = -inv; \ 2412 \ 2413 speed_operand_src (s, ap, 2*s->size+1); \ 2414 speed_operand_dst (s, tp, 2*s->size+1); \ 2415 speed_operand_src (s, mp, s->size); \ 2416 speed_operand_dst (s, cp, s->size); \ 2417 speed_cache_fill (s); \ 2418 \ 2419 speed_starttime (); \ 2420 i = s->reps; \ 2421 do { \ 2422 MPN_COPY (tp, ap, 2*s->size); \ 2423 function (cp, tp, mp, s->size, inv); \ 2424 } while (--i != 0); \ 2425 t = speed_endtime (); \ 2426 \ 2427 TMP_FREE; \ 2428 return t; \ 2429 } 2430 #define SPEED_ROUTINE_REDC_2(function) \ 2431 { \ 2432 unsigned i; \ 2433 mp_ptr cp, mp, tp, ap; \ 2434 mp_limb_t invp[2]; \ 2435 double t; \ 2436 TMP_DECL; \ 2437 \ 2438 SPEED_RESTRICT_COND (s->size >= 1); \ 2439 \ 2440 TMP_MARK; \ 2441 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2442 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2443 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2444 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2445 \ 2446 MPN_COPY (ap, s->xp, s->size); \ 2447 MPN_COPY (ap+s->size, s->xp, s->size); \ 2448 \ 2449 /* modulus must be odd */ \ 2450 MPN_COPY (mp, s->yp, s->size); \ 2451 mp[0] |= 1; \ 2452 mpn_binvert (invp, mp, 2, tp); \ 2453 invp[0] = -invp[0]; invp[1] = ~invp[1]; \ 2454 \ 2455 speed_operand_src (s, ap, 2*s->size+1); \ 2456 speed_operand_dst (s, tp, 2*s->size+1); \ 2457 speed_operand_src (s, mp, s->size); \ 2458 speed_operand_dst (s, cp, s->size); \ 2459 speed_cache_fill (s); \ 2460 \ 2461 speed_starttime (); \ 2462 i = s->reps; \ 2463 do { \ 2464 MPN_COPY (tp, ap, 2*s->size); \ 2465 function (cp, tp, mp, s->size, invp); \ 2466 } while (--i != 0); \ 2467 t = speed_endtime (); \ 2468 \ 2469 TMP_FREE; \ 2470 return t; \ 2471 } 2472 #define SPEED_ROUTINE_REDC_N(function) \ 2473 { \ 2474 unsigned i; \ 2475 mp_ptr cp, mp, tp, ap, invp; \ 2476 double t; \ 2477 TMP_DECL; \ 2478 \ 2479 SPEED_RESTRICT_COND (s->size > 8); \ 2480 \ 2481 TMP_MARK; \ 2482 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2483 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2484 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2485 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2486 SPEED_TMP_ALLOC_LIMBS (invp, s->size, s->align_wp2); /* align? */ \ 2487 \ 2488 MPN_COPY (ap, s->xp, s->size); \ 2489 MPN_COPY (ap+s->size, s->xp, s->size); \ 2490 \ 2491 /* modulus must be odd */ \ 2492 MPN_COPY (mp, s->yp, s->size); \ 2493 mp[0] |= 1; \ 2494 mpn_binvert (invp, mp, s->size, tp); \ 2495 \ 2496 speed_operand_src (s, ap, 2*s->size+1); \ 2497 speed_operand_dst (s, tp, 2*s->size+1); \ 2498 speed_operand_src (s, mp, s->size); \ 2499 speed_operand_dst (s, cp, s->size); \ 2500 speed_cache_fill (s); \ 2501 \ 2502 speed_starttime (); \ 2503 i = s->reps; \ 2504 do { \ 2505 MPN_COPY (tp, ap, 2*s->size); \ 2506 function (cp, tp, mp, s->size, invp); \ 2507 } while (--i != 0); \ 2508 t = speed_endtime (); \ 2509 \ 2510 TMP_FREE; \ 2511 return t; \ 2512 } 2513 2514 2515 #define SPEED_ROUTINE_MPN_POPCOUNT(function) \ 2516 { \ 2517 unsigned i; \ 2518 \ 2519 SPEED_RESTRICT_COND (s->size >= 1); \ 2520 \ 2521 speed_operand_src (s, s->xp, s->size); \ 2522 speed_cache_fill (s); \ 2523 \ 2524 speed_starttime (); \ 2525 i = s->reps; \ 2526 do \ 2527 function (s->xp, s->size); \ 2528 while (--i != 0); \ 2529 \ 2530 return speed_endtime (); \ 2531 } 2532 2533 #define SPEED_ROUTINE_MPN_HAMDIST(function) \ 2534 { \ 2535 unsigned i; \ 2536 \ 2537 SPEED_RESTRICT_COND (s->size >= 1); \ 2538 \ 2539 speed_operand_src (s, s->xp, s->size); \ 2540 speed_operand_src (s, s->yp, s->size); \ 2541 speed_cache_fill (s); \ 2542 \ 2543 speed_starttime (); \ 2544 i = s->reps; \ 2545 do \ 2546 function (s->xp, s->yp, s->size); \ 2547 while (--i != 0); \ 2548 \ 2549 return speed_endtime (); \ 2550 } 2551 2552 2553 #define SPEED_ROUTINE_MPZ_UI(function) \ 2554 { \ 2555 mpz_t z; \ 2556 unsigned i; \ 2557 double t; \ 2558 \ 2559 SPEED_RESTRICT_COND (s->size >= 0); \ 2560 \ 2561 mpz_init (z); \ 2562 \ 2563 speed_starttime (); \ 2564 i = s->reps; \ 2565 do \ 2566 function (z, s->size); \ 2567 while (--i != 0); \ 2568 t = speed_endtime (); \ 2569 \ 2570 mpz_clear (z); \ 2571 return t; \ 2572 } 2573 2574 #define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2575 #define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2576 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2577 2578 2579 #define SPEED_ROUTINE_MPZ_2_UI(function) \ 2580 { \ 2581 mpz_t z, z2; \ 2582 unsigned i; \ 2583 double t; \ 2584 \ 2585 SPEED_RESTRICT_COND (s->size >= 0); \ 2586 \ 2587 mpz_init (z); \ 2588 mpz_init (z2); \ 2589 \ 2590 speed_starttime (); \ 2591 i = s->reps; \ 2592 do \ 2593 function (z, z2, s->size); \ 2594 while (--i != 0); \ 2595 t = speed_endtime (); \ 2596 \ 2597 mpz_clear (z); \ 2598 mpz_clear (z2); \ 2599 return t; \ 2600 } 2601 2602 #define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2603 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2604 2605 2606 #define SPEED_ROUTINE_MPN_FIB2_UI(function) \ 2607 { \ 2608 mp_ptr fp, f1p; \ 2609 mp_size_t alloc; \ 2610 unsigned i; \ 2611 double t; \ 2612 TMP_DECL; \ 2613 \ 2614 SPEED_RESTRICT_COND (s->size >= 0); \ 2615 \ 2616 TMP_MARK; \ 2617 alloc = MPN_FIB2_SIZE (s->size); \ 2618 SPEED_TMP_ALLOC_LIMBS (fp, alloc, s->align_xp); \ 2619 SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp); \ 2620 \ 2621 speed_starttime (); \ 2622 i = s->reps; \ 2623 do \ 2624 function (fp, f1p, s->size); \ 2625 while (--i != 0); \ 2626 t = speed_endtime (); \ 2627 \ 2628 TMP_FREE; \ 2629 return t; \ 2630 } 2631 2632 2633 2634 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6 2635 limbs. m is forced to odd so that redc can be used. e is limited in 2636 size so the calculation doesn't take too long. */ 2637 #define SPEED_ROUTINE_MPZ_POWM(function) \ 2638 { \ 2639 mpz_t r, b, e, m; \ 2640 unsigned i; \ 2641 double t; \ 2642 \ 2643 SPEED_RESTRICT_COND (s->size >= 1); \ 2644 \ 2645 mpz_init (r); \ 2646 if (s->r < 2) \ 2647 mpz_init_set_n (b, s->xp, s->size); \ 2648 else \ 2649 mpz_init_set_ui (b, s->r); \ 2650 mpz_init_set_n (m, s->yp, s->size); \ 2651 mpz_setbit (m, 0); /* force m to odd */ \ 2652 mpz_init_set_n (e, s->xp_block, 6); \ 2653 \ 2654 speed_starttime (); \ 2655 i = s->reps; \ 2656 do \ 2657 function (r, b, e, m); \ 2658 while (--i != 0); \ 2659 t = speed_endtime (); \ 2660 \ 2661 mpz_clear (r); \ 2662 mpz_clear (b); \ 2663 mpz_clear (e); \ 2664 mpz_clear (m); \ 2665 return t; \ 2666 } 2667 2668 /* (m-2)^0xAAAAAAAA mod m */ 2669 #define SPEED_ROUTINE_MPZ_POWM_UI(function) \ 2670 { \ 2671 mpz_t r, b, m; \ 2672 unsigned long e; \ 2673 unsigned i; \ 2674 double t; \ 2675 \ 2676 SPEED_RESTRICT_COND (s->size >= 1); \ 2677 \ 2678 mpz_init (r); \ 2679 \ 2680 /* force m to odd */ \ 2681 mpz_init (m); \ 2682 mpz_set_n (m, s->xp, s->size); \ 2683 PTR(m)[0] |= 1; \ 2684 \ 2685 e = (~ (unsigned long) 0) / 3; \ 2686 if (s->r != 0) \ 2687 e = s->r; \ 2688 \ 2689 mpz_init_set (b, m); \ 2690 mpz_sub_ui (b, b, 2); \ 2691 /* printf ("%X\n", mpz_get_ui(m)); */ \ 2692 i = s->reps; \ 2693 speed_starttime (); \ 2694 do \ 2695 function (r, b, e, m); \ 2696 while (--i != 0); \ 2697 t = speed_endtime (); \ 2698 \ 2699 mpz_clear (r); \ 2700 mpz_clear (b); \ 2701 mpz_clear (m); \ 2702 return t; \ 2703 } 2704 2705 2706 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \ 2707 { \ 2708 mp_ptr wp, wp2, xp, yp; \ 2709 unsigned i; \ 2710 double t; \ 2711 TMP_DECL; \ 2712 \ 2713 SPEED_RESTRICT_COND (s->size >= 0); \ 2714 \ 2715 TMP_MARK; \ 2716 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 2717 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 2718 xp = s->xp; \ 2719 yp = s->yp; \ 2720 \ 2721 if (s->r == 0) ; \ 2722 else if (s->r == 1) { xp = wp; } \ 2723 else if (s->r == 2) { yp = wp2; } \ 2724 else if (s->r == 3) { xp = wp; yp = wp2; } \ 2725 else if (s->r == 4) { xp = wp2; yp = wp; } \ 2726 else { \ 2727 TMP_FREE; \ 2728 return -1.0; \ 2729 } \ 2730 if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \ 2731 if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \ 2732 \ 2733 speed_operand_src (s, xp, s->size); \ 2734 speed_operand_src (s, yp, s->size); \ 2735 speed_operand_dst (s, wp, s->size); \ 2736 speed_operand_dst (s, wp2, s->size); \ 2737 speed_cache_fill (s); \ 2738 \ 2739 speed_starttime (); \ 2740 i = s->reps; \ 2741 do \ 2742 call; \ 2743 while (--i != 0); \ 2744 t = speed_endtime (); \ 2745 \ 2746 TMP_FREE; \ 2747 return t; \ 2748 } 2749 2750 #define SPEED_ROUTINE_MPN_ADDSUB_N(function) \ 2751 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2752 (function (wp, wp2, xp, yp, s->size)); 2753 2754 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \ 2755 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2756 (function (wp, wp2, xp, yp, s->size, 0)); 2757 2758 2759 /* Doing an Nx1 gcd with the given r. */ 2760 #define SPEED_ROUTINE_MPN_GCD_1N(function) \ 2761 { \ 2762 mp_ptr xp; \ 2763 unsigned i; \ 2764 double t; \ 2765 TMP_DECL; \ 2766 \ 2767 SPEED_RESTRICT_COND (s->size >= 1); \ 2768 SPEED_RESTRICT_COND (s->r != 0); \ 2769 \ 2770 TMP_MARK; \ 2771 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 2772 MPN_COPY (xp, s->xp, s->size); \ 2773 xp[0] |= refmpn_zero_p (xp, s->size); \ 2774 \ 2775 speed_operand_src (s, s->xp, s->size); \ 2776 speed_cache_fill (s); \ 2777 \ 2778 speed_starttime (); \ 2779 i = s->reps; \ 2780 do \ 2781 function (xp, s->size, s->r); \ 2782 while (--i != 0); \ 2783 t = speed_endtime (); \ 2784 \ 2785 TMP_FREE; \ 2786 return t; \ 2787 } 2788 2789 2790 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */ 2791 2792 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \ 2793 { \ 2794 unsigned i, j; \ 2795 mp_ptr px, py; \ 2796 mp_limb_t x_mask, y_mask; \ 2797 double t; \ 2798 TMP_DECL; \ 2799 \ 2800 SPEED_RESTRICT_COND (s->size >= 1); \ 2801 SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \ 2802 \ 2803 TMP_MARK; \ 2804 SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp); \ 2805 SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp); \ 2806 MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \ 2807 MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \ 2808 \ 2809 x_mask = MP_LIMB_T_LOWBITMASK (s->size); \ 2810 y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \ 2811 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 2812 { \ 2813 px[i] &= x_mask; px[i] += (px[i] == 0); \ 2814 py[i] &= y_mask; py[i] += (py[i] == 0); \ 2815 setup; \ 2816 } \ 2817 \ 2818 speed_operand_src (s, px, SPEED_BLOCK_SIZE); \ 2819 speed_operand_src (s, py, SPEED_BLOCK_SIZE); \ 2820 speed_cache_fill (s); \ 2821 \ 2822 speed_starttime (); \ 2823 i = s->reps; \ 2824 do \ 2825 { \ 2826 j = SPEED_BLOCK_SIZE; \ 2827 do \ 2828 { \ 2829 call; \ 2830 } \ 2831 while (--j != 0); \ 2832 } \ 2833 while (--i != 0); \ 2834 t = speed_endtime (); \ 2835 \ 2836 TMP_FREE; \ 2837 \ 2838 s->time_divisor = SPEED_BLOCK_SIZE; \ 2839 return t; \ 2840 } 2841 2842 #define SPEED_ROUTINE_MPN_GCD_1(function) \ 2843 SPEED_ROUTINE_MPN_GCD_1_CALL(do{}while(0) , function (&px[j-1], 1, py[j-1])) 2844 2845 #define SPEED_ROUTINE_MPN_GCD_11(function) \ 2846 SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1), \ 2847 function (px[j-1], py[j-1])) 2848 2849 /* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so 2850 we can measure gcd_22 loop only, without gcd_11. */ 2851 #define SPEED_ROUTINE_MPN_GCD_22(function) \ 2852 SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1), \ 2853 function (px[j-1], px[j-1], py[j-1], py[j-1])) 2854 2855 #define SPEED_ROUTINE_MPN_JACBASE(function) \ 2856 SPEED_ROUTINE_MPN_GCD_1_CALL \ 2857 ({ \ 2858 /* require x<y, y odd, y!=1 */ \ 2859 px[i] %= py[i]; \ 2860 px[i] |= 1; \ 2861 py[i] |= 1; \ 2862 if (py[i]==1) py[i]=3; \ 2863 }, \ 2864 function (px[j-1], py[j-1], 0)) 2865 2866 #define SPEED_ROUTINE_MPN_HGCD2(function) \ 2867 { \ 2868 unsigned i, j; \ 2869 struct hgcd_matrix1 m = {{{0,0},{0,0}}}; \ 2870 double t; \ 2871 mp_limb_t chain; \ 2872 \ 2873 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \ 2874 speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE); \ 2875 speed_cache_fill (s); \ 2876 \ 2877 speed_starttime (); \ 2878 i = s->reps; \ 2879 chain = 0; \ 2880 do \ 2881 { \ 2882 for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2) \ 2883 { \ 2884 /* randomized but successively dependent */ \ 2885 function (s->xp_block[j] | GMP_NUMB_HIGHBIT, \ 2886 s->xp_block[j+1] + chain, \ 2887 s->yp_block[j] | GMP_NUMB_HIGHBIT, \ 2888 s->yp_block[j+1], &m); \ 2889 chain += m.u[0][0]; \ 2890 } \ 2891 } \ 2892 while (--i != 0); \ 2893 t = speed_endtime (); \ 2894 \ 2895 /* make sure the compiler won't optimize away chain */ \ 2896 noop_1 (chain); \ 2897 \ 2898 s->time_divisor = SPEED_BLOCK_SIZE / 2; \ 2899 return t; \ 2900 } 2901 2902 #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc) \ 2903 { \ 2904 mp_size_t hgcd_init_itch, hgcd_itch; \ 2905 mp_ptr ap, bp, wp, tmp1; \ 2906 struct hgcd_matrix hgcd; \ 2907 int res; \ 2908 unsigned i; \ 2909 double t; \ 2910 TMP_DECL; \ 2911 \ 2912 if (s->size < 2) \ 2913 return -1; \ 2914 \ 2915 TMP_MARK; \ 2916 \ 2917 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ 2918 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ 2919 \ 2920 s->xp[s->size - 1] |= 1; \ 2921 s->yp[s->size - 1] |= 1; \ 2922 \ 2923 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ 2924 hgcd_itch = itchfunc (s->size); \ 2925 \ 2926 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ 2927 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp); \ 2928 \ 2929 speed_operand_src (s, s->xp, s->size); \ 2930 speed_operand_src (s, s->yp, s->size); \ 2931 speed_operand_dst (s, ap, s->size + 1); \ 2932 speed_operand_dst (s, bp, s->size + 1); \ 2933 speed_operand_dst (s, wp, hgcd_itch); \ 2934 speed_operand_dst (s, tmp1, hgcd_init_itch); \ 2935 speed_cache_fill (s); \ 2936 \ 2937 speed_starttime (); \ 2938 i = s->reps; \ 2939 do \ 2940 { \ 2941 MPN_COPY (ap, s->xp, s->size); \ 2942 MPN_COPY (bp, s->yp, s->size); \ 2943 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ 2944 res = func (ap, bp, s->size, &hgcd, wp); \ 2945 } \ 2946 while (--i != 0); \ 2947 t = speed_endtime (); \ 2948 TMP_FREE; \ 2949 return t; \ 2950 } 2951 2952 #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc) \ 2953 { \ 2954 mp_size_t hgcd_init_itch, hgcd_step_itch; \ 2955 mp_ptr ap, bp, wp, tmp1; \ 2956 struct hgcd_matrix hgcd; \ 2957 mp_size_t p = s->size/2; \ 2958 int res; \ 2959 unsigned i; \ 2960 double t; \ 2961 TMP_DECL; \ 2962 \ 2963 if (s->size < 2) \ 2964 return -1; \ 2965 \ 2966 TMP_MARK; \ 2967 \ 2968 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ 2969 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ 2970 \ 2971 s->xp[s->size - 1] |= 1; \ 2972 s->yp[s->size - 1] |= 1; \ 2973 \ 2974 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ 2975 hgcd_step_itch = itchfunc (s->size, p); \ 2976 \ 2977 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ 2978 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp); \ 2979 \ 2980 speed_operand_src (s, s->xp, s->size); \ 2981 speed_operand_src (s, s->yp, s->size); \ 2982 speed_operand_dst (s, ap, s->size + 1); \ 2983 speed_operand_dst (s, bp, s->size + 1); \ 2984 speed_operand_dst (s, wp, hgcd_step_itch); \ 2985 speed_operand_dst (s, tmp1, hgcd_init_itch); \ 2986 speed_cache_fill (s); \ 2987 \ 2988 speed_starttime (); \ 2989 i = s->reps; \ 2990 do \ 2991 { \ 2992 MPN_COPY (ap, s->xp, s->size); \ 2993 MPN_COPY (bp, s->yp, s->size); \ 2994 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ 2995 res = func (&hgcd, ap, bp, s->size, p, wp); \ 2996 } \ 2997 while (--i != 0); \ 2998 t = speed_endtime (); \ 2999 TMP_FREE; \ 3000 return t; \ 3001 } 3002 3003 /* Run some GCDs of s->size limbs each. The number of different data values 3004 is decreased as s->size**2, since GCD is a quadratic algorithm. 3005 SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT 3006 though, because the plain gcd is about twice as fast as gcdext. */ 3007 3008 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \ 3009 { \ 3010 unsigned i; \ 3011 mp_size_t j, pieces, psize; \ 3012 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 3013 double t; \ 3014 TMP_DECL; \ 3015 \ 3016 SPEED_RESTRICT_COND (s->size >= 1); \ 3017 \ 3018 TMP_MARK; \ 3019 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 3020 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 3021 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 3022 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 3023 \ 3024 pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \ 3025 pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \ 3026 pieces = MAX (pieces, 1); \ 3027 \ 3028 psize = pieces * s->size; \ 3029 px = TMP_ALLOC_LIMBS (psize); \ 3030 py = TMP_ALLOC_LIMBS (psize); \ 3031 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 3032 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 3033 \ 3034 /* Requirements: x >= y, y must be odd, high limbs != 0. \ 3035 No need to ensure random numbers are really great. */ \ 3036 for (j = 0; j < pieces; j++) \ 3037 { \ 3038 mp_ptr x = px + j * s->size; \ 3039 mp_ptr y = py + j * s->size; \ 3040 if (x[s->size - 1] == 0) x[s->size - 1] = 1; \ 3041 if (y[s->size - 1] == 0) y[s->size - 1] = 1; \ 3042 \ 3043 if (x[s->size - 1] < y[s->size - 1]) \ 3044 MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]); \ 3045 else if (x[s->size - 1] == y[s->size - 1]) \ 3046 { \ 3047 x[s->size - 1] = 2; \ 3048 y[s->size - 1] = 1; \ 3049 } \ 3050 y[0] |= 1; \ 3051 } \ 3052 \ 3053 speed_operand_src (s, px, psize); \ 3054 speed_operand_src (s, py, psize); \ 3055 speed_operand_dst (s, xtmp, s->size); \ 3056 speed_operand_dst (s, ytmp, s->size); \ 3057 speed_operand_dst (s, wp, s->size); \ 3058 speed_cache_fill (s); \ 3059 \ 3060 speed_starttime (); \ 3061 i = s->reps; \ 3062 do \ 3063 { \ 3064 j = pieces; \ 3065 do \ 3066 { \ 3067 MPN_COPY (xtmp, px+(j - 1)*s->size, s->size); \ 3068 MPN_COPY (ytmp, py+(j - 1)*s->size, s->size); \ 3069 call; \ 3070 } \ 3071 while (--j != 0); \ 3072 } \ 3073 while (--i != 0); \ 3074 t = speed_endtime (); \ 3075 \ 3076 TMP_FREE; \ 3077 \ 3078 s->time_divisor = pieces; \ 3079 return t; \ 3080 } 3081 3082 #define SPEED_ROUTINE_MPN_GCD(function) \ 3083 SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size)) 3084 3085 #define SPEED_ROUTINE_MPN_GCDEXT(function) \ 3086 SPEED_ROUTINE_MPN_GCD_CALL \ 3087 (4, { mp_size_t wp2size; \ 3088 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); }) 3089 3090 3091 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \ 3092 { \ 3093 unsigned i; \ 3094 mp_size_t j, pieces, psize, wp2size; \ 3095 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 3096 double t; \ 3097 TMP_DECL; \ 3098 \ 3099 SPEED_RESTRICT_COND (s->size >= 1); \ 3100 \ 3101 TMP_MARK; \ 3102 \ 3103 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 3104 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 3105 MPN_COPY (xtmp, s->xp, s->size); \ 3106 MPN_COPY (ytmp, s->yp, s->size); \ 3107 \ 3108 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 3109 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 3110 \ 3111 pieces = SPEED_BLOCK_SIZE / 3; \ 3112 psize = 3 * pieces; \ 3113 px = TMP_ALLOC_LIMBS (psize); \ 3114 py = TMP_ALLOC_LIMBS (psize); \ 3115 MPN_COPY (px, s->xp_block, psize); \ 3116 MPN_COPY (py, s->yp_block, psize); \ 3117 \ 3118 /* x must have at least as many bits as y, \ 3119 high limbs must be non-zero */ \ 3120 for (j = 0; j < pieces; j++) \ 3121 { \ 3122 mp_ptr x = px+3*j; \ 3123 mp_ptr y = py+3*j; \ 3124 x[2] += (x[2] == 0); \ 3125 y[2] += (y[2] == 0); \ 3126 if (x[2] < y[2]) \ 3127 MP_LIMB_T_SWAP (x[2], y[2]); \ 3128 } \ 3129 \ 3130 speed_operand_src (s, px, psize); \ 3131 speed_operand_src (s, py, psize); \ 3132 speed_operand_dst (s, xtmp, s->size); \ 3133 speed_operand_dst (s, ytmp, s->size); \ 3134 speed_operand_dst (s, wp, s->size); \ 3135 speed_cache_fill (s); \ 3136 \ 3137 speed_starttime (); \ 3138 i = s->reps; \ 3139 do \ 3140 { \ 3141 mp_ptr x = px; \ 3142 mp_ptr y = py; \ 3143 mp_ptr xth = &xtmp[s->size-3]; \ 3144 mp_ptr yth = &ytmp[s->size-3]; \ 3145 j = pieces; \ 3146 do \ 3147 { \ 3148 xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \ 3149 yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \ 3150 \ 3151 ytmp[0] |= 1; /* y must be odd, */ \ 3152 \ 3153 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \ 3154 \ 3155 x += 3; \ 3156 y += 3; \ 3157 } \ 3158 while (--j != 0); \ 3159 } \ 3160 while (--i != 0); \ 3161 t = speed_endtime (); \ 3162 \ 3163 TMP_FREE; \ 3164 \ 3165 s->time_divisor = pieces; \ 3166 return t; \ 3167 } 3168 3169 /* Calculate nextprime(n) for random n of s->size bits (not limbs). */ 3170 #define SPEED_ROUTINE_MPZ_NEXTPRIME(function) \ 3171 { \ 3172 unsigned i, j; \ 3173 mpz_t wp, n; \ 3174 double t; \ 3175 \ 3176 SPEED_RESTRICT_COND (s->size >= 10); \ 3177 \ 3178 mpz_init (wp); \ 3179 mpz_init_set_n (n, s->xp, s->size); \ 3180 /* limit to s->size bits, as this function is very slow */ \ 3181 mpz_tdiv_r_2exp (n, n, s->size); \ 3182 /* set high bits so operand and result are genaral s->size bits */ \ 3183 mpz_setbit (n, s->size - 1); \ 3184 mpz_clrbit (n, s->size - 2); \ 3185 \ 3186 speed_starttime (); \ 3187 i = s->reps; \ 3188 do \ 3189 { \ 3190 /* nextprime timing is variable, so average over many calls */ \ 3191 j = SPEED_BLOCK_SIZE - 1; \ 3192 /* starts on random, after measures prime to next prime */ \ 3193 function (wp, n); \ 3194 do \ 3195 { \ 3196 function (wp, wp); \ 3197 } \ 3198 while (--j != 0); \ 3199 } \ 3200 while (--i != 0); \ 3201 t = speed_endtime (); \ 3202 \ 3203 mpz_clear (wp); \ 3204 mpz_clear (n); \ 3205 \ 3206 s->time_divisor = SPEED_BLOCK_SIZE; \ 3207 return t; \ 3208 } 3209 3210 #define SPEED_ROUTINE_MPZ_JACOBI(function) \ 3211 { \ 3212 mpz_t a, b; \ 3213 unsigned i; \ 3214 mp_size_t j, pieces, psize; \ 3215 mp_ptr px, py; \ 3216 double t; \ 3217 TMP_DECL; \ 3218 \ 3219 TMP_MARK; \ 3220 pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \ 3221 pieces = MAX (pieces, 1); \ 3222 s->time_divisor = pieces; \ 3223 \ 3224 psize = pieces * s->size; \ 3225 px = TMP_ALLOC_LIMBS (psize); \ 3226 py = TMP_ALLOC_LIMBS (psize); \ 3227 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 3228 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 3229 \ 3230 for (j = 0; j < pieces; j++) \ 3231 { \ 3232 mp_ptr x = px+j*s->size; \ 3233 mp_ptr y = py+j*s->size; \ 3234 \ 3235 /* y odd */ \ 3236 y[0] |= 1; \ 3237 \ 3238 /* high limbs non-zero */ \ 3239 if (x[s->size-1] == 0) x[s->size-1] = 1; \ 3240 if (y[s->size-1] == 0) y[s->size-1] = 1; \ 3241 } \ 3242 \ 3243 SIZ(a) = s->size; \ 3244 SIZ(b) = s->size; \ 3245 \ 3246 speed_operand_src (s, px, psize); \ 3247 speed_operand_src (s, py, psize); \ 3248 speed_cache_fill (s); \ 3249 \ 3250 speed_starttime (); \ 3251 i = s->reps; \ 3252 do \ 3253 { \ 3254 j = pieces; \ 3255 do \ 3256 { \ 3257 PTR(a) = px+(j-1)*s->size; \ 3258 PTR(b) = py+(j-1)*s->size; \ 3259 function (a, b); \ 3260 } \ 3261 while (--j != 0); \ 3262 } \ 3263 while (--i != 0); \ 3264 t = speed_endtime (); \ 3265 \ 3266 TMP_FREE; \ 3267 return t; \ 3268 } 3269 3270 #define SPEED_ROUTINE_MPN_DIVREM_2(function) \ 3271 { \ 3272 mp_ptr wp, xp; \ 3273 mp_limb_t yp[2]; \ 3274 unsigned i; \ 3275 double t; \ 3276 TMP_DECL; \ 3277 \ 3278 SPEED_RESTRICT_COND (s->size >= 2); \ 3279 \ 3280 TMP_MARK; \ 3281 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 3282 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3283 \ 3284 /* source is destroyed */ \ 3285 MPN_COPY (xp, s->xp, s->size); \ 3286 \ 3287 /* divisor must be normalized */ \ 3288 MPN_COPY (yp, s->yp_block, 2); \ 3289 yp[1] |= GMP_NUMB_HIGHBIT; \ 3290 \ 3291 speed_operand_src (s, xp, s->size); \ 3292 speed_operand_src (s, yp, 2); \ 3293 speed_operand_dst (s, wp, s->size); \ 3294 speed_cache_fill (s); \ 3295 \ 3296 speed_starttime (); \ 3297 i = s->reps; \ 3298 do \ 3299 function (wp, 0, xp, s->size, yp); \ 3300 while (--i != 0); \ 3301 t = speed_endtime (); \ 3302 \ 3303 TMP_FREE; \ 3304 return t; \ 3305 } 3306 3307 #define SPEED_ROUTINE_MPN_DIV_QR_1(function) \ 3308 { \ 3309 mp_ptr wp, xp; \ 3310 mp_limb_t d; \ 3311 mp_limb_t r; \ 3312 unsigned i; \ 3313 double t; \ 3314 TMP_DECL; \ 3315 \ 3316 SPEED_RESTRICT_COND (s->size >= 1); \ 3317 \ 3318 TMP_MARK; \ 3319 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3320 \ 3321 d = s->r; \ 3322 if (d == 0) \ 3323 d = 1; \ 3324 speed_operand_src (s, s->xp, s->size); \ 3325 speed_operand_dst (s, wp, s->size); \ 3326 speed_cache_fill (s); \ 3327 \ 3328 speed_starttime (); \ 3329 i = s->reps; \ 3330 do \ 3331 r = function (wp, wp+s->size-1, s->xp, s->size, d); \ 3332 while (--i != 0); \ 3333 t = speed_endtime (); \ 3334 \ 3335 TMP_FREE; \ 3336 return t; \ 3337 } 3338 3339 #define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function) \ 3340 { \ 3341 mp_ptr wp, xp; \ 3342 mp_limb_t d, dinv; \ 3343 mp_limb_t r; \ 3344 unsigned i; \ 3345 double t; \ 3346 TMP_DECL; \ 3347 \ 3348 SPEED_RESTRICT_COND (s->size >= 1); \ 3349 \ 3350 TMP_MARK; \ 3351 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3352 \ 3353 d = s->r; \ 3354 /* divisor must be normalized */ \ 3355 SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT); \ 3356 invert_limb (dinv, d); \ 3357 speed_operand_src (s, s->xp, s->size); \ 3358 speed_operand_dst (s, wp, s->size); \ 3359 speed_cache_fill (s); \ 3360 \ 3361 speed_starttime (); \ 3362 i = s->reps; \ 3363 do \ 3364 r = function (wp, s->xp, s->size, 0, d, dinv); \ 3365 while (--i != 0); \ 3366 t = speed_endtime (); \ 3367 \ 3368 TMP_FREE; \ 3369 return t; \ 3370 } 3371 3372 #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm) \ 3373 { \ 3374 mp_ptr wp, xp; \ 3375 mp_limb_t yp[2]; \ 3376 mp_limb_t rp[2]; \ 3377 unsigned i; \ 3378 double t; \ 3379 TMP_DECL; \ 3380 \ 3381 SPEED_RESTRICT_COND (s->size >= 2); \ 3382 \ 3383 TMP_MARK; \ 3384 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3385 \ 3386 /* divisor must be normalized */ \ 3387 MPN_COPY (yp, s->yp_block, 2); \ 3388 if (norm) \ 3389 yp[1] |= GMP_NUMB_HIGHBIT; \ 3390 else \ 3391 { \ 3392 yp[1] &= ~GMP_NUMB_HIGHBIT; \ 3393 if (yp[1] == 0) \ 3394 yp[1] = 1; \ 3395 } \ 3396 speed_operand_src (s, s->xp, s->size); \ 3397 speed_operand_src (s, yp, 2); \ 3398 speed_operand_dst (s, wp, s->size); \ 3399 speed_operand_dst (s, rp, 2); \ 3400 speed_cache_fill (s); \ 3401 \ 3402 speed_starttime (); \ 3403 i = s->reps; \ 3404 do \ 3405 function (wp, rp, s->xp, s->size, yp); \ 3406 while (--i != 0); \ 3407 t = speed_endtime (); \ 3408 \ 3409 TMP_FREE; \ 3410 return t; \ 3411 } 3412 3413 #define SPEED_ROUTINE_MODLIMB_INVERT(function) \ 3414 { \ 3415 unsigned i, j; \ 3416 mp_ptr xp; \ 3417 mp_limb_t n = 1; \ 3418 double t; \ 3419 \ 3420 xp = s->xp_block-1; \ 3421 \ 3422 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \ 3423 speed_cache_fill (s); \ 3424 \ 3425 speed_starttime (); \ 3426 i = s->reps; \ 3427 do \ 3428 { \ 3429 j = SPEED_BLOCK_SIZE; \ 3430 do \ 3431 { \ 3432 /* randomized but successively dependent */ \ 3433 n += (xp[j] << 1); \ 3434 \ 3435 function (n, n); \ 3436 } \ 3437 while (--j != 0); \ 3438 } \ 3439 while (--i != 0); \ 3440 t = speed_endtime (); \ 3441 \ 3442 /* make sure the compiler won't optimize away n */ \ 3443 noop_1 (n); \ 3444 \ 3445 s->time_divisor = SPEED_BLOCK_SIZE; \ 3446 return t; \ 3447 } 3448 3449 3450 #define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call) \ 3451 { \ 3452 mp_ptr wp, wp2; \ 3453 unsigned i; \ 3454 double t; \ 3455 TMP_DECL; \ 3456 \ 3457 SPEED_RESTRICT_COND (s->size >= 1); \ 3458 \ 3459 TMP_MARK; \ 3460 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3461 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 3462 \ 3463 speed_operand_src (s, s->xp, s->size); \ 3464 speed_operand_dst (s, wp, s->size); \ 3465 speed_operand_dst (s, wp2, s->size); \ 3466 speed_cache_fill (s); \ 3467 \ 3468 speed_starttime (); \ 3469 i = s->reps; \ 3470 do \ 3471 call; \ 3472 while (--i != 0); \ 3473 t = speed_endtime (); \ 3474 \ 3475 TMP_FREE; \ 3476 return t; \ 3477 } 3478 3479 3480 /* Calculate worst case for perfect_power 3481 Worst case is multiple prime factors larger than trial div limit. */ 3482 #define SPEED_ROUTINE_MPN_PERFECT_POWER(function) \ 3483 { \ 3484 mpz_t r; \ 3485 unsigned i, power; \ 3486 double t; \ 3487 \ 3488 SPEED_RESTRICT_COND (s->size >= 10); \ 3489 \ 3490 mpz_init (r); \ 3491 power = s->size * GMP_NUMB_BITS / 17; \ 3492 mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1); \ 3493 mpz_mul_ui(r, r, (1 << 16) + 1); /* larger than 1000th prime */ \ 3494 \ 3495 speed_starttime (); \ 3496 i = s->reps; \ 3497 do \ 3498 function (PTR(r), SIZ(r)); \ 3499 while (--i != 0); \ 3500 t = speed_endtime (); \ 3501 \ 3502 mpz_clear (r); \ 3503 return t; \ 3504 } 3505 3506 /* Calculate worst case (larger prime) for perfect_square */ 3507 #define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function) \ 3508 { \ 3509 mpz_t r; \ 3510 unsigned i; \ 3511 double t; \ 3512 \ 3513 SPEED_RESTRICT_COND (s->size >= 2); \ 3514 mpz_init_set_n (r, s->xp, s->size / 2); \ 3515 mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1); \ 3516 mpz_mul (r, r, r); \ 3517 \ 3518 speed_starttime (); \ 3519 i = s->reps; \ 3520 do \ 3521 function (PTR(r), SIZ(r)); \ 3522 while (--i != 0); \ 3523 t = speed_endtime (); \ 3524 \ 3525 mpz_clear (r); \ 3526 return t; \ 3527 } 3528 3529 3530 /* s->size controls the number of limbs in the input, s->r is the base, or 3531 decimal by default. */ 3532 #define SPEED_ROUTINE_MPN_GET_STR(function) \ 3533 { \ 3534 unsigned char *wp; \ 3535 mp_size_t wn; \ 3536 mp_ptr xp; \ 3537 int base; \ 3538 unsigned i; \ 3539 double t; \ 3540 TMP_DECL; \ 3541 \ 3542 SPEED_RESTRICT_COND (s->size >= 1); \ 3543 \ 3544 base = s->r == 0 ? 10 : s->r; \ 3545 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 3546 \ 3547 TMP_MARK; \ 3548 SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp); \ 3549 \ 3550 MPN_SIZEINBASE (wn, s->xp, s->size, base); \ 3551 wp = (unsigned char *) TMP_ALLOC (wn); \ 3552 \ 3553 /* use this during development to guard against overflowing wp */ \ 3554 /* \ 3555 MPN_COPY (xp, s->xp, s->size); \ 3556 ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn); \ 3557 */ \ 3558 \ 3559 speed_operand_src (s, s->xp, s->size); \ 3560 speed_operand_dst (s, xp, s->size); \ 3561 speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES); \ 3562 speed_cache_fill (s); \ 3563 \ 3564 speed_starttime (); \ 3565 i = s->reps; \ 3566 do \ 3567 { \ 3568 MPN_COPY (xp, s->xp, s->size); \ 3569 function (wp, base, xp, s->size); \ 3570 } \ 3571 while (--i != 0); \ 3572 t = speed_endtime (); \ 3573 \ 3574 TMP_FREE; \ 3575 return t; \ 3576 } 3577 3578 /* s->size controls the number of digits in the input, s->r is the base, or 3579 decimal by default. */ 3580 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call) \ 3581 { \ 3582 unsigned char *xp; \ 3583 mp_ptr wp; \ 3584 mp_size_t wn; \ 3585 unsigned i; \ 3586 int base; \ 3587 double t; \ 3588 TMP_DECL; \ 3589 \ 3590 SPEED_RESTRICT_COND (s->size >= 1); \ 3591 \ 3592 base = s->r == 0 ? 10 : s->r; \ 3593 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 3594 \ 3595 TMP_MARK; \ 3596 \ 3597 xp = (unsigned char *) TMP_ALLOC (s->size); \ 3598 for (i = 0; i < s->size; i++) \ 3599 xp[i] = s->xp[i] % base; \ 3600 \ 3601 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); \ 3602 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 3603 \ 3604 /* use this during development to check wn is big enough */ \ 3605 /* \ 3606 ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn); \ 3607 */ \ 3608 \ 3609 speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES); \ 3610 speed_operand_dst (s, wp, wn); \ 3611 speed_cache_fill (s); \ 3612 \ 3613 speed_starttime (); \ 3614 i = s->reps; \ 3615 do \ 3616 call; \ 3617 while (--i != 0); \ 3618 t = speed_endtime (); \ 3619 \ 3620 TMP_FREE; \ 3621 return t; \ 3622 } 3623 3624 3625 /* Run an accel gcd find_a() function over various data values. A set of 3626 values is used in case some run particularly fast or slow. The size 3627 parameter is ignored, the amount of data tested is fixed. */ 3628 3629 #define SPEED_ROUTINE_MPN_GCD_FINDA(function) \ 3630 { \ 3631 unsigned i, j; \ 3632 mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \ 3633 double t; \ 3634 TMP_DECL; \ 3635 \ 3636 TMP_MARK; \ 3637 \ 3638 /* low must be odd, high must be non-zero */ \ 3639 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 3640 { \ 3641 cp[i][0] = s->xp_block[i] | 1; \ 3642 cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \ 3643 } \ 3644 \ 3645 speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \ 3646 speed_cache_fill (s); \ 3647 \ 3648 speed_starttime (); \ 3649 i = s->reps; \ 3650 do \ 3651 { \ 3652 j = SPEED_BLOCK_SIZE; \ 3653 do \ 3654 { \ 3655 function (cp[j-1]); \ 3656 } \ 3657 while (--j != 0); \ 3658 } \ 3659 while (--i != 0); \ 3660 t = speed_endtime (); \ 3661 \ 3662 TMP_FREE; \ 3663 \ 3664 s->time_divisor = SPEED_BLOCK_SIZE; \ 3665 return t; \ 3666 } 3667 3668 3669 /* "call" should do "count_foo_zeros(c,n)". 3670 Give leading=1 if foo is leading zeros, leading=0 for trailing. 3671 Give zero=1 if n=0 is allowed in the call, zero=0 if not. */ 3672 3673 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \ 3674 { \ 3675 mp_ptr xp; \ 3676 int i, c; \ 3677 unsigned j; \ 3678 mp_limb_t n; \ 3679 double t; \ 3680 TMP_DECL; \ 3681 \ 3682 TMP_MARK; \ 3683 SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp); \ 3684 \ 3685 if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \ 3686 return -1.0; \ 3687 speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \ 3688 speed_cache_fill (s); \ 3689 \ 3690 c = 0; \ 3691 speed_starttime (); \ 3692 j = s->reps; \ 3693 do { \ 3694 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 3695 { \ 3696 n = xp[i]; \ 3697 n ^= c; \ 3698 3699 #define SPEED_ROUTINE_COUNT_ZEROS_B() \ 3700 } \ 3701 } while (--j != 0); \ 3702 t = speed_endtime (); \ 3703 \ 3704 /* don't let c go dead */ \ 3705 noop_1 (c); \ 3706 \ 3707 s->time_divisor = SPEED_BLOCK_SIZE; \ 3708 \ 3709 TMP_FREE; \ 3710 return t; \ 3711 } \ 3712 3713 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \ 3714 do { \ 3715 SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \ 3716 call; \ 3717 SPEED_ROUTINE_COUNT_ZEROS_B (); \ 3718 } while (0) \ 3719 3720 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \ 3721 SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero) 3722 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \ 3723 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0) 3724 3725 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \ 3726 SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero) 3727 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \ 3728 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0) 3729 3730 3731 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \ 3732 { \ 3733 unsigned i, j; \ 3734 mp_limb_t d, dinv=0; \ 3735 mp_ptr xp = s->xp_block - 1; \ 3736 \ 3737 s->time_divisor = SPEED_BLOCK_SIZE; \ 3738 \ 3739 speed_starttime (); \ 3740 i = s->reps; \ 3741 do \ 3742 { \ 3743 j = SPEED_BLOCK_SIZE; \ 3744 do \ 3745 { \ 3746 d = dinv ^ xp[j]; \ 3747 d |= GMP_LIMB_HIGHBIT; \ 3748 do { call; } while (0); \ 3749 } \ 3750 while (--j != 0); \ 3751 } \ 3752 while (--i != 0); \ 3753 \ 3754 /* don't let the compiler optimize everything away */ \ 3755 noop_1 (dinv); \ 3756 \ 3757 return speed_endtime(); \ 3758 } 3759 3760 3761 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \ 3762 { \ 3763 unsigned i; \ 3764 speed_starttime (); \ 3765 i = s->reps; \ 3766 do \ 3767 function (); \ 3768 while (--i != 0); \ 3769 return speed_endtime (); \ 3770 } 3771 3772 3773 #define SPEED_ROUTINE_MPN_ZERO_CALL(call) \ 3774 { \ 3775 mp_ptr wp; \ 3776 unsigned i; \ 3777 double t; \ 3778 TMP_DECL; \ 3779 \ 3780 SPEED_RESTRICT_COND (s->size >= 0); \ 3781 \ 3782 TMP_MARK; \ 3783 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3784 speed_operand_dst (s, wp, s->size); \ 3785 speed_cache_fill (s); \ 3786 \ 3787 speed_starttime (); \ 3788 i = s->reps; \ 3789 do \ 3790 call; \ 3791 while (--i != 0); \ 3792 t = speed_endtime (); \ 3793 \ 3794 TMP_FREE; \ 3795 return t; \ 3796 } 3797 3798 #define SPEED_ROUTINE_MPN_ZERO(function) \ 3799 SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size)) 3800 3801 3802 #endif 3803