1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic. 2 3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2015 Free Software 4 Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 (Copied from GMP 6.1.0.) 8 9 The GNU MP Library is free software; you can redistribute it and/or modify 10 it under the terms of either: 11 12 * the GNU Lesser General Public License as published by the Free 13 Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 or 17 18 * the GNU General Public License as published by the Free Software 19 Foundation; either version 2 of the License, or (at your option) any 20 later version. 21 22 or both in parallel, as here. 23 24 The GNU MP Library is distributed in the hope that it will be useful, but 25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 for more details. 28 29 You should have received copies of the GNU General Public License and the 30 GNU Lesser General Public License along with the GNU MP Library. If not, 31 see https://www.gnu.org/licenses/. */ 32 33 /* the following was added for use within GMP-ECM */ 34 #ifndef HAVE_HOST_CPU_FAMILY_power 35 #define HAVE_HOST_CPU_FAMILY_power 0 36 #endif 37 38 #ifndef HAVE_HOST_CPU_FAMILY_powerpc 39 #define HAVE_HOST_CPU_FAMILY_powerpc 0 40 #endif 41 42 #ifndef HAVE_HOST_CPU_FAMILY_x86 43 #define HAVE_HOST_CPU_FAMILY_x86 0 44 #endif 45 46 #ifndef HAVE_NATIVE_mpn_umul_ppmm 47 #define HAVE_NATIVE_mpn_umul_ppmm 0 48 #endif 49 50 #ifndef HAVE_NATIVE_mpn_umul_ppmm_r 51 #define HAVE_NATIVE_mpn_umul_ppmm_r 0 52 #endif 53 54 #ifndef HAVE_NATIVE_mpn_udiv_qrnnd 55 #define HAVE_NATIVE_mpn_udiv_qrnnd 0 56 #endif 57 58 #ifndef HAVE_NATIVE_mpn_udiv_qrnnd_r 59 #define HAVE_NATIVE_mpn_udiv_qrnnd_r 0 60 #endif 61 62 #ifndef HAVE_HOST_CPU_i586 63 #define HAVE_HOST_CPU_i586 0 64 #endif 65 66 #ifndef HAVE_HOST_CPU_pentium 67 #define HAVE_HOST_CPU_pentium 0 68 #endif 69 70 #ifndef HAVE_HOST_CPU_pentiummmx 71 #define HAVE_HOST_CPU_pentiummmx 0 72 #endif 73 74 /* end of stuff added for GMP-ECM */ 75 76 /* You have to define the following before including this file: 77 78 UWtype -- An unsigned type, default type for operations (typically a "word") 79 UHWtype -- An unsigned type, at least half the size of UWtype 80 UDWtype -- An unsigned type, at least twice as large a UWtype 81 W_TYPE_SIZE -- size in bits of UWtype 82 83 SItype, USItype -- Signed and unsigned 32 bit types 84 DItype, UDItype -- Signed and unsigned 64 bit types 85 86 On a 32 bit machine UWtype should typically be USItype; 87 on a 64 bit machine, UWtype should typically be UDItype. 88 89 Optionally, define: 90 91 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files 92 NO_ASM -- Disable inline asm 93 94 95 CAUTION! Using this version of longlong.h outside of GMP is not safe. You 96 need to include gmp.h and gmp-impl.h, or certain things might not work as 97 expected. 98 */ 99 100 #define __BITS4 (W_TYPE_SIZE / 4) 101 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) 102 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) 103 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) 104 105 /* This is used to make sure no undesirable sharing between different libraries 106 that use this file takes place. */ 107 #ifndef __MPN 108 #define __MPN(x) __##x 109 #endif 110 111 /* Define auxiliary asm macros. 112 113 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two 114 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype 115 word product in HIGH_PROD and LOW_PROD. 116 117 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a 118 UDWtype product. This is just a variant of umul_ppmm. 119 120 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 121 denominator) divides a UDWtype, composed by the UWtype integers 122 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient 123 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less 124 than DENOMINATOR for correct operation. If, in addition, the most 125 significant bit of DENOMINATOR must be 1, then the pre-processor symbol 126 UDIV_NEEDS_NORMALIZATION is defined to 1. 127 128 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 129 denominator). Like udiv_qrnnd but the numbers are signed. The quotient 130 is rounded towards 0. 131 132 5) count_leading_zeros(count, x) counts the number of zero-bits from the 133 msb to the first non-zero bit in the UWtype X. This is the number of 134 steps X needs to be shifted left to set the msb. Undefined for X == 0, 135 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. 136 137 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts 138 from the least significant end. 139 140 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, 141 high_addend_2, low_addend_2) adds two UWtype integers, composed by 142 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 143 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow 144 (i.e. carry out) is not stored anywhere, and is lost. 145 146 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, 147 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, 148 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and 149 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE 150 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, 151 and is lost. 152 153 If any of these macros are left undefined for a particular CPU, 154 C macros are used. 155 156 157 Notes: 158 159 For add_ssaaaa the two high and two low addends can both commute, but 160 unfortunately gcc only supports one "%" commutative in each asm block. 161 This has always been so but is only documented in recent versions 162 (eg. pre-release 3.3). Having two or more "%"s can cause an internal 163 compiler error in certain rare circumstances. 164 165 Apparently it was only the last "%" that was ever actually respected, so 166 the code has been updated to leave just that. Clearly there's a free 167 choice whether high or low should get it, if there's a reason to favour 168 one over the other. Also obviously when the constraints on the two 169 operands are identical there's no benefit to the reloader in any "%" at 170 all. 171 172 */ 173 174 /* The CPUs come in alphabetical order below. 175 176 Please add support for more CPUs here, or improve the current support 177 for the CPUs below! */ 178 179 180 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc 181 3.4 __builtin_clzl or __builtin_clzll, according to our limb size. 182 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or 183 __builtin_ctzll. 184 185 These builtins are only used when we check what code comes out, on some 186 chips they're merely libgcc calls, where we will instead want an inline 187 in that case (either asm or generic C). 188 189 These builtins are better than an asm block of the same insn, since an 190 asm block doesn't give gcc any information about scheduling or resource 191 usage. We keep an asm block for use on prior versions of gcc though. 192 193 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but 194 it's not used (for count_leading_zeros) because it generally gives extra 195 code to ensure the result is 0 when the input is 0, which we don't need 196 or want. */ 197 198 #ifdef _LONG_LONG_LIMB 199 #define count_leading_zeros_gcc_clz(count,x) \ 200 do { \ 201 ASSERT ((x) != 0); \ 202 (count) = __builtin_clzll (x); \ 203 } while (0) 204 #else 205 #define count_leading_zeros_gcc_clz(count,x) \ 206 do { \ 207 ASSERT ((x) != 0); \ 208 (count) = __builtin_clzl (x); \ 209 } while (0) 210 #endif 211 212 #ifdef _LONG_LONG_LIMB 213 #define count_trailing_zeros_gcc_ctz(count,x) \ 214 do { \ 215 ASSERT ((x) != 0); \ 216 (count) = __builtin_ctzll (x); \ 217 } while (0) 218 #else 219 #define count_trailing_zeros_gcc_ctz(count,x) \ 220 do { \ 221 ASSERT ((x) != 0); \ 222 (count) = __builtin_ctzl (x); \ 223 } while (0) 224 #endif 225 226 227 /* FIXME: The macros using external routines like __MPN(count_leading_zeros) 228 don't need to be under !NO_ASM */ 229 #if ! defined (NO_ASM) 230 231 #if defined (__alpha) && W_TYPE_SIZE == 64 232 /* Most alpha-based machines, except Cray systems. */ 233 #if defined (__GNUC__) 234 #if __GMP_GNUC_PREREQ (3,3) 235 #define umul_ppmm(ph, pl, m0, m1) \ 236 do { \ 237 UDItype __m0 = (m0), __m1 = (m1); \ 238 (ph) = __builtin_alpha_umulh (__m0, __m1); \ 239 (pl) = __m0 * __m1; \ 240 } while (0) 241 #else 242 #define umul_ppmm(ph, pl, m0, m1) \ 243 do { \ 244 UDItype __m0 = (m0), __m1 = (m1); \ 245 __asm__ ("umulh %r1,%2,%0" \ 246 : "=r" (ph) \ 247 : "%rJ" (__m0), "rI" (__m1)); \ 248 (pl) = __m0 * __m1; \ 249 } while (0) 250 #endif 251 #define UMUL_TIME 18 252 #else /* ! __GNUC__ */ 253 #include <machine/builtins.h> 254 #define umul_ppmm(ph, pl, m0, m1) \ 255 do { \ 256 UDItype __m0 = (m0), __m1 = (m1); \ 257 (ph) = __UMULH (__m0, __m1); \ 258 (pl) = __m0 * __m1; \ 259 } while (0) 260 #endif 261 #ifndef LONGLONG_STANDALONE 262 #define udiv_qrnnd(q, r, n1, n0, d) \ 263 do { UWtype __di; \ 264 __di = __MPN(invert_limb) (d); \ 265 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 266 } while (0) 267 #define UDIV_PREINV_ALWAYS 1 268 #define UDIV_NEEDS_NORMALIZATION 1 269 #define UDIV_TIME 220 270 #endif /* LONGLONG_STANDALONE */ 271 272 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm 273 always goes into libgmp.so, even when not actually used. */ 274 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 275 276 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX 277 #define count_leading_zeros(COUNT,X) \ 278 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) 279 #define count_trailing_zeros(COUNT,X) \ 280 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) 281 #endif /* clz/ctz using cix */ 282 283 #if ! defined (count_leading_zeros) \ 284 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) 285 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. 286 "$31" is written explicitly in the asm, since an "r" constraint won't 287 select reg 31. There seems no need to worry about "r31" syntax for cray, 288 since gcc itself (pre-release 3.4) emits just $31 in various places. */ 289 #define ALPHA_CMPBGE_0(dst, src) \ 290 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) 291 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts 292 them, locating the highest non-zero byte. A second __clz_tab lookup 293 counts the leading zero bits in that byte, giving the result. */ 294 #define count_leading_zeros(count, x) \ 295 do { \ 296 UWtype __clz__b, __clz__c, __clz__x = (x); \ 297 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ 298 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ 299 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ 300 __clz__x >>= __clz__b; \ 301 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ 302 __clz__b = 65 - __clz__b; \ 303 (count) = __clz__b - __clz__c; \ 304 } while (0) 305 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 306 #endif /* clz using cmpbge */ 307 308 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) 309 #if HAVE_ATTRIBUTE_CONST 310 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); 311 #else 312 long __MPN(count_leading_zeros) (UDItype); 313 #endif 314 #define count_leading_zeros(count, x) \ 315 ((count) = __MPN(count_leading_zeros) (x)) 316 #endif /* clz using mpn */ 317 #endif /* __alpha */ 318 319 #if defined (__AVR) && W_TYPE_SIZE == 8 320 #define umul_ppmm(ph, pl, m0, m1) \ 321 do { \ 322 unsigned short __p = (unsigned short) (m0) * (m1); \ 323 (ph) = __p >> 8; \ 324 (pl) = __p; \ 325 } while (0) 326 #endif /* AVR */ 327 328 #if defined (_CRAY) && W_TYPE_SIZE == 64 329 #include <intrinsics.h> 330 #define UDIV_PREINV_ALWAYS 1 331 #define UDIV_NEEDS_NORMALIZATION 1 332 #define UDIV_TIME 220 333 long __MPN(count_leading_zeros) (UDItype); 334 #define count_leading_zeros(count, x) \ 335 ((count) = _leadz ((UWtype) (x))) 336 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ 337 #define umul_ppmm(ph, pl, m0, m1) \ 338 do { \ 339 UDItype __m0 = (m0), __m1 = (m1); \ 340 (ph) = _int_mult_upper (__m0, __m1); \ 341 (pl) = __m0 * __m1; \ 342 } while (0) 343 #ifndef LONGLONG_STANDALONE 344 #define udiv_qrnnd(q, r, n1, n0, d) \ 345 do { UWtype __di; \ 346 __di = __MPN(invert_limb) (d); \ 347 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 348 } while (0) 349 #endif /* LONGLONG_STANDALONE */ 350 #endif /* _CRAYIEEE */ 351 #endif /* _CRAY */ 352 353 #if defined (__ia64) && W_TYPE_SIZE == 64 354 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated 355 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic 356 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a 357 register, which takes an extra cycle. */ 358 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 359 do { \ 360 UWtype __x; \ 361 __x = (al) - (bl); \ 362 if ((al) < (bl)) \ 363 (sh) = (ah) - (bh) - 1; \ 364 else \ 365 (sh) = (ah) - (bh); \ 366 (sl) = __x; \ 367 } while (0) 368 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 369 /* Do both product parts in assembly, since that gives better code with 370 all gcc versions. Some callers will just use the upper part, and in 371 that situation we waste an instruction, but not any cycles. */ 372 #define umul_ppmm(ph, pl, m0, m1) \ 373 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \ 374 : "=&f" (ph), "=f" (pl) \ 375 : "f" (m0), "f" (m1)) 376 #define UMUL_TIME 14 377 #define count_leading_zeros(count, x) \ 378 do { \ 379 UWtype _x = (x), _y, _a, _c; \ 380 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ 381 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ 382 _c = (_a - 1) << 3; \ 383 _x >>= _c; \ 384 if (_x >= 1 << 4) \ 385 _x >>= 4, _c += 4; \ 386 if (_x >= 1 << 2) \ 387 _x >>= 2, _c += 2; \ 388 _c += _x >> 1; \ 389 (count) = W_TYPE_SIZE - 1 - _c; \ 390 } while (0) 391 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1 392 based, and we don't need a special case for x==0 here */ 393 #define count_trailing_zeros(count, x) \ 394 do { \ 395 UWtype __ctz_x = (x); \ 396 __asm__ ("popcnt %0 = %1" \ 397 : "=r" (count) \ 398 : "r" ((__ctz_x-1) & ~__ctz_x)); \ 399 } while (0) 400 #endif 401 #if defined (__INTEL_COMPILER) 402 #include <ia64intrin.h> 403 #define umul_ppmm(ph, pl, m0, m1) \ 404 do { \ 405 UWtype __m0 = (m0), __m1 = (m1); \ 406 ph = _m64_xmahu (__m0, __m1, 0); \ 407 pl = __m0 * __m1; \ 408 } while (0) 409 #endif 410 #ifndef LONGLONG_STANDALONE 411 #define udiv_qrnnd(q, r, n1, n0, d) \ 412 do { UWtype __di; \ 413 __di = __MPN(invert_limb) (d); \ 414 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 415 } while (0) 416 #define UDIV_PREINV_ALWAYS 1 417 #define UDIV_NEEDS_NORMALIZATION 1 418 #endif 419 #define UDIV_TIME 220 420 #endif 421 422 423 #if defined (__GNUC__) 424 425 /* We sometimes need to clobber "cc" with gcc2, but that would not be 426 understood by gcc1. Use cpp to avoid major code duplication. */ 427 #if __GNUC__ < 2 428 #define __CLOBBER_CC 429 #define __AND_CLOBBER_CC 430 #else /* __GNUC__ >= 2 */ 431 #define __CLOBBER_CC : "cc" 432 #define __AND_CLOBBER_CC , "cc" 433 #endif /* __GNUC__ < 2 */ 434 435 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 436 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 437 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \ 438 : "=r" (sh), "=&r" (sl) \ 439 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) 440 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 441 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ 442 : "=r" (sh), "=&r" (sl) \ 443 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) 444 #define umul_ppmm(xh, xl, m0, m1) \ 445 do { \ 446 USItype __m0 = (m0), __m1 = (m1); \ 447 __asm__ ("multiplu %0,%1,%2" \ 448 : "=r" (xl) \ 449 : "r" (__m0), "r" (__m1)); \ 450 __asm__ ("multmu %0,%1,%2" \ 451 : "=r" (xh) \ 452 : "r" (__m0), "r" (__m1)); \ 453 } while (0) 454 #define udiv_qrnnd(q, r, n1, n0, d) \ 455 __asm__ ("dividu %0,%3,%4" \ 456 : "=r" (q), "=q" (r) \ 457 : "1" (n1), "r" (n0), "r" (d)) 458 #define count_leading_zeros(count, x) \ 459 __asm__ ("clz %0,%1" \ 460 : "=r" (count) \ 461 : "r" (x)) 462 #define COUNT_LEADING_ZEROS_0 32 463 #endif /* __a29k__ */ 464 465 #if defined (__arc__) 466 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 467 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 468 : "=r" (sh), \ 469 "=&r" (sl) \ 470 : "r" ((USItype) (ah)), \ 471 "rIJ" ((USItype) (bh)), \ 472 "%r" ((USItype) (al)), \ 473 "rIJ" ((USItype) (bl))) 474 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 475 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 476 : "=r" (sh), \ 477 "=&r" (sl) \ 478 : "r" ((USItype) (ah)), \ 479 "rIJ" ((USItype) (bh)), \ 480 "r" ((USItype) (al)), \ 481 "rIJ" ((USItype) (bl))) 482 #endif 483 484 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \ 485 && W_TYPE_SIZE == 32 486 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 487 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 488 : "=r" (sh), "=&r" (sl) \ 489 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) 490 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 491 do { \ 492 if (__builtin_constant_p (al)) \ 493 { \ 494 if (__builtin_constant_p (ah)) \ 495 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 496 : "=r" (sh), "=&r" (sl) \ 497 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 498 else \ 499 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 500 : "=r" (sh), "=&r" (sl) \ 501 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 502 } \ 503 else if (__builtin_constant_p (ah)) \ 504 { \ 505 if (__builtin_constant_p (bl)) \ 506 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ 507 : "=r" (sh), "=&r" (sl) \ 508 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 509 else \ 510 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 511 : "=r" (sh), "=&r" (sl) \ 512 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 513 } \ 514 else if (__builtin_constant_p (bl)) \ 515 { \ 516 if (__builtin_constant_p (bh)) \ 517 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 518 : "=r" (sh), "=&r" (sl) \ 519 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 520 else \ 521 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ 522 : "=r" (sh), "=&r" (sl) \ 523 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 524 } \ 525 else /* only bh might be a constant */ \ 526 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 527 : "=r" (sh), "=&r" (sl) \ 528 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\ 529 } while (0) 530 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \ 531 || defined (__ARM_ARCH_3__) 532 #define umul_ppmm(xh, xl, a, b) \ 533 do { \ 534 register USItype __t0, __t1, __t2; \ 535 __asm__ ("%@ Inlined umul_ppmm\n" \ 536 " mov %2, %5, lsr #16\n" \ 537 " mov %0, %6, lsr #16\n" \ 538 " bic %3, %5, %2, lsl #16\n" \ 539 " bic %4, %6, %0, lsl #16\n" \ 540 " mul %1, %3, %4\n" \ 541 " mul %4, %2, %4\n" \ 542 " mul %3, %0, %3\n" \ 543 " mul %0, %2, %0\n" \ 544 " adds %3, %4, %3\n" \ 545 " addcs %0, %0, #65536\n" \ 546 " adds %1, %1, %3, lsl #16\n" \ 547 " adc %0, %0, %3, lsr #16" \ 548 : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \ 549 "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \ 550 : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \ 551 } while (0) 552 #define UMUL_TIME 20 553 #define udiv_qrnnd(q, r, n1, n0, d) \ 554 do { UWtype __r; \ 555 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 556 (r) = __r; \ 557 } while (0) 558 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 559 #define UDIV_TIME 200 560 #else /* ARMv4 or newer */ 561 #define umul_ppmm(xh, xl, a, b) \ 562 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 563 #define UMUL_TIME 5 564 #define smul_ppmm(xh, xl, a, b) \ 565 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 566 #ifndef LONGLONG_STANDALONE 567 #define udiv_qrnnd(q, r, n1, n0, d) \ 568 do { UWtype __di; \ 569 __di = __MPN(invert_limb) (d); \ 570 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 571 } while (0) 572 #define UDIV_PREINV_ALWAYS 1 573 #define UDIV_NEEDS_NORMALIZATION 1 574 #define UDIV_TIME 70 575 #endif /* LONGLONG_STANDALONE */ 576 #endif /* defined(__ARM_ARCH_2__) ... */ 577 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 578 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 579 #define COUNT_LEADING_ZEROS_0 32 580 #endif /* __arm__ */ 581 582 #if defined (__aarch64__) && W_TYPE_SIZE == 64 583 /* FIXME: Extend the immediate range for the low word by using both 584 ADDS and SUBS, since they set carry in the same way. */ 585 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 586 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 587 : "=r" (sh), "=&r" (sl) \ 588 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \ 589 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC) 590 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 591 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 592 : "=r,r" (sh), "=&r,&r" (sl) \ 593 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \ 594 "r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC) 595 #define umul_ppmm(ph, pl, m0, m1) \ 596 do { \ 597 UDItype __m0 = (m0), __m1 = (m1); \ 598 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \ 599 (pl) = __m0 * __m1; \ 600 } while (0) 601 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 602 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 603 #define COUNT_LEADING_ZEROS_0 64 604 #endif /* __aarch64__ */ 605 606 #if defined (__clipper__) && W_TYPE_SIZE == 32 607 #define umul_ppmm(w1, w0, u, v) \ 608 ({union {UDItype __ll; \ 609 struct {USItype __l, __h;} __i; \ 610 } __x; \ 611 __asm__ ("mulwux %2,%0" \ 612 : "=r" (__x.__ll) \ 613 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 614 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 615 #define smul_ppmm(w1, w0, u, v) \ 616 ({union {DItype __ll; \ 617 struct {SItype __l, __h;} __i; \ 618 } __x; \ 619 __asm__ ("mulwx %2,%0" \ 620 : "=r" (__x.__ll) \ 621 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \ 622 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 623 #define __umulsidi3(u, v) \ 624 ({UDItype __w; \ 625 __asm__ ("mulwux %2,%0" \ 626 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 627 __w; }) 628 #endif /* __clipper__ */ 629 630 /* Fujitsu vector computers. */ 631 #if defined (__uxp__) && W_TYPE_SIZE == 32 632 #define umul_ppmm(ph, pl, u, v) \ 633 do { \ 634 union {UDItype __ll; \ 635 struct {USItype __h, __l;} __i; \ 636 } __x; \ 637 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\ 638 (ph) = __x.__i.__h; \ 639 (pl) = __x.__i.__l; \ 640 } while (0) 641 #define smul_ppmm(ph, pl, u, v) \ 642 do { \ 643 union {UDItype __ll; \ 644 struct {USItype __h, __l;} __i; \ 645 } __x; \ 646 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \ 647 (ph) = __x.__i.__h; \ 648 (pl) = __x.__i.__l; \ 649 } while (0) 650 #endif 651 652 #if defined (__gmicro__) && W_TYPE_SIZE == 32 653 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 654 __asm__ ("add.w %5,%1\n\taddx %3,%0" \ 655 : "=g" (sh), "=&g" (sl) \ 656 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 657 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 658 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 659 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \ 660 : "=g" (sh), "=&g" (sl) \ 661 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 662 "1" ((USItype)(al)), "g" ((USItype)(bl))) 663 #define umul_ppmm(ph, pl, m0, m1) \ 664 __asm__ ("mulx %3,%0,%1" \ 665 : "=g" (ph), "=r" (pl) \ 666 : "%0" ((USItype)(m0)), "g" ((USItype)(m1))) 667 #define udiv_qrnnd(q, r, nh, nl, d) \ 668 __asm__ ("divx %4,%0,%1" \ 669 : "=g" (q), "=r" (r) \ 670 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d))) 671 #define count_leading_zeros(count, x) \ 672 __asm__ ("bsch/1 %1,%0" \ 673 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0)) 674 #endif 675 676 #if defined (__hppa) && W_TYPE_SIZE == 32 677 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 678 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \ 679 : "=r" (sh), "=&r" (sl) \ 680 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 681 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 682 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \ 683 : "=r" (sh), "=&r" (sl) \ 684 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 685 #if defined (_PA_RISC1_1) 686 #define umul_ppmm(wh, wl, u, v) \ 687 do { \ 688 union {UDItype __ll; \ 689 struct {USItype __h, __l;} __i; \ 690 } __x; \ 691 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \ 692 (wh) = __x.__i.__h; \ 693 (wl) = __x.__i.__l; \ 694 } while (0) 695 #define UMUL_TIME 8 696 #define UDIV_TIME 60 697 #else 698 #define UMUL_TIME 40 699 #define UDIV_TIME 80 700 #endif 701 #define count_leading_zeros(count, x) \ 702 do { \ 703 USItype __tmp; \ 704 __asm__ ( \ 705 "ldi 1,%0\n" \ 706 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ 707 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ 708 " ldo 16(%0),%0 ; Yes. Perform add.\n" \ 709 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ 710 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ 711 " ldo 8(%0),%0 ; Yes. Perform add.\n" \ 712 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ 713 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ 714 " ldo 4(%0),%0 ; Yes. Perform add.\n" \ 715 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ 716 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ 717 " ldo 2(%0),%0 ; Yes. Perform add.\n" \ 718 " extru %1,30,1,%1 ; Extract bit 1.\n" \ 719 " sub %0,%1,%0 ; Subtract it.\n" \ 720 : "=r" (count), "=r" (__tmp) : "1" (x)); \ 721 } while (0) 722 #endif /* hppa */ 723 724 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC 725 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this 726 is just a case of no direct support for 2.0n but treating it like 1.0. */ 727 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB) 728 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 729 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \ 730 : "=r" (sh), "=&r" (sl) \ 731 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 732 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 733 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \ 734 : "=r" (sh), "=&r" (sl) \ 735 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 736 #endif /* hppa */ 737 738 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 739 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch) 740 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 741 do { \ 742 /* if (__builtin_constant_p (bl)) \ 743 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \ 744 : "=r" (sh), "=&r" (sl) \ 745 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\ 746 else \ 747 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \ 748 : "=r" (sh), "=&r" (sl) \ 749 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \ 750 } while (0) 751 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 752 do { \ 753 /* if (__builtin_constant_p (bl)) \ 754 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \ 755 : "=r" (sh), "=&r" (sl) \ 756 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \ 757 else \ 758 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \ 759 : "=r" (sh), "=&r" (sl) \ 760 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \ 761 } while (0) 762 #if __GMP_GNUC_PREREQ (4,5) 763 #define umul_ppmm(xh, xl, m0, m1) \ 764 do { \ 765 union {UDItype __ll; \ 766 struct {USItype __h, __l;} __i; \ 767 } __x; \ 768 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \ 769 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 770 } while (0) 771 #else 772 #if 0 773 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 774 with a new enough processor pretending we have 32-bit registers. */ 775 #define umul_ppmm(xh, xl, m0, m1) \ 776 do { \ 777 union {UDItype __ll; \ 778 struct {USItype __h, __l;} __i; \ 779 } __x; \ 780 __asm__ ("mlr\t%0,%2" \ 781 : "=r" (__x.__ll) \ 782 : "%0" (m0), "r" (m1)); \ 783 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 784 } while (0) 785 #else 786 #define umul_ppmm(xh, xl, m0, m1) \ 787 do { \ 788 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use 789 DImode for the product, since that would be allocated to a single 64-bit 790 register, whereas mlr uses the low 32-bits of an even-odd register pair. 791 */ \ 792 register USItype __r0 __asm__ ("0"); \ 793 register USItype __r1 __asm__ ("1") = (m0); \ 794 __asm__ ("mlr\t%0,%3" \ 795 : "=r" (__r0), "=r" (__r1) \ 796 : "r" (__r1), "r" (m1)); \ 797 (xh) = __r0; (xl) = __r1; \ 798 } while (0) 799 #endif /* if 0 */ 800 #endif 801 #if 0 802 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 803 with a new enough processor pretending we have 32-bit registers. */ 804 #define udiv_qrnnd(q, r, n1, n0, d) \ 805 do { \ 806 union {UDItype __ll; \ 807 struct {USItype __h, __l;} __i; \ 808 } __x; \ 809 __x.__i.__h = n1; __x.__i.__l = n0; \ 810 __asm__ ("dlr\t%0,%2" \ 811 : "=r" (__x.__ll) \ 812 : "0" (__x.__ll), "r" (d)); \ 813 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 814 } while (0) 815 #else 816 #define udiv_qrnnd(q, r, n1, n0, d) \ 817 do { \ 818 register USItype __r0 __asm__ ("0") = (n1); \ 819 register USItype __r1 __asm__ ("1") = (n0); \ 820 __asm__ ("dlr\t%0,%4" \ 821 : "=r" (__r0), "=r" (__r1) \ 822 : "r" (__r0), "r" (__r1), "r" (d)); \ 823 (q) = __r1; (r) = __r0; \ 824 } while (0) 825 #endif /* if 0 */ 826 #else /* if __zarch__ */ 827 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 828 #define smul_ppmm(xh, xl, m0, m1) \ 829 do { \ 830 union {DItype __ll; \ 831 struct {USItype __h, __l;} __i; \ 832 } __x; \ 833 __asm__ ("mr\t%0,%2" \ 834 : "=r" (__x.__ll) \ 835 : "%0" (m0), "r" (m1)); \ 836 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 837 } while (0) 838 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 839 #define sdiv_qrnnd(q, r, n1, n0, d) \ 840 do { \ 841 union {DItype __ll; \ 842 struct {USItype __h, __l;} __i; \ 843 } __x; \ 844 __x.__i.__h = n1; __x.__i.__l = n0; \ 845 __asm__ ("dr\t%0,%2" \ 846 : "=r" (__x.__ll) \ 847 : "0" (__x.__ll), "r" (d)); \ 848 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 849 } while (0) 850 #endif /* if __zarch__ */ 851 #endif 852 853 #if defined (__s390x__) && W_TYPE_SIZE == 64 854 /* We need to cast operands with register constraints, otherwise their types 855 will be assumed to be SImode by gcc. For these machines, such operations 856 will insert a value into the low 32 bits, and leave the high 32 bits with 857 garbage. */ 858 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 859 do { \ 860 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \ 861 : "=r" (sh), "=&r" (sl) \ 862 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 863 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 864 } while (0) 865 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 866 do { \ 867 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \ 868 : "=r" (sh), "=&r" (sl) \ 869 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 870 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 871 } while (0) 872 #define umul_ppmm(xh, xl, m0, m1) \ 873 do { \ 874 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 875 struct {UDItype __h, __l;} __i; \ 876 } __x; \ 877 __asm__ ("mlgr\t%0,%2" \ 878 : "=r" (__x.__ll) \ 879 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \ 880 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 881 } while (0) 882 #define udiv_qrnnd(q, r, n1, n0, d) \ 883 do { \ 884 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 885 struct {UDItype __h, __l;} __i; \ 886 } __x; \ 887 __x.__i.__h = n1; __x.__i.__l = n0; \ 888 __asm__ ("dlgr\t%0,%2" \ 889 : "=r" (__x.__ll) \ 890 : "0" (__x.__ll), "r" ((UDItype)(d))); \ 891 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 892 } while (0) 893 #if 0 /* FIXME: Enable for z10 (?) */ 894 #define count_leading_zeros(cnt, x) \ 895 do { \ 896 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 897 struct {UDItype __h, __l;} __i; \ 898 } __clr_cnt; \ 899 __asm__ ("flogr\t%0,%1" \ 900 : "=r" (__clr_cnt.__ll) \ 901 : "r" (x) __CLOBBER_CC); \ 902 (cnt) = __clr_cnt.__i.__h; \ 903 } while (0) 904 #endif 905 #endif 906 907 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr", 908 so we don't need __CLOBBER_CC. */ 909 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 910 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 911 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \ 912 : "=r" (sh), "=&r" (sl) \ 913 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 914 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 915 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 916 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \ 917 : "=r" (sh), "=&r" (sl) \ 918 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 919 "1" ((USItype)(al)), "g" ((USItype)(bl))) 920 #define umul_ppmm(w1, w0, u, v) \ 921 __asm__ ("mull %3" \ 922 : "=a" (w0), "=d" (w1) \ 923 : "%0" ((USItype)(u)), "rm" ((USItype)(v))) 924 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 925 __asm__ ("divl %4" /* stringification in K&R C */ \ 926 : "=a" (q), "=d" (r) \ 927 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) 928 929 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx 930 /* Pentium bsrl takes between 10 and 72 cycles depending where the most 931 significant 1 bit is, hence the use of the following alternatives. bsfl 932 is slow too, between 18 and 42 depending where the least significant 1 933 bit is, so let the generic count_trailing_zeros below make use of the 934 count_leading_zeros here too. */ 935 936 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE) 937 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 938 cache miss reading from __clz_tab. For P55 it's favoured over the float 939 below so as to avoid mixing MMX and x87, since the penalty for switching 940 between the two is about 100 cycles. 941 942 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 943 16, -1 for 8, or 0 otherwise. This could be written equivalently as 944 follows, but as of gcc 2.95.2 it results in conditional jumps. 945 946 __shift = -(__n < 0x1000000); 947 __shift -= (__n < 0x10000); 948 __shift -= (__n < 0x100); 949 950 The middle two sbbl and cmpl's pair, and with luck something gcc 951 generates might pair with the first cmpl and the last sbbl. The "32+1" 952 constant could be folded into __clz_tab[], but it doesn't seem worth 953 making a different table just for that. */ 954 955 #define count_leading_zeros(c,n) \ 956 do { \ 957 USItype __n = (n); \ 958 USItype __shift; \ 959 __asm__ ("cmpl $0x1000000, %1\n" \ 960 "sbbl %0, %0\n" \ 961 "cmpl $0x10000, %1\n" \ 962 "sbbl $0, %0\n" \ 963 "cmpl $0x100, %1\n" \ 964 "sbbl $0, %0\n" \ 965 : "=&r" (__shift) : "r" (__n)); \ 966 __shift = __shift*8 + 24 + 1; \ 967 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ 968 } while (0) 969 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 970 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ 971 972 #else /* ! pentiummmx || LONGLONG_STANDALONE */ 973 /* The following should be a fixed 14 cycles or so. Some scheduling 974 opportunities should be available between the float load/store too. This 975 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is 976 apparently suggested by the Intel optimizing manual (don't know exactly 977 where). gcc 2.95 or up will be best for this, so the "double" is 978 correctly aligned on the stack. */ 979 #define count_leading_zeros(c,n) \ 980 do { \ 981 union { \ 982 double d; \ 983 unsigned a[2]; \ 984 } __u; \ 985 ASSERT ((n) != 0); \ 986 __u.d = (UWtype) (n); \ 987 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ 988 } while (0) 989 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31) 990 #endif /* pentiummx */ 991 992 #else /* ! pentium */ 993 994 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */ 995 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x) 996 #endif /* gcc clz */ 997 998 /* On P6, gcc prior to 3.0 generates a partial register stall for 999 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former 1000 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the 1001 cost of one extra instruction. Do this for "i386" too, since that means 1002 generic x86. */ 1003 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \ 1004 && (HAVE_HOST_CPU_i386 \ 1005 || HAVE_HOST_CPU_i686 \ 1006 || HAVE_HOST_CPU_pentiumpro \ 1007 || HAVE_HOST_CPU_pentium2 \ 1008 || HAVE_HOST_CPU_pentium3) 1009 #define count_leading_zeros(count, x) \ 1010 do { \ 1011 USItype __cbtmp; \ 1012 ASSERT ((x) != 0); \ 1013 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1014 (count) = 31 - __cbtmp; \ 1015 } while (0) 1016 #endif /* gcc<3 asm bsrl */ 1017 1018 #ifndef count_leading_zeros 1019 #define count_leading_zeros(count, x) \ 1020 do { \ 1021 USItype __cbtmp; \ 1022 ASSERT ((x) != 0); \ 1023 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1024 (count) = __cbtmp ^ 31; \ 1025 } while (0) 1026 #endif /* asm bsrl */ 1027 1028 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */ 1029 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x) 1030 #endif /* gcc ctz */ 1031 1032 #ifndef count_trailing_zeros 1033 #define count_trailing_zeros(count, x) \ 1034 do { \ 1035 ASSERT ((x) != 0); \ 1036 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \ 1037 } while (0) 1038 #endif /* asm bsfl */ 1039 1040 #endif /* ! pentium */ 1041 1042 #ifndef UMUL_TIME 1043 #define UMUL_TIME 10 1044 #endif 1045 #ifndef UDIV_TIME 1046 #define UDIV_TIME 40 1047 #endif 1048 #endif /* 80x86 */ 1049 1050 #if defined (__amd64__) && W_TYPE_SIZE == 64 1051 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1052 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \ 1053 : "=r" (sh), "=&r" (sl) \ 1054 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1055 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1056 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1057 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \ 1058 : "=r" (sh), "=&r" (sl) \ 1059 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1060 "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1061 #define umul_ppmm(w1, w0, u, v) \ 1062 __asm__ ("mulq %3" \ 1063 : "=a" (w0), "=d" (w1) \ 1064 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) 1065 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 1066 __asm__ ("divq %4" /* stringification in K&R C */ \ 1067 : "=a" (q), "=d" (r) \ 1068 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) 1069 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */ 1070 #define count_leading_zeros(count, x) \ 1071 do { \ 1072 UDItype __cbtmp; \ 1073 ASSERT ((x) != 0); \ 1074 __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ 1075 (count) = __cbtmp ^ 63; \ 1076 } while (0) 1077 /* bsfq destination must be a 64-bit register, "%q0" forces this in case 1078 count is only an int. */ 1079 #define count_trailing_zeros(count, x) \ 1080 do { \ 1081 ASSERT ((x) != 0); \ 1082 __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1083 } while (0) 1084 #endif /* __amd64__ */ 1085 1086 #if defined (__i860__) && W_TYPE_SIZE == 32 1087 #define rshift_rhlc(r,h,l,c) \ 1088 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ 1089 "=r" (r) : "r" (h), "r" (l), "rn" (c)) 1090 #endif /* i860 */ 1091 1092 #if defined (__i960__) && W_TYPE_SIZE == 32 1093 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1094 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \ 1095 : "=r" (sh), "=&r" (sl) \ 1096 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl)) 1097 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1098 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \ 1099 : "=r" (sh), "=&r" (sl) \ 1100 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl)) 1101 #define umul_ppmm(w1, w0, u, v) \ 1102 ({union {UDItype __ll; \ 1103 struct {USItype __l, __h;} __i; \ 1104 } __x; \ 1105 __asm__ ("emul %2,%1,%0" \ 1106 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \ 1107 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1108 #define __umulsidi3(u, v) \ 1109 ({UDItype __w; \ 1110 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \ 1111 __w; }) 1112 #define udiv_qrnnd(q, r, nh, nl, d) \ 1113 do { \ 1114 union {UDItype __ll; \ 1115 struct {USItype __l, __h;} __i; \ 1116 } __nn; \ 1117 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ 1118 __asm__ ("ediv %d,%n,%0" \ 1119 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \ 1120 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ 1121 } while (0) 1122 #define count_leading_zeros(count, x) \ 1123 do { \ 1124 USItype __cbtmp; \ 1125 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \ 1126 (count) = __cbtmp ^ 31; \ 1127 } while (0) 1128 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */ 1129 #if defined (__i960mx) /* what is the proper symbol to test??? */ 1130 #define rshift_rhlc(r,h,l,c) \ 1131 do { \ 1132 union {UDItype __ll; \ 1133 struct {USItype __l, __h;} __i; \ 1134 } __nn; \ 1135 __nn.__i.__h = (h); __nn.__i.__l = (l); \ 1136 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ 1137 } 1138 #endif /* i960mx */ 1139 #endif /* i960 */ 1140 1141 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \ 1142 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \ 1143 || defined (__mc5307__)) && W_TYPE_SIZE == 32 1144 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1145 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \ 1146 : "=d" (sh), "=&d" (sl) \ 1147 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1148 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1149 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1150 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \ 1151 : "=d" (sh), "=&d" (sl) \ 1152 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1153 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1154 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */ 1155 #if defined (__mc68020__) || defined(mc68020) \ 1156 || defined (__mc68030__) || defined (mc68030) \ 1157 || defined (__mc68040__) || defined (mc68040) \ 1158 || defined (__mcpu32__) || defined (mcpu32) \ 1159 || defined (__NeXT__) 1160 #define umul_ppmm(w1, w0, u, v) \ 1161 __asm__ ("mulu%.l %3,%1:%0" \ 1162 : "=d" (w0), "=d" (w1) \ 1163 : "%0" ((USItype)(u)), "dmi" ((USItype)(v))) 1164 #define UMUL_TIME 45 1165 #define udiv_qrnnd(q, r, n1, n0, d) \ 1166 __asm__ ("divu%.l %4,%1:%0" \ 1167 : "=d" (q), "=d" (r) \ 1168 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1169 #define UDIV_TIME 90 1170 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1171 __asm__ ("divs%.l %4,%1:%0" \ 1172 : "=d" (q), "=d" (r) \ 1173 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1174 #else /* for other 68k family members use 16x16->32 multiplication */ 1175 #define umul_ppmm(xh, xl, a, b) \ 1176 do { USItype __umul_tmp1, __umul_tmp2; \ 1177 __asm__ ("| Inlined umul_ppmm\n" \ 1178 " move%.l %5,%3\n" \ 1179 " move%.l %2,%0\n" \ 1180 " move%.w %3,%1\n" \ 1181 " swap %3\n" \ 1182 " swap %0\n" \ 1183 " mulu%.w %2,%1\n" \ 1184 " mulu%.w %3,%0\n" \ 1185 " mulu%.w %2,%3\n" \ 1186 " swap %2\n" \ 1187 " mulu%.w %5,%2\n" \ 1188 " add%.l %3,%2\n" \ 1189 " jcc 1f\n" \ 1190 " add%.l %#0x10000,%0\n" \ 1191 "1: move%.l %2,%3\n" \ 1192 " clr%.w %2\n" \ 1193 " swap %2\n" \ 1194 " swap %3\n" \ 1195 " clr%.w %3\n" \ 1196 " add%.l %3,%1\n" \ 1197 " addx%.l %2,%0\n" \ 1198 " | End inlined umul_ppmm" \ 1199 : "=&d" (xh), "=&d" (xl), \ 1200 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ 1201 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ 1202 } while (0) 1203 #define UMUL_TIME 100 1204 #define UDIV_TIME 400 1205 #endif /* not mc68020 */ 1206 /* The '020, '030, '040 and '060 have bitfield insns. 1207 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to 1208 exclude bfffo on that chip (bitfield insns not available). */ 1209 #if (defined (__mc68020__) || defined (mc68020) \ 1210 || defined (__mc68030__) || defined (mc68030) \ 1211 || defined (__mc68040__) || defined (mc68040) \ 1212 || defined (__mc68060__) || defined (mc68060) \ 1213 || defined (__NeXT__)) \ 1214 && ! defined (__mcpu32__) 1215 #define count_leading_zeros(count, x) \ 1216 __asm__ ("bfffo %1{%b2:%b2},%0" \ 1217 : "=d" (count) \ 1218 : "od" ((USItype) (x)), "n" (0)) 1219 #define COUNT_LEADING_ZEROS_0 32 1220 #endif 1221 #endif /* mc68000 */ 1222 1223 #if defined (__m88000__) && W_TYPE_SIZE == 32 1224 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1225 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \ 1226 : "=r" (sh), "=&r" (sl) \ 1227 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl)) 1228 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1229 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \ 1230 : "=r" (sh), "=&r" (sl) \ 1231 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl)) 1232 #define count_leading_zeros(count, x) \ 1233 do { \ 1234 USItype __cbtmp; \ 1235 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \ 1236 (count) = __cbtmp ^ 31; \ 1237 } while (0) 1238 #define COUNT_LEADING_ZEROS_0 63 /* sic */ 1239 #if defined (__m88110__) 1240 #define umul_ppmm(wh, wl, u, v) \ 1241 do { \ 1242 union {UDItype __ll; \ 1243 struct {USItype __h, __l;} __i; \ 1244 } __x; \ 1245 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ 1246 (wh) = __x.__i.__h; \ 1247 (wl) = __x.__i.__l; \ 1248 } while (0) 1249 #define udiv_qrnnd(q, r, n1, n0, d) \ 1250 ({union {UDItype __ll; \ 1251 struct {USItype __h, __l;} __i; \ 1252 } __x, __q; \ 1253 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1254 __asm__ ("divu.d %0,%1,%2" \ 1255 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ 1256 (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) 1257 #define UMUL_TIME 5 1258 #define UDIV_TIME 25 1259 #else 1260 #define UMUL_TIME 17 1261 #define UDIV_TIME 150 1262 #endif /* __m88110__ */ 1263 #endif /* __m88000__ */ 1264 1265 #if defined (__mips) && W_TYPE_SIZE == 32 1266 #if __GMP_GNUC_PREREQ (4,4) 1267 #define umul_ppmm(w1, w0, u, v) \ 1268 do { \ 1269 UDItype __ll = (UDItype)(u) * (v); \ 1270 w1 = __ll >> 32; \ 1271 w0 = __ll; \ 1272 } while (0) 1273 #endif 1274 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1275 #define umul_ppmm(w1, w0, u, v) \ 1276 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) 1277 #endif 1278 #if !defined (umul_ppmm) 1279 #define umul_ppmm(w1, w0, u, v) \ 1280 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1281 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) 1282 #endif 1283 #define UMUL_TIME 10 1284 #define UDIV_TIME 100 1285 #endif /* __mips */ 1286 1287 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 1288 #if __GMP_GNUC_PREREQ (4,4) 1289 #define umul_ppmm(w1, w0, u, v) \ 1290 do { \ 1291 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1292 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1293 w1 = __ll >> 64; \ 1294 w0 = __ll; \ 1295 } while (0) 1296 #endif 1297 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1298 #define umul_ppmm(w1, w0, u, v) \ 1299 __asm__ ("dmultu %2,%3" \ 1300 : "=l" (w0), "=h" (w1) \ 1301 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1302 #endif 1303 #if !defined (umul_ppmm) 1304 #define umul_ppmm(w1, w0, u, v) \ 1305 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1306 : "=d" (w0), "=d" (w1) \ 1307 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1308 #endif 1309 #define UMUL_TIME 20 1310 #define UDIV_TIME 140 1311 #endif /* __mips */ 1312 1313 #if defined (__mmix__) && W_TYPE_SIZE == 64 1314 #define umul_ppmm(w1, w0, u, v) \ 1315 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v)) 1316 #endif 1317 1318 #if defined (__ns32000__) && W_TYPE_SIZE == 32 1319 #define umul_ppmm(w1, w0, u, v) \ 1320 ({union {UDItype __ll; \ 1321 struct {USItype __l, __h;} __i; \ 1322 } __x; \ 1323 __asm__ ("meid %2,%0" \ 1324 : "=g" (__x.__ll) \ 1325 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1326 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1327 #define __umulsidi3(u, v) \ 1328 ({UDItype __w; \ 1329 __asm__ ("meid %2,%0" \ 1330 : "=g" (__w) \ 1331 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1332 __w; }) 1333 #define udiv_qrnnd(q, r, n1, n0, d) \ 1334 ({union {UDItype __ll; \ 1335 struct {USItype __l, __h;} __i; \ 1336 } __x; \ 1337 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1338 __asm__ ("deid %2,%0" \ 1339 : "=g" (__x.__ll) \ 1340 : "0" (__x.__ll), "g" ((USItype)(d))); \ 1341 (r) = __x.__i.__l; (q) = __x.__i.__h; }) 1342 #define count_trailing_zeros(count,x) \ 1343 do { \ 1344 __asm__ ("ffsd %2,%0" \ 1345 : "=r" (count) \ 1346 : "0" ((USItype) 0), "r" ((USItype) (x))); \ 1347 } while (0) 1348 #endif /* __ns32000__ */ 1349 1350 /* In the past we had a block of various #defines tested 1351 _ARCH_PPC - AIX 1352 _ARCH_PWR - AIX 1353 __powerpc__ - gcc 1354 __POWERPC__ - BEOS 1355 __ppc__ - Darwin 1356 PPC - old gcc, GNU/Linux, SysV 1357 The plain PPC test was not good for vxWorks, since PPC is defined on all 1358 CPUs there (eg. m68k too), as a constant one is expected to compare 1359 CPU_FAMILY against. 1360 1361 At any rate, this was pretty unattractive and a bit fragile. The use of 1362 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of 1363 getting the desired effect. 1364 1365 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for 1366 the system vendor compilers. (Is that vendor compilers with inline asm, 1367 or what?) */ 1368 1369 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \ 1370 && W_TYPE_SIZE == 32 1371 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1372 do { \ 1373 if (__builtin_constant_p (bh) && (bh) == 0) \ 1374 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1375 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \ 1376 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1377 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1378 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \ 1379 else \ 1380 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1381 : "=r" (sh), "=&r" (sl) \ 1382 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ 1383 } while (0) 1384 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1385 do { \ 1386 if (__builtin_constant_p (ah) && (ah) == 0) \ 1387 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1388 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ 1389 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ 1390 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1391 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ 1392 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1393 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1394 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ 1395 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1396 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1397 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ 1398 else \ 1399 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1400 : "=r" (sh), "=&r" (sl) \ 1401 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ 1402 } while (0) 1403 #define count_leading_zeros(count, x) \ 1404 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x)) 1405 #define COUNT_LEADING_ZEROS_0 32 1406 #if HAVE_HOST_CPU_FAMILY_powerpc 1407 #if __GMP_GNUC_PREREQ (4,4) 1408 #define umul_ppmm(w1, w0, u, v) \ 1409 do { \ 1410 UDItype __ll = (UDItype)(u) * (v); \ 1411 w1 = __ll >> 32; \ 1412 w0 = __ll; \ 1413 } while (0) 1414 #endif 1415 #if !defined (umul_ppmm) 1416 #define umul_ppmm(ph, pl, m0, m1) \ 1417 do { \ 1418 USItype __m0 = (m0), __m1 = (m1); \ 1419 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1420 (pl) = __m0 * __m1; \ 1421 } while (0) 1422 #endif 1423 #define UMUL_TIME 15 1424 #define smul_ppmm(ph, pl, m0, m1) \ 1425 do { \ 1426 SItype __m0 = (m0), __m1 = (m1); \ 1427 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1428 (pl) = __m0 * __m1; \ 1429 } while (0) 1430 #define SMUL_TIME 14 1431 #define UDIV_TIME 120 1432 #else 1433 #define UMUL_TIME 8 1434 #define smul_ppmm(xh, xl, m0, m1) \ 1435 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1)) 1436 #define SMUL_TIME 4 1437 #define sdiv_qrnnd(q, r, nh, nl, d) \ 1438 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d)) 1439 #define UDIV_TIME 100 1440 #endif 1441 #endif /* 32-bit POWER architecture variants. */ 1442 1443 /* We should test _IBMR2 here when we add assembly support for the system 1444 vendor compilers. */ 1445 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64 1446 #if !defined (_LONG_LONG_LIMB) 1447 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So 1448 use adde etc only when not _LONG_LONG_LIMB. */ 1449 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1450 do { \ 1451 if (__builtin_constant_p (bh) && (bh) == 0) \ 1452 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1453 : "=r" (sh), "=&r" (sl) \ 1454 : "r" ((UDItype)(ah)), \ 1455 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))); \ 1456 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1457 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1458 : "=r" (sh), "=&r" (sl) \ 1459 : "r" ((UDItype)(ah)), \ 1460 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))); \ 1461 else \ 1462 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1463 : "=r" (sh), "=&r" (sl) \ 1464 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1465 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))); \ 1466 } while (0) 1467 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs. 1468 This might seem strange, but gcc folds away the dead code late. */ 1469 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1470 do { \ 1471 if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \ 1472 if (__builtin_constant_p (ah) && (ah) == 0) \ 1473 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \ 1474 : "=r" (sh), "=&r" (sl) \ 1475 : "r" ((UDItype)(bh)), \ 1476 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))); \ 1477 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1478 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \ 1479 : "=r" (sh), "=&r" (sl) \ 1480 : "r" ((UDItype)(bh)), \ 1481 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))); \ 1482 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1483 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \ 1484 : "=r" (sh), "=&r" (sl) \ 1485 : "r" ((UDItype)(ah)), \ 1486 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))); \ 1487 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1488 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \ 1489 : "=r" (sh), "=&r" (sl) \ 1490 : "r" ((UDItype)(ah)), \ 1491 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))); \ 1492 else \ 1493 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \ 1494 : "=r" (sh), "=&r" (sl) \ 1495 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1496 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))); \ 1497 } else { \ 1498 if (__builtin_constant_p (ah) && (ah) == 0) \ 1499 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1500 : "=r" (sh), "=&r" (sl) \ 1501 : "r" ((UDItype)(bh)), \ 1502 "rI" ((UDItype)(al)), "r" ((UDItype)(bl))); \ 1503 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1504 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1505 : "=r" (sh), "=&r" (sl) \ 1506 : "r" ((UDItype)(bh)), \ 1507 "rI" ((UDItype)(al)), "r" ((UDItype)(bl))); \ 1508 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1509 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1510 : "=r" (sh), "=&r" (sl) \ 1511 : "r" ((UDItype)(ah)), \ 1512 "rI" ((UDItype)(al)), "r" ((UDItype)(bl))); \ 1513 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1514 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1515 : "=r" (sh), "=&r" (sl) \ 1516 : "r" ((UDItype)(ah)), \ 1517 "rI" ((UDItype)(al)), "r" ((UDItype)(bl))); \ 1518 else \ 1519 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1520 : "=r" (sh), "=&r" (sl) \ 1521 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1522 "rI" ((UDItype)(al)), "r" ((UDItype)(bl))); \ 1523 } \ 1524 } while (0) 1525 #endif /* ! _LONG_LONG_LIMB */ 1526 #define count_leading_zeros(count, x) \ 1527 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) 1528 #define COUNT_LEADING_ZEROS_0 64 1529 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */ 1530 #define umul_ppmm(w1, w0, u, v) \ 1531 do { \ 1532 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1533 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1534 w1 = __ll >> 64; \ 1535 w0 = __ll; \ 1536 } while (0) 1537 #endif 1538 #if !defined (umul_ppmm) 1539 #define umul_ppmm(ph, pl, m0, m1) \ 1540 do { \ 1541 UDItype __m0 = (m0), __m1 = (m1); \ 1542 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1543 (pl) = __m0 * __m1; \ 1544 } while (0) 1545 #endif 1546 #define UMUL_TIME 15 1547 #define smul_ppmm(ph, pl, m0, m1) \ 1548 do { \ 1549 DItype __m0 = (m0), __m1 = (m1); \ 1550 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1551 (pl) = __m0 * __m1; \ 1552 } while (0) 1553 #define SMUL_TIME 14 /* ??? */ 1554 #define UDIV_TIME 120 /* ??? */ 1555 #endif /* 64-bit PowerPC. */ 1556 1557 #if defined (__pyr__) && W_TYPE_SIZE == 32 1558 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1559 __asm__ ("addw %5,%1\n\taddwc %3,%0" \ 1560 : "=r" (sh), "=&r" (sl) \ 1561 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1562 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1563 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1564 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \ 1565 : "=r" (sh), "=&r" (sl) \ 1566 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1567 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1568 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ 1569 #define umul_ppmm(w1, w0, u, v) \ 1570 ({union {UDItype __ll; \ 1571 struct {USItype __h, __l;} __i; \ 1572 } __x; \ 1573 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \ 1574 : "=&r" (__x.__ll) \ 1575 : "g" ((USItype) (u)), "g" ((USItype)(v))); \ 1576 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1577 #endif /* __pyr__ */ 1578 1579 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 1580 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1581 __asm__ ("a %1,%5\n\tae %0,%3" \ 1582 : "=r" (sh), "=&r" (sl) \ 1583 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1584 "%1" ((USItype)(al)), "r" ((USItype)(bl))) 1585 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1586 __asm__ ("s %1,%5\n\tse %0,%3" \ 1587 : "=r" (sh), "=&r" (sl) \ 1588 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1589 "1" ((USItype)(al)), "r" ((USItype)(bl))) 1590 #define smul_ppmm(ph, pl, m0, m1) \ 1591 __asm__ ( \ 1592 "s r2,r2\n" \ 1593 " mts r10,%2\n" \ 1594 " m r2,%3\n" \ 1595 " m r2,%3\n" \ 1596 " m r2,%3\n" \ 1597 " m r2,%3\n" \ 1598 " m r2,%3\n" \ 1599 " m r2,%3\n" \ 1600 " m r2,%3\n" \ 1601 " m r2,%3\n" \ 1602 " m r2,%3\n" \ 1603 " m r2,%3\n" \ 1604 " m r2,%3\n" \ 1605 " m r2,%3\n" \ 1606 " m r2,%3\n" \ 1607 " m r2,%3\n" \ 1608 " m r2,%3\n" \ 1609 " m r2,%3\n" \ 1610 " cas %0,r2,r0\n" \ 1611 " mfs r10,%1" \ 1612 : "=r" (ph), "=r" (pl) \ 1613 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ 1614 : "r2") 1615 #define UMUL_TIME 20 1616 #define UDIV_TIME 200 1617 #define count_leading_zeros(count, x) \ 1618 do { \ 1619 if ((x) >= 0x10000) \ 1620 __asm__ ("clz %0,%1" \ 1621 : "=r" (count) : "r" ((USItype)(x) >> 16)); \ 1622 else \ 1623 { \ 1624 __asm__ ("clz %0,%1" \ 1625 : "=r" (count) : "r" ((USItype)(x))); \ 1626 (count) += 16; \ 1627 } \ 1628 } while (0) 1629 #endif /* RT/ROMP */ 1630 1631 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32 1632 #define umul_ppmm(w1, w0, u, v) \ 1633 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \ 1634 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach") 1635 #define UMUL_TIME 5 1636 #endif 1637 1638 #if defined (__sparc__) && W_TYPE_SIZE == 32 1639 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1640 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \ 1641 : "=r" (sh), "=&r" (sl) \ 1642 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \ 1643 __CLOBBER_CC) 1644 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1645 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \ 1646 : "=r" (sh), "=&r" (sl) \ 1647 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ 1648 __CLOBBER_CC) 1649 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h 1650 doesn't define anything to indicate that to us, it only sets __sparcv8. */ 1651 #if defined (__sparc_v9__) || defined (__sparcv9) 1652 /* Perhaps we should use floating-point operations here? */ 1653 #if 0 1654 /* Triggers a bug making mpz/tests/t-gcd.c fail. 1655 Perhaps we simply need explicitly zero-extend the inputs? */ 1656 #define umul_ppmm(w1, w0, u, v) \ 1657 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \ 1658 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1") 1659 #else 1660 /* Use v8 umul until above bug is fixed. */ 1661 #define umul_ppmm(w1, w0, u, v) \ 1662 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1663 #endif 1664 /* Use a plain v8 divide for v9. */ 1665 #define udiv_qrnnd(q, r, n1, n0, d) \ 1666 do { \ 1667 USItype __q; \ 1668 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1669 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1670 (r) = (n0) - __q * (d); \ 1671 (q) = __q; \ 1672 } while (0) 1673 #else 1674 #if defined (__sparc_v8__) /* gcc normal */ \ 1675 || defined (__sparcv8) /* gcc solaris */ \ 1676 || HAVE_HOST_CPU_supersparc 1677 /* Don't match immediate range because, 1) it is not often useful, 1678 2) the 'I' flag thinks of the range as a 13 bit signed interval, 1679 while we want to match a 13 bit interval, sign extended to 32 bits, 1680 but INTERPRETED AS UNSIGNED. */ 1681 #define umul_ppmm(w1, w0, u, v) \ 1682 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1683 #define UMUL_TIME 5 1684 1685 #if HAVE_HOST_CPU_supersparc 1686 #define UDIV_TIME 60 /* SuperSPARC timing */ 1687 #else 1688 /* Don't use this on SuperSPARC because its udiv only handles 53 bit 1689 dividends and will trap to the kernel for the rest. */ 1690 #define udiv_qrnnd(q, r, n1, n0, d) \ 1691 do { \ 1692 USItype __q; \ 1693 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1694 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1695 (r) = (n0) - __q * (d); \ 1696 (q) = __q; \ 1697 } while (0) 1698 #define UDIV_TIME 25 1699 #endif /* HAVE_HOST_CPU_supersparc */ 1700 1701 #else /* ! __sparc_v8__ */ 1702 #if defined (__sparclite__) 1703 /* This has hardware multiply but not divide. It also has two additional 1704 instructions scan (ffs from high bit) and divscc. */ 1705 #define umul_ppmm(w1, w0, u, v) \ 1706 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1707 #define UMUL_TIME 5 1708 #define udiv_qrnnd(q, r, n1, n0, d) \ 1709 __asm__ ("! Inlined udiv_qrnnd\n" \ 1710 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ 1711 " tst %%g0\n" \ 1712 " divscc %3,%4,%%g1\n" \ 1713 " divscc %%g1,%4,%%g1\n" \ 1714 " divscc %%g1,%4,%%g1\n" \ 1715 " divscc %%g1,%4,%%g1\n" \ 1716 " divscc %%g1,%4,%%g1\n" \ 1717 " divscc %%g1,%4,%%g1\n" \ 1718 " divscc %%g1,%4,%%g1\n" \ 1719 " divscc %%g1,%4,%%g1\n" \ 1720 " divscc %%g1,%4,%%g1\n" \ 1721 " divscc %%g1,%4,%%g1\n" \ 1722 " divscc %%g1,%4,%%g1\n" \ 1723 " divscc %%g1,%4,%%g1\n" \ 1724 " divscc %%g1,%4,%%g1\n" \ 1725 " divscc %%g1,%4,%%g1\n" \ 1726 " divscc %%g1,%4,%%g1\n" \ 1727 " divscc %%g1,%4,%%g1\n" \ 1728 " divscc %%g1,%4,%%g1\n" \ 1729 " divscc %%g1,%4,%%g1\n" \ 1730 " divscc %%g1,%4,%%g1\n" \ 1731 " divscc %%g1,%4,%%g1\n" \ 1732 " divscc %%g1,%4,%%g1\n" \ 1733 " divscc %%g1,%4,%%g1\n" \ 1734 " divscc %%g1,%4,%%g1\n" \ 1735 " divscc %%g1,%4,%%g1\n" \ 1736 " divscc %%g1,%4,%%g1\n" \ 1737 " divscc %%g1,%4,%%g1\n" \ 1738 " divscc %%g1,%4,%%g1\n" \ 1739 " divscc %%g1,%4,%%g1\n" \ 1740 " divscc %%g1,%4,%%g1\n" \ 1741 " divscc %%g1,%4,%%g1\n" \ 1742 " divscc %%g1,%4,%%g1\n" \ 1743 " divscc %%g1,%4,%0\n" \ 1744 " rd %%y,%1\n" \ 1745 " bl,a 1f\n" \ 1746 " add %1,%4,%1\n" \ 1747 "1: ! End of inline udiv_qrnnd" \ 1748 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ 1749 : "%g1" __AND_CLOBBER_CC) 1750 #define UDIV_TIME 37 1751 #define count_leading_zeros(count, x) \ 1752 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) 1753 /* Early sparclites return 63 for an argument of 0, but they warn that future 1754 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 1755 undefined. */ 1756 #endif /* __sparclite__ */ 1757 #endif /* __sparc_v8__ */ 1758 #endif /* __sparc_v9__ */ 1759 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ 1760 #ifndef umul_ppmm 1761 #define umul_ppmm(w1, w0, u, v) \ 1762 __asm__ ("! Inlined umul_ppmm\n" \ 1763 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ 1764 " sra %3,31,%%g2 ! Don't move this insn\n" \ 1765 " and %2,%%g2,%%g2 ! Don't move this insn\n" \ 1766 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \ 1767 " mulscc %%g1,%3,%%g1\n" \ 1768 " mulscc %%g1,%3,%%g1\n" \ 1769 " mulscc %%g1,%3,%%g1\n" \ 1770 " mulscc %%g1,%3,%%g1\n" \ 1771 " mulscc %%g1,%3,%%g1\n" \ 1772 " mulscc %%g1,%3,%%g1\n" \ 1773 " mulscc %%g1,%3,%%g1\n" \ 1774 " mulscc %%g1,%3,%%g1\n" \ 1775 " mulscc %%g1,%3,%%g1\n" \ 1776 " mulscc %%g1,%3,%%g1\n" \ 1777 " mulscc %%g1,%3,%%g1\n" \ 1778 " mulscc %%g1,%3,%%g1\n" \ 1779 " mulscc %%g1,%3,%%g1\n" \ 1780 " mulscc %%g1,%3,%%g1\n" \ 1781 " mulscc %%g1,%3,%%g1\n" \ 1782 " mulscc %%g1,%3,%%g1\n" \ 1783 " mulscc %%g1,%3,%%g1\n" \ 1784 " mulscc %%g1,%3,%%g1\n" \ 1785 " mulscc %%g1,%3,%%g1\n" \ 1786 " mulscc %%g1,%3,%%g1\n" \ 1787 " mulscc %%g1,%3,%%g1\n" \ 1788 " mulscc %%g1,%3,%%g1\n" \ 1789 " mulscc %%g1,%3,%%g1\n" \ 1790 " mulscc %%g1,%3,%%g1\n" \ 1791 " mulscc %%g1,%3,%%g1\n" \ 1792 " mulscc %%g1,%3,%%g1\n" \ 1793 " mulscc %%g1,%3,%%g1\n" \ 1794 " mulscc %%g1,%3,%%g1\n" \ 1795 " mulscc %%g1,%3,%%g1\n" \ 1796 " mulscc %%g1,%3,%%g1\n" \ 1797 " mulscc %%g1,%3,%%g1\n" \ 1798 " mulscc %%g1,%3,%%g1\n" \ 1799 " mulscc %%g1,0,%%g1\n" \ 1800 " add %%g1,%%g2,%0\n" \ 1801 " rd %%y,%1" \ 1802 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ 1803 : "%g1", "%g2" __AND_CLOBBER_CC) 1804 #define UMUL_TIME 39 /* 39 instructions */ 1805 #endif 1806 #ifndef udiv_qrnnd 1807 #ifndef LONGLONG_STANDALONE 1808 #define udiv_qrnnd(q, r, n1, n0, d) \ 1809 do { UWtype __r; \ 1810 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 1811 (r) = __r; \ 1812 } while (0) 1813 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 1814 #ifndef UDIV_TIME 1815 #define UDIV_TIME 140 1816 #endif 1817 #endif /* LONGLONG_STANDALONE */ 1818 #endif /* udiv_qrnnd */ 1819 #endif /* __sparc__ */ 1820 1821 #if defined (__sparc__) && W_TYPE_SIZE == 64 1822 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1823 __asm__ ( \ 1824 "addcc %r4,%5,%1\n" \ 1825 " addccc %r6,%7,%%g0\n" \ 1826 " addc %r2,%3,%0" \ 1827 : "=r" (sh), "=&r" (sl) \ 1828 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1829 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1830 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1831 __CLOBBER_CC) 1832 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1833 __asm__ ( \ 1834 "subcc %r4,%5,%1\n" \ 1835 " subccc %r6,%7,%%g0\n" \ 1836 " subc %r2,%3,%0" \ 1837 : "=r" (sh), "=&r" (sl) \ 1838 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1839 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1840 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1841 __CLOBBER_CC) 1842 #if __VIS__ >= 0x300 1843 #undef add_ssaaaa 1844 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1845 __asm__ ( \ 1846 "addcc %r4, %5, %1\n" \ 1847 " addxc %r2, %r3, %0" \ 1848 : "=r" (sh), "=&r" (sl) \ 1849 : "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \ 1850 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC) 1851 #define umul_ppmm(ph, pl, m0, m1) \ 1852 do { \ 1853 UDItype __m0 = (m0), __m1 = (m1); \ 1854 (pl) = __m0 * __m1; \ 1855 __asm__ ("umulxhi\t%2, %1, %0" \ 1856 : "=r" (ph) \ 1857 : "%r" (__m0), "r" (__m1)); \ 1858 } while (0) 1859 #define count_leading_zeros(count, x) \ 1860 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x)) 1861 /* Needed by count_leading_zeros_32 in sparc64.h. */ 1862 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 1863 #endif 1864 #endif 1865 1866 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32 1867 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1868 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ 1869 : "=g" (sh), "=&g" (sl) \ 1870 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1871 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1872 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1873 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \ 1874 : "=g" (sh), "=&g" (sl) \ 1875 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1876 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1877 #define smul_ppmm(xh, xl, m0, m1) \ 1878 do { \ 1879 union {UDItype __ll; \ 1880 struct {USItype __l, __h;} __i; \ 1881 } __x; \ 1882 USItype __m0 = (m0), __m1 = (m1); \ 1883 __asm__ ("emul %1,%2,$0,%0" \ 1884 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \ 1885 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1886 } while (0) 1887 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1888 do { \ 1889 union {DItype __ll; \ 1890 struct {SItype __l, __h;} __i; \ 1891 } __x; \ 1892 __x.__i.__h = n1; __x.__i.__l = n0; \ 1893 __asm__ ("ediv %3,%2,%0,%1" \ 1894 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ 1895 } while (0) 1896 #if 0 1897 /* FIXME: This instruction appears to be unimplemented on some systems (vax 1898 8800 maybe). */ 1899 #define count_trailing_zeros(count,x) \ 1900 do { \ 1901 __asm__ ("ffs 0, 31, %1, %0" \ 1902 : "=g" (count) \ 1903 : "g" ((USItype) (x))); \ 1904 } while (0) 1905 #endif 1906 #endif /* vax */ 1907 1908 #if defined (__z8000__) && W_TYPE_SIZE == 16 1909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1910 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ 1911 : "=r" (sh), "=&r" (sl) \ 1912 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1913 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1915 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ 1916 : "=r" (sh), "=&r" (sl) \ 1917 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1918 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1919 #define umul_ppmm(xh, xl, m0, m1) \ 1920 do { \ 1921 union {long int __ll; \ 1922 struct {unsigned int __h, __l;} __i; \ 1923 } __x; \ 1924 unsigned int __m0 = (m0), __m1 = (m1); \ 1925 __asm__ ("mult %S0,%H3" \ 1926 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ 1927 : "%1" (m0), "rQR" (m1)); \ 1928 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1929 (xh) += ((((signed int) __m0 >> 15) & __m1) \ 1930 + (((signed int) __m1 >> 15) & __m0)); \ 1931 } while (0) 1932 #endif /* __z8000__ */ 1933 1934 #endif /* __GNUC__ */ 1935 1936 #endif /* NO_ASM */ 1937 1938 1939 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */ 1940 #if !defined (umul_ppmm) && defined (__umulsidi3) 1941 #define umul_ppmm(ph, pl, m0, m1) \ 1942 { \ 1943 UDWtype __ll = __umulsidi3 (m0, m1); \ 1944 ph = (UWtype) (__ll >> W_TYPE_SIZE); \ 1945 pl = (UWtype) __ll; \ 1946 } 1947 #endif 1948 1949 #if !defined (__umulsidi3) 1950 #define __umulsidi3(u, v) \ 1951 ({UWtype __hi, __lo; \ 1952 umul_ppmm (__hi, __lo, u, v); \ 1953 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) 1954 #endif 1955 1956 1957 #if defined (__cplusplus) 1958 #define __longlong_h_C "C" 1959 #else 1960 #define __longlong_h_C 1961 #endif 1962 1963 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r" 1964 forms have "reversed" arguments, meaning the pointer is last, which 1965 sometimes allows better parameter passing, in particular on 64-bit 1966 hppa. */ 1967 1968 #define mpn_umul_ppmm __MPN(umul_ppmm) 1969 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype); 1970 1971 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \ 1972 && ! defined (LONGLONG_STANDALONE) 1973 #define umul_ppmm(wh, wl, u, v) \ 1974 do { \ 1975 UWtype __umul_ppmm__p0; \ 1976 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\ 1977 (wl) = __umul_ppmm__p0; \ 1978 } while (0) 1979 #endif 1980 1981 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r) 1982 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *); 1983 1984 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \ 1985 && ! defined (LONGLONG_STANDALONE) 1986 #define umul_ppmm(wh, wl, u, v) \ 1987 do { \ 1988 UWtype __umul_p0; \ 1989 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \ 1990 (wl) = __umul_p0; \ 1991 } while (0) 1992 #endif 1993 1994 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd) 1995 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype); 1996 1997 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \ 1998 && ! defined (LONGLONG_STANDALONE) 1999 #define udiv_qrnnd(q, r, n1, n0, d) \ 2000 do { \ 2001 UWtype __udiv_qrnnd_r; \ 2002 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \ 2003 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \ 2004 (r) = __udiv_qrnnd_r; \ 2005 } while (0) 2006 #endif 2007 2008 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r) 2009 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *); 2010 2011 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \ 2012 && ! defined (LONGLONG_STANDALONE) 2013 #define udiv_qrnnd(q, r, n1, n0, d) \ 2014 do { \ 2015 UWtype __udiv_qrnnd_r; \ 2016 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \ 2017 &__udiv_qrnnd_r); \ 2018 (r) = __udiv_qrnnd_r; \ 2019 } while (0) 2020 #endif 2021 2022 2023 /* If this machine has no inline assembler, use C macros. */ 2024 2025 #if !defined (add_ssaaaa) 2026 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 2027 do { \ 2028 UWtype __x; \ 2029 __x = (al) + (bl); \ 2030 (sh) = (ah) + (bh) + (__x < (al)); \ 2031 (sl) = __x; \ 2032 } while (0) 2033 #endif 2034 2035 #if !defined (sub_ddmmss) 2036 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 2037 do { \ 2038 UWtype __x; \ 2039 __x = (al) - (bl); \ 2040 (sh) = (ah) - (bh) - ((al) < (bl)); \ 2041 (sl) = __x; \ 2042 } while (0) 2043 #endif 2044 2045 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of 2046 smul_ppmm. */ 2047 #if !defined (umul_ppmm) && defined (smul_ppmm) 2048 #define umul_ppmm(w1, w0, u, v) \ 2049 do { \ 2050 UWtype __w1; \ 2051 UWtype __xm0 = (u), __xm1 = (v); \ 2052 smul_ppmm (__w1, w0, __xm0, __xm1); \ 2053 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2054 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2055 } while (0) 2056 #endif 2057 2058 /* If we still don't have umul_ppmm, define it using plain C. 2059 2060 For reference, when this code is used for squaring (ie. u and v identical 2061 expressions), gcc recognises __x1 and __x2 are the same and generates 3 2062 multiplies, not 4. The subsequent additions could be optimized a bit, 2063 but the only place GMP currently uses such a square is mpn_sqr_basecase, 2064 and chips obliged to use this generic C umul will have plenty of worse 2065 performance problems than a couple of extra instructions on the diagonal 2066 of sqr_basecase. */ 2067 2068 #if !defined (umul_ppmm) 2069 #define umul_ppmm(w1, w0, u, v) \ 2070 do { \ 2071 UWtype __x0, __x1, __x2, __x3; \ 2072 UHWtype __ul, __vl, __uh, __vh; \ 2073 UWtype __u = (u), __v = (v); \ 2074 \ 2075 __ul = __ll_lowpart (__u); \ 2076 __uh = __ll_highpart (__u); \ 2077 __vl = __ll_lowpart (__v); \ 2078 __vh = __ll_highpart (__v); \ 2079 \ 2080 __x0 = (UWtype) __ul * __vl; \ 2081 __x1 = (UWtype) __ul * __vh; \ 2082 __x2 = (UWtype) __uh * __vl; \ 2083 __x3 = (UWtype) __uh * __vh; \ 2084 \ 2085 __x1 += __ll_highpart (__x0);/* this can't give carry */ \ 2086 __x1 += __x2; /* but this indeed can */ \ 2087 if (__x1 < __x2) /* did we get it? */ \ 2088 __x3 += __ll_B; /* yes, add it in the proper pos. */ \ 2089 \ 2090 (w1) = __x3 + __ll_highpart (__x1); \ 2091 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \ 2092 } while (0) 2093 #endif 2094 2095 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will 2096 exist in one form or another. */ 2097 #if !defined (smul_ppmm) 2098 #define smul_ppmm(w1, w0, u, v) \ 2099 do { \ 2100 UWtype __w1; \ 2101 UWtype __xm0 = (u), __xm1 = (v); \ 2102 umul_ppmm (__w1, w0, __xm0, __xm1); \ 2103 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2104 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2105 } while (0) 2106 #endif 2107 2108 /* Define this unconditionally, so it can be used for debugging. */ 2109 #define __udiv_qrnnd_c(q, r, n1, n0, d) \ 2110 do { \ 2111 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ 2112 \ 2113 ASSERT ((d) != 0); \ 2114 ASSERT ((n1) < (d)); \ 2115 \ 2116 __d1 = __ll_highpart (d); \ 2117 __d0 = __ll_lowpart (d); \ 2118 \ 2119 __q1 = (n1) / __d1; \ 2120 __r1 = (n1) - __q1 * __d1; \ 2121 __m = __q1 * __d0; \ 2122 __r1 = __r1 * __ll_B | __ll_highpart (n0); \ 2123 if (__r1 < __m) \ 2124 { \ 2125 __q1--, __r1 += (d); \ 2126 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ 2127 if (__r1 < __m) \ 2128 __q1--, __r1 += (d); \ 2129 } \ 2130 __r1 -= __m; \ 2131 \ 2132 __q0 = __r1 / __d1; \ 2133 __r0 = __r1 - __q0 * __d1; \ 2134 __m = __q0 * __d0; \ 2135 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ 2136 if (__r0 < __m) \ 2137 { \ 2138 __q0--, __r0 += (d); \ 2139 if (__r0 >= (d)) \ 2140 if (__r0 < __m) \ 2141 __q0--, __r0 += (d); \ 2142 } \ 2143 __r0 -= __m; \ 2144 \ 2145 (q) = __q1 * __ll_B | __q0; \ 2146 (r) = __r0; \ 2147 } while (0) 2148 2149 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through 2150 __udiv_w_sdiv (defined in libgcc or elsewhere). */ 2151 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) 2152 #define udiv_qrnnd(q, r, nh, nl, d) \ 2153 do { \ 2154 UWtype __r; \ 2155 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ 2156 (r) = __r; \ 2157 } while (0) 2158 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype); 2159 #endif 2160 2161 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ 2162 #if !defined (udiv_qrnnd) 2163 #define UDIV_NEEDS_NORMALIZATION 1 2164 #define udiv_qrnnd __udiv_qrnnd_c 2165 #endif 2166 2167 #if !defined (count_leading_zeros) 2168 #define count_leading_zeros(count, x) \ 2169 do { \ 2170 UWtype __xr = (x); \ 2171 UWtype __a; \ 2172 \ 2173 if (W_TYPE_SIZE == 32) \ 2174 { \ 2175 __a = __xr < ((UWtype) 1 << 2*__BITS4) \ 2176 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ 2177 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ 2178 : 3*__BITS4 + 1); \ 2179 } \ 2180 else \ 2181 { \ 2182 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ 2183 if (((__xr >> __a) & 0xff) != 0) \ 2184 break; \ 2185 ++__a; \ 2186 } \ 2187 \ 2188 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ 2189 } while (0) 2190 /* This version gives a well-defined value for zero. */ 2191 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) 2192 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2193 #define COUNT_LEADING_ZEROS_SLOW 2194 #endif 2195 2196 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */ 2197 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY 2198 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2199 #endif 2200 2201 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2202 extern const unsigned char __GMP_DECLSPEC __clz_tab[129]; 2203 #endif 2204 2205 #if !defined (count_trailing_zeros) 2206 #if !defined (COUNT_LEADING_ZEROS_SLOW) 2207 /* Define count_trailing_zeros using an asm count_leading_zeros. */ 2208 #define count_trailing_zeros(count, x) \ 2209 do { \ 2210 UWtype __ctz_x = (x); \ 2211 UWtype __ctz_c; \ 2212 ASSERT (__ctz_x != 0); \ 2213 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ 2214 (count) = W_TYPE_SIZE - 1 - __ctz_c; \ 2215 } while (0) 2216 #else 2217 /* Define count_trailing_zeros in plain C, assuming small counts are common. 2218 We use clz_tab without ado, since the C count_leading_zeros above will have 2219 pulled it in. */ 2220 #define count_trailing_zeros(count, x) \ 2221 do { \ 2222 UWtype __ctz_x = (x); \ 2223 int __ctz_c; \ 2224 \ 2225 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2226 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \ 2227 else \ 2228 { \ 2229 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \ 2230 { \ 2231 __ctz_x >>= 8; \ 2232 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2233 break; \ 2234 } \ 2235 \ 2236 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \ 2237 } \ 2238 } while (0) 2239 #endif 2240 #endif 2241 2242 #ifndef UDIV_NEEDS_NORMALIZATION 2243 #define UDIV_NEEDS_NORMALIZATION 0 2244 #endif 2245 2246 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and 2247 that hence the latter should always be used. */ 2248 #ifndef UDIV_PREINV_ALWAYS 2249 #define UDIV_PREINV_ALWAYS 0 2250 #endif 2251 2252 /* Give defaults for UMUL_TIME and UDIV_TIME. */ 2253 #ifndef UMUL_TIME 2254 #define UMUL_TIME 1 2255 #endif 2256 2257 #ifndef UDIV_TIME 2258 #define UDIV_TIME UMUL_TIME 2259 #endif 2260