1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic. 2 3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2023 Free Software Foundation, Inc. 4 5 This file is part of the GNU MPFR Library and has been copied from 6 GNU MP 18339:32dc4af70f95, with the following changes: 7 * the copyright notice (note: only LGPL 3+ is used in MPFR); 8 * the code declared as added for MPFR just below these comments; 9 * __GMP_DECLSPEC renamed to __MPFR_DECLSPEC. 10 11 The GNU MPFR Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU Lesser General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 The GNU MPFR Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19 License for more details. 20 21 You should have received a copy of the GNU Lesser General Public License 22 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see 23 https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 24 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ 25 26 /* You have to define the following before including this file: 27 28 UWtype -- An unsigned type, default type for operations (typically a "word") 29 UHWtype -- An unsigned type, at least half the size of UWtype 30 UDWtype -- An unsigned type, at least twice as large a UWtype 31 W_TYPE_SIZE -- size in bits of UWtype 32 33 SItype, USItype -- Signed and unsigned 32 bit types 34 DItype, UDItype -- Signed and unsigned 64 bit types 35 36 On a 32 bit machine UWtype should typically be USItype; 37 on a 64 bit machine, UWtype should typically be UDItype. 38 39 Optionally, define: 40 41 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files 42 NO_ASM -- Disable inline asm 43 44 45 CAUTION! Using this version of longlong.h outside of GMP is not safe. You 46 need to include gmp.h and gmp-impl.h, or certain things might not work as 47 expected. 48 */ 49 50 /* Code added for MPFR */ 51 52 #ifndef MPFR_NEED_LONGLONG_H 53 # error "Never include mpfr-longlong.h directly; define MPFR_NEED_LONGLONG_H instead." 54 #endif 55 56 #ifndef __GMP_GNUC_PREREQ 57 # define __GMP_GNUC_PREREQ(X,Y) __MPFR_GNUC(X,Y) 58 #endif 59 60 /* End of code added for MPFR */ 61 62 #define __BITS4 (W_TYPE_SIZE / 4) 63 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) 64 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) 65 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) 66 67 /* This is used to make sure no undesirable sharing between different libraries 68 that use this file takes place. */ 69 #ifndef __MPN 70 #define __MPN(x) __##x 71 #endif 72 73 /* Define auxiliary asm macros. 74 75 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two 76 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype 77 word product in HIGH_PROD and LOW_PROD. 78 79 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a 80 UDWtype product. This is just a variant of umul_ppmm. 81 82 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 83 denominator) divides a UDWtype, composed by the UWtype integers 84 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient 85 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less 86 than DENOMINATOR for correct operation. If, in addition, the most 87 significant bit of DENOMINATOR must be 1, then the pre-processor symbol 88 UDIV_NEEDS_NORMALIZATION is defined to 1. 89 90 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 91 denominator). Like udiv_qrnnd but the numbers are signed. The quotient 92 is rounded towards 0. 93 94 5) count_leading_zeros(count, x) counts the number of zero-bits from the 95 msb to the first non-zero bit in the UWtype X. This is the number of 96 steps X needs to be shifted left to set the msb. Undefined for X == 0, 97 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. 98 99 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts 100 from the least significant end. 101 102 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, 103 high_addend_2, low_addend_2) adds two UWtype integers, composed by 104 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 105 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow 106 (i.e. carry out) is not stored anywhere, and is lost. 107 108 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, 109 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, 110 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and 111 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE 112 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, 113 and is lost. 114 115 If any of these macros are left undefined for a particular CPU, 116 C macros are used. 117 118 119 Notes: 120 121 For add_ssaaaa the two high and two low addends can both commute, but 122 unfortunately gcc only supports one "%" commutative in each asm block. 123 This has always been so but is only documented in recent versions 124 (eg. pre-release 3.3). Having two or more "%"s can cause an internal 125 compiler error in certain rare circumstances. 126 127 Apparently it was only the last "%" that was ever actually respected, so 128 the code has been updated to leave just that. Clearly there's a free 129 choice whether high or low should get it, if there's a reason to favour 130 one over the other. Also obviously when the constraints on the two 131 operands are identical there's no benefit to the reloader in any "%" at 132 all. 133 134 */ 135 136 /* The CPUs come in alphabetical order below. 137 138 Please add support for more CPUs here, or improve the current support 139 for the CPUs below! */ 140 141 142 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc 143 3.4 __builtin_clzl or __builtin_clzll, according to our limb size. 144 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or 145 __builtin_ctzll. 146 147 These builtins are only used when we check what code comes out, on some 148 chips they're merely libgcc calls, where we will instead want an inline 149 in that case (either asm or generic C). 150 151 These builtins are better than an asm block of the same insn, since an 152 asm block doesn't give gcc any information about scheduling or resource 153 usage. We keep an asm block for use on prior versions of gcc though. 154 155 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but 156 it's not used (for count_leading_zeros) because it generally gives extra 157 code to ensure the result is 0 when the input is 0, which we don't need 158 or want. */ 159 160 #ifdef _LONG_LONG_LIMB 161 #define count_leading_zeros_gcc_clz(count,x) \ 162 do { \ 163 ASSERT ((x) != 0); \ 164 (count) = __builtin_clzll (x); \ 165 } while (0) 166 #else 167 #define count_leading_zeros_gcc_clz(count,x) \ 168 do { \ 169 ASSERT ((x) != 0); \ 170 (count) = __builtin_clzl (x); \ 171 } while (0) 172 #endif 173 174 #ifdef _LONG_LONG_LIMB 175 #define count_trailing_zeros_gcc_ctz(count,x) \ 176 do { \ 177 ASSERT ((x) != 0); \ 178 (count) = __builtin_ctzll (x); \ 179 } while (0) 180 #else 181 #define count_trailing_zeros_gcc_ctz(count,x) \ 182 do { \ 183 ASSERT ((x) != 0); \ 184 (count) = __builtin_ctzl (x); \ 185 } while (0) 186 #endif 187 188 189 /* FIXME: The macros using external routines like __MPN(count_leading_zeros) 190 don't need to be under !NO_ASM */ 191 #if ! defined (NO_ASM) 192 193 #if defined (__alpha) && W_TYPE_SIZE == 64 194 /* Most alpha-based machines, except Cray systems. */ 195 #if defined (__GNUC__) 196 #if __GMP_GNUC_PREREQ (3,3) 197 #define umul_ppmm(ph, pl, m0, m1) \ 198 do { \ 199 UDItype __m0 = (m0), __m1 = (m1); \ 200 (ph) = __builtin_alpha_umulh (__m0, __m1); \ 201 (pl) = __m0 * __m1; \ 202 } while (0) 203 #else 204 #define umul_ppmm(ph, pl, m0, m1) \ 205 do { \ 206 UDItype __m0 = (m0), __m1 = (m1); \ 207 __asm__ ("umulh %r1,%2,%0" \ 208 : "=r" (ph) \ 209 : "%rJ" (__m0), "rI" (__m1)); \ 210 (pl) = __m0 * __m1; \ 211 } while (0) 212 #endif 213 #else /* ! __GNUC__ */ 214 #include <machine/builtins.h> 215 #define umul_ppmm(ph, pl, m0, m1) \ 216 do { \ 217 UDItype __m0 = (m0), __m1 = (m1); \ 218 (ph) = __UMULH (__m0, __m1); \ 219 (pl) = __m0 * __m1; \ 220 } while (0) 221 #endif 222 #ifndef LONGLONG_STANDALONE 223 #define udiv_qrnnd(q, r, n1, n0, d) \ 224 do { UWtype __di; \ 225 __di = __MPN(invert_limb) (d); \ 226 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 227 } while (0) 228 #define UDIV_PREINV_ALWAYS 1 229 #define UDIV_NEEDS_NORMALIZATION 1 230 #endif /* LONGLONG_STANDALONE */ 231 232 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm 233 always goes into libgmp.so, even when not actually used. */ 234 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 235 236 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX 237 #define count_leading_zeros(COUNT,X) \ 238 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) 239 #define count_trailing_zeros(COUNT,X) \ 240 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) 241 #endif /* clz/ctz using cix */ 242 243 #if ! defined (count_leading_zeros) \ 244 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) 245 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. 246 "$31" is written explicitly in the asm, since an "r" constraint won't 247 select reg 31. There seems no need to worry about "r31" syntax for cray, 248 since gcc itself (pre-release 3.4) emits just $31 in various places. */ 249 #define ALPHA_CMPBGE_0(dst, src) \ 250 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) 251 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts 252 them, locating the highest non-zero byte. A second __clz_tab lookup 253 counts the leading zero bits in that byte, giving the result. */ 254 #define count_leading_zeros(count, x) \ 255 do { \ 256 UWtype __clz__b, __clz__c, __clz__x = (x); \ 257 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ 258 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ 259 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ 260 __clz__x >>= __clz__b; \ 261 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ 262 __clz__b = 65 - __clz__b; \ 263 (count) = __clz__b - __clz__c; \ 264 } while (0) 265 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 266 #endif /* clz using cmpbge */ 267 268 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) 269 #if HAVE_ATTRIBUTE_CONST 270 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); 271 #else 272 long __MPN(count_leading_zeros) (UDItype); 273 #endif 274 #define count_leading_zeros(count, x) \ 275 ((count) = __MPN(count_leading_zeros) (x)) 276 #endif /* clz using mpn */ 277 #endif /* __alpha */ 278 279 #if defined (__AVR) && W_TYPE_SIZE == 8 280 #define umul_ppmm(ph, pl, m0, m1) \ 281 do { \ 282 unsigned short __p = (unsigned short) (m0) * (m1); \ 283 (ph) = __p >> 8; \ 284 (pl) = __p; \ 285 } while (0) 286 #endif /* AVR */ 287 288 #if defined (_CRAY) && W_TYPE_SIZE == 64 289 #include <intrinsics.h> 290 #define UDIV_PREINV_ALWAYS 1 291 #define UDIV_NEEDS_NORMALIZATION 1 292 long __MPN(count_leading_zeros) (UDItype); 293 #define count_leading_zeros(count, x) \ 294 ((count) = _leadz ((UWtype) (x))) 295 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ 296 #define umul_ppmm(ph, pl, m0, m1) \ 297 do { \ 298 UDItype __m0 = (m0), __m1 = (m1); \ 299 (ph) = _int_mult_upper (__m0, __m1); \ 300 (pl) = __m0 * __m1; \ 301 } while (0) 302 #ifndef LONGLONG_STANDALONE 303 #define udiv_qrnnd(q, r, n1, n0, d) \ 304 do { UWtype __di; \ 305 __di = __MPN(invert_limb) (d); \ 306 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 307 } while (0) 308 #endif /* LONGLONG_STANDALONE */ 309 #endif /* _CRAYIEEE */ 310 #endif /* _CRAY */ 311 312 #if defined (__ia64) && W_TYPE_SIZE == 64 313 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated 314 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic 315 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a 316 register, which takes an extra cycle. */ 317 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 318 do { \ 319 UWtype __x; \ 320 __x = (al) - (bl); \ 321 if ((al) < (bl)) \ 322 (sh) = (ah) - (bh) - 1; \ 323 else \ 324 (sh) = (ah) - (bh); \ 325 (sl) = __x; \ 326 } while (0) 327 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 328 /* Do both product parts in assembly, since that gives better code with 329 all gcc versions. Some callers will just use the upper part, and in 330 that situation we waste an instruction, but not any cycles. */ 331 #define umul_ppmm(ph, pl, m0, m1) \ 332 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \ 333 : "=&f" (ph), "=f" (pl) \ 334 : "f" (m0), "f" (m1)) 335 #define count_leading_zeros(count, x) \ 336 do { \ 337 UWtype _x = (x), _y, _a, _c; \ 338 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ 339 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ 340 _c = (_a - 1) << 3; \ 341 _x >>= _c; \ 342 if (_x >= 1 << 4) \ 343 _x >>= 4, _c += 4; \ 344 if (_x >= 1 << 2) \ 345 _x >>= 2, _c += 2; \ 346 _c += _x >> 1; \ 347 (count) = W_TYPE_SIZE - 1 - _c; \ 348 } while (0) 349 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1 350 based, and we don't need a special case for x==0 here */ 351 #define count_trailing_zeros(count, x) \ 352 do { \ 353 UWtype __ctz_x = (x); \ 354 __asm__ ("popcnt %0 = %1" \ 355 : "=r" (count) \ 356 : "r" ((__ctz_x-1) & ~__ctz_x)); \ 357 } while (0) 358 #endif 359 #if defined (__INTEL_COMPILER) 360 #include <ia64intrin.h> 361 #define umul_ppmm(ph, pl, m0, m1) \ 362 do { \ 363 UWtype __m0 = (m0), __m1 = (m1); \ 364 ph = _m64_xmahu (__m0, __m1, 0); \ 365 pl = __m0 * __m1; \ 366 } while (0) 367 #endif 368 #ifndef LONGLONG_STANDALONE 369 #define udiv_qrnnd(q, r, n1, n0, d) \ 370 do { UWtype __di; \ 371 __di = __MPN(invert_limb) (d); \ 372 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 373 } while (0) 374 #define UDIV_PREINV_ALWAYS 1 375 #define UDIV_NEEDS_NORMALIZATION 1 376 #endif 377 #endif 378 379 380 #if defined (__GNUC__) 381 382 /* We sometimes need to clobber "cc" with gcc2, but that would not be 383 understood by gcc1. Use cpp to avoid major code duplication. */ 384 #if __GNUC__ < 2 385 #define __CLOBBER_CC 386 #define __AND_CLOBBER_CC 387 #else /* __GNUC__ >= 2 */ 388 #define __CLOBBER_CC : "cc" 389 #define __AND_CLOBBER_CC , "cc" 390 #endif /* __GNUC__ < 2 */ 391 392 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 393 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 394 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \ 395 : "=r" (sh), "=&r" (sl) \ 396 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) 397 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 398 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ 399 : "=r" (sh), "=&r" (sl) \ 400 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) 401 #define umul_ppmm(xh, xl, m0, m1) \ 402 do { \ 403 USItype __m0 = (m0), __m1 = (m1); \ 404 __asm__ ("multiplu %0,%1,%2" \ 405 : "=r" (xl) \ 406 : "r" (__m0), "r" (__m1)); \ 407 __asm__ ("multmu %0,%1,%2" \ 408 : "=r" (xh) \ 409 : "r" (__m0), "r" (__m1)); \ 410 } while (0) 411 #define udiv_qrnnd(q, r, n1, n0, d) \ 412 __asm__ ("dividu %0,%3,%4" \ 413 : "=r" (q), "=q" (r) \ 414 : "1" (n1), "r" (n0), "r" (d)) 415 #define count_leading_zeros(count, x) \ 416 __asm__ ("clz %0,%1" \ 417 : "=r" (count) \ 418 : "r" (x)) 419 #define COUNT_LEADING_ZEROS_0 32 420 #endif /* __a29k__ */ 421 422 #if defined (__arc__) 423 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 424 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 425 : "=r" (sh), \ 426 "=&r" (sl) \ 427 : "r" ((USItype) (ah)), \ 428 "rICal" ((USItype) (bh)), \ 429 "%r" ((USItype) (al)), \ 430 "rICal" ((USItype) (bl))) 431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 432 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 433 : "=r" (sh), \ 434 "=&r" (sl) \ 435 : "r" ((USItype) (ah)), \ 436 "rICal" ((USItype) (bh)), \ 437 "r" ((USItype) (al)), \ 438 "rICal" ((USItype) (bl))) 439 #endif 440 441 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \ 442 && W_TYPE_SIZE == 32 443 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 444 do { \ 445 if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl)) \ 446 __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 447 : "=r" (sh), "=&r" (sl) \ 448 : "r" (ah), "rI" (bh), \ 449 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC); \ 450 else \ 451 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 452 : "=r" (sh), "=&r" (sl) \ 453 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC); \ 454 } while (0) 455 /* FIXME: Extend the immediate range for the low word by using both ADDS and 456 SUBS, since they set carry in the same way. We need separate definitions 457 for thumb and non-thumb since thumb lacks RSC. */ 458 #if defined (__thumb__) 459 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 460 do { \ 461 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \ 462 && (ah) == (bh)) \ 463 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \ 464 : "=r" (sh), "=r" (sl) \ 465 : "r" (al), "rI" (bl) __CLOBBER_CC); \ 466 else if (__builtin_constant_p (al)) \ 467 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 468 : "=r" (sh), "=&r" (sl) \ 469 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 470 else \ 471 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 472 : "=r" (sh), "=&r" (sl) \ 473 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 474 } while (0) 475 #else 476 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 477 do { \ 478 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \ 479 && (ah) == (bh)) \ 480 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \ 481 : "=r" (sh), "=r" (sl) \ 482 : "r" (al), "rI" (bl) __CLOBBER_CC); \ 483 else if (__builtin_constant_p (al)) \ 484 { \ 485 if (__builtin_constant_p (ah)) \ 486 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 487 : "=r" (sh), "=&r" (sl) \ 488 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 489 else \ 490 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 491 : "=r" (sh), "=&r" (sl) \ 492 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 493 } \ 494 else if (__builtin_constant_p (ah)) \ 495 { \ 496 if (__builtin_constant_p (bl)) \ 497 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ 498 : "=r" (sh), "=&r" (sl) \ 499 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 500 else \ 501 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 502 : "=r" (sh), "=&r" (sl) \ 503 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 504 } \ 505 else \ 506 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 507 : "=r" (sh), "=&r" (sl) \ 508 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 509 } while (0) 510 #endif 511 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \ 512 || defined (__ARM_ARCH_3__) 513 #define umul_ppmm(xh, xl, a, b) \ 514 do { \ 515 register USItype __t0, __t1, __t2; \ 516 __asm__ ("%@ Inlined umul_ppmm\n" \ 517 " mov %2, %5, lsr #16\n" \ 518 " mov %0, %6, lsr #16\n" \ 519 " bic %3, %5, %2, lsl #16\n" \ 520 " bic %4, %6, %0, lsl #16\n" \ 521 " mul %1, %3, %4\n" \ 522 " mul %4, %2, %4\n" \ 523 " mul %3, %0, %3\n" \ 524 " mul %0, %2, %0\n" \ 525 " adds %3, %4, %3\n" \ 526 " addcs %0, %0, #65536\n" \ 527 " adds %1, %1, %3, lsl #16\n" \ 528 " adc %0, %0, %3, lsr #16" \ 529 : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \ 530 "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \ 531 : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \ 532 } while (0) 533 #ifndef LONGLONG_STANDALONE 534 #define udiv_qrnnd(q, r, n1, n0, d) \ 535 do { UWtype __r; \ 536 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 537 (r) = __r; \ 538 } while (0) 539 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 540 #endif /* LONGLONG_STANDALONE */ 541 #else /* ARMv4 or newer */ 542 #define umul_ppmm(xh, xl, a, b) \ 543 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 544 #define smul_ppmm(xh, xl, a, b) \ 545 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 546 #ifndef LONGLONG_STANDALONE 547 #define udiv_qrnnd(q, r, n1, n0, d) \ 548 do { UWtype __di; \ 549 __di = __MPN(invert_limb) (d); \ 550 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 551 } while (0) 552 #define UDIV_PREINV_ALWAYS 1 553 #define UDIV_NEEDS_NORMALIZATION 1 554 #endif /* LONGLONG_STANDALONE */ 555 #endif /* defined(__ARM_ARCH_2__) ... */ 556 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 557 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 558 #endif /* __arm__ */ 559 560 #if defined (__aarch64__) && W_TYPE_SIZE == 64 561 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 562 do { \ 563 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \ 564 __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 565 : "=r" (sh), "=&r" (sl) \ 566 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \ 567 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\ 568 else \ 569 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 570 : "=r" (sh), "=&r" (sl) \ 571 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \ 572 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\ 573 } while (0) 574 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 575 do { \ 576 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \ 577 __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 578 : "=r,r" (sh), "=&r,&r" (sl) \ 579 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \ 580 "r,Z" ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\ 581 else \ 582 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 583 : "=r,r" (sh), "=&r,&r" (sl) \ 584 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \ 585 "r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC);\ 586 } while(0); 587 #if __GMP_GNUC_PREREQ (4,9) 588 #define umul_ppmm(w1, w0, u, v) \ 589 do { \ 590 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 591 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 592 w1 = __ll >> 64; \ 593 w0 = __ll; \ 594 } while (0) 595 #endif 596 #if !defined (umul_ppmm) 597 #define umul_ppmm(ph, pl, m0, m1) \ 598 do { \ 599 UDItype __m0 = (m0), __m1 = (m1); \ 600 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \ 601 (pl) = __m0 * __m1; \ 602 } while (0) 603 #endif 604 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 605 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 606 #endif /* __aarch64__ */ 607 608 #if defined (__clipper__) && W_TYPE_SIZE == 32 609 #define umul_ppmm(w1, w0, u, v) \ 610 ({union {UDItype __ll; \ 611 struct {USItype __l, __h;} __i; \ 612 } __x; \ 613 __asm__ ("mulwux %2,%0" \ 614 : "=r" (__x.__ll) \ 615 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 616 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 617 #define smul_ppmm(w1, w0, u, v) \ 618 ({union {DItype __ll; \ 619 struct {SItype __l, __h;} __i; \ 620 } __x; \ 621 __asm__ ("mulwx %2,%0" \ 622 : "=r" (__x.__ll) \ 623 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \ 624 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 625 #define __umulsidi3(u, v) \ 626 ({UDItype __w; \ 627 __asm__ ("mulwux %2,%0" \ 628 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 629 __w; }) 630 #endif /* __clipper__ */ 631 632 /* Fujitsu vector computers. */ 633 #if defined (__uxp__) && W_TYPE_SIZE == 32 634 #define umul_ppmm(ph, pl, u, v) \ 635 do { \ 636 union {UDItype __ll; \ 637 struct {USItype __h, __l;} __i; \ 638 } __x; \ 639 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\ 640 (ph) = __x.__i.__h; \ 641 (pl) = __x.__i.__l; \ 642 } while (0) 643 #define smul_ppmm(ph, pl, u, v) \ 644 do { \ 645 union {UDItype __ll; \ 646 struct {USItype __h, __l;} __i; \ 647 } __x; \ 648 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \ 649 (ph) = __x.__i.__h; \ 650 (pl) = __x.__i.__l; \ 651 } while (0) 652 #endif 653 654 #if defined (__gmicro__) && W_TYPE_SIZE == 32 655 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 656 __asm__ ("add.w %5,%1\n\taddx %3,%0" \ 657 : "=g" (sh), "=&g" (sl) \ 658 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 659 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 660 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 661 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \ 662 : "=g" (sh), "=&g" (sl) \ 663 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 664 "1" ((USItype)(al)), "g" ((USItype)(bl))) 665 #define umul_ppmm(ph, pl, m0, m1) \ 666 __asm__ ("mulx %3,%0,%1" \ 667 : "=g" (ph), "=r" (pl) \ 668 : "%0" ((USItype)(m0)), "g" ((USItype)(m1))) 669 #define udiv_qrnnd(q, r, nh, nl, d) \ 670 __asm__ ("divx %4,%0,%1" \ 671 : "=g" (q), "=r" (r) \ 672 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d))) 673 #define count_leading_zeros(count, x) \ 674 __asm__ ("bsch/1 %1,%0" \ 675 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0)) 676 #endif 677 678 #if defined (__hppa) && W_TYPE_SIZE == 32 679 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 680 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \ 681 : "=r" (sh), "=&r" (sl) \ 682 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 683 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 684 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \ 685 : "=r" (sh), "=&r" (sl) \ 686 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 687 #if defined (_PA_RISC1_1) 688 #define umul_ppmm(wh, wl, u, v) \ 689 do { \ 690 union {UDItype __ll; \ 691 struct {USItype __h, __l;} __i; \ 692 } __x; \ 693 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \ 694 (wh) = __x.__i.__h; \ 695 (wl) = __x.__i.__l; \ 696 } while (0) 697 #endif 698 #define count_leading_zeros(count, x) \ 699 do { \ 700 USItype __tmp; \ 701 __asm__ ( \ 702 "ldi 1,%0\n" \ 703 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ 704 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ 705 " ldo 16(%0),%0 ; Yes. Perform add.\n" \ 706 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ 707 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ 708 " ldo 8(%0),%0 ; Yes. Perform add.\n" \ 709 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ 710 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ 711 " ldo 4(%0),%0 ; Yes. Perform add.\n" \ 712 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ 713 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ 714 " ldo 2(%0),%0 ; Yes. Perform add.\n" \ 715 " extru %1,30,1,%1 ; Extract bit 1.\n" \ 716 " sub %0,%1,%0 ; Subtract it.\n" \ 717 : "=r" (count), "=r" (__tmp) : "1" (x)); \ 718 } while (0) 719 #endif /* hppa */ 720 721 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC 722 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this 723 is just a case of no direct support for 2.0n but treating it like 1.0. */ 724 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB) 725 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 726 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \ 727 : "=r" (sh), "=&r" (sl) \ 728 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 729 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 730 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \ 731 : "=r" (sh), "=&r" (sl) \ 732 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 733 #endif /* hppa */ 734 735 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 736 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch) 737 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 738 do { \ 739 /* if (__builtin_constant_p (bl)) \ 740 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \ 741 : "=r" (sh), "=&r" (sl) \ 742 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\ 743 else \ 744 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \ 745 : "=r" (sh), "=&r" (sl) \ 746 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \ 747 } while (0) 748 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 749 do { \ 750 /* if (__builtin_constant_p (bl)) \ 751 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \ 752 : "=r" (sh), "=&r" (sl) \ 753 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \ 754 else \ 755 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \ 756 : "=r" (sh), "=&r" (sl) \ 757 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \ 758 } while (0) 759 #if __GMP_GNUC_PREREQ (4,5) 760 #define umul_ppmm(xh, xl, m0, m1) \ 761 do { \ 762 union {UDItype __ll; \ 763 struct {USItype __h, __l;} __i; \ 764 } __x; \ 765 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \ 766 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 767 } while (0) 768 #else 769 #if 0 770 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 771 with a new enough processor pretending we have 32-bit registers. */ 772 #define umul_ppmm(xh, xl, m0, m1) \ 773 do { \ 774 union {UDItype __ll; \ 775 struct {USItype __h, __l;} __i; \ 776 } __x; \ 777 __asm__ ("mlr\t%0,%2" \ 778 : "=r" (__x.__ll) \ 779 : "%0" (m0), "r" (m1)); \ 780 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 781 } while (0) 782 #else 783 #define umul_ppmm(xh, xl, m0, m1) \ 784 do { \ 785 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use 786 DImode for the product, since that would be allocated to a single 64-bit 787 register, whereas mlr uses the low 32-bits of an even-odd register pair. 788 */ \ 789 register USItype __r0 __asm__ ("0"); \ 790 register USItype __r1 __asm__ ("1") = (m0); \ 791 __asm__ ("mlr\t%0,%3" \ 792 : "=r" (__r0), "=r" (__r1) \ 793 : "r" (__r1), "r" (m1)); \ 794 (xh) = __r0; (xl) = __r1; \ 795 } while (0) 796 #endif /* if 0 */ 797 #endif 798 #if 0 799 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 800 with a new enough processor pretending we have 32-bit registers. */ 801 #define udiv_qrnnd(q, r, n1, n0, d) \ 802 do { \ 803 union {UDItype __ll; \ 804 struct {USItype __h, __l;} __i; \ 805 } __x; \ 806 __x.__i.__h = n1; __x.__i.__l = n0; \ 807 __asm__ ("dlr\t%0,%2" \ 808 : "=r" (__x.__ll) \ 809 : "0" (__x.__ll), "r" (d)); \ 810 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 811 } while (0) 812 #else 813 #define udiv_qrnnd(q, r, n1, n0, d) \ 814 do { \ 815 register USItype __r0 __asm__ ("0") = (n1); \ 816 register USItype __r1 __asm__ ("1") = (n0); \ 817 __asm__ ("dlr\t%0,%4" \ 818 : "=r" (__r0), "=r" (__r1) \ 819 : "r" (__r0), "r" (__r1), "r" (d)); \ 820 (q) = __r1; (r) = __r0; \ 821 } while (0) 822 #endif /* if 0 */ 823 #else /* if __zarch__ */ 824 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 825 #define smul_ppmm(xh, xl, m0, m1) \ 826 do { \ 827 union {DItype __ll; \ 828 struct {USItype __h, __l;} __i; \ 829 } __x; \ 830 __asm__ ("mr\t%0,%2" \ 831 : "=r" (__x.__ll) \ 832 : "%0" (m0), "r" (m1)); \ 833 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 834 } while (0) 835 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 836 #define sdiv_qrnnd(q, r, n1, n0, d) \ 837 do { \ 838 union {DItype __ll; \ 839 struct {USItype __h, __l;} __i; \ 840 } __x; \ 841 __x.__i.__h = n1; __x.__i.__l = n0; \ 842 __asm__ ("dr\t%0,%2" \ 843 : "=r" (__x.__ll) \ 844 : "0" (__x.__ll), "r" (d)); \ 845 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 846 } while (0) 847 #endif /* if __zarch__ */ 848 #endif 849 850 #if defined (__s390x__) && W_TYPE_SIZE == 64 851 /* We need to cast operands with register constraints, otherwise their types 852 will be assumed to be SImode by gcc. For these machines, such operations 853 will insert a value into the low 32 bits, and leave the high 32 bits with 854 garbage. */ 855 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 856 do { \ 857 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \ 858 : "=r" (sh), "=&r" (sl) \ 859 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 860 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 861 } while (0) 862 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 863 do { \ 864 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \ 865 : "=r" (sh), "=&r" (sl) \ 866 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 867 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 868 } while (0) 869 #if !defined (__clang__) 870 #define umul_ppmm(xh, xl, m0, m1) \ 871 do { \ 872 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 873 struct {UDItype __h, __l;} __i; \ 874 } __x; \ 875 __asm__ ("mlgr\t%0,%2" \ 876 : "=r" (__x.__ll) \ 877 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \ 878 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 879 } while (0) 880 #define udiv_qrnnd(q, r, n1, n0, d) \ 881 do { \ 882 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 883 struct {UDItype __h, __l;} __i; \ 884 } __x; \ 885 __x.__i.__h = n1; __x.__i.__l = n0; \ 886 __asm__ ("dlgr\t%0,%2" \ 887 : "=r" (__x.__ll) \ 888 : "0" (__x.__ll), "r" ((UDItype)(d))); \ 889 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 890 } while (0) 891 #endif 892 #if 0 /* FIXME: Enable for z10 (?) */ 893 #define count_leading_zeros(cnt, x) \ 894 do { \ 895 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 896 struct {UDItype __h, __l;} __i; \ 897 } __clr_cnt; \ 898 __asm__ ("flogr\t%0,%1" \ 899 : "=r" (__clr_cnt.__ll) \ 900 : "r" (x) __CLOBBER_CC); \ 901 (cnt) = __clr_cnt.__i.__h; \ 902 } while (0) 903 #endif 904 #endif 905 906 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr", 907 so we don't need __CLOBBER_CC. */ 908 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 910 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \ 911 : "=r" (sh), "=&r" (sl) \ 912 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 913 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 915 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \ 916 : "=r" (sh), "=&r" (sl) \ 917 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 918 "1" ((USItype)(al)), "g" ((USItype)(bl))) 919 #define umul_ppmm(w1, w0, u, v) \ 920 __asm__ ("mull %3" \ 921 : "=a" (w0), "=d" (w1) \ 922 : "%0" ((USItype)(u)), "rm" ((USItype)(v))) 923 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 924 __asm__ ("divl %4" /* stringification in K&R C */ \ 925 : "=a" (q), "=d" (r) \ 926 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) 927 928 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx 929 /* Pentium bsrl takes between 10 and 72 cycles depending where the most 930 significant 1 bit is, hence the use of the following alternatives. bsfl 931 is slow too, between 18 and 42 depending where the least significant 1 932 bit is, so let the generic count_trailing_zeros below make use of the 933 count_leading_zeros here too. */ 934 935 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE) 936 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 937 cache miss reading from __clz_tab. For P55 it's favoured over the float 938 below so as to avoid mixing MMX and x87, since the penalty for switching 939 between the two is about 100 cycles. 940 941 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 942 16, -1 for 8, or 0 otherwise. This could be written equivalently as 943 follows, but as of gcc 2.95.2 it results in conditional jumps. 944 945 __shift = -(__n < 0x1000000); 946 __shift -= (__n < 0x10000); 947 __shift -= (__n < 0x100); 948 949 The middle two sbbl and cmpl's pair, and with luck something gcc 950 generates might pair with the first cmpl and the last sbbl. The "32+1" 951 constant could be folded into __clz_tab[], but it doesn't seem worth 952 making a different table just for that. */ 953 954 #define count_leading_zeros(c,n) \ 955 do { \ 956 USItype __n = (n); \ 957 USItype __shift; \ 958 __asm__ ("cmpl $0x1000000, %1\n" \ 959 "sbbl %0, %0\n" \ 960 "cmpl $0x10000, %1\n" \ 961 "sbbl $0, %0\n" \ 962 "cmpl $0x100, %1\n" \ 963 "sbbl $0, %0\n" \ 964 : "=&r" (__shift) : "r" (__n)); \ 965 __shift = __shift*8 + 24 + 1; \ 966 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ 967 } while (0) 968 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 969 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ 970 971 #else /* ! pentiummmx || LONGLONG_STANDALONE */ 972 /* The following should be a fixed 14 cycles or so. Some scheduling 973 opportunities should be available between the float load/store too. This 974 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is 975 apparently suggested by the Intel optimizing manual (don't know exactly 976 where). gcc 2.95 or up will be best for this, so the "double" is 977 correctly aligned on the stack. */ 978 #define count_leading_zeros(c,n) \ 979 do { \ 980 union { \ 981 double d; \ 982 unsigned a[2]; \ 983 } __u; \ 984 __u.d = (UWtype) (n); \ 985 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ 986 } while (0) 987 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31) 988 #endif /* pentiummx */ 989 990 #else /* ! pentium */ 991 992 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */ 993 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x) 994 #endif /* gcc clz */ 995 996 /* On P6, gcc prior to 3.0 generates a partial register stall for 997 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former 998 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the 999 cost of one extra instruction. Do this for "i386" too, since that means 1000 generic x86. */ 1001 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \ 1002 && (HAVE_HOST_CPU_i386 \ 1003 || HAVE_HOST_CPU_i686 \ 1004 || HAVE_HOST_CPU_pentiumpro \ 1005 || HAVE_HOST_CPU_pentium2 \ 1006 || HAVE_HOST_CPU_pentium3) 1007 #define count_leading_zeros(count, x) \ 1008 do { \ 1009 USItype __cbtmp; \ 1010 ASSERT ((x) != 0); \ 1011 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1012 (count) = 31 - __cbtmp; \ 1013 } while (0) 1014 #endif /* gcc<3 asm bsrl */ 1015 1016 #ifndef count_leading_zeros 1017 #define count_leading_zeros(count, x) \ 1018 do { \ 1019 USItype __cbtmp; \ 1020 ASSERT ((x) != 0); \ 1021 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1022 (count) = __cbtmp ^ 31; \ 1023 } while (0) 1024 #endif /* asm bsrl */ 1025 1026 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */ 1027 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x) 1028 #endif /* gcc ctz */ 1029 1030 #ifndef count_trailing_zeros 1031 #define count_trailing_zeros(count, x) \ 1032 do { \ 1033 ASSERT ((x) != 0); \ 1034 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \ 1035 } while (0) 1036 #endif /* asm bsfl */ 1037 1038 #endif /* ! pentium */ 1039 1040 #endif /* 80x86 */ 1041 1042 #if defined (__amd64__) && W_TYPE_SIZE == 64 1043 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1044 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \ 1045 : "=r" (sh), "=&r" (sl) \ 1046 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1047 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1048 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1049 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \ 1050 : "=r" (sh), "=&r" (sl) \ 1051 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1052 "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1053 #if X86_ASM_MULX \ 1054 && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \ 1055 || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen) 1056 #define umul_ppmm(w1, w0, u, v) \ 1057 __asm__ ("mulx\t%3, %q0, %q1" \ 1058 : "=r" (w0), "=r" (w1) \ 1059 : "%d" ((UDItype)(u)), "rm" ((UDItype)(v))) 1060 #else 1061 #define umul_ppmm(w1, w0, u, v) \ 1062 __asm__ ("mulq\t%3" \ 1063 : "=a" (w0), "=d" (w1) \ 1064 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) 1065 #endif 1066 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 1067 __asm__ ("divq %4" /* stringification in K&R C */ \ 1068 : "=a" (q), "=d" (r) \ 1069 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) 1070 1071 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \ 1072 || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \ 1073 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \ 1074 || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar 1075 #define count_leading_zeros(count, x) \ 1076 do { \ 1077 /* This is lzcnt, spelled for older assemblers. Destination and */ \ 1078 /* source must be a 64-bit registers, hence cast and %q. */ \ 1079 __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1080 } while (0) 1081 #define COUNT_LEADING_ZEROS_0 64 1082 #else 1083 #define count_leading_zeros(count, x) \ 1084 do { \ 1085 UDItype __cbtmp; \ 1086 ASSERT ((x) != 0); \ 1087 __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ 1088 (count) = __cbtmp ^ 63; \ 1089 } while (0) 1090 #endif 1091 1092 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \ 1093 || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar 1094 #define count_trailing_zeros(count, x) \ 1095 do { \ 1096 /* This is tzcnt, spelled for older assemblers. Destination and */ \ 1097 /* source must be a 64-bit registers, hence cast and %q. */ \ 1098 __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1099 } while (0) 1100 #define COUNT_TRAILING_ZEROS_0 64 1101 #else 1102 #define count_trailing_zeros(count, x) \ 1103 do { \ 1104 ASSERT ((x) != 0); \ 1105 __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1106 } while (0) 1107 #endif 1108 #endif /* __amd64__ */ 1109 1110 #if defined (__i860__) && W_TYPE_SIZE == 32 1111 #define rshift_rhlc(r,h,l,c) \ 1112 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ 1113 "=r" (r) : "r" (h), "r" (l), "rn" (c)) 1114 #endif /* i860 */ 1115 1116 #if defined (__i960__) && W_TYPE_SIZE == 32 1117 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1118 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \ 1119 : "=r" (sh), "=&r" (sl) \ 1120 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl)) 1121 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1122 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \ 1123 : "=r" (sh), "=&r" (sl) \ 1124 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl)) 1125 #define umul_ppmm(w1, w0, u, v) \ 1126 ({union {UDItype __ll; \ 1127 struct {USItype __l, __h;} __i; \ 1128 } __x; \ 1129 __asm__ ("emul %2,%1,%0" \ 1130 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \ 1131 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1132 #define __umulsidi3(u, v) \ 1133 ({UDItype __w; \ 1134 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \ 1135 __w; }) 1136 #define udiv_qrnnd(q, r, nh, nl, d) \ 1137 do { \ 1138 union {UDItype __ll; \ 1139 struct {USItype __l, __h;} __i; \ 1140 } __nn; \ 1141 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ 1142 __asm__ ("ediv %d,%n,%0" \ 1143 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \ 1144 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ 1145 } while (0) 1146 #define count_leading_zeros(count, x) \ 1147 do { \ 1148 USItype __cbtmp; \ 1149 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \ 1150 (count) = __cbtmp ^ 31; \ 1151 } while (0) 1152 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */ 1153 #if defined (__i960mx) /* what is the proper symbol to test??? */ 1154 #define rshift_rhlc(r,h,l,c) \ 1155 do { \ 1156 union {UDItype __ll; \ 1157 struct {USItype __l, __h;} __i; \ 1158 } __nn; \ 1159 __nn.__i.__h = (h); __nn.__i.__l = (l); \ 1160 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ 1161 } 1162 #endif /* i960mx */ 1163 #endif /* i960 */ 1164 1165 1166 #if defined (__loongarch64) && W_TYPE_SIZE == 64 1167 #define umul_ppmm(w1, w0, u, v) \ 1168 do { \ 1169 UDItype __u = (u), __v = (v); \ 1170 (w0) = __u * __v; \ 1171 (w1) = (unsigned __int128__) __u * __v >> 64; \ 1172 } while (0) 1173 #endif 1174 1175 1176 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \ 1177 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \ 1178 || defined (__mc5307__)) && W_TYPE_SIZE == 32 1179 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1180 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \ 1181 : "=d" (sh), "=&d" (sl) \ 1182 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1183 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1184 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1185 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \ 1186 : "=d" (sh), "=&d" (sl) \ 1187 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1188 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1189 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */ 1190 #if defined (__mc68020__) || defined(mc68020) \ 1191 || defined (__mc68030__) || defined (mc68030) \ 1192 || defined (__mc68040__) || defined (mc68040) \ 1193 || defined (__mcpu32__) || defined (mcpu32) \ 1194 || defined (__NeXT__) 1195 #define umul_ppmm(w1, w0, u, v) \ 1196 __asm__ ("mulu%.l %3,%1:%0" \ 1197 : "=d" (w0), "=d" (w1) \ 1198 : "%0" ((USItype)(u)), "dmi" ((USItype)(v))) 1199 #define udiv_qrnnd(q, r, n1, n0, d) \ 1200 __asm__ ("divu%.l %4,%1:%0" \ 1201 : "=d" (q), "=d" (r) \ 1202 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1203 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1204 __asm__ ("divs%.l %4,%1:%0" \ 1205 : "=d" (q), "=d" (r) \ 1206 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1207 #else /* for other 68k family members use 16x16->32 multiplication */ 1208 #define umul_ppmm(xh, xl, a, b) \ 1209 do { USItype __umul_tmp1, __umul_tmp2; \ 1210 __asm__ ("| Inlined umul_ppmm\n" \ 1211 " move%.l %5,%3\n" \ 1212 " move%.l %2,%0\n" \ 1213 " move%.w %3,%1\n" \ 1214 " swap %3\n" \ 1215 " swap %0\n" \ 1216 " mulu%.w %2,%1\n" \ 1217 " mulu%.w %3,%0\n" \ 1218 " mulu%.w %2,%3\n" \ 1219 " swap %2\n" \ 1220 " mulu%.w %5,%2\n" \ 1221 " add%.l %3,%2\n" \ 1222 " jcc 1f\n" \ 1223 " add%.l %#0x10000,%0\n" \ 1224 "1: move%.l %2,%3\n" \ 1225 " clr%.w %2\n" \ 1226 " swap %2\n" \ 1227 " swap %3\n" \ 1228 " clr%.w %3\n" \ 1229 " add%.l %3,%1\n" \ 1230 " addx%.l %2,%0\n" \ 1231 " | End inlined umul_ppmm" \ 1232 : "=&d" (xh), "=&d" (xl), \ 1233 "=&d" (__umul_tmp1), "=&d" (__umul_tmp2) \ 1234 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ 1235 } while (0) 1236 #endif /* not mc68020 */ 1237 /* The '020, '030, '040 and '060 have bitfield insns. 1238 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to 1239 exclude bfffo on that chip (bitfield insns not available). */ 1240 #if (defined (__mc68020__) || defined (mc68020) \ 1241 || defined (__mc68030__) || defined (mc68030) \ 1242 || defined (__mc68040__) || defined (mc68040) \ 1243 || defined (__mc68060__) || defined (mc68060) \ 1244 || defined (__NeXT__)) \ 1245 && ! defined (__mcpu32__) 1246 #define count_leading_zeros(count, x) \ 1247 __asm__ ("bfffo %1{%b2:%b2},%0" \ 1248 : "=d" (count) \ 1249 : "od" ((USItype) (x)), "n" (0)) 1250 #define COUNT_LEADING_ZEROS_0 32 1251 #endif 1252 #endif /* mc68000 */ 1253 1254 #if defined (__m88000__) && W_TYPE_SIZE == 32 1255 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1256 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \ 1257 : "=r" (sh), "=&r" (sl) \ 1258 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl)) 1259 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1260 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \ 1261 : "=r" (sh), "=&r" (sl) \ 1262 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl)) 1263 #define count_leading_zeros(count, x) \ 1264 do { \ 1265 USItype __cbtmp; \ 1266 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \ 1267 (count) = __cbtmp ^ 31; \ 1268 } while (0) 1269 #define COUNT_LEADING_ZEROS_0 63 /* sic */ 1270 #if defined (__m88110__) 1271 #define umul_ppmm(wh, wl, u, v) \ 1272 do { \ 1273 union {UDItype __ll; \ 1274 struct {USItype __h, __l;} __i; \ 1275 } __x; \ 1276 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ 1277 (wh) = __x.__i.__h; \ 1278 (wl) = __x.__i.__l; \ 1279 } while (0) 1280 #define udiv_qrnnd(q, r, n1, n0, d) \ 1281 ({union {UDItype __ll; \ 1282 struct {USItype __h, __l;} __i; \ 1283 } __x, __q; \ 1284 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1285 __asm__ ("divu.d %0,%1,%2" \ 1286 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ 1287 (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) 1288 #endif /* __m88110__ */ 1289 #endif /* __m88000__ */ 1290 1291 #if defined (__mips) && W_TYPE_SIZE == 32 1292 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__) 1293 #define umul_ppmm(w1, w0, u, v) \ 1294 do { \ 1295 UDItype __ll = (UDItype)(u) * (v); \ 1296 w1 = __ll >> 32; \ 1297 w0 = __ll; \ 1298 } while (0) 1299 #endif 1300 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1301 #define umul_ppmm(w1, w0, u, v) \ 1302 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) 1303 #endif 1304 #if !defined (umul_ppmm) 1305 #define umul_ppmm(w1, w0, u, v) \ 1306 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1307 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) 1308 #endif 1309 #endif /* __mips */ 1310 1311 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 1312 #if defined (_MIPS_ARCH_MIPS64R6) 1313 #define umul_ppmm(w1, w0, u, v) \ 1314 do { \ 1315 UDItype __m0 = (u), __m1 = (v); \ 1316 (w0) = __m0 * __m1; \ 1317 __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1)); \ 1318 } while (0) 1319 #endif 1320 #if !defined (umul_ppmm) && (__GMP_GNUC_PREREQ (4,4) || defined(__clang__)) 1321 #define umul_ppmm(w1, w0, u, v) \ 1322 do { \ 1323 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1324 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1325 w1 = __ll >> 64; \ 1326 w0 = __ll; \ 1327 } while (0) 1328 #endif 1329 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1330 #define umul_ppmm(w1, w0, u, v) \ 1331 __asm__ ("dmultu %2,%3" \ 1332 : "=l" (w0), "=h" (w1) \ 1333 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1334 #endif 1335 #if !defined (umul_ppmm) 1336 #define umul_ppmm(w1, w0, u, v) \ 1337 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1338 : "=d" (w0), "=d" (w1) \ 1339 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1340 #endif 1341 #endif /* __mips */ 1342 1343 #if defined (__mmix__) && W_TYPE_SIZE == 64 1344 #define umul_ppmm(w1, w0, u, v) \ 1345 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v)) 1346 #endif 1347 1348 #if defined (__ns32000__) && W_TYPE_SIZE == 32 1349 #define umul_ppmm(w1, w0, u, v) \ 1350 ({union {UDItype __ll; \ 1351 struct {USItype __l, __h;} __i; \ 1352 } __x; \ 1353 __asm__ ("meid %2,%0" \ 1354 : "=g" (__x.__ll) \ 1355 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1356 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1357 #define __umulsidi3(u, v) \ 1358 ({UDItype __w; \ 1359 __asm__ ("meid %2,%0" \ 1360 : "=g" (__w) \ 1361 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1362 __w; }) 1363 #define udiv_qrnnd(q, r, n1, n0, d) \ 1364 ({union {UDItype __ll; \ 1365 struct {USItype __l, __h;} __i; \ 1366 } __x; \ 1367 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1368 __asm__ ("deid %2,%0" \ 1369 : "=g" (__x.__ll) \ 1370 : "0" (__x.__ll), "g" ((USItype)(d))); \ 1371 (r) = __x.__i.__l; (q) = __x.__i.__h; }) 1372 #define count_trailing_zeros(count,x) \ 1373 do { \ 1374 __asm__ ("ffsd %2,%0" \ 1375 : "=r" (count) \ 1376 : "0" ((USItype) 0), "r" ((USItype) (x))); \ 1377 } while (0) 1378 #endif /* __ns32000__ */ 1379 1380 /* In the past we had a block of various #defines tested 1381 _ARCH_PPC - AIX 1382 _ARCH_PWR - AIX 1383 __powerpc__ - gcc 1384 __POWERPC__ - BEOS 1385 __ppc__ - Darwin 1386 PPC - old gcc, GNU/Linux, SysV 1387 The plain PPC test was not good for vxWorks, since PPC is defined on all 1388 CPUs there (eg. m68k too), as a constant one is expected to compare 1389 CPU_FAMILY against. 1390 1391 At any rate, this was pretty unattractive and a bit fragile. The use of 1392 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of 1393 getting the desired effect. 1394 1395 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for 1396 the system vendor compilers. (Is that vendor compilers with inline asm, 1397 or what?) */ 1398 1399 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \ 1400 && W_TYPE_SIZE == 32 1401 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1402 do { \ 1403 if (__builtin_constant_p (bh) && (bh) == 0) \ 1404 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1405 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ 1406 __CLOBBER_CC); \ 1407 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1408 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1409 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ 1410 __CLOBBER_CC); \ 1411 else \ 1412 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1413 : "=r" (sh), "=&r" (sl) \ 1414 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl) \ 1415 __CLOBBER_CC); \ 1416 } while (0) 1417 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1418 do { \ 1419 if (__builtin_constant_p (ah) && (ah) == 0) \ 1420 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1421 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ 1422 __CLOBBER_CC); \ 1423 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ 1424 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1425 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ 1426 __CLOBBER_CC); \ 1427 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1428 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1429 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ 1430 __CLOBBER_CC); \ 1431 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1432 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1433 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ 1434 __CLOBBER_CC); \ 1435 else \ 1436 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1437 : "=r" (sh), "=&r" (sl) \ 1438 : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \ 1439 __CLOBBER_CC); \ 1440 } while (0) 1441 #define count_leading_zeros(count, x) \ 1442 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x)) 1443 #define COUNT_LEADING_ZEROS_0 32 1444 #if HAVE_HOST_CPU_FAMILY_powerpc 1445 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__) 1446 #define umul_ppmm(w1, w0, u, v) \ 1447 do { \ 1448 UDItype __ll = (UDItype)(u) * (v); \ 1449 w1 = __ll >> 32; \ 1450 w0 = __ll; \ 1451 } while (0) 1452 #endif 1453 #if !defined (umul_ppmm) 1454 #define umul_ppmm(ph, pl, m0, m1) \ 1455 do { \ 1456 USItype __m0 = (m0), __m1 = (m1); \ 1457 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1458 (pl) = __m0 * __m1; \ 1459 } while (0) 1460 #endif 1461 #define smul_ppmm(ph, pl, m0, m1) \ 1462 do { \ 1463 SItype __m0 = (m0), __m1 = (m1); \ 1464 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1465 (pl) = __m0 * __m1; \ 1466 } while (0) 1467 #else 1468 #define smul_ppmm(xh, xl, m0, m1) \ 1469 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1)) 1470 #define sdiv_qrnnd(q, r, nh, nl, d) \ 1471 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d)) 1472 #endif 1473 #endif /* 32-bit POWER architecture variants. */ 1474 1475 /* We should test _IBMR2 here when we add assembly support for the system 1476 vendor compilers. */ 1477 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64 1478 #if !defined (_LONG_LONG_LIMB) 1479 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So 1480 use adde etc only when not _LONG_LONG_LIMB. */ 1481 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1482 do { \ 1483 if (__builtin_constant_p (bh) && (bh) == 0) \ 1484 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1485 : "=r" (sh), "=&r" (sl) \ 1486 : "r" ((UDItype)(ah)), \ 1487 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1488 __CLOBBER_CC); \ 1489 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1490 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1491 : "=r" (sh), "=&r" (sl) \ 1492 : "r" ((UDItype)(ah)), \ 1493 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1494 __CLOBBER_CC); \ 1495 else \ 1496 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1497 : "=r" (sh), "=&r" (sl) \ 1498 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1499 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1500 __CLOBBER_CC); \ 1501 } while (0) 1502 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs. 1503 This might seem strange, but gcc folds away the dead code late. */ 1504 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1505 do { \ 1506 if (__builtin_constant_p (bl) \ 1507 && (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) { \ 1508 if (__builtin_constant_p (ah) && (ah) == 0) \ 1509 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \ 1510 : "=r" (sh), "=&r" (sl) \ 1511 : "r" ((UDItype)(bh)), \ 1512 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1513 __CLOBBER_CC); \ 1514 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1515 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \ 1516 : "=r" (sh), "=&r" (sl) \ 1517 : "r" ((UDItype)(bh)), \ 1518 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1519 __CLOBBER_CC); \ 1520 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1521 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \ 1522 : "=r" (sh), "=&r" (sl) \ 1523 : "r" ((UDItype)(ah)), \ 1524 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1525 __CLOBBER_CC); \ 1526 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1527 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \ 1528 : "=r" (sh), "=&r" (sl) \ 1529 : "r" ((UDItype)(ah)), \ 1530 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1531 __CLOBBER_CC); \ 1532 else \ 1533 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \ 1534 : "=r" (sh), "=&r" (sl) \ 1535 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1536 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1537 __CLOBBER_CC); \ 1538 } else { \ 1539 if (__builtin_constant_p (ah) && (ah) == 0) \ 1540 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1541 : "=r" (sh), "=&r" (sl) \ 1542 : "r" ((UDItype)(bh)), \ 1543 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1544 __CLOBBER_CC); \ 1545 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1546 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1547 : "=r" (sh), "=&r" (sl) \ 1548 : "r" ((UDItype)(bh)), \ 1549 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1550 __CLOBBER_CC); \ 1551 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1552 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1553 : "=r" (sh), "=&r" (sl) \ 1554 : "r" ((UDItype)(ah)), \ 1555 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1556 __CLOBBER_CC); \ 1557 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1558 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1559 : "=r" (sh), "=&r" (sl) \ 1560 : "r" ((UDItype)(ah)), \ 1561 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1562 __CLOBBER_CC); \ 1563 else \ 1564 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1565 : "=r" (sh), "=&r" (sl) \ 1566 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1567 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1568 __CLOBBER_CC); \ 1569 } \ 1570 } while (0) 1571 #endif /* ! _LONG_LONG_LIMB */ 1572 #define count_leading_zeros(count, x) \ 1573 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) 1574 #define COUNT_LEADING_ZEROS_0 64 1575 /* XXXMRG GCC-9 era, pre-req went from 4.4 to 4.8, check this. */ 1576 #if 0 && (__GMP_GNUC_PREREQ (4,8) || defined(__clang__)) /* Disable, this results in libcalls! */ 1577 #define umul_ppmm(w1, w0, u, v) \ 1578 do { \ 1579 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1580 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1581 w1 = __ll >> 64; \ 1582 w0 = __ll; \ 1583 } while (0) 1584 #endif 1585 #if !defined (umul_ppmm) 1586 #define umul_ppmm(ph, pl, m0, m1) \ 1587 do { \ 1588 UDItype __m0 = (m0), __m1 = (m1); \ 1589 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1590 (pl) = __m0 * __m1; \ 1591 } while (0) 1592 #endif 1593 #define smul_ppmm(ph, pl, m0, m1) \ 1594 do { \ 1595 DItype __m0 = (m0), __m1 = (m1); \ 1596 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1597 (pl) = __m0 * __m1; \ 1598 } while (0) 1599 #endif /* 64-bit PowerPC. */ 1600 1601 #if defined (__pyr__) && W_TYPE_SIZE == 32 1602 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1603 __asm__ ("addw %5,%1\n\taddwc %3,%0" \ 1604 : "=r" (sh), "=&r" (sl) \ 1605 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1606 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1607 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1608 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \ 1609 : "=r" (sh), "=&r" (sl) \ 1610 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1611 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1612 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ 1613 #define umul_ppmm(w1, w0, u, v) \ 1614 ({union {UDItype __ll; \ 1615 struct {USItype __h, __l;} __i; \ 1616 } __x; \ 1617 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \ 1618 : "=&r" (__x.__ll) \ 1619 : "g" ((USItype) (u)), "g" ((USItype)(v))); \ 1620 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1621 #endif /* __pyr__ */ 1622 1623 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 1624 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1625 __asm__ ("a %1,%5\n\tae %0,%3" \ 1626 : "=r" (sh), "=&r" (sl) \ 1627 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1628 "%1" ((USItype)(al)), "r" ((USItype)(bl))) 1629 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1630 __asm__ ("s %1,%5\n\tse %0,%3" \ 1631 : "=r" (sh), "=&r" (sl) \ 1632 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1633 "1" ((USItype)(al)), "r" ((USItype)(bl))) 1634 #define smul_ppmm(ph, pl, m0, m1) \ 1635 __asm__ ( \ 1636 "s r2,r2\n" \ 1637 " mts r10,%2\n" \ 1638 " m r2,%3\n" \ 1639 " m r2,%3\n" \ 1640 " m r2,%3\n" \ 1641 " m r2,%3\n" \ 1642 " m r2,%3\n" \ 1643 " m r2,%3\n" \ 1644 " m r2,%3\n" \ 1645 " m r2,%3\n" \ 1646 " m r2,%3\n" \ 1647 " m r2,%3\n" \ 1648 " m r2,%3\n" \ 1649 " m r2,%3\n" \ 1650 " m r2,%3\n" \ 1651 " m r2,%3\n" \ 1652 " m r2,%3\n" \ 1653 " m r2,%3\n" \ 1654 " cas %0,r2,r0\n" \ 1655 " mfs r10,%1" \ 1656 : "=r" (ph), "=r" (pl) \ 1657 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ 1658 : "r2") 1659 #define count_leading_zeros(count, x) \ 1660 do { \ 1661 if ((x) >= 0x10000) \ 1662 __asm__ ("clz %0,%1" \ 1663 : "=r" (count) : "r" ((USItype)(x) >> 16)); \ 1664 else \ 1665 { \ 1666 __asm__ ("clz %0,%1" \ 1667 : "=r" (count) : "r" ((USItype)(x))); \ 1668 (count) += 16; \ 1669 } \ 1670 } while (0) 1671 #endif /* RT/ROMP */ 1672 1673 #if defined (__riscv) && defined (__riscv_mul) && W_TYPE_SIZE == 64 1674 #define umul_ppmm(ph, pl, u, v) \ 1675 do { \ 1676 UDItype __u = (u), __v = (v); \ 1677 (pl) = __u * __v; \ 1678 __asm__ ("mulhu\t%0, %1, %2" : "=r" (ph) : "%r" (__u), "r" (__v)); \ 1679 } while (0) 1680 #endif 1681 1682 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32 1683 #define umul_ppmm(w1, w0, u, v) \ 1684 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \ 1685 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach") 1686 #endif 1687 1688 #if defined (__sparc__) && W_TYPE_SIZE == 32 1689 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1690 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \ 1691 : "=r" (sh), "=&r" (sl) \ 1692 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \ 1693 __CLOBBER_CC) 1694 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1695 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \ 1696 : "=r" (sh), "=&r" (sl) \ 1697 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ 1698 __CLOBBER_CC) 1699 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h 1700 doesn't define anything to indicate that to us, it only sets __sparcv8. */ 1701 #if defined (__sparc_v9__) || defined (__sparcv9) 1702 /* Perhaps we should use floating-point operations here? */ 1703 #if 0 1704 /* Triggers a bug making mpz/tests/t-gcd.c fail. 1705 Perhaps we simply need explicitly zero-extend the inputs? */ 1706 #define umul_ppmm(w1, w0, u, v) \ 1707 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \ 1708 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1") 1709 #else 1710 /* Use v8 umul until above bug is fixed. */ 1711 #define umul_ppmm(w1, w0, u, v) \ 1712 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1713 #endif 1714 /* Use a plain v8 divide for v9. */ 1715 #define udiv_qrnnd(q, r, n1, n0, d) \ 1716 do { \ 1717 USItype __q; \ 1718 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1719 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1720 (r) = (n0) - __q * (d); \ 1721 (q) = __q; \ 1722 } while (0) 1723 #else 1724 #if defined (__sparc_v8__) /* gcc normal */ \ 1725 || defined (__sparcv8) /* gcc solaris */ \ 1726 || HAVE_HOST_CPU_supersparc 1727 /* Don't match immediate range because, 1) it is not often useful, 1728 2) the 'I' flag thinks of the range as a 13 bit signed interval, 1729 while we want to match a 13 bit interval, sign extended to 32 bits, 1730 but INTERPRETED AS UNSIGNED. */ 1731 #define umul_ppmm(w1, w0, u, v) \ 1732 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1733 1734 #if HAVE_HOST_CPU_supersparc 1735 #else 1736 /* Don't use this on SuperSPARC because its udiv only handles 53 bit 1737 dividends and will trap to the kernel for the rest. */ 1738 #define udiv_qrnnd(q, r, n1, n0, d) \ 1739 do { \ 1740 USItype __q; \ 1741 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1742 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1743 (r) = (n0) - __q * (d); \ 1744 (q) = __q; \ 1745 } while (0) 1746 #endif /* HAVE_HOST_CPU_supersparc */ 1747 1748 #else /* ! __sparc_v8__ */ 1749 #if defined (__sparclite__) 1750 /* This has hardware multiply but not divide. It also has two additional 1751 instructions scan (ffs from high bit) and divscc. */ 1752 #define umul_ppmm(w1, w0, u, v) \ 1753 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1754 #define udiv_qrnnd(q, r, n1, n0, d) \ 1755 __asm__ ("! Inlined udiv_qrnnd\n" \ 1756 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ 1757 " tst %%g0\n" \ 1758 " divscc %3,%4,%%g1\n" \ 1759 " divscc %%g1,%4,%%g1\n" \ 1760 " divscc %%g1,%4,%%g1\n" \ 1761 " divscc %%g1,%4,%%g1\n" \ 1762 " divscc %%g1,%4,%%g1\n" \ 1763 " divscc %%g1,%4,%%g1\n" \ 1764 " divscc %%g1,%4,%%g1\n" \ 1765 " divscc %%g1,%4,%%g1\n" \ 1766 " divscc %%g1,%4,%%g1\n" \ 1767 " divscc %%g1,%4,%%g1\n" \ 1768 " divscc %%g1,%4,%%g1\n" \ 1769 " divscc %%g1,%4,%%g1\n" \ 1770 " divscc %%g1,%4,%%g1\n" \ 1771 " divscc %%g1,%4,%%g1\n" \ 1772 " divscc %%g1,%4,%%g1\n" \ 1773 " divscc %%g1,%4,%%g1\n" \ 1774 " divscc %%g1,%4,%%g1\n" \ 1775 " divscc %%g1,%4,%%g1\n" \ 1776 " divscc %%g1,%4,%%g1\n" \ 1777 " divscc %%g1,%4,%%g1\n" \ 1778 " divscc %%g1,%4,%%g1\n" \ 1779 " divscc %%g1,%4,%%g1\n" \ 1780 " divscc %%g1,%4,%%g1\n" \ 1781 " divscc %%g1,%4,%%g1\n" \ 1782 " divscc %%g1,%4,%%g1\n" \ 1783 " divscc %%g1,%4,%%g1\n" \ 1784 " divscc %%g1,%4,%%g1\n" \ 1785 " divscc %%g1,%4,%%g1\n" \ 1786 " divscc %%g1,%4,%%g1\n" \ 1787 " divscc %%g1,%4,%%g1\n" \ 1788 " divscc %%g1,%4,%%g1\n" \ 1789 " divscc %%g1,%4,%0\n" \ 1790 " rd %%y,%1\n" \ 1791 " bl,a 1f\n" \ 1792 " add %1,%4,%1\n" \ 1793 "1: ! End of inline udiv_qrnnd" \ 1794 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ 1795 : "%g1" __AND_CLOBBER_CC) 1796 #define count_leading_zeros(count, x) \ 1797 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) 1798 /* Early sparclites return 63 for an argument of 0, but they warn that future 1799 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 1800 undefined. */ 1801 #endif /* __sparclite__ */ 1802 #endif /* __sparc_v8__ */ 1803 #endif /* __sparc_v9__ */ 1804 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ 1805 #ifndef umul_ppmm 1806 #define umul_ppmm(w1, w0, u, v) \ 1807 __asm__ ("! Inlined umul_ppmm\n" \ 1808 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ 1809 " sra %3,31,%%g2 ! Don't move this insn\n" \ 1810 " and %2,%%g2,%%g2 ! Don't move this insn\n" \ 1811 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \ 1812 " mulscc %%g1,%3,%%g1\n" \ 1813 " mulscc %%g1,%3,%%g1\n" \ 1814 " mulscc %%g1,%3,%%g1\n" \ 1815 " mulscc %%g1,%3,%%g1\n" \ 1816 " mulscc %%g1,%3,%%g1\n" \ 1817 " mulscc %%g1,%3,%%g1\n" \ 1818 " mulscc %%g1,%3,%%g1\n" \ 1819 " mulscc %%g1,%3,%%g1\n" \ 1820 " mulscc %%g1,%3,%%g1\n" \ 1821 " mulscc %%g1,%3,%%g1\n" \ 1822 " mulscc %%g1,%3,%%g1\n" \ 1823 " mulscc %%g1,%3,%%g1\n" \ 1824 " mulscc %%g1,%3,%%g1\n" \ 1825 " mulscc %%g1,%3,%%g1\n" \ 1826 " mulscc %%g1,%3,%%g1\n" \ 1827 " mulscc %%g1,%3,%%g1\n" \ 1828 " mulscc %%g1,%3,%%g1\n" \ 1829 " mulscc %%g1,%3,%%g1\n" \ 1830 " mulscc %%g1,%3,%%g1\n" \ 1831 " mulscc %%g1,%3,%%g1\n" \ 1832 " mulscc %%g1,%3,%%g1\n" \ 1833 " mulscc %%g1,%3,%%g1\n" \ 1834 " mulscc %%g1,%3,%%g1\n" \ 1835 " mulscc %%g1,%3,%%g1\n" \ 1836 " mulscc %%g1,%3,%%g1\n" \ 1837 " mulscc %%g1,%3,%%g1\n" \ 1838 " mulscc %%g1,%3,%%g1\n" \ 1839 " mulscc %%g1,%3,%%g1\n" \ 1840 " mulscc %%g1,%3,%%g1\n" \ 1841 " mulscc %%g1,%3,%%g1\n" \ 1842 " mulscc %%g1,%3,%%g1\n" \ 1843 " mulscc %%g1,%3,%%g1\n" \ 1844 " mulscc %%g1,0,%%g1\n" \ 1845 " add %%g1,%%g2,%0\n" \ 1846 " rd %%y,%1" \ 1847 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ 1848 : "%g1", "%g2" __AND_CLOBBER_CC) 1849 #endif 1850 #ifndef udiv_qrnnd 1851 #ifndef LONGLONG_STANDALONE 1852 #define udiv_qrnnd(q, r, n1, n0, d) \ 1853 do { UWtype __r; \ 1854 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 1855 (r) = __r; \ 1856 } while (0) 1857 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 1858 #endif /* LONGLONG_STANDALONE */ 1859 #endif /* udiv_qrnnd */ 1860 #endif /* __sparc__ */ 1861 1862 #if defined (__sparc__) && W_TYPE_SIZE == 64 1863 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1864 __asm__ ( \ 1865 "addcc %r4,%5,%1\n" \ 1866 " addccc %r6,%7,%%g0\n" \ 1867 " addc %r2,%3,%0" \ 1868 : "=r" (sh), "=&r" (sl) \ 1869 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1870 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1871 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1872 __CLOBBER_CC) 1873 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1874 __asm__ ( \ 1875 "subcc %r4,%5,%1\n" \ 1876 " subccc %r6,%7,%%g0\n" \ 1877 " subc %r2,%3,%0" \ 1878 : "=r" (sh), "=&r" (sl) \ 1879 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1880 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1881 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1882 __CLOBBER_CC) 1883 #if __VIS__ >= 0x300 1884 #undef add_ssaaaa 1885 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1886 __asm__ ( \ 1887 "addcc %r4, %5, %1\n" \ 1888 " addxc %r2, %r3, %0" \ 1889 : "=r" (sh), "=&r" (sl) \ 1890 : "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \ 1891 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC) 1892 #define umul_ppmm(ph, pl, m0, m1) \ 1893 do { \ 1894 UDItype __m0 = (m0), __m1 = (m1); \ 1895 (pl) = __m0 * __m1; \ 1896 __asm__ ("umulxhi\t%2, %1, %0" \ 1897 : "=r" (ph) \ 1898 : "%r" (__m0), "r" (__m1)); \ 1899 } while (0) 1900 #define count_leading_zeros(count, x) \ 1901 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x)) 1902 /* Needed by count_leading_zeros_32 in sparc64.h. */ 1903 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 1904 #endif 1905 #endif 1906 1907 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32 1908 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1909 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ 1910 : "=g" (sh), "=&g" (sl) \ 1911 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1912 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1913 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1914 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \ 1915 : "=g" (sh), "=&g" (sl) \ 1916 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1917 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1918 #define smul_ppmm(xh, xl, m0, m1) \ 1919 do { \ 1920 union {UDItype __ll; \ 1921 struct {USItype __l, __h;} __i; \ 1922 } __x; \ 1923 USItype __m0 = (m0), __m1 = (m1); \ 1924 __asm__ ("emul %1,%2,$0,%0" \ 1925 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \ 1926 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1927 } while (0) 1928 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1929 do { \ 1930 union {DItype __ll; \ 1931 struct {SItype __l, __h;} __i; \ 1932 } __x; \ 1933 __x.__i.__h = n1; __x.__i.__l = n0; \ 1934 __asm__ ("ediv %3,%2,%0,%1" \ 1935 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ 1936 } while (0) 1937 #if 0 1938 /* FIXME: This instruction appears to be unimplemented on some systems (vax 1939 8800 maybe). */ 1940 #define count_trailing_zeros(count,x) \ 1941 do { \ 1942 __asm__ ("ffs 0, 31, %1, %0" \ 1943 : "=g" (count) \ 1944 : "g" ((USItype) (x))); \ 1945 } while (0) 1946 #endif 1947 #endif /* vax */ 1948 1949 #if defined (__z8000__) && W_TYPE_SIZE == 16 1950 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1951 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ 1952 : "=r" (sh), "=&r" (sl) \ 1953 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1954 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1955 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1956 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ 1957 : "=r" (sh), "=&r" (sl) \ 1958 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1959 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1960 #define umul_ppmm(xh, xl, m0, m1) \ 1961 do { \ 1962 union {long int __ll; \ 1963 struct {unsigned int __h, __l;} __i; \ 1964 } __x; \ 1965 unsigned int __m0 = (m0), __m1 = (m1); \ 1966 __asm__ ("mult %S0,%H3" \ 1967 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ 1968 : "%1" (m0), "rQR" (m1)); \ 1969 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1970 (xh) += ((((signed int) __m0 >> 15) & __m1) \ 1971 + (((signed int) __m1 >> 15) & __m0)); \ 1972 } while (0) 1973 #endif /* __z8000__ */ 1974 1975 #endif /* __GNUC__ */ 1976 1977 #endif /* NO_ASM */ 1978 1979 1980 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */ 1981 #if !defined (umul_ppmm) && defined (__umulsidi3) 1982 #define umul_ppmm(ph, pl, m0, m1) \ 1983 do { \ 1984 UDWtype __ll = __umulsidi3 (m0, m1); \ 1985 ph = (UWtype) (__ll >> W_TYPE_SIZE); \ 1986 pl = (UWtype) __ll; \ 1987 } while (0) 1988 #endif 1989 1990 #if !defined (__umulsidi3) 1991 #define __umulsidi3(u, v) \ 1992 ({UWtype __hi, __lo; \ 1993 umul_ppmm (__hi, __lo, u, v); \ 1994 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) 1995 #endif 1996 1997 1998 #if defined (__cplusplus) 1999 #define __longlong_h_C "C" 2000 #else 2001 #define __longlong_h_C 2002 #endif 2003 2004 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r" 2005 forms have "reversed" arguments, meaning the pointer is last, which 2006 sometimes allows better parameter passing, in particular on 64-bit 2007 hppa. */ 2008 2009 #define mpn_umul_ppmm __MPN(umul_ppmm) 2010 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype); 2011 2012 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \ 2013 && ! defined (LONGLONG_STANDALONE) 2014 #define umul_ppmm(wh, wl, u, v) \ 2015 do { \ 2016 UWtype __umul_ppmm__p0; \ 2017 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\ 2018 (wl) = __umul_ppmm__p0; \ 2019 } while (0) 2020 #endif 2021 2022 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r) 2023 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *); 2024 2025 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \ 2026 && ! defined (LONGLONG_STANDALONE) 2027 #define umul_ppmm(wh, wl, u, v) \ 2028 do { \ 2029 UWtype __umul_p0; \ 2030 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \ 2031 (wl) = __umul_p0; \ 2032 } while (0) 2033 #endif 2034 2035 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd) 2036 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype); 2037 2038 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \ 2039 && ! defined (LONGLONG_STANDALONE) 2040 #define udiv_qrnnd(q, r, n1, n0, d) \ 2041 do { \ 2042 UWtype __udiv_qrnnd_r; \ 2043 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \ 2044 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \ 2045 (r) = __udiv_qrnnd_r; \ 2046 } while (0) 2047 #endif 2048 2049 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r) 2050 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *); 2051 2052 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \ 2053 && ! defined (LONGLONG_STANDALONE) 2054 #define udiv_qrnnd(q, r, n1, n0, d) \ 2055 do { \ 2056 UWtype __udiv_qrnnd_r; \ 2057 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \ 2058 &__udiv_qrnnd_r); \ 2059 (r) = __udiv_qrnnd_r; \ 2060 } while (0) 2061 #endif 2062 2063 2064 /* If this machine has no inline assembler, use C macros. */ 2065 2066 #if !defined (add_ssaaaa) 2067 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 2068 do { \ 2069 UWtype __x; \ 2070 UWtype __al = (al); \ 2071 UWtype __bl = (bl); \ 2072 __x = __al + __bl; \ 2073 (sh) = (ah) + (bh) + (__x < __al); \ 2074 (sl) = __x; \ 2075 } while (0) 2076 #endif 2077 2078 #if !defined (sub_ddmmss) 2079 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 2080 do { \ 2081 UWtype __x; \ 2082 UWtype __al = (al); \ 2083 UWtype __bl = (bl); \ 2084 __x = __al - __bl; \ 2085 (sh) = (ah) - (bh) - (__al < __bl); \ 2086 (sl) = __x; \ 2087 } while (0) 2088 #endif 2089 2090 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of 2091 smul_ppmm. */ 2092 #if !defined (umul_ppmm) && defined (smul_ppmm) 2093 #define umul_ppmm(w1, w0, u, v) \ 2094 do { \ 2095 UWtype __w1; \ 2096 UWtype __xm0 = (u), __xm1 = (v); \ 2097 smul_ppmm (__w1, w0, __xm0, __xm1); \ 2098 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2099 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2100 } while (0) 2101 #endif 2102 2103 /* If we still don't have umul_ppmm, define it using plain C. 2104 2105 For reference, when this code is used for squaring (ie. u and v identical 2106 expressions), gcc recognises __x1 and __x2 are the same and generates 3 2107 multiplies, not 4. The subsequent additions could be optimized a bit, 2108 but the only place GMP currently uses such a square is mpn_sqr_basecase, 2109 and chips obliged to use this generic C umul will have plenty of worse 2110 performance problems than a couple of extra instructions on the diagonal 2111 of sqr_basecase. */ 2112 2113 #if !defined (umul_ppmm) 2114 #define umul_ppmm(w1, w0, u, v) \ 2115 do { \ 2116 UWtype __x0, __x1, __x2, __x3; \ 2117 UHWtype __ul, __vl, __uh, __vh; \ 2118 UWtype __u = (u), __v = (v); \ 2119 \ 2120 __ul = __ll_lowpart (__u); \ 2121 __uh = __ll_highpart (__u); \ 2122 __vl = __ll_lowpart (__v); \ 2123 __vh = __ll_highpart (__v); \ 2124 \ 2125 __x0 = (UWtype) __ul * __vl; \ 2126 __x1 = (UWtype) __ul * __vh; \ 2127 __x2 = (UWtype) __uh * __vl; \ 2128 __x3 = (UWtype) __uh * __vh; \ 2129 \ 2130 __x1 += __ll_highpart (__x0);/* this can't give carry */ \ 2131 __x1 += __x2; /* but this indeed can */ \ 2132 if (__x1 < __x2) /* did we get it? */ \ 2133 __x3 += __ll_B; /* yes, add it in the proper pos. */ \ 2134 \ 2135 (w1) = __x3 + __ll_highpart (__x1); \ 2136 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \ 2137 } while (0) 2138 #endif 2139 2140 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will 2141 exist in one form or another. */ 2142 #if !defined (smul_ppmm) 2143 #define smul_ppmm(w1, w0, u, v) \ 2144 do { \ 2145 UWtype __w1; \ 2146 UWtype __xm0 = (u), __xm1 = (v); \ 2147 umul_ppmm (__w1, w0, __xm0, __xm1); \ 2148 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2149 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2150 } while (0) 2151 #endif 2152 2153 /* Define this unconditionally, so it can be used for debugging. */ 2154 #define __udiv_qrnnd_c(q, r, n1, n0, d) \ 2155 do { \ 2156 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ 2157 \ 2158 ASSERT ((d) != 0); \ 2159 ASSERT ((n1) < (d)); \ 2160 \ 2161 __d1 = __ll_highpart (d); \ 2162 __d0 = __ll_lowpart (d); \ 2163 \ 2164 __q1 = (n1) / __d1; \ 2165 __r1 = (n1) - __q1 * __d1; \ 2166 __m = __q1 * __d0; \ 2167 __r1 = __r1 * __ll_B | __ll_highpart (n0); \ 2168 if (__r1 < __m) \ 2169 { \ 2170 __q1--, __r1 += (d); \ 2171 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ 2172 if (__r1 < __m) \ 2173 __q1--, __r1 += (d); \ 2174 } \ 2175 __r1 -= __m; \ 2176 \ 2177 __q0 = __r1 / __d1; \ 2178 __r0 = __r1 - __q0 * __d1; \ 2179 __m = __q0 * __d0; \ 2180 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ 2181 if (__r0 < __m) \ 2182 { \ 2183 __q0--, __r0 += (d); \ 2184 if (__r0 >= (d)) \ 2185 if (__r0 < __m) \ 2186 __q0--, __r0 += (d); \ 2187 } \ 2188 __r0 -= __m; \ 2189 \ 2190 (q) = __q1 * __ll_B | __q0; \ 2191 (r) = __r0; \ 2192 } while (0) 2193 2194 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through 2195 __udiv_w_sdiv (defined in libgcc or elsewhere). */ 2196 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \ 2197 && ! defined (LONGLONG_STANDALONE) 2198 #define udiv_qrnnd(q, r, nh, nl, d) \ 2199 do { \ 2200 UWtype __r; \ 2201 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ 2202 (r) = __r; \ 2203 } while (0) 2204 __MPFR_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype); 2205 #endif 2206 2207 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ 2208 #if !defined (udiv_qrnnd) 2209 #define UDIV_NEEDS_NORMALIZATION 1 2210 #define udiv_qrnnd __udiv_qrnnd_c 2211 #endif 2212 2213 #if !defined (count_leading_zeros) 2214 #define count_leading_zeros(count, x) \ 2215 do { \ 2216 UWtype __xr = (x); \ 2217 UWtype __a; \ 2218 \ 2219 if (W_TYPE_SIZE == 32) \ 2220 { \ 2221 __a = __xr < ((UWtype) 1 << 2*__BITS4) \ 2222 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ 2223 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ 2224 : 3*__BITS4 + 1); \ 2225 } \ 2226 else \ 2227 { \ 2228 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ 2229 if (((__xr >> __a) & 0xff) != 0) \ 2230 break; \ 2231 ++__a; \ 2232 } \ 2233 \ 2234 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ 2235 } while (0) 2236 /* This version gives a well-defined value for zero. */ 2237 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) 2238 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2239 #define COUNT_LEADING_ZEROS_SLOW 2240 #endif 2241 2242 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */ 2243 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY 2244 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2245 #endif 2246 2247 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2248 extern const unsigned char __MPFR_DECLSPEC __clz_tab[129]; 2249 #endif 2250 2251 #if !defined (count_trailing_zeros) 2252 #if !defined (COUNT_LEADING_ZEROS_SLOW) 2253 /* Define count_trailing_zeros using an asm count_leading_zeros. */ 2254 #define count_trailing_zeros(count, x) \ 2255 do { \ 2256 UWtype __ctz_x = (x); \ 2257 UWtype __ctz_c; \ 2258 ASSERT (__ctz_x != 0); \ 2259 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ 2260 (count) = W_TYPE_SIZE - 1 - __ctz_c; \ 2261 } while (0) 2262 #else 2263 /* Define count_trailing_zeros in plain C, assuming small counts are common. 2264 We use clz_tab without ado, since the C count_leading_zeros above will have 2265 pulled it in. */ 2266 #define count_trailing_zeros(count, x) \ 2267 do { \ 2268 UWtype __ctz_x = (x); \ 2269 int __ctz_c; \ 2270 \ 2271 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2272 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \ 2273 else \ 2274 { \ 2275 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \ 2276 { \ 2277 __ctz_x >>= 8; \ 2278 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2279 break; \ 2280 } \ 2281 \ 2282 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \ 2283 } \ 2284 } while (0) 2285 #endif 2286 #endif 2287 2288 #ifndef UDIV_NEEDS_NORMALIZATION 2289 #define UDIV_NEEDS_NORMALIZATION 0 2290 #endif 2291 2292 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and 2293 that hence the latter should always be used. */ 2294 #ifndef UDIV_PREINV_ALWAYS 2295 #define UDIV_PREINV_ALWAYS 0 2296 #endif 2297