1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic. 2 3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software 4 Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of either: 10 11 * the GNU Lesser General Public License as published by the Free 12 Software Foundation; either version 3 of the License, or (at your 13 option) any later version. 14 15 or 16 17 * the GNU General Public License as published by the Free Software 18 Foundation; either version 2 of the License, or (at your option) any 19 later version. 20 21 or both in parallel, as here. 22 23 The GNU MP Library is distributed in the hope that it will be useful, but 24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 for more details. 27 28 You should have received copies of the GNU General Public License and the 29 GNU Lesser General Public License along with the GNU MP Library. If not, 30 see https://www.gnu.org/licenses/. */ 31 32 /* You have to define the following before including this file: 33 34 UWtype -- An unsigned type, default type for operations (typically a "word") 35 UHWtype -- An unsigned type, at least half the size of UWtype 36 UDWtype -- An unsigned type, at least twice as large a UWtype 37 W_TYPE_SIZE -- size in bits of UWtype 38 39 SItype, USItype -- Signed and unsigned 32 bit types 40 DItype, UDItype -- Signed and unsigned 64 bit types 41 42 On a 32 bit machine UWtype should typically be USItype; 43 on a 64 bit machine, UWtype should typically be UDItype. 44 45 Optionally, define: 46 47 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files 48 NO_ASM -- Disable inline asm 49 50 51 CAUTION! Using this version of longlong.h outside of GMP is not safe. You 52 need to include gmp.h and gmp-impl.h, or certain things might not work as 53 expected. 54 */ 55 56 #define __BITS4 (W_TYPE_SIZE / 4) 57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) 58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) 59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) 60 61 /* This is used to make sure no undesirable sharing between different libraries 62 that use this file takes place. */ 63 #ifndef __MPN 64 #define __MPN(x) __##x 65 #endif 66 67 /* Define auxiliary asm macros. 68 69 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two 70 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype 71 word product in HIGH_PROD and LOW_PROD. 72 73 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a 74 UDWtype product. This is just a variant of umul_ppmm. 75 76 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 77 denominator) divides a UDWtype, composed by the UWtype integers 78 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient 79 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less 80 than DENOMINATOR for correct operation. If, in addition, the most 81 significant bit of DENOMINATOR must be 1, then the pre-processor symbol 82 UDIV_NEEDS_NORMALIZATION is defined to 1. 83 84 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 85 denominator). Like udiv_qrnnd but the numbers are signed. The quotient 86 is rounded towards 0. 87 88 5) count_leading_zeros(count, x) counts the number of zero-bits from the 89 msb to the first non-zero bit in the UWtype X. This is the number of 90 steps X needs to be shifted left to set the msb. Undefined for X == 0, 91 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. 92 93 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts 94 from the least significant end. 95 96 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, 97 high_addend_2, low_addend_2) adds two UWtype integers, composed by 98 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 99 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow 100 (i.e. carry out) is not stored anywhere, and is lost. 101 102 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, 103 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, 104 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and 105 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE 106 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, 107 and is lost. 108 109 If any of these macros are left undefined for a particular CPU, 110 C macros are used. 111 112 113 Notes: 114 115 For add_ssaaaa the two high and two low addends can both commute, but 116 unfortunately gcc only supports one "%" commutative in each asm block. 117 This has always been so but is only documented in recent versions 118 (eg. pre-release 3.3). Having two or more "%"s can cause an internal 119 compiler error in certain rare circumstances. 120 121 Apparently it was only the last "%" that was ever actually respected, so 122 the code has been updated to leave just that. Clearly there's a free 123 choice whether high or low should get it, if there's a reason to favour 124 one over the other. Also obviously when the constraints on the two 125 operands are identical there's no benefit to the reloader in any "%" at 126 all. 127 128 */ 129 130 /* The CPUs come in alphabetical order below. 131 132 Please add support for more CPUs here, or improve the current support 133 for the CPUs below! */ 134 135 136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc 137 3.4 __builtin_clzl or __builtin_clzll, according to our limb size. 138 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or 139 __builtin_ctzll. 140 141 These builtins are only used when we check what code comes out, on some 142 chips they're merely libgcc calls, where we will instead want an inline 143 in that case (either asm or generic C). 144 145 These builtins are better than an asm block of the same insn, since an 146 asm block doesn't give gcc any information about scheduling or resource 147 usage. We keep an asm block for use on prior versions of gcc though. 148 149 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but 150 it's not used (for count_leading_zeros) because it generally gives extra 151 code to ensure the result is 0 when the input is 0, which we don't need 152 or want. */ 153 154 #ifdef _LONG_LONG_LIMB 155 #define count_leading_zeros_gcc_clz(count,x) \ 156 do { \ 157 ASSERT ((x) != 0); \ 158 (count) = __builtin_clzll (x); \ 159 } while (0) 160 #else 161 #define count_leading_zeros_gcc_clz(count,x) \ 162 do { \ 163 ASSERT ((x) != 0); \ 164 (count) = __builtin_clzl (x); \ 165 } while (0) 166 #endif 167 168 #ifdef _LONG_LONG_LIMB 169 #define count_trailing_zeros_gcc_ctz(count,x) \ 170 do { \ 171 ASSERT ((x) != 0); \ 172 (count) = __builtin_ctzll (x); \ 173 } while (0) 174 #else 175 #define count_trailing_zeros_gcc_ctz(count,x) \ 176 do { \ 177 ASSERT ((x) != 0); \ 178 (count) = __builtin_ctzl (x); \ 179 } while (0) 180 #endif 181 182 183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros) 184 don't need to be under !NO_ASM */ 185 #if ! defined (NO_ASM) 186 187 #if defined (__alpha) && W_TYPE_SIZE == 64 188 /* Most alpha-based machines, except Cray systems. */ 189 #if defined (__GNUC__) 190 #if __GMP_GNUC_PREREQ (3,3) 191 #define umul_ppmm(ph, pl, m0, m1) \ 192 do { \ 193 UDItype __m0 = (m0), __m1 = (m1); \ 194 (ph) = __builtin_alpha_umulh (__m0, __m1); \ 195 (pl) = __m0 * __m1; \ 196 } while (0) 197 #else 198 #define umul_ppmm(ph, pl, m0, m1) \ 199 do { \ 200 UDItype __m0 = (m0), __m1 = (m1); \ 201 __asm__ ("umulh %r1,%2,%0" \ 202 : "=r" (ph) \ 203 : "%rJ" (__m0), "rI" (__m1)); \ 204 (pl) = __m0 * __m1; \ 205 } while (0) 206 #endif 207 #else /* ! __GNUC__ */ 208 #include <machine/builtins.h> 209 #define umul_ppmm(ph, pl, m0, m1) \ 210 do { \ 211 UDItype __m0 = (m0), __m1 = (m1); \ 212 (ph) = __UMULH (__m0, __m1); \ 213 (pl) = __m0 * __m1; \ 214 } while (0) 215 #endif 216 #ifndef LONGLONG_STANDALONE 217 #define udiv_qrnnd(q, r, n1, n0, d) \ 218 do { UWtype __di; \ 219 __di = __MPN(invert_limb) (d); \ 220 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 221 } while (0) 222 #define UDIV_PREINV_ALWAYS 1 223 #define UDIV_NEEDS_NORMALIZATION 1 224 #endif /* LONGLONG_STANDALONE */ 225 226 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm 227 always goes into libgmp.so, even when not actually used. */ 228 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 229 230 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX 231 #define count_leading_zeros(COUNT,X) \ 232 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) 233 #define count_trailing_zeros(COUNT,X) \ 234 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) 235 #endif /* clz/ctz using cix */ 236 237 #if ! defined (count_leading_zeros) \ 238 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) 239 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. 240 "$31" is written explicitly in the asm, since an "r" constraint won't 241 select reg 31. There seems no need to worry about "r31" syntax for cray, 242 since gcc itself (pre-release 3.4) emits just $31 in various places. */ 243 #define ALPHA_CMPBGE_0(dst, src) \ 244 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) 245 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts 246 them, locating the highest non-zero byte. A second __clz_tab lookup 247 counts the leading zero bits in that byte, giving the result. */ 248 #define count_leading_zeros(count, x) \ 249 do { \ 250 UWtype __clz__b, __clz__c, __clz__x = (x); \ 251 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ 252 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ 253 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ 254 __clz__x >>= __clz__b; \ 255 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ 256 __clz__b = 65 - __clz__b; \ 257 (count) = __clz__b - __clz__c; \ 258 } while (0) 259 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 260 #endif /* clz using cmpbge */ 261 262 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) 263 #if HAVE_ATTRIBUTE_CONST 264 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); 265 #else 266 long __MPN(count_leading_zeros) (UDItype); 267 #endif 268 #define count_leading_zeros(count, x) \ 269 ((count) = __MPN(count_leading_zeros) (x)) 270 #endif /* clz using mpn */ 271 #endif /* __alpha */ 272 273 #if defined (__AVR) && W_TYPE_SIZE == 8 274 #define umul_ppmm(ph, pl, m0, m1) \ 275 do { \ 276 unsigned short __p = (unsigned short) (m0) * (m1); \ 277 (ph) = __p >> 8; \ 278 (pl) = __p; \ 279 } while (0) 280 #endif /* AVR */ 281 282 #if defined (_CRAY) && W_TYPE_SIZE == 64 283 #include <intrinsics.h> 284 #define UDIV_PREINV_ALWAYS 1 285 #define UDIV_NEEDS_NORMALIZATION 1 286 long __MPN(count_leading_zeros) (UDItype); 287 #define count_leading_zeros(count, x) \ 288 ((count) = _leadz ((UWtype) (x))) 289 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ 290 #define umul_ppmm(ph, pl, m0, m1) \ 291 do { \ 292 UDItype __m0 = (m0), __m1 = (m1); \ 293 (ph) = _int_mult_upper (__m0, __m1); \ 294 (pl) = __m0 * __m1; \ 295 } while (0) 296 #ifndef LONGLONG_STANDALONE 297 #define udiv_qrnnd(q, r, n1, n0, d) \ 298 do { UWtype __di; \ 299 __di = __MPN(invert_limb) (d); \ 300 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 301 } while (0) 302 #endif /* LONGLONG_STANDALONE */ 303 #endif /* _CRAYIEEE */ 304 #endif /* _CRAY */ 305 306 #if defined (__ia64) && W_TYPE_SIZE == 64 307 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated 308 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic 309 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a 310 register, which takes an extra cycle. */ 311 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 312 do { \ 313 UWtype __x; \ 314 __x = (al) - (bl); \ 315 if ((al) < (bl)) \ 316 (sh) = (ah) - (bh) - 1; \ 317 else \ 318 (sh) = (ah) - (bh); \ 319 (sl) = __x; \ 320 } while (0) 321 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 322 /* Do both product parts in assembly, since that gives better code with 323 all gcc versions. Some callers will just use the upper part, and in 324 that situation we waste an instruction, but not any cycles. */ 325 #define umul_ppmm(ph, pl, m0, m1) \ 326 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \ 327 : "=&f" (ph), "=f" (pl) \ 328 : "f" (m0), "f" (m1)) 329 #define count_leading_zeros(count, x) \ 330 do { \ 331 UWtype _x = (x), _y, _a, _c; \ 332 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ 333 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ 334 _c = (_a - 1) << 3; \ 335 _x >>= _c; \ 336 if (_x >= 1 << 4) \ 337 _x >>= 4, _c += 4; \ 338 if (_x >= 1 << 2) \ 339 _x >>= 2, _c += 2; \ 340 _c += _x >> 1; \ 341 (count) = W_TYPE_SIZE - 1 - _c; \ 342 } while (0) 343 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1 344 based, and we don't need a special case for x==0 here */ 345 #define count_trailing_zeros(count, x) \ 346 do { \ 347 UWtype __ctz_x = (x); \ 348 __asm__ ("popcnt %0 = %1" \ 349 : "=r" (count) \ 350 : "r" ((__ctz_x-1) & ~__ctz_x)); \ 351 } while (0) 352 #endif 353 #if defined (__INTEL_COMPILER) 354 #include <ia64intrin.h> 355 #define umul_ppmm(ph, pl, m0, m1) \ 356 do { \ 357 UWtype __m0 = (m0), __m1 = (m1); \ 358 ph = _m64_xmahu (__m0, __m1, 0); \ 359 pl = __m0 * __m1; \ 360 } while (0) 361 #endif 362 #ifndef LONGLONG_STANDALONE 363 #define udiv_qrnnd(q, r, n1, n0, d) \ 364 do { UWtype __di; \ 365 __di = __MPN(invert_limb) (d); \ 366 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 367 } while (0) 368 #define UDIV_PREINV_ALWAYS 1 369 #define UDIV_NEEDS_NORMALIZATION 1 370 #endif 371 #endif 372 373 374 #if defined (__GNUC__) 375 376 /* We sometimes need to clobber "cc" with gcc2, but that would not be 377 understood by gcc1. Use cpp to avoid major code duplication. */ 378 #if __GNUC__ < 2 379 #define __CLOBBER_CC 380 #define __AND_CLOBBER_CC 381 #else /* __GNUC__ >= 2 */ 382 #define __CLOBBER_CC : "cc" 383 #define __AND_CLOBBER_CC , "cc" 384 #endif /* __GNUC__ < 2 */ 385 386 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 387 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 388 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \ 389 : "=r" (sh), "=&r" (sl) \ 390 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) 391 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 392 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ 393 : "=r" (sh), "=&r" (sl) \ 394 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) 395 #define umul_ppmm(xh, xl, m0, m1) \ 396 do { \ 397 USItype __m0 = (m0), __m1 = (m1); \ 398 __asm__ ("multiplu %0,%1,%2" \ 399 : "=r" (xl) \ 400 : "r" (__m0), "r" (__m1)); \ 401 __asm__ ("multmu %0,%1,%2" \ 402 : "=r" (xh) \ 403 : "r" (__m0), "r" (__m1)); \ 404 } while (0) 405 #define udiv_qrnnd(q, r, n1, n0, d) \ 406 __asm__ ("dividu %0,%3,%4" \ 407 : "=r" (q), "=q" (r) \ 408 : "1" (n1), "r" (n0), "r" (d)) 409 #define count_leading_zeros(count, x) \ 410 __asm__ ("clz %0,%1" \ 411 : "=r" (count) \ 412 : "r" (x)) 413 #define COUNT_LEADING_ZEROS_0 32 414 #endif /* __a29k__ */ 415 416 #if defined (__arc__) 417 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 418 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 419 : "=r" (sh), \ 420 "=&r" (sl) \ 421 : "r" ((USItype) (ah)), \ 422 "rICal" ((USItype) (bh)), \ 423 "%r" ((USItype) (al)), \ 424 "rICal" ((USItype) (bl))) 425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 426 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 427 : "=r" (sh), \ 428 "=&r" (sl) \ 429 : "r" ((USItype) (ah)), \ 430 "rICal" ((USItype) (bh)), \ 431 "r" ((USItype) (al)), \ 432 "rICal" ((USItype) (bl))) 433 #endif 434 435 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \ 436 && W_TYPE_SIZE == 32 437 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 438 do { \ 439 if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl)) \ 440 __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 441 : "=r" (sh), "=&r" (sl) \ 442 : "r" (ah), "rI" (bh), \ 443 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC); \ 444 else \ 445 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 446 : "=r" (sh), "=&r" (sl) \ 447 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC); \ 448 } while (0) 449 /* FIXME: Extend the immediate range for the low word by using both ADDS and 450 SUBS, since they set carry in the same way. We need separate definitions 451 for thumb and non-thumb since thumb lacks RSC. */ 452 #if defined (__thumb__) 453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 454 do { \ 455 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \ 456 && (ah) == (bh)) \ 457 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \ 458 : "=r" (sh), "=r" (sl) \ 459 : "r" (al), "rI" (bl) __CLOBBER_CC); \ 460 else if (__builtin_constant_p (al)) \ 461 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 462 : "=r" (sh), "=&r" (sl) \ 463 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 464 else if (__builtin_constant_p (bl)) \ 465 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 466 : "=r" (sh), "=&r" (sl) \ 467 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 468 else \ 469 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 470 : "=r" (sh), "=&r" (sl) \ 471 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 472 } while (0) 473 #else 474 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 475 do { \ 476 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \ 477 && (ah) == (bh)) \ 478 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \ 479 : "=r" (sh), "=r" (sl) \ 480 : "r" (al), "rI" (bl) __CLOBBER_CC); \ 481 else if (__builtin_constant_p (al)) \ 482 { \ 483 if (__builtin_constant_p (ah)) \ 484 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 485 : "=r" (sh), "=&r" (sl) \ 486 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 487 else \ 488 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 489 : "=r" (sh), "=&r" (sl) \ 490 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 491 } \ 492 else if (__builtin_constant_p (ah)) \ 493 { \ 494 if (__builtin_constant_p (bl)) \ 495 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ 496 : "=r" (sh), "=&r" (sl) \ 497 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 498 else \ 499 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 500 : "=r" (sh), "=&r" (sl) \ 501 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 502 } \ 503 else if (__builtin_constant_p (bl)) \ 504 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 505 : "=r" (sh), "=&r" (sl) \ 506 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 507 else \ 508 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 509 : "=r" (sh), "=&r" (sl) \ 510 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 511 } while (0) 512 #endif 513 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \ 514 || defined (__ARM_ARCH_3__) 515 #define umul_ppmm(xh, xl, a, b) \ 516 do { \ 517 register USItype __t0, __t1, __t2; \ 518 __asm__ ("%@ Inlined umul_ppmm\n" \ 519 " mov %2, %5, lsr #16\n" \ 520 " mov %0, %6, lsr #16\n" \ 521 " bic %3, %5, %2, lsl #16\n" \ 522 " bic %4, %6, %0, lsl #16\n" \ 523 " mul %1, %3, %4\n" \ 524 " mul %4, %2, %4\n" \ 525 " mul %3, %0, %3\n" \ 526 " mul %0, %2, %0\n" \ 527 " adds %3, %4, %3\n" \ 528 " addcs %0, %0, #65536\n" \ 529 " adds %1, %1, %3, lsl #16\n" \ 530 " adc %0, %0, %3, lsr #16" \ 531 : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \ 532 "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \ 533 : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \ 534 } while (0) 535 #ifndef LONGLONG_STANDALONE 536 #define udiv_qrnnd(q, r, n1, n0, d) \ 537 do { UWtype __r; \ 538 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 539 (r) = __r; \ 540 } while (0) 541 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 542 #endif /* LONGLONG_STANDALONE */ 543 #else /* ARMv4 or newer */ 544 #define umul_ppmm(xh, xl, a, b) \ 545 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 546 #define smul_ppmm(xh, xl, a, b) \ 547 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 548 #ifndef LONGLONG_STANDALONE 549 #define udiv_qrnnd(q, r, n1, n0, d) \ 550 do { UWtype __di; \ 551 __di = __MPN(invert_limb) (d); \ 552 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 553 } while (0) 554 #define UDIV_PREINV_ALWAYS 1 555 #define UDIV_NEEDS_NORMALIZATION 1 556 #endif /* LONGLONG_STANDALONE */ 557 #endif /* defined(__ARM_ARCH_2__) ... */ 558 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 559 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 560 #endif /* __arm__ */ 561 562 #if defined (__aarch64__) && W_TYPE_SIZE == 64 563 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 564 do { \ 565 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \ 566 __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 567 : "=r" (sh), "=&r" (sl) \ 568 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \ 569 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\ 570 else \ 571 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 572 : "=r" (sh), "=&r" (sl) \ 573 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \ 574 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\ 575 } while (0) 576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 577 do { \ 578 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \ 579 __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 580 : "=r,r" (sh), "=&r,&r" (sl) \ 581 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \ 582 "r,Z" ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\ 583 else \ 584 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 585 : "=r,r" (sh), "=&r,&r" (sl) \ 586 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \ 587 "r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC);\ 588 } while(0); 589 #if __GMP_GNUC_PREREQ (4,9) 590 #define umul_ppmm(w1, w0, u, v) \ 591 do { \ 592 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 593 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 594 w1 = __ll >> 64; \ 595 w0 = __ll; \ 596 } while (0) 597 #endif 598 #if !defined (umul_ppmm) 599 #define umul_ppmm(ph, pl, m0, m1) \ 600 do { \ 601 UDItype __m0 = (m0), __m1 = (m1); \ 602 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \ 603 (pl) = __m0 * __m1; \ 604 } while (0) 605 #endif 606 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 607 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 608 #endif /* __aarch64__ */ 609 610 #if defined (__clipper__) && W_TYPE_SIZE == 32 611 #define umul_ppmm(w1, w0, u, v) \ 612 ({union {UDItype __ll; \ 613 struct {USItype __l, __h;} __i; \ 614 } __x; \ 615 __asm__ ("mulwux %2,%0" \ 616 : "=r" (__x.__ll) \ 617 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 618 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 619 #define smul_ppmm(w1, w0, u, v) \ 620 ({union {DItype __ll; \ 621 struct {SItype __l, __h;} __i; \ 622 } __x; \ 623 __asm__ ("mulwx %2,%0" \ 624 : "=r" (__x.__ll) \ 625 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \ 626 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 627 #define __umulsidi3(u, v) \ 628 ({UDItype __w; \ 629 __asm__ ("mulwux %2,%0" \ 630 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 631 __w; }) 632 #endif /* __clipper__ */ 633 634 /* Fujitsu vector computers. */ 635 #if defined (__uxp__) && W_TYPE_SIZE == 32 636 #define umul_ppmm(ph, pl, u, v) \ 637 do { \ 638 union {UDItype __ll; \ 639 struct {USItype __h, __l;} __i; \ 640 } __x; \ 641 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\ 642 (ph) = __x.__i.__h; \ 643 (pl) = __x.__i.__l; \ 644 } while (0) 645 #define smul_ppmm(ph, pl, u, v) \ 646 do { \ 647 union {UDItype __ll; \ 648 struct {USItype __h, __l;} __i; \ 649 } __x; \ 650 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \ 651 (ph) = __x.__i.__h; \ 652 (pl) = __x.__i.__l; \ 653 } while (0) 654 #endif 655 656 #if defined (__gmicro__) && W_TYPE_SIZE == 32 657 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 658 __asm__ ("add.w %5,%1\n\taddx %3,%0" \ 659 : "=g" (sh), "=&g" (sl) \ 660 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 661 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 662 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 663 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \ 664 : "=g" (sh), "=&g" (sl) \ 665 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 666 "1" ((USItype)(al)), "g" ((USItype)(bl))) 667 #define umul_ppmm(ph, pl, m0, m1) \ 668 __asm__ ("mulx %3,%0,%1" \ 669 : "=g" (ph), "=r" (pl) \ 670 : "%0" ((USItype)(m0)), "g" ((USItype)(m1))) 671 #define udiv_qrnnd(q, r, nh, nl, d) \ 672 __asm__ ("divx %4,%0,%1" \ 673 : "=g" (q), "=r" (r) \ 674 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d))) 675 #define count_leading_zeros(count, x) \ 676 __asm__ ("bsch/1 %1,%0" \ 677 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0)) 678 #endif 679 680 #if defined (__hppa) && W_TYPE_SIZE == 32 681 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 682 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \ 683 : "=r" (sh), "=&r" (sl) \ 684 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 685 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 686 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \ 687 : "=r" (sh), "=&r" (sl) \ 688 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 689 #if defined (_PA_RISC1_1) 690 #define umul_ppmm(wh, wl, u, v) \ 691 do { \ 692 union {UDItype __ll; \ 693 struct {USItype __h, __l;} __i; \ 694 } __x; \ 695 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \ 696 (wh) = __x.__i.__h; \ 697 (wl) = __x.__i.__l; \ 698 } while (0) 699 #endif 700 #define count_leading_zeros(count, x) \ 701 do { \ 702 USItype __tmp; \ 703 __asm__ ( \ 704 "ldi 1,%0\n" \ 705 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ 706 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ 707 " ldo 16(%0),%0 ; Yes. Perform add.\n" \ 708 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ 709 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ 710 " ldo 8(%0),%0 ; Yes. Perform add.\n" \ 711 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ 712 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ 713 " ldo 4(%0),%0 ; Yes. Perform add.\n" \ 714 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ 715 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ 716 " ldo 2(%0),%0 ; Yes. Perform add.\n" \ 717 " extru %1,30,1,%1 ; Extract bit 1.\n" \ 718 " sub %0,%1,%0 ; Subtract it.\n" \ 719 : "=r" (count), "=r" (__tmp) : "1" (x)); \ 720 } while (0) 721 #endif /* hppa */ 722 723 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC 724 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this 725 is just a case of no direct support for 2.0n but treating it like 1.0. */ 726 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB) 727 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 728 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \ 729 : "=r" (sh), "=&r" (sl) \ 730 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 731 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 732 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \ 733 : "=r" (sh), "=&r" (sl) \ 734 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 735 #endif /* hppa */ 736 737 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 738 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch) 739 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 740 do { \ 741 /* if (__builtin_constant_p (bl)) \ 742 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \ 743 : "=r" (sh), "=&r" (sl) \ 744 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\ 745 else \ 746 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \ 747 : "=r" (sh), "=&r" (sl) \ 748 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \ 749 } while (0) 750 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 751 do { \ 752 /* if (__builtin_constant_p (bl)) \ 753 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \ 754 : "=r" (sh), "=&r" (sl) \ 755 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \ 756 else \ 757 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \ 758 : "=r" (sh), "=&r" (sl) \ 759 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \ 760 } while (0) 761 #if __GMP_GNUC_PREREQ (4,5) 762 #define umul_ppmm(xh, xl, m0, m1) \ 763 do { \ 764 union {UDItype __ll; \ 765 struct {USItype __h, __l;} __i; \ 766 } __x; \ 767 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \ 768 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 769 } while (0) 770 #else 771 #if 0 772 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 773 with a new enough processor pretending we have 32-bit registers. */ 774 #define umul_ppmm(xh, xl, m0, m1) \ 775 do { \ 776 union {UDItype __ll; \ 777 struct {USItype __h, __l;} __i; \ 778 } __x; \ 779 __asm__ ("mlr\t%0,%2" \ 780 : "=r" (__x.__ll) \ 781 : "%0" (m0), "r" (m1)); \ 782 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 783 } while (0) 784 #else 785 #define umul_ppmm(xh, xl, m0, m1) \ 786 do { \ 787 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use 788 DImode for the product, since that would be allocated to a single 64-bit 789 register, whereas mlr uses the low 32-bits of an even-odd register pair. 790 */ \ 791 register USItype __r0 __asm__ ("0"); \ 792 register USItype __r1 __asm__ ("1") = (m0); \ 793 __asm__ ("mlr\t%0,%3" \ 794 : "=r" (__r0), "=r" (__r1) \ 795 : "r" (__r1), "r" (m1)); \ 796 (xh) = __r0; (xl) = __r1; \ 797 } while (0) 798 #endif /* if 0 */ 799 #endif 800 #if 0 801 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 802 with a new enough processor pretending we have 32-bit registers. */ 803 #define udiv_qrnnd(q, r, n1, n0, d) \ 804 do { \ 805 union {UDItype __ll; \ 806 struct {USItype __h, __l;} __i; \ 807 } __x; \ 808 __x.__i.__h = n1; __x.__i.__l = n0; \ 809 __asm__ ("dlr\t%0,%2" \ 810 : "=r" (__x.__ll) \ 811 : "0" (__x.__ll), "r" (d)); \ 812 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 813 } while (0) 814 #else 815 #define udiv_qrnnd(q, r, n1, n0, d) \ 816 do { \ 817 register USItype __r0 __asm__ ("0") = (n1); \ 818 register USItype __r1 __asm__ ("1") = (n0); \ 819 __asm__ ("dlr\t%0,%4" \ 820 : "=r" (__r0), "=r" (__r1) \ 821 : "r" (__r0), "r" (__r1), "r" (d)); \ 822 (q) = __r1; (r) = __r0; \ 823 } while (0) 824 #endif /* if 0 */ 825 #else /* if __zarch__ */ 826 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 827 #define smul_ppmm(xh, xl, m0, m1) \ 828 do { \ 829 union {DItype __ll; \ 830 struct {USItype __h, __l;} __i; \ 831 } __x; \ 832 __asm__ ("mr\t%0,%2" \ 833 : "=r" (__x.__ll) \ 834 : "%0" (m0), "r" (m1)); \ 835 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 836 } while (0) 837 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 838 #define sdiv_qrnnd(q, r, n1, n0, d) \ 839 do { \ 840 union {DItype __ll; \ 841 struct {USItype __h, __l;} __i; \ 842 } __x; \ 843 __x.__i.__h = n1; __x.__i.__l = n0; \ 844 __asm__ ("dr\t%0,%2" \ 845 : "=r" (__x.__ll) \ 846 : "0" (__x.__ll), "r" (d)); \ 847 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 848 } while (0) 849 #endif /* if __zarch__ */ 850 #endif 851 852 #if defined (__s390x__) && W_TYPE_SIZE == 64 853 /* We need to cast operands with register constraints, otherwise their types 854 will be assumed to be SImode by gcc. For these machines, such operations 855 will insert a value into the low 32 bits, and leave the high 32 bits with 856 garbage. */ 857 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 858 do { \ 859 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \ 860 : "=r" (sh), "=&r" (sl) \ 861 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 862 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 863 } while (0) 864 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 865 do { \ 866 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \ 867 : "=r" (sh), "=&r" (sl) \ 868 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 869 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 870 } while (0) 871 #define umul_ppmm(xh, xl, m0, m1) \ 872 do { \ 873 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 874 struct {UDItype __h, __l;} __i; \ 875 } __x; \ 876 __asm__ ("mlgr\t%0,%2" \ 877 : "=r" (__x.__ll) \ 878 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \ 879 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 880 } while (0) 881 #define udiv_qrnnd(q, r, n1, n0, d) \ 882 do { \ 883 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 884 struct {UDItype __h, __l;} __i; \ 885 } __x; \ 886 __x.__i.__h = n1; __x.__i.__l = n0; \ 887 __asm__ ("dlgr\t%0,%2" \ 888 : "=r" (__x.__ll) \ 889 : "0" (__x.__ll), "r" ((UDItype)(d))); \ 890 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 891 } while (0) 892 #if 0 /* FIXME: Enable for z10 (?) */ 893 #define count_leading_zeros(cnt, x) \ 894 do { \ 895 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 896 struct {UDItype __h, __l;} __i; \ 897 } __clr_cnt; \ 898 __asm__ ("flogr\t%0,%1" \ 899 : "=r" (__clr_cnt.__ll) \ 900 : "r" (x) __CLOBBER_CC); \ 901 (cnt) = __clr_cnt.__i.__h; \ 902 } while (0) 903 #endif 904 #endif 905 906 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr", 907 so we don't need __CLOBBER_CC. */ 908 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 910 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \ 911 : "=r" (sh), "=&r" (sl) \ 912 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 913 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 915 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \ 916 : "=r" (sh), "=&r" (sl) \ 917 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 918 "1" ((USItype)(al)), "g" ((USItype)(bl))) 919 #define umul_ppmm(w1, w0, u, v) \ 920 __asm__ ("mull %3" \ 921 : "=a" (w0), "=d" (w1) \ 922 : "%0" ((USItype)(u)), "rm" ((USItype)(v))) 923 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 924 __asm__ ("divl %4" /* stringification in K&R C */ \ 925 : "=a" (q), "=d" (r) \ 926 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) 927 928 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx 929 /* Pentium bsrl takes between 10 and 72 cycles depending where the most 930 significant 1 bit is, hence the use of the following alternatives. bsfl 931 is slow too, between 18 and 42 depending where the least significant 1 932 bit is, so let the generic count_trailing_zeros below make use of the 933 count_leading_zeros here too. */ 934 935 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE) 936 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 937 cache miss reading from __clz_tab. For P55 it's favoured over the float 938 below so as to avoid mixing MMX and x87, since the penalty for switching 939 between the two is about 100 cycles. 940 941 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 942 16, -1 for 8, or 0 otherwise. This could be written equivalently as 943 follows, but as of gcc 2.95.2 it results in conditional jumps. 944 945 __shift = -(__n < 0x1000000); 946 __shift -= (__n < 0x10000); 947 __shift -= (__n < 0x100); 948 949 The middle two sbbl and cmpl's pair, and with luck something gcc 950 generates might pair with the first cmpl and the last sbbl. The "32+1" 951 constant could be folded into __clz_tab[], but it doesn't seem worth 952 making a different table just for that. */ 953 954 #define count_leading_zeros(c,n) \ 955 do { \ 956 USItype __n = (n); \ 957 USItype __shift; \ 958 __asm__ ("cmpl $0x1000000, %1\n" \ 959 "sbbl %0, %0\n" \ 960 "cmpl $0x10000, %1\n" \ 961 "sbbl $0, %0\n" \ 962 "cmpl $0x100, %1\n" \ 963 "sbbl $0, %0\n" \ 964 : "=&r" (__shift) : "r" (__n)); \ 965 __shift = __shift*8 + 24 + 1; \ 966 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ 967 } while (0) 968 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 969 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ 970 971 #else /* ! pentiummmx || LONGLONG_STANDALONE */ 972 /* The following should be a fixed 14 cycles or so. Some scheduling 973 opportunities should be available between the float load/store too. This 974 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is 975 apparently suggested by the Intel optimizing manual (don't know exactly 976 where). gcc 2.95 or up will be best for this, so the "double" is 977 correctly aligned on the stack. */ 978 #define count_leading_zeros(c,n) \ 979 do { \ 980 union { \ 981 double d; \ 982 unsigned a[2]; \ 983 } __u; \ 984 __u.d = (UWtype) (n); \ 985 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ 986 } while (0) 987 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31) 988 #endif /* pentiummx */ 989 990 #else /* ! pentium */ 991 992 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */ 993 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x) 994 #endif /* gcc clz */ 995 996 /* On P6, gcc prior to 3.0 generates a partial register stall for 997 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former 998 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the 999 cost of one extra instruction. Do this for "i386" too, since that means 1000 generic x86. */ 1001 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \ 1002 && (HAVE_HOST_CPU_i386 \ 1003 || HAVE_HOST_CPU_i686 \ 1004 || HAVE_HOST_CPU_pentiumpro \ 1005 || HAVE_HOST_CPU_pentium2 \ 1006 || HAVE_HOST_CPU_pentium3) 1007 #define count_leading_zeros(count, x) \ 1008 do { \ 1009 USItype __cbtmp; \ 1010 ASSERT ((x) != 0); \ 1011 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1012 (count) = 31 - __cbtmp; \ 1013 } while (0) 1014 #endif /* gcc<3 asm bsrl */ 1015 1016 #ifndef count_leading_zeros 1017 #define count_leading_zeros(count, x) \ 1018 do { \ 1019 USItype __cbtmp; \ 1020 ASSERT ((x) != 0); \ 1021 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1022 (count) = __cbtmp ^ 31; \ 1023 } while (0) 1024 #endif /* asm bsrl */ 1025 1026 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */ 1027 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x) 1028 #endif /* gcc ctz */ 1029 1030 #ifndef count_trailing_zeros 1031 #define count_trailing_zeros(count, x) \ 1032 do { \ 1033 ASSERT ((x) != 0); \ 1034 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \ 1035 } while (0) 1036 #endif /* asm bsfl */ 1037 1038 #endif /* ! pentium */ 1039 1040 #endif /* 80x86 */ 1041 1042 #if defined (__amd64__) && W_TYPE_SIZE == 64 1043 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1044 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \ 1045 : "=r" (sh), "=&r" (sl) \ 1046 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1047 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1048 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1049 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \ 1050 : "=r" (sh), "=&r" (sl) \ 1051 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1052 "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1053 #if X86_ASM_MULX \ 1054 && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \ 1055 || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen) 1056 #define umul_ppmm(w1, w0, u, v) \ 1057 __asm__ ("mulx\t%3, %q0, %q1" \ 1058 : "=r" (w0), "=r" (w1) \ 1059 : "%d" ((UDItype)(u)), "rm" ((UDItype)(v))) 1060 #else 1061 #define umul_ppmm(w1, w0, u, v) \ 1062 __asm__ ("mulq\t%3" \ 1063 : "=a" (w0), "=d" (w1) \ 1064 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) 1065 #endif 1066 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 1067 __asm__ ("divq %4" /* stringification in K&R C */ \ 1068 : "=a" (q), "=d" (r) \ 1069 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) 1070 1071 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \ 1072 || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \ 1073 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \ 1074 || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar 1075 #define count_leading_zeros(count, x) \ 1076 do { \ 1077 /* This is lzcnt, spelled for older assemblers. Destination and */ \ 1078 /* source must be a 64-bit registers, hence cast and %q. */ \ 1079 __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1080 } while (0) 1081 #define COUNT_LEADING_ZEROS_0 64 1082 #else 1083 #define count_leading_zeros(count, x) \ 1084 do { \ 1085 UDItype __cbtmp; \ 1086 ASSERT ((x) != 0); \ 1087 __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ 1088 (count) = __cbtmp ^ 63; \ 1089 } while (0) 1090 #endif 1091 1092 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \ 1093 || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar 1094 #define count_trailing_zeros(count, x) \ 1095 do { \ 1096 /* This is tzcnt, spelled for older assemblers. Destination and */ \ 1097 /* source must be a 64-bit registers, hence cast and %q. */ \ 1098 __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1099 } while (0) 1100 #define COUNT_TRAILING_ZEROS_0 64 1101 #else 1102 #define count_trailing_zeros(count, x) \ 1103 do { \ 1104 ASSERT ((x) != 0); \ 1105 __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1106 } while (0) 1107 #endif 1108 #endif /* __amd64__ */ 1109 1110 #if defined (__i860__) && W_TYPE_SIZE == 32 1111 #define rshift_rhlc(r,h,l,c) \ 1112 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ 1113 "=r" (r) : "r" (h), "r" (l), "rn" (c)) 1114 #endif /* i860 */ 1115 1116 #if defined (__i960__) && W_TYPE_SIZE == 32 1117 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1118 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \ 1119 : "=r" (sh), "=&r" (sl) \ 1120 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl)) 1121 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1122 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \ 1123 : "=r" (sh), "=&r" (sl) \ 1124 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl)) 1125 #define umul_ppmm(w1, w0, u, v) \ 1126 ({union {UDItype __ll; \ 1127 struct {USItype __l, __h;} __i; \ 1128 } __x; \ 1129 __asm__ ("emul %2,%1,%0" \ 1130 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \ 1131 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1132 #define __umulsidi3(u, v) \ 1133 ({UDItype __w; \ 1134 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \ 1135 __w; }) 1136 #define udiv_qrnnd(q, r, nh, nl, d) \ 1137 do { \ 1138 union {UDItype __ll; \ 1139 struct {USItype __l, __h;} __i; \ 1140 } __nn; \ 1141 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ 1142 __asm__ ("ediv %d,%n,%0" \ 1143 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \ 1144 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ 1145 } while (0) 1146 #define count_leading_zeros(count, x) \ 1147 do { \ 1148 USItype __cbtmp; \ 1149 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \ 1150 (count) = __cbtmp ^ 31; \ 1151 } while (0) 1152 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */ 1153 #if defined (__i960mx) /* what is the proper symbol to test??? */ 1154 #define rshift_rhlc(r,h,l,c) \ 1155 do { \ 1156 union {UDItype __ll; \ 1157 struct {USItype __l, __h;} __i; \ 1158 } __nn; \ 1159 __nn.__i.__h = (h); __nn.__i.__l = (l); \ 1160 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ 1161 } 1162 #endif /* i960mx */ 1163 #endif /* i960 */ 1164 1165 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \ 1166 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \ 1167 || defined (__mc5307__)) && W_TYPE_SIZE == 32 1168 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1169 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \ 1170 : "=d" (sh), "=&d" (sl) \ 1171 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1172 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1173 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1174 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \ 1175 : "=d" (sh), "=&d" (sl) \ 1176 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1177 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1178 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */ 1179 #if defined (__mc68020__) || defined(mc68020) \ 1180 || defined (__mc68030__) || defined (mc68030) \ 1181 || defined (__mc68040__) || defined (mc68040) \ 1182 || defined (__mcpu32__) || defined (mcpu32) \ 1183 || defined (__NeXT__) 1184 #define umul_ppmm(w1, w0, u, v) \ 1185 __asm__ ("mulu%.l %3,%1:%0" \ 1186 : "=d" (w0), "=d" (w1) \ 1187 : "%0" ((USItype)(u)), "dmi" ((USItype)(v))) 1188 #define udiv_qrnnd(q, r, n1, n0, d) \ 1189 __asm__ ("divu%.l %4,%1:%0" \ 1190 : "=d" (q), "=d" (r) \ 1191 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1192 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1193 __asm__ ("divs%.l %4,%1:%0" \ 1194 : "=d" (q), "=d" (r) \ 1195 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1196 #else /* for other 68k family members use 16x16->32 multiplication */ 1197 #define umul_ppmm(xh, xl, a, b) \ 1198 do { USItype __umul_tmp1, __umul_tmp2; \ 1199 __asm__ ("| Inlined umul_ppmm\n" \ 1200 " move%.l %5,%3\n" \ 1201 " move%.l %2,%0\n" \ 1202 " move%.w %3,%1\n" \ 1203 " swap %3\n" \ 1204 " swap %0\n" \ 1205 " mulu%.w %2,%1\n" \ 1206 " mulu%.w %3,%0\n" \ 1207 " mulu%.w %2,%3\n" \ 1208 " swap %2\n" \ 1209 " mulu%.w %5,%2\n" \ 1210 " add%.l %3,%2\n" \ 1211 " jcc 1f\n" \ 1212 " add%.l %#0x10000,%0\n" \ 1213 "1: move%.l %2,%3\n" \ 1214 " clr%.w %2\n" \ 1215 " swap %2\n" \ 1216 " swap %3\n" \ 1217 " clr%.w %3\n" \ 1218 " add%.l %3,%1\n" \ 1219 " addx%.l %2,%0\n" \ 1220 " | End inlined umul_ppmm" \ 1221 : "=&d" (xh), "=&d" (xl), \ 1222 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ 1223 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ 1224 } while (0) 1225 #endif /* not mc68020 */ 1226 /* The '020, '030, '040 and '060 have bitfield insns. 1227 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to 1228 exclude bfffo on that chip (bitfield insns not available). */ 1229 #if (defined (__mc68020__) || defined (mc68020) \ 1230 || defined (__mc68030__) || defined (mc68030) \ 1231 || defined (__mc68040__) || defined (mc68040) \ 1232 || defined (__mc68060__) || defined (mc68060) \ 1233 || defined (__NeXT__)) \ 1234 && ! defined (__mcpu32__) 1235 #define count_leading_zeros(count, x) \ 1236 __asm__ ("bfffo %1{%b2:%b2},%0" \ 1237 : "=d" (count) \ 1238 : "od" ((USItype) (x)), "n" (0)) 1239 #define COUNT_LEADING_ZEROS_0 32 1240 #endif 1241 #endif /* mc68000 */ 1242 1243 #if defined (__m88000__) && W_TYPE_SIZE == 32 1244 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1245 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \ 1246 : "=r" (sh), "=&r" (sl) \ 1247 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl)) 1248 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1249 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \ 1250 : "=r" (sh), "=&r" (sl) \ 1251 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl)) 1252 #define count_leading_zeros(count, x) \ 1253 do { \ 1254 USItype __cbtmp; \ 1255 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \ 1256 (count) = __cbtmp ^ 31; \ 1257 } while (0) 1258 #define COUNT_LEADING_ZEROS_0 63 /* sic */ 1259 #if defined (__m88110__) 1260 #define umul_ppmm(wh, wl, u, v) \ 1261 do { \ 1262 union {UDItype __ll; \ 1263 struct {USItype __h, __l;} __i; \ 1264 } __x; \ 1265 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ 1266 (wh) = __x.__i.__h; \ 1267 (wl) = __x.__i.__l; \ 1268 } while (0) 1269 #define udiv_qrnnd(q, r, n1, n0, d) \ 1270 ({union {UDItype __ll; \ 1271 struct {USItype __h, __l;} __i; \ 1272 } __x, __q; \ 1273 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1274 __asm__ ("divu.d %0,%1,%2" \ 1275 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ 1276 (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) 1277 #endif /* __m88110__ */ 1278 #endif /* __m88000__ */ 1279 1280 #if defined (__mips) && W_TYPE_SIZE == 32 1281 #if __GMP_GNUC_PREREQ (4,4) 1282 #define umul_ppmm(w1, w0, u, v) \ 1283 do { \ 1284 UDItype __ll = (UDItype)(u) * (v); \ 1285 w1 = __ll >> 32; \ 1286 w0 = __ll; \ 1287 } while (0) 1288 #endif 1289 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1290 #define umul_ppmm(w1, w0, u, v) \ 1291 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) 1292 #endif 1293 #if !defined (umul_ppmm) 1294 #define umul_ppmm(w1, w0, u, v) \ 1295 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1296 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) 1297 #endif 1298 #endif /* __mips */ 1299 1300 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 1301 #if defined (_MIPS_ARCH_MIPS64R6) 1302 #define umul_ppmm(w1, w0, u, v) \ 1303 do { \ 1304 UDItype __m0 = (u), __m1 = (v); \ 1305 (w0) = __m0 * __m1; \ 1306 __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1)); \ 1307 } while (0) 1308 #endif 1309 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4) 1310 #define umul_ppmm(w1, w0, u, v) \ 1311 do { \ 1312 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1313 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1314 w1 = __ll >> 64; \ 1315 w0 = __ll; \ 1316 } while (0) 1317 #endif 1318 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1319 #define umul_ppmm(w1, w0, u, v) \ 1320 __asm__ ("dmultu %2,%3" \ 1321 : "=l" (w0), "=h" (w1) \ 1322 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1323 #endif 1324 #if !defined (umul_ppmm) 1325 #define umul_ppmm(w1, w0, u, v) \ 1326 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1327 : "=d" (w0), "=d" (w1) \ 1328 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1329 #endif 1330 #endif /* __mips */ 1331 1332 #if defined (__mmix__) && W_TYPE_SIZE == 64 1333 #define umul_ppmm(w1, w0, u, v) \ 1334 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v)) 1335 #endif 1336 1337 #if defined (__ns32000__) && W_TYPE_SIZE == 32 1338 #define umul_ppmm(w1, w0, u, v) \ 1339 ({union {UDItype __ll; \ 1340 struct {USItype __l, __h;} __i; \ 1341 } __x; \ 1342 __asm__ ("meid %2,%0" \ 1343 : "=g" (__x.__ll) \ 1344 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1345 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1346 #define __umulsidi3(u, v) \ 1347 ({UDItype __w; \ 1348 __asm__ ("meid %2,%0" \ 1349 : "=g" (__w) \ 1350 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1351 __w; }) 1352 #define udiv_qrnnd(q, r, n1, n0, d) \ 1353 ({union {UDItype __ll; \ 1354 struct {USItype __l, __h;} __i; \ 1355 } __x; \ 1356 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1357 __asm__ ("deid %2,%0" \ 1358 : "=g" (__x.__ll) \ 1359 : "0" (__x.__ll), "g" ((USItype)(d))); \ 1360 (r) = __x.__i.__l; (q) = __x.__i.__h; }) 1361 #define count_trailing_zeros(count,x) \ 1362 do { \ 1363 __asm__ ("ffsd %2,%0" \ 1364 : "=r" (count) \ 1365 : "0" ((USItype) 0), "r" ((USItype) (x))); \ 1366 } while (0) 1367 #endif /* __ns32000__ */ 1368 1369 /* In the past we had a block of various #defines tested 1370 _ARCH_PPC - AIX 1371 _ARCH_PWR - AIX 1372 __powerpc__ - gcc 1373 __POWERPC__ - BEOS 1374 __ppc__ - Darwin 1375 PPC - old gcc, GNU/Linux, SysV 1376 The plain PPC test was not good for vxWorks, since PPC is defined on all 1377 CPUs there (eg. m68k too), as a constant one is expected to compare 1378 CPU_FAMILY against. 1379 1380 At any rate, this was pretty unattractive and a bit fragile. The use of 1381 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of 1382 getting the desired effect. 1383 1384 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for 1385 the system vendor compilers. (Is that vendor compilers with inline asm, 1386 or what?) */ 1387 1388 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \ 1389 && W_TYPE_SIZE == 32 1390 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1391 do { \ 1392 if (__builtin_constant_p (bh) && (bh) == 0) \ 1393 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1394 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ 1395 __CLOBBER_CC); \ 1396 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1397 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1398 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ 1399 __CLOBBER_CC); \ 1400 else \ 1401 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1402 : "=r" (sh), "=&r" (sl) \ 1403 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl) \ 1404 __CLOBBER_CC); \ 1405 } while (0) 1406 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1407 do { \ 1408 if (__builtin_constant_p (ah) && (ah) == 0) \ 1409 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1410 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ 1411 __CLOBBER_CC); \ 1412 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ 1413 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1414 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ 1415 __CLOBBER_CC); \ 1416 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1417 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1418 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ 1419 __CLOBBER_CC); \ 1420 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1421 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1422 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ 1423 __CLOBBER_CC); \ 1424 else \ 1425 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1426 : "=r" (sh), "=&r" (sl) \ 1427 : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \ 1428 __CLOBBER_CC); \ 1429 } while (0) 1430 #define count_leading_zeros(count, x) \ 1431 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x)) 1432 #define COUNT_LEADING_ZEROS_0 32 1433 #if HAVE_HOST_CPU_FAMILY_powerpc 1434 #if __GMP_GNUC_PREREQ (4,4) 1435 #define umul_ppmm(w1, w0, u, v) \ 1436 do { \ 1437 UDItype __ll = (UDItype)(u) * (v); \ 1438 w1 = __ll >> 32; \ 1439 w0 = __ll; \ 1440 } while (0) 1441 #endif 1442 #if !defined (umul_ppmm) 1443 #define umul_ppmm(ph, pl, m0, m1) \ 1444 do { \ 1445 USItype __m0 = (m0), __m1 = (m1); \ 1446 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1447 (pl) = __m0 * __m1; \ 1448 } while (0) 1449 #endif 1450 #define smul_ppmm(ph, pl, m0, m1) \ 1451 do { \ 1452 SItype __m0 = (m0), __m1 = (m1); \ 1453 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1454 (pl) = __m0 * __m1; \ 1455 } while (0) 1456 #else 1457 #define smul_ppmm(xh, xl, m0, m1) \ 1458 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1)) 1459 #define sdiv_qrnnd(q, r, nh, nl, d) \ 1460 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d)) 1461 #endif 1462 #endif /* 32-bit POWER architecture variants. */ 1463 1464 /* We should test _IBMR2 here when we add assembly support for the system 1465 vendor compilers. */ 1466 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64 1467 #if !defined (_LONG_LONG_LIMB) 1468 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So 1469 use adde etc only when not _LONG_LONG_LIMB. */ 1470 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1471 do { \ 1472 if (__builtin_constant_p (bh) && (bh) == 0) \ 1473 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1474 : "=r" (sh), "=&r" (sl) \ 1475 : "r" ((UDItype)(ah)), \ 1476 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1477 __CLOBBER_CC); \ 1478 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1479 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1480 : "=r" (sh), "=&r" (sl) \ 1481 : "r" ((UDItype)(ah)), \ 1482 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1483 __CLOBBER_CC); \ 1484 else \ 1485 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1486 : "=r" (sh), "=&r" (sl) \ 1487 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1488 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1489 __CLOBBER_CC); \ 1490 } while (0) 1491 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs. 1492 This might seem strange, but gcc folds away the dead code late. */ 1493 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1494 do { \ 1495 if (__builtin_constant_p (bl) \ 1496 && (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) { \ 1497 if (__builtin_constant_p (ah) && (ah) == 0) \ 1498 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \ 1499 : "=r" (sh), "=&r" (sl) \ 1500 : "r" ((UDItype)(bh)), \ 1501 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1502 __CLOBBER_CC); \ 1503 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1504 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \ 1505 : "=r" (sh), "=&r" (sl) \ 1506 : "r" ((UDItype)(bh)), \ 1507 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1508 __CLOBBER_CC); \ 1509 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1510 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \ 1511 : "=r" (sh), "=&r" (sl) \ 1512 : "r" ((UDItype)(ah)), \ 1513 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1514 __CLOBBER_CC); \ 1515 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1516 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \ 1517 : "=r" (sh), "=&r" (sl) \ 1518 : "r" ((UDItype)(ah)), \ 1519 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1520 __CLOBBER_CC); \ 1521 else \ 1522 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \ 1523 : "=r" (sh), "=&r" (sl) \ 1524 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1525 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1526 __CLOBBER_CC); \ 1527 } else { \ 1528 if (__builtin_constant_p (ah) && (ah) == 0) \ 1529 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1530 : "=r" (sh), "=&r" (sl) \ 1531 : "r" ((UDItype)(bh)), \ 1532 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1533 __CLOBBER_CC); \ 1534 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1535 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1536 : "=r" (sh), "=&r" (sl) \ 1537 : "r" ((UDItype)(bh)), \ 1538 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1539 __CLOBBER_CC); \ 1540 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1541 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1542 : "=r" (sh), "=&r" (sl) \ 1543 : "r" ((UDItype)(ah)), \ 1544 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1545 __CLOBBER_CC); \ 1546 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1547 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1548 : "=r" (sh), "=&r" (sl) \ 1549 : "r" ((UDItype)(ah)), \ 1550 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1551 __CLOBBER_CC); \ 1552 else \ 1553 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1554 : "=r" (sh), "=&r" (sl) \ 1555 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1556 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1557 __CLOBBER_CC); \ 1558 } \ 1559 } while (0) 1560 #endif /* ! _LONG_LONG_LIMB */ 1561 #define count_leading_zeros(count, x) \ 1562 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) 1563 #define COUNT_LEADING_ZEROS_0 64 1564 #if __GMP_GNUC_PREREQ (4,8) 1565 #define umul_ppmm(w1, w0, u, v) \ 1566 do { \ 1567 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1568 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1569 w1 = __ll >> 64; \ 1570 w0 = __ll; \ 1571 } while (0) 1572 #endif 1573 #if !defined (umul_ppmm) 1574 #define umul_ppmm(ph, pl, m0, m1) \ 1575 do { \ 1576 UDItype __m0 = (m0), __m1 = (m1); \ 1577 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1578 (pl) = __m0 * __m1; \ 1579 } while (0) 1580 #endif 1581 #define smul_ppmm(ph, pl, m0, m1) \ 1582 do { \ 1583 DItype __m0 = (m0), __m1 = (m1); \ 1584 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1585 (pl) = __m0 * __m1; \ 1586 } while (0) 1587 #endif /* 64-bit PowerPC. */ 1588 1589 #if defined (__pyr__) && W_TYPE_SIZE == 32 1590 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1591 __asm__ ("addw %5,%1\n\taddwc %3,%0" \ 1592 : "=r" (sh), "=&r" (sl) \ 1593 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1594 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1595 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1596 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \ 1597 : "=r" (sh), "=&r" (sl) \ 1598 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1599 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1600 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ 1601 #define umul_ppmm(w1, w0, u, v) \ 1602 ({union {UDItype __ll; \ 1603 struct {USItype __h, __l;} __i; \ 1604 } __x; \ 1605 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \ 1606 : "=&r" (__x.__ll) \ 1607 : "g" ((USItype) (u)), "g" ((USItype)(v))); \ 1608 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1609 #endif /* __pyr__ */ 1610 1611 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 1612 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1613 __asm__ ("a %1,%5\n\tae %0,%3" \ 1614 : "=r" (sh), "=&r" (sl) \ 1615 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1616 "%1" ((USItype)(al)), "r" ((USItype)(bl))) 1617 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1618 __asm__ ("s %1,%5\n\tse %0,%3" \ 1619 : "=r" (sh), "=&r" (sl) \ 1620 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1621 "1" ((USItype)(al)), "r" ((USItype)(bl))) 1622 #define smul_ppmm(ph, pl, m0, m1) \ 1623 __asm__ ( \ 1624 "s r2,r2\n" \ 1625 " mts r10,%2\n" \ 1626 " m r2,%3\n" \ 1627 " m r2,%3\n" \ 1628 " m r2,%3\n" \ 1629 " m r2,%3\n" \ 1630 " m r2,%3\n" \ 1631 " m r2,%3\n" \ 1632 " m r2,%3\n" \ 1633 " m r2,%3\n" \ 1634 " m r2,%3\n" \ 1635 " m r2,%3\n" \ 1636 " m r2,%3\n" \ 1637 " m r2,%3\n" \ 1638 " m r2,%3\n" \ 1639 " m r2,%3\n" \ 1640 " m r2,%3\n" \ 1641 " m r2,%3\n" \ 1642 " cas %0,r2,r0\n" \ 1643 " mfs r10,%1" \ 1644 : "=r" (ph), "=r" (pl) \ 1645 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ 1646 : "r2") 1647 #define count_leading_zeros(count, x) \ 1648 do { \ 1649 if ((x) >= 0x10000) \ 1650 __asm__ ("clz %0,%1" \ 1651 : "=r" (count) : "r" ((USItype)(x) >> 16)); \ 1652 else \ 1653 { \ 1654 __asm__ ("clz %0,%1" \ 1655 : "=r" (count) : "r" ((USItype)(x))); \ 1656 (count) += 16; \ 1657 } \ 1658 } while (0) 1659 #endif /* RT/ROMP */ 1660 1661 #if defined (__riscv64) && W_TYPE_SIZE == 64 1662 #define umul_ppmm(ph, pl, u, v) \ 1663 do { \ 1664 UDItype __u = (u), __v = (v); \ 1665 (pl) = __u * __v; \ 1666 __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v)); \ 1667 } while (0) 1668 #endif 1669 1670 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32 1671 #define umul_ppmm(w1, w0, u, v) \ 1672 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \ 1673 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach") 1674 #endif 1675 1676 #if defined (__sparc__) && W_TYPE_SIZE == 32 1677 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1678 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \ 1679 : "=r" (sh), "=&r" (sl) \ 1680 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \ 1681 __CLOBBER_CC) 1682 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1683 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \ 1684 : "=r" (sh), "=&r" (sl) \ 1685 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ 1686 __CLOBBER_CC) 1687 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h 1688 doesn't define anything to indicate that to us, it only sets __sparcv8. */ 1689 #if defined (__sparc_v9__) || defined (__sparcv9) 1690 /* Perhaps we should use floating-point operations here? */ 1691 #if 0 1692 /* Triggers a bug making mpz/tests/t-gcd.c fail. 1693 Perhaps we simply need explicitly zero-extend the inputs? */ 1694 #define umul_ppmm(w1, w0, u, v) \ 1695 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \ 1696 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1") 1697 #else 1698 /* Use v8 umul until above bug is fixed. */ 1699 #define umul_ppmm(w1, w0, u, v) \ 1700 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1701 #endif 1702 /* Use a plain v8 divide for v9. */ 1703 #define udiv_qrnnd(q, r, n1, n0, d) \ 1704 do { \ 1705 USItype __q; \ 1706 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1707 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1708 (r) = (n0) - __q * (d); \ 1709 (q) = __q; \ 1710 } while (0) 1711 #else 1712 #if defined (__sparc_v8__) /* gcc normal */ \ 1713 || defined (__sparcv8) /* gcc solaris */ \ 1714 || HAVE_HOST_CPU_supersparc 1715 /* Don't match immediate range because, 1) it is not often useful, 1716 2) the 'I' flag thinks of the range as a 13 bit signed interval, 1717 while we want to match a 13 bit interval, sign extended to 32 bits, 1718 but INTERPRETED AS UNSIGNED. */ 1719 #define umul_ppmm(w1, w0, u, v) \ 1720 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1721 1722 #if HAVE_HOST_CPU_supersparc 1723 #else 1724 /* Don't use this on SuperSPARC because its udiv only handles 53 bit 1725 dividends and will trap to the kernel for the rest. */ 1726 #define udiv_qrnnd(q, r, n1, n0, d) \ 1727 do { \ 1728 USItype __q; \ 1729 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1730 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1731 (r) = (n0) - __q * (d); \ 1732 (q) = __q; \ 1733 } while (0) 1734 #endif /* HAVE_HOST_CPU_supersparc */ 1735 1736 #else /* ! __sparc_v8__ */ 1737 #if defined (__sparclite__) 1738 /* This has hardware multiply but not divide. It also has two additional 1739 instructions scan (ffs from high bit) and divscc. */ 1740 #define umul_ppmm(w1, w0, u, v) \ 1741 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1742 #define udiv_qrnnd(q, r, n1, n0, d) \ 1743 __asm__ ("! Inlined udiv_qrnnd\n" \ 1744 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ 1745 " tst %%g0\n" \ 1746 " divscc %3,%4,%%g1\n" \ 1747 " divscc %%g1,%4,%%g1\n" \ 1748 " divscc %%g1,%4,%%g1\n" \ 1749 " divscc %%g1,%4,%%g1\n" \ 1750 " divscc %%g1,%4,%%g1\n" \ 1751 " divscc %%g1,%4,%%g1\n" \ 1752 " divscc %%g1,%4,%%g1\n" \ 1753 " divscc %%g1,%4,%%g1\n" \ 1754 " divscc %%g1,%4,%%g1\n" \ 1755 " divscc %%g1,%4,%%g1\n" \ 1756 " divscc %%g1,%4,%%g1\n" \ 1757 " divscc %%g1,%4,%%g1\n" \ 1758 " divscc %%g1,%4,%%g1\n" \ 1759 " divscc %%g1,%4,%%g1\n" \ 1760 " divscc %%g1,%4,%%g1\n" \ 1761 " divscc %%g1,%4,%%g1\n" \ 1762 " divscc %%g1,%4,%%g1\n" \ 1763 " divscc %%g1,%4,%%g1\n" \ 1764 " divscc %%g1,%4,%%g1\n" \ 1765 " divscc %%g1,%4,%%g1\n" \ 1766 " divscc %%g1,%4,%%g1\n" \ 1767 " divscc %%g1,%4,%%g1\n" \ 1768 " divscc %%g1,%4,%%g1\n" \ 1769 " divscc %%g1,%4,%%g1\n" \ 1770 " divscc %%g1,%4,%%g1\n" \ 1771 " divscc %%g1,%4,%%g1\n" \ 1772 " divscc %%g1,%4,%%g1\n" \ 1773 " divscc %%g1,%4,%%g1\n" \ 1774 " divscc %%g1,%4,%%g1\n" \ 1775 " divscc %%g1,%4,%%g1\n" \ 1776 " divscc %%g1,%4,%%g1\n" \ 1777 " divscc %%g1,%4,%0\n" \ 1778 " rd %%y,%1\n" \ 1779 " bl,a 1f\n" \ 1780 " add %1,%4,%1\n" \ 1781 "1: ! End of inline udiv_qrnnd" \ 1782 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ 1783 : "%g1" __AND_CLOBBER_CC) 1784 #define count_leading_zeros(count, x) \ 1785 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) 1786 /* Early sparclites return 63 for an argument of 0, but they warn that future 1787 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 1788 undefined. */ 1789 #endif /* __sparclite__ */ 1790 #endif /* __sparc_v8__ */ 1791 #endif /* __sparc_v9__ */ 1792 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ 1793 #ifndef umul_ppmm 1794 #define umul_ppmm(w1, w0, u, v) \ 1795 __asm__ ("! Inlined umul_ppmm\n" \ 1796 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ 1797 " sra %3,31,%%g2 ! Don't move this insn\n" \ 1798 " and %2,%%g2,%%g2 ! Don't move this insn\n" \ 1799 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \ 1800 " mulscc %%g1,%3,%%g1\n" \ 1801 " mulscc %%g1,%3,%%g1\n" \ 1802 " mulscc %%g1,%3,%%g1\n" \ 1803 " mulscc %%g1,%3,%%g1\n" \ 1804 " mulscc %%g1,%3,%%g1\n" \ 1805 " mulscc %%g1,%3,%%g1\n" \ 1806 " mulscc %%g1,%3,%%g1\n" \ 1807 " mulscc %%g1,%3,%%g1\n" \ 1808 " mulscc %%g1,%3,%%g1\n" \ 1809 " mulscc %%g1,%3,%%g1\n" \ 1810 " mulscc %%g1,%3,%%g1\n" \ 1811 " mulscc %%g1,%3,%%g1\n" \ 1812 " mulscc %%g1,%3,%%g1\n" \ 1813 " mulscc %%g1,%3,%%g1\n" \ 1814 " mulscc %%g1,%3,%%g1\n" \ 1815 " mulscc %%g1,%3,%%g1\n" \ 1816 " mulscc %%g1,%3,%%g1\n" \ 1817 " mulscc %%g1,%3,%%g1\n" \ 1818 " mulscc %%g1,%3,%%g1\n" \ 1819 " mulscc %%g1,%3,%%g1\n" \ 1820 " mulscc %%g1,%3,%%g1\n" \ 1821 " mulscc %%g1,%3,%%g1\n" \ 1822 " mulscc %%g1,%3,%%g1\n" \ 1823 " mulscc %%g1,%3,%%g1\n" \ 1824 " mulscc %%g1,%3,%%g1\n" \ 1825 " mulscc %%g1,%3,%%g1\n" \ 1826 " mulscc %%g1,%3,%%g1\n" \ 1827 " mulscc %%g1,%3,%%g1\n" \ 1828 " mulscc %%g1,%3,%%g1\n" \ 1829 " mulscc %%g1,%3,%%g1\n" \ 1830 " mulscc %%g1,%3,%%g1\n" \ 1831 " mulscc %%g1,%3,%%g1\n" \ 1832 " mulscc %%g1,0,%%g1\n" \ 1833 " add %%g1,%%g2,%0\n" \ 1834 " rd %%y,%1" \ 1835 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ 1836 : "%g1", "%g2" __AND_CLOBBER_CC) 1837 #endif 1838 #ifndef udiv_qrnnd 1839 #ifndef LONGLONG_STANDALONE 1840 #define udiv_qrnnd(q, r, n1, n0, d) \ 1841 do { UWtype __r; \ 1842 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 1843 (r) = __r; \ 1844 } while (0) 1845 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 1846 #endif /* LONGLONG_STANDALONE */ 1847 #endif /* udiv_qrnnd */ 1848 #endif /* __sparc__ */ 1849 1850 #if defined (__sparc__) && W_TYPE_SIZE == 64 1851 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1852 __asm__ ( \ 1853 "addcc %r4,%5,%1\n" \ 1854 " addccc %r6,%7,%%g0\n" \ 1855 " addc %r2,%3,%0" \ 1856 : "=r" (sh), "=&r" (sl) \ 1857 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1858 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1859 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1860 __CLOBBER_CC) 1861 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1862 __asm__ ( \ 1863 "subcc %r4,%5,%1\n" \ 1864 " subccc %r6,%7,%%g0\n" \ 1865 " subc %r2,%3,%0" \ 1866 : "=r" (sh), "=&r" (sl) \ 1867 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1868 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1869 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1870 __CLOBBER_CC) 1871 #if __VIS__ >= 0x300 1872 #undef add_ssaaaa 1873 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1874 __asm__ ( \ 1875 "addcc %r4, %5, %1\n" \ 1876 " addxc %r2, %r3, %0" \ 1877 : "=r" (sh), "=&r" (sl) \ 1878 : "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \ 1879 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC) 1880 #define umul_ppmm(ph, pl, m0, m1) \ 1881 do { \ 1882 UDItype __m0 = (m0), __m1 = (m1); \ 1883 (pl) = __m0 * __m1; \ 1884 __asm__ ("umulxhi\t%2, %1, %0" \ 1885 : "=r" (ph) \ 1886 : "%r" (__m0), "r" (__m1)); \ 1887 } while (0) 1888 #define count_leading_zeros(count, x) \ 1889 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x)) 1890 /* Needed by count_leading_zeros_32 in sparc64.h. */ 1891 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 1892 #endif 1893 #endif 1894 1895 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32 1896 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1897 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ 1898 : "=g" (sh), "=&g" (sl) \ 1899 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1900 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1901 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1902 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \ 1903 : "=g" (sh), "=&g" (sl) \ 1904 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1905 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1906 #define smul_ppmm(xh, xl, m0, m1) \ 1907 do { \ 1908 union {UDItype __ll; \ 1909 struct {USItype __l, __h;} __i; \ 1910 } __x; \ 1911 USItype __m0 = (m0), __m1 = (m1); \ 1912 __asm__ ("emul %1,%2,$0,%0" \ 1913 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \ 1914 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1915 } while (0) 1916 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1917 do { \ 1918 union {DItype __ll; \ 1919 struct {SItype __l, __h;} __i; \ 1920 } __x; \ 1921 __x.__i.__h = n1; __x.__i.__l = n0; \ 1922 __asm__ ("ediv %3,%2,%0,%1" \ 1923 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ 1924 } while (0) 1925 #if 0 1926 /* FIXME: This instruction appears to be unimplemented on some systems (vax 1927 8800 maybe). */ 1928 #define count_trailing_zeros(count,x) \ 1929 do { \ 1930 __asm__ ("ffs 0, 31, %1, %0" \ 1931 : "=g" (count) \ 1932 : "g" ((USItype) (x))); \ 1933 } while (0) 1934 #endif 1935 #endif /* vax */ 1936 1937 #if defined (__z8000__) && W_TYPE_SIZE == 16 1938 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1939 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ 1940 : "=r" (sh), "=&r" (sl) \ 1941 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1942 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1943 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1944 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ 1945 : "=r" (sh), "=&r" (sl) \ 1946 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1947 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1948 #define umul_ppmm(xh, xl, m0, m1) \ 1949 do { \ 1950 union {long int __ll; \ 1951 struct {unsigned int __h, __l;} __i; \ 1952 } __x; \ 1953 unsigned int __m0 = (m0), __m1 = (m1); \ 1954 __asm__ ("mult %S0,%H3" \ 1955 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ 1956 : "%1" (m0), "rQR" (m1)); \ 1957 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1958 (xh) += ((((signed int) __m0 >> 15) & __m1) \ 1959 + (((signed int) __m1 >> 15) & __m0)); \ 1960 } while (0) 1961 #endif /* __z8000__ */ 1962 1963 #endif /* __GNUC__ */ 1964 1965 #endif /* NO_ASM */ 1966 1967 1968 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */ 1969 #if !defined (umul_ppmm) && defined (__umulsidi3) 1970 #define umul_ppmm(ph, pl, m0, m1) \ 1971 do { \ 1972 UDWtype __ll = __umulsidi3 (m0, m1); \ 1973 ph = (UWtype) (__ll >> W_TYPE_SIZE); \ 1974 pl = (UWtype) __ll; \ 1975 } while (0) 1976 #endif 1977 1978 #if !defined (__umulsidi3) 1979 #define __umulsidi3(u, v) \ 1980 ({UWtype __hi, __lo; \ 1981 umul_ppmm (__hi, __lo, u, v); \ 1982 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) 1983 #endif 1984 1985 1986 #if defined (__cplusplus) 1987 #define __longlong_h_C "C" 1988 #else 1989 #define __longlong_h_C 1990 #endif 1991 1992 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r" 1993 forms have "reversed" arguments, meaning the pointer is last, which 1994 sometimes allows better parameter passing, in particular on 64-bit 1995 hppa. */ 1996 1997 #define mpn_umul_ppmm __MPN(umul_ppmm) 1998 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype); 1999 2000 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \ 2001 && ! defined (LONGLONG_STANDALONE) 2002 #define umul_ppmm(wh, wl, u, v) \ 2003 do { \ 2004 UWtype __umul_ppmm__p0; \ 2005 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\ 2006 (wl) = __umul_ppmm__p0; \ 2007 } while (0) 2008 #endif 2009 2010 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r) 2011 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *); 2012 2013 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \ 2014 && ! defined (LONGLONG_STANDALONE) 2015 #define umul_ppmm(wh, wl, u, v) \ 2016 do { \ 2017 UWtype __umul_p0; \ 2018 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \ 2019 (wl) = __umul_p0; \ 2020 } while (0) 2021 #endif 2022 2023 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd) 2024 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype); 2025 2026 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \ 2027 && ! defined (LONGLONG_STANDALONE) 2028 #define udiv_qrnnd(q, r, n1, n0, d) \ 2029 do { \ 2030 UWtype __udiv_qrnnd_r; \ 2031 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \ 2032 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \ 2033 (r) = __udiv_qrnnd_r; \ 2034 } while (0) 2035 #endif 2036 2037 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r) 2038 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *); 2039 2040 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \ 2041 && ! defined (LONGLONG_STANDALONE) 2042 #define udiv_qrnnd(q, r, n1, n0, d) \ 2043 do { \ 2044 UWtype __udiv_qrnnd_r; \ 2045 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \ 2046 &__udiv_qrnnd_r); \ 2047 (r) = __udiv_qrnnd_r; \ 2048 } while (0) 2049 #endif 2050 2051 2052 /* If this machine has no inline assembler, use C macros. */ 2053 2054 #if !defined (add_ssaaaa) 2055 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 2056 do { \ 2057 UWtype __x; \ 2058 UWtype __al = (al); \ 2059 UWtype __bl = (bl); \ 2060 __x = __al + __bl; \ 2061 (sh) = (ah) + (bh) + (__x < __al); \ 2062 (sl) = __x; \ 2063 } while (0) 2064 #endif 2065 2066 #if !defined (sub_ddmmss) 2067 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 2068 do { \ 2069 UWtype __x; \ 2070 UWtype __al = (al); \ 2071 UWtype __bl = (bl); \ 2072 __x = __al - __bl; \ 2073 (sh) = (ah) - (bh) - (__al < __bl); \ 2074 (sl) = __x; \ 2075 } while (0) 2076 #endif 2077 2078 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of 2079 smul_ppmm. */ 2080 #if !defined (umul_ppmm) && defined (smul_ppmm) 2081 #define umul_ppmm(w1, w0, u, v) \ 2082 do { \ 2083 UWtype __w1; \ 2084 UWtype __xm0 = (u), __xm1 = (v); \ 2085 smul_ppmm (__w1, w0, __xm0, __xm1); \ 2086 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2087 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2088 } while (0) 2089 #endif 2090 2091 /* If we still don't have umul_ppmm, define it using plain C. 2092 2093 For reference, when this code is used for squaring (ie. u and v identical 2094 expressions), gcc recognises __x1 and __x2 are the same and generates 3 2095 multiplies, not 4. The subsequent additions could be optimized a bit, 2096 but the only place GMP currently uses such a square is mpn_sqr_basecase, 2097 and chips obliged to use this generic C umul will have plenty of worse 2098 performance problems than a couple of extra instructions on the diagonal 2099 of sqr_basecase. */ 2100 2101 #if !defined (umul_ppmm) 2102 #define umul_ppmm(w1, w0, u, v) \ 2103 do { \ 2104 UWtype __x0, __x1, __x2, __x3; \ 2105 UHWtype __ul, __vl, __uh, __vh; \ 2106 UWtype __u = (u), __v = (v); \ 2107 \ 2108 __ul = __ll_lowpart (__u); \ 2109 __uh = __ll_highpart (__u); \ 2110 __vl = __ll_lowpart (__v); \ 2111 __vh = __ll_highpart (__v); \ 2112 \ 2113 __x0 = (UWtype) __ul * __vl; \ 2114 __x1 = (UWtype) __ul * __vh; \ 2115 __x2 = (UWtype) __uh * __vl; \ 2116 __x3 = (UWtype) __uh * __vh; \ 2117 \ 2118 __x1 += __ll_highpart (__x0);/* this can't give carry */ \ 2119 __x1 += __x2; /* but this indeed can */ \ 2120 if (__x1 < __x2) /* did we get it? */ \ 2121 __x3 += __ll_B; /* yes, add it in the proper pos. */ \ 2122 \ 2123 (w1) = __x3 + __ll_highpart (__x1); \ 2124 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \ 2125 } while (0) 2126 #endif 2127 2128 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will 2129 exist in one form or another. */ 2130 #if !defined (smul_ppmm) 2131 #define smul_ppmm(w1, w0, u, v) \ 2132 do { \ 2133 UWtype __w1; \ 2134 UWtype __xm0 = (u), __xm1 = (v); \ 2135 umul_ppmm (__w1, w0, __xm0, __xm1); \ 2136 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2137 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2138 } while (0) 2139 #endif 2140 2141 /* Define this unconditionally, so it can be used for debugging. */ 2142 #define __udiv_qrnnd_c(q, r, n1, n0, d) \ 2143 do { \ 2144 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ 2145 \ 2146 ASSERT ((d) != 0); \ 2147 ASSERT ((n1) < (d)); \ 2148 \ 2149 __d1 = __ll_highpart (d); \ 2150 __d0 = __ll_lowpart (d); \ 2151 \ 2152 __q1 = (n1) / __d1; \ 2153 __r1 = (n1) - __q1 * __d1; \ 2154 __m = __q1 * __d0; \ 2155 __r1 = __r1 * __ll_B | __ll_highpart (n0); \ 2156 if (__r1 < __m) \ 2157 { \ 2158 __q1--, __r1 += (d); \ 2159 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ 2160 if (__r1 < __m) \ 2161 __q1--, __r1 += (d); \ 2162 } \ 2163 __r1 -= __m; \ 2164 \ 2165 __q0 = __r1 / __d1; \ 2166 __r0 = __r1 - __q0 * __d1; \ 2167 __m = __q0 * __d0; \ 2168 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ 2169 if (__r0 < __m) \ 2170 { \ 2171 __q0--, __r0 += (d); \ 2172 if (__r0 >= (d)) \ 2173 if (__r0 < __m) \ 2174 __q0--, __r0 += (d); \ 2175 } \ 2176 __r0 -= __m; \ 2177 \ 2178 (q) = __q1 * __ll_B | __q0; \ 2179 (r) = __r0; \ 2180 } while (0) 2181 2182 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through 2183 __udiv_w_sdiv (defined in libgcc or elsewhere). */ 2184 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \ 2185 && ! defined (LONGLONG_STANDALONE) 2186 #define udiv_qrnnd(q, r, nh, nl, d) \ 2187 do { \ 2188 UWtype __r; \ 2189 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ 2190 (r) = __r; \ 2191 } while (0) 2192 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype); 2193 #endif 2194 2195 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ 2196 #if !defined (udiv_qrnnd) 2197 #define UDIV_NEEDS_NORMALIZATION 1 2198 #define udiv_qrnnd __udiv_qrnnd_c 2199 #endif 2200 2201 #if !defined (count_leading_zeros) 2202 #define count_leading_zeros(count, x) \ 2203 do { \ 2204 UWtype __xr = (x); \ 2205 UWtype __a; \ 2206 \ 2207 if (W_TYPE_SIZE == 32) \ 2208 { \ 2209 __a = __xr < ((UWtype) 1 << 2*__BITS4) \ 2210 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ 2211 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ 2212 : 3*__BITS4 + 1); \ 2213 } \ 2214 else \ 2215 { \ 2216 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ 2217 if (((__xr >> __a) & 0xff) != 0) \ 2218 break; \ 2219 ++__a; \ 2220 } \ 2221 \ 2222 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ 2223 } while (0) 2224 /* This version gives a well-defined value for zero. */ 2225 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) 2226 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2227 #define COUNT_LEADING_ZEROS_SLOW 2228 #endif 2229 2230 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */ 2231 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY 2232 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2233 #endif 2234 2235 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2236 extern const unsigned char __GMP_DECLSPEC __clz_tab[129]; 2237 #endif 2238 2239 #if !defined (count_trailing_zeros) 2240 #if !defined (COUNT_LEADING_ZEROS_SLOW) 2241 /* Define count_trailing_zeros using an asm count_leading_zeros. */ 2242 #define count_trailing_zeros(count, x) \ 2243 do { \ 2244 UWtype __ctz_x = (x); \ 2245 UWtype __ctz_c; \ 2246 ASSERT (__ctz_x != 0); \ 2247 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ 2248 (count) = W_TYPE_SIZE - 1 - __ctz_c; \ 2249 } while (0) 2250 #else 2251 /* Define count_trailing_zeros in plain C, assuming small counts are common. 2252 We use clz_tab without ado, since the C count_leading_zeros above will have 2253 pulled it in. */ 2254 #define count_trailing_zeros(count, x) \ 2255 do { \ 2256 UWtype __ctz_x = (x); \ 2257 int __ctz_c; \ 2258 \ 2259 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2260 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \ 2261 else \ 2262 { \ 2263 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \ 2264 { \ 2265 __ctz_x >>= 8; \ 2266 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2267 break; \ 2268 } \ 2269 \ 2270 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \ 2271 } \ 2272 } while (0) 2273 #endif 2274 #endif 2275 2276 #ifndef UDIV_NEEDS_NORMALIZATION 2277 #define UDIV_NEEDS_NORMALIZATION 0 2278 #endif 2279 2280 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and 2281 that hence the latter should always be used. */ 2282 #ifndef UDIV_PREINV_ALWAYS 2283 #define UDIV_PREINV_ALWAYS 0 2284 #endif 2285