1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic. 2 3 Copyright 1991-2020 Free Software Foundation, Inc. 4 5 This file is free software; you can redistribute it and/or modify it under the 6 terms of the GNU Lesser General Public License as published by the Free 7 Software Foundation; either version 3 of the License, or (at your option) any 8 later version. 9 10 This file is distributed in the hope that it will be useful, but WITHOUT ANY 11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 12 PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this file. If not, see https://www.gnu.org/licenses/. */ 17 18 /* You have to define the following before including this file: 19 20 UWtype -- An unsigned type, default type for operations (typically a "word") 21 UHWtype -- An unsigned type, at least half the size of UWtype 22 UDWtype -- An unsigned type, at least twice as large a UWtype 23 W_TYPE_SIZE -- size in bits of UWtype 24 25 SItype, USItype -- Signed and unsigned 32 bit types 26 DItype, UDItype -- Signed and unsigned 64 bit types 27 28 On a 32 bit machine UWtype should typically be USItype; 29 on a 64 bit machine, UWtype should typically be UDItype. 30 31 Optionally, define: 32 33 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files 34 NO_ASM -- Disable inline asm 35 36 37 CAUTION! Using this version of longlong.h outside of GMP is not safe. You 38 need to include gmp.h and gmp-impl.h, or certain things might not work as 39 expected. 40 */ 41 42 #define __BITS4 (W_TYPE_SIZE / 4) 43 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) 44 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) 45 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) 46 47 /* This is used to make sure no undesirable sharing between different libraries 48 that use this file takes place. */ 49 #ifndef __MPN 50 #define __MPN(x) __##x 51 #endif 52 53 /* Define auxiliary asm macros. 54 55 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two 56 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype 57 word product in HIGH_PROD and LOW_PROD. 58 59 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a 60 UDWtype product. This is just a variant of umul_ppmm. 61 62 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 63 denominator) divides a UDWtype, composed by the UWtype integers 64 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient 65 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less 66 than DENOMINATOR for correct operation. If, in addition, the most 67 significant bit of DENOMINATOR must be 1, then the pre-processor symbol 68 UDIV_NEEDS_NORMALIZATION is defined to 1. 69 70 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 71 denominator). Like udiv_qrnnd but the numbers are signed. The quotient 72 is rounded towards 0. 73 74 5) count_leading_zeros(count, x) counts the number of zero-bits from the 75 msb to the first non-zero bit in the UWtype X. This is the number of 76 steps X needs to be shifted left to set the msb. Undefined for X == 0, 77 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. 78 79 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts 80 from the least significant end. 81 82 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, 83 high_addend_2, low_addend_2) adds two UWtype integers, composed by 84 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 85 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow 86 (i.e. carry out) is not stored anywhere, and is lost. 87 88 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, 89 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, 90 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and 91 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE 92 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, 93 and is lost. 94 95 If any of these macros are left undefined for a particular CPU, 96 C macros are used. 97 98 99 Notes: 100 101 For add_ssaaaa the two high and two low addends can both commute, but 102 unfortunately gcc only supports one "%" commutative in each asm block. 103 This has always been so but is only documented in recent versions 104 (eg. pre-release 3.3). Having two or more "%"s can cause an internal 105 compiler error in certain rare circumstances. 106 107 Apparently it was only the last "%" that was ever actually respected, so 108 the code has been updated to leave just that. Clearly there's a free 109 choice whether high or low should get it, if there's a reason to favour 110 one over the other. Also obviously when the constraints on the two 111 operands are identical there's no benefit to the reloader in any "%" at 112 all. 113 114 */ 115 116 /* The CPUs come in alphabetical order below. 117 118 Please add support for more CPUs here, or improve the current support 119 for the CPUs below! */ 120 121 122 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc 123 3.4 __builtin_clzl or __builtin_clzll, according to our limb size. 124 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or 125 __builtin_ctzll. 126 127 These builtins are only used when we check what code comes out, on some 128 chips they're merely libgcc calls, where we will instead want an inline 129 in that case (either asm or generic C). 130 131 These builtins are better than an asm block of the same insn, since an 132 asm block doesn't give gcc any information about scheduling or resource 133 usage. We keep an asm block for use on prior versions of gcc though. 134 135 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but 136 it's not used (for count_leading_zeros) because it generally gives extra 137 code to ensure the result is 0 when the input is 0, which we don't need 138 or want. */ 139 140 #ifdef _LONG_LONG_LIMB 141 #define count_leading_zeros_gcc_clz(count,x) \ 142 do { \ 143 ASSERT ((x) != 0); \ 144 (count) = __builtin_clzll (x); \ 145 } while (0) 146 #else 147 #define count_leading_zeros_gcc_clz(count,x) \ 148 do { \ 149 ASSERT ((x) != 0); \ 150 (count) = __builtin_clzl (x); \ 151 } while (0) 152 #endif 153 154 #ifdef _LONG_LONG_LIMB 155 #define count_trailing_zeros_gcc_ctz(count,x) \ 156 do { \ 157 ASSERT ((x) != 0); \ 158 (count) = __builtin_ctzll (x); \ 159 } while (0) 160 #else 161 #define count_trailing_zeros_gcc_ctz(count,x) \ 162 do { \ 163 ASSERT ((x) != 0); \ 164 (count) = __builtin_ctzl (x); \ 165 } while (0) 166 #endif 167 168 169 /* FIXME: The macros using external routines like __MPN(count_leading_zeros) 170 don't need to be under !NO_ASM */ 171 #if ! defined (NO_ASM) 172 173 #if defined (__alpha) && W_TYPE_SIZE == 64 174 /* Most alpha-based machines, except Cray systems. */ 175 #if defined (__GNUC__) 176 #if __GMP_GNUC_PREREQ (3,3) 177 #define umul_ppmm(ph, pl, m0, m1) \ 178 do { \ 179 UDItype __m0 = (m0), __m1 = (m1); \ 180 (ph) = __builtin_alpha_umulh (__m0, __m1); \ 181 (pl) = __m0 * __m1; \ 182 } while (0) 183 #else 184 #define umul_ppmm(ph, pl, m0, m1) \ 185 do { \ 186 UDItype __m0 = (m0), __m1 = (m1); \ 187 __asm__ ("umulh %r1,%2,%0" \ 188 : "=r" (ph) \ 189 : "%rJ" (__m0), "rI" (__m1)); \ 190 (pl) = __m0 * __m1; \ 191 } while (0) 192 #endif 193 #else /* ! __GNUC__ */ 194 #include <machine/builtins.h> 195 #define umul_ppmm(ph, pl, m0, m1) \ 196 do { \ 197 UDItype __m0 = (m0), __m1 = (m1); \ 198 (ph) = __UMULH (__m0, __m1); \ 199 (pl) = __m0 * __m1; \ 200 } while (0) 201 #endif 202 #ifndef LONGLONG_STANDALONE 203 #define udiv_qrnnd(q, r, n1, n0, d) \ 204 do { UWtype __di; \ 205 __di = __MPN(invert_limb) (d); \ 206 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 207 } while (0) 208 #define UDIV_PREINV_ALWAYS 1 209 #define UDIV_NEEDS_NORMALIZATION 1 210 #endif /* LONGLONG_STANDALONE */ 211 212 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm 213 always goes into libgmp.so, even when not actually used. */ 214 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 215 216 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX 217 #define count_leading_zeros(COUNT,X) \ 218 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) 219 #define count_trailing_zeros(COUNT,X) \ 220 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) 221 #endif /* clz/ctz using cix */ 222 223 #if ! defined (count_leading_zeros) \ 224 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) 225 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. 226 "$31" is written explicitly in the asm, since an "r" constraint won't 227 select reg 31. There seems no need to worry about "r31" syntax for cray, 228 since gcc itself (pre-release 3.4) emits just $31 in various places. */ 229 #define ALPHA_CMPBGE_0(dst, src) \ 230 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) 231 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts 232 them, locating the highest non-zero byte. A second __clz_tab lookup 233 counts the leading zero bits in that byte, giving the result. */ 234 #define count_leading_zeros(count, x) \ 235 do { \ 236 UWtype __clz__b, __clz__c, __clz__x = (x); \ 237 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ 238 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ 239 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ 240 __clz__x >>= __clz__b; \ 241 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ 242 __clz__b = 65 - __clz__b; \ 243 (count) = __clz__b - __clz__c; \ 244 } while (0) 245 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 246 #endif /* clz using cmpbge */ 247 248 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) 249 #if HAVE_ATTRIBUTE_CONST 250 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); 251 #else 252 long __MPN(count_leading_zeros) (UDItype); 253 #endif 254 #define count_leading_zeros(count, x) \ 255 ((count) = __MPN(count_leading_zeros) (x)) 256 #endif /* clz using mpn */ 257 #endif /* __alpha */ 258 259 #if defined (__AVR) && W_TYPE_SIZE == 8 260 #define umul_ppmm(ph, pl, m0, m1) \ 261 do { \ 262 unsigned short __p = (unsigned short) (m0) * (m1); \ 263 (ph) = __p >> 8; \ 264 (pl) = __p; \ 265 } while (0) 266 #endif /* AVR */ 267 268 #if defined (_CRAY) && W_TYPE_SIZE == 64 269 #include <intrinsics.h> 270 #define UDIV_PREINV_ALWAYS 1 271 #define UDIV_NEEDS_NORMALIZATION 1 272 long __MPN(count_leading_zeros) (UDItype); 273 #define count_leading_zeros(count, x) \ 274 ((count) = _leadz ((UWtype) (x))) 275 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ 276 #define umul_ppmm(ph, pl, m0, m1) \ 277 do { \ 278 UDItype __m0 = (m0), __m1 = (m1); \ 279 (ph) = _int_mult_upper (__m0, __m1); \ 280 (pl) = __m0 * __m1; \ 281 } while (0) 282 #ifndef LONGLONG_STANDALONE 283 #define udiv_qrnnd(q, r, n1, n0, d) \ 284 do { UWtype __di; \ 285 __di = __MPN(invert_limb) (d); \ 286 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 287 } while (0) 288 #endif /* LONGLONG_STANDALONE */ 289 #endif /* _CRAYIEEE */ 290 #endif /* _CRAY */ 291 292 #if defined (__ia64) && W_TYPE_SIZE == 64 293 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated 294 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic 295 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a 296 register, which takes an extra cycle. */ 297 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 298 do { \ 299 UWtype __x; \ 300 __x = (al) - (bl); \ 301 if ((al) < (bl)) \ 302 (sh) = (ah) - (bh) - 1; \ 303 else \ 304 (sh) = (ah) - (bh); \ 305 (sl) = __x; \ 306 } while (0) 307 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 308 /* Do both product parts in assembly, since that gives better code with 309 all gcc versions. Some callers will just use the upper part, and in 310 that situation we waste an instruction, but not any cycles. */ 311 #define umul_ppmm(ph, pl, m0, m1) \ 312 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \ 313 : "=&f" (ph), "=f" (pl) \ 314 : "f" (m0), "f" (m1)) 315 #define count_leading_zeros(count, x) \ 316 do { \ 317 UWtype _x = (x), _y, _a, _c; \ 318 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ 319 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ 320 _c = (_a - 1) << 3; \ 321 _x >>= _c; \ 322 if (_x >= 1 << 4) \ 323 _x >>= 4, _c += 4; \ 324 if (_x >= 1 << 2) \ 325 _x >>= 2, _c += 2; \ 326 _c += _x >> 1; \ 327 (count) = W_TYPE_SIZE - 1 - _c; \ 328 } while (0) 329 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1 330 based, and we don't need a special case for x==0 here */ 331 #define count_trailing_zeros(count, x) \ 332 do { \ 333 UWtype __ctz_x = (x); \ 334 __asm__ ("popcnt %0 = %1" \ 335 : "=r" (count) \ 336 : "r" ((__ctz_x-1) & ~__ctz_x)); \ 337 } while (0) 338 #endif 339 #if defined (__INTEL_COMPILER) 340 #include <ia64intrin.h> 341 #define umul_ppmm(ph, pl, m0, m1) \ 342 do { \ 343 UWtype __m0 = (m0), __m1 = (m1); \ 344 ph = _m64_xmahu (__m0, __m1, 0); \ 345 pl = __m0 * __m1; \ 346 } while (0) 347 #endif 348 #ifndef LONGLONG_STANDALONE 349 #define udiv_qrnnd(q, r, n1, n0, d) \ 350 do { UWtype __di; \ 351 __di = __MPN(invert_limb) (d); \ 352 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 353 } while (0) 354 #define UDIV_PREINV_ALWAYS 1 355 #define UDIV_NEEDS_NORMALIZATION 1 356 #endif 357 #endif 358 359 360 #if defined (__GNUC__) 361 362 /* We sometimes need to clobber "cc" with gcc2, but that would not be 363 understood by gcc1. Use cpp to avoid major code duplication. */ 364 #if __GNUC__ < 2 365 #define __CLOBBER_CC 366 #define __AND_CLOBBER_CC 367 #else /* __GNUC__ >= 2 */ 368 #define __CLOBBER_CC : "cc" 369 #define __AND_CLOBBER_CC , "cc" 370 #endif /* __GNUC__ < 2 */ 371 372 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 373 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 374 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \ 375 : "=r" (sh), "=&r" (sl) \ 376 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) 377 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 378 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ 379 : "=r" (sh), "=&r" (sl) \ 380 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) 381 #define umul_ppmm(xh, xl, m0, m1) \ 382 do { \ 383 USItype __m0 = (m0), __m1 = (m1); \ 384 __asm__ ("multiplu %0,%1,%2" \ 385 : "=r" (xl) \ 386 : "r" (__m0), "r" (__m1)); \ 387 __asm__ ("multmu %0,%1,%2" \ 388 : "=r" (xh) \ 389 : "r" (__m0), "r" (__m1)); \ 390 } while (0) 391 #define udiv_qrnnd(q, r, n1, n0, d) \ 392 __asm__ ("dividu %0,%3,%4" \ 393 : "=r" (q), "=q" (r) \ 394 : "1" (n1), "r" (n0), "r" (d)) 395 #define count_leading_zeros(count, x) \ 396 __asm__ ("clz %0,%1" \ 397 : "=r" (count) \ 398 : "r" (x)) 399 #define COUNT_LEADING_ZEROS_0 32 400 #endif /* __a29k__ */ 401 402 #if defined (__arc__) 403 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 404 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 405 : "=r" (sh), \ 406 "=&r" (sl) \ 407 : "r" ((USItype) (ah)), \ 408 "rICal" ((USItype) (bh)), \ 409 "%r" ((USItype) (al)), \ 410 "rICal" ((USItype) (bl))) 411 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 412 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 413 : "=r" (sh), \ 414 "=&r" (sl) \ 415 : "r" ((USItype) (ah)), \ 416 "rICal" ((USItype) (bh)), \ 417 "r" ((USItype) (al)), \ 418 "rICal" ((USItype) (bl))) 419 #endif 420 421 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \ 422 && W_TYPE_SIZE == 32 423 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 424 do { \ 425 if (__builtin_constant_p (bl) && -(USItype)(bl) < 0x100) \ 426 __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 427 : "=r" (sh), "=&r" (sl) \ 428 : "r" (ah), "rI" (bh), \ 429 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC); \ 430 else \ 431 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 432 : "=r" (sh), "=&r" (sl) \ 433 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC); \ 434 } while (0) 435 /* FIXME: Extend the immediate range for the low word by using both ADDS and 436 SUBS, since they set carry in the same way. Note: We need separate 437 definitions for thumb and non-thumb due to the absence of RSC on thumb. */ 438 #if defined (__thumb__) 439 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 440 do { \ 441 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \ 442 && (ah) == (bh)) \ 443 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \ 444 : "=r" (sh), "=r" (sl) \ 445 : "r" (al), "rI" (bl) __CLOBBER_CC); \ 446 else if (__builtin_constant_p (al)) \ 447 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 448 : "=r" (sh), "=&r" (sl) \ 449 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 450 else if (__builtin_constant_p (bl)) \ 451 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 452 : "=r" (sh), "=&r" (sl) \ 453 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 454 else \ 455 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 456 : "=r" (sh), "=&r" (sl) \ 457 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 458 } while (0) 459 #else 460 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 461 do { \ 462 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \ 463 && (ah) == (bh)) \ 464 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \ 465 : "=r" (sh), "=r" (sl) \ 466 : "r" (al), "rI" (bl) __CLOBBER_CC); \ 467 else if (__builtin_constant_p (al)) \ 468 { \ 469 if (__builtin_constant_p (ah)) \ 470 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 471 : "=r" (sh), "=&r" (sl) \ 472 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 473 else \ 474 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 475 : "=r" (sh), "=&r" (sl) \ 476 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 477 } \ 478 else if (__builtin_constant_p (ah)) \ 479 { \ 480 if (__builtin_constant_p (bl)) \ 481 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ 482 : "=r" (sh), "=&r" (sl) \ 483 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 484 else \ 485 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 486 : "=r" (sh), "=&r" (sl) \ 487 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 488 } \ 489 else if (__builtin_constant_p (bl)) \ 490 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 491 : "=r" (sh), "=&r" (sl) \ 492 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 493 else /* only bh might be a constant */ \ 494 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 495 : "=r" (sh), "=&r" (sl) \ 496 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 497 } while (0) 498 #endif 499 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \ 500 || defined (__ARM_ARCH_3__) 501 #define umul_ppmm(xh, xl, a, b) \ 502 do { \ 503 register USItype __t0, __t1, __t2; \ 504 __asm__ ("%@ Inlined umul_ppmm\n" \ 505 " mov %2, %5, lsr #16\n" \ 506 " mov %0, %6, lsr #16\n" \ 507 " bic %3, %5, %2, lsl #16\n" \ 508 " bic %4, %6, %0, lsl #16\n" \ 509 " mul %1, %3, %4\n" \ 510 " mul %4, %2, %4\n" \ 511 " mul %3, %0, %3\n" \ 512 " mul %0, %2, %0\n" \ 513 " adds %3, %4, %3\n" \ 514 " addcs %0, %0, #65536\n" \ 515 " adds %1, %1, %3, lsl #16\n" \ 516 " adc %0, %0, %3, lsr #16" \ 517 : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \ 518 "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \ 519 : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \ 520 } while (0) 521 #ifndef LONGLONG_STANDALONE 522 #define udiv_qrnnd(q, r, n1, n0, d) \ 523 do { UWtype __r; \ 524 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 525 (r) = __r; \ 526 } while (0) 527 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 528 #endif /* LONGLONG_STANDALONE */ 529 #else /* ARMv4 or newer */ 530 #define umul_ppmm(xh, xl, a, b) \ 531 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 532 #define smul_ppmm(xh, xl, a, b) \ 533 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 534 #ifndef LONGLONG_STANDALONE 535 #define udiv_qrnnd(q, r, n1, n0, d) \ 536 do { UWtype __di; \ 537 __di = __MPN(invert_limb) (d); \ 538 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 539 } while (0) 540 #define UDIV_PREINV_ALWAYS 1 541 #define UDIV_NEEDS_NORMALIZATION 1 542 #endif /* LONGLONG_STANDALONE */ 543 #endif /* defined(__ARM_ARCH_2__) ... */ 544 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 545 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 546 #endif /* __arm__ */ 547 548 #if defined (__aarch64__) && W_TYPE_SIZE == 64 549 /* FIXME: Extend the immediate range for the low word by using both 550 ADDS and SUBS, since they set carry in the same way. */ 551 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 552 do { \ 553 if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000) \ 554 __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 555 : "=r" (sh), "=&r" (sl) \ 556 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \ 557 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\ 558 else \ 559 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 560 : "=r" (sh), "=&r" (sl) \ 561 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \ 562 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\ 563 } while (0) 564 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 565 do { \ 566 if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000) \ 567 __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 568 : "=r,r" (sh), "=&r,&r" (sl) \ 569 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \ 570 "r,Z" ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\ 571 else \ 572 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 573 : "=r,r" (sh), "=&r,&r" (sl) \ 574 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \ 575 "r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC);\ 576 } while(0); 577 #if __GMP_GNUC_PREREQ (4,9) 578 #define umul_ppmm(w1, w0, u, v) \ 579 do { \ 580 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 581 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 582 w1 = __ll >> 64; \ 583 w0 = __ll; \ 584 } while (0) 585 #endif 586 #if !defined (umul_ppmm) 587 #define umul_ppmm(ph, pl, m0, m1) \ 588 do { \ 589 UDItype __m0 = (m0), __m1 = (m1); \ 590 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \ 591 (pl) = __m0 * __m1; \ 592 } while (0) 593 #endif 594 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x) 595 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x) 596 #endif /* __aarch64__ */ 597 598 #if defined (__clipper__) && W_TYPE_SIZE == 32 599 #define umul_ppmm(w1, w0, u, v) \ 600 ({union {UDItype __ll; \ 601 struct {USItype __l, __h;} __i; \ 602 } __x; \ 603 __asm__ ("mulwux %2,%0" \ 604 : "=r" (__x.__ll) \ 605 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 606 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 607 #define smul_ppmm(w1, w0, u, v) \ 608 ({union {DItype __ll; \ 609 struct {SItype __l, __h;} __i; \ 610 } __x; \ 611 __asm__ ("mulwx %2,%0" \ 612 : "=r" (__x.__ll) \ 613 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \ 614 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 615 #define __umulsidi3(u, v) \ 616 ({UDItype __w; \ 617 __asm__ ("mulwux %2,%0" \ 618 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 619 __w; }) 620 #endif /* __clipper__ */ 621 622 /* Fujitsu vector computers. */ 623 #if defined (__uxp__) && W_TYPE_SIZE == 32 624 #define umul_ppmm(ph, pl, u, v) \ 625 do { \ 626 union {UDItype __ll; \ 627 struct {USItype __h, __l;} __i; \ 628 } __x; \ 629 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\ 630 (ph) = __x.__i.__h; \ 631 (pl) = __x.__i.__l; \ 632 } while (0) 633 #define smul_ppmm(ph, pl, u, v) \ 634 do { \ 635 union {UDItype __ll; \ 636 struct {USItype __h, __l;} __i; \ 637 } __x; \ 638 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \ 639 (ph) = __x.__i.__h; \ 640 (pl) = __x.__i.__l; \ 641 } while (0) 642 #endif 643 644 #if defined (__gmicro__) && W_TYPE_SIZE == 32 645 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 646 __asm__ ("add.w %5,%1\n\taddx %3,%0" \ 647 : "=g" (sh), "=&g" (sl) \ 648 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 649 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 650 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 651 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \ 652 : "=g" (sh), "=&g" (sl) \ 653 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 654 "1" ((USItype)(al)), "g" ((USItype)(bl))) 655 #define umul_ppmm(ph, pl, m0, m1) \ 656 __asm__ ("mulx %3,%0,%1" \ 657 : "=g" (ph), "=r" (pl) \ 658 : "%0" ((USItype)(m0)), "g" ((USItype)(m1))) 659 #define udiv_qrnnd(q, r, nh, nl, d) \ 660 __asm__ ("divx %4,%0,%1" \ 661 : "=g" (q), "=r" (r) \ 662 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d))) 663 #define count_leading_zeros(count, x) \ 664 __asm__ ("bsch/1 %1,%0" \ 665 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0)) 666 #endif 667 668 #if defined (__hppa) && W_TYPE_SIZE == 32 669 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 670 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \ 671 : "=r" (sh), "=&r" (sl) \ 672 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 673 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 674 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \ 675 : "=r" (sh), "=&r" (sl) \ 676 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 677 #if defined (_PA_RISC1_1) 678 #define umul_ppmm(wh, wl, u, v) \ 679 do { \ 680 union {UDItype __ll; \ 681 struct {USItype __h, __l;} __i; \ 682 } __x; \ 683 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \ 684 (wh) = __x.__i.__h; \ 685 (wl) = __x.__i.__l; \ 686 } while (0) 687 #endif 688 #define count_leading_zeros(count, x) \ 689 do { \ 690 USItype __tmp; \ 691 __asm__ ( \ 692 "ldi 1,%0\n" \ 693 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ 694 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ 695 " ldo 16(%0),%0 ; Yes. Perform add.\n" \ 696 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ 697 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ 698 " ldo 8(%0),%0 ; Yes. Perform add.\n" \ 699 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ 700 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ 701 " ldo 4(%0),%0 ; Yes. Perform add.\n" \ 702 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ 703 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ 704 " ldo 2(%0),%0 ; Yes. Perform add.\n" \ 705 " extru %1,30,1,%1 ; Extract bit 1.\n" \ 706 " sub %0,%1,%0 ; Subtract it.\n" \ 707 : "=r" (count), "=r" (__tmp) : "1" (x)); \ 708 } while (0) 709 #endif /* hppa */ 710 711 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC 712 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this 713 is just a case of no direct support for 2.0n but treating it like 1.0. */ 714 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB) 715 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 716 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \ 717 : "=r" (sh), "=&r" (sl) \ 718 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 719 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 720 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \ 721 : "=r" (sh), "=&r" (sl) \ 722 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 723 #endif /* hppa */ 724 725 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 726 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch) 727 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 728 do { \ 729 /* if (__builtin_constant_p (bl)) \ 730 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \ 731 : "=r" (sh), "=&r" (sl) \ 732 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\ 733 else \ 734 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \ 735 : "=r" (sh), "=&r" (sl) \ 736 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \ 737 } while (0) 738 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 739 do { \ 740 /* if (__builtin_constant_p (bl)) \ 741 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \ 742 : "=r" (sh), "=&r" (sl) \ 743 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \ 744 else \ 745 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \ 746 : "=r" (sh), "=&r" (sl) \ 747 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \ 748 } while (0) 749 #if __GMP_GNUC_PREREQ (4,5) 750 #define umul_ppmm(xh, xl, m0, m1) \ 751 do { \ 752 union {UDItype __ll; \ 753 struct {USItype __h, __l;} __i; \ 754 } __x; \ 755 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \ 756 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 757 } while (0) 758 #else 759 #if 0 760 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 761 with a new enough processor pretending we have 32-bit registers. */ 762 #define umul_ppmm(xh, xl, m0, m1) \ 763 do { \ 764 union {UDItype __ll; \ 765 struct {USItype __h, __l;} __i; \ 766 } __x; \ 767 __asm__ ("mlr\t%0,%2" \ 768 : "=r" (__x.__ll) \ 769 : "%0" (m0), "r" (m1)); \ 770 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 771 } while (0) 772 #else 773 #define umul_ppmm(xh, xl, m0, m1) \ 774 do { \ 775 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use 776 DImode for the product, since that would be allocated to a single 64-bit 777 register, whereas mlr uses the low 32-bits of an even-odd register pair. 778 */ \ 779 register USItype __r0 __asm__ ("0"); \ 780 register USItype __r1 __asm__ ("1") = (m0); \ 781 __asm__ ("mlr\t%0,%3" \ 782 : "=r" (__r0), "=r" (__r1) \ 783 : "r" (__r1), "r" (m1)); \ 784 (xh) = __r0; (xl) = __r1; \ 785 } while (0) 786 #endif /* if 0 */ 787 #endif 788 #if 0 789 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 790 with a new enough processor pretending we have 32-bit registers. */ 791 #define udiv_qrnnd(q, r, n1, n0, d) \ 792 do { \ 793 union {UDItype __ll; \ 794 struct {USItype __h, __l;} __i; \ 795 } __x; \ 796 __x.__i.__h = n1; __x.__i.__l = n0; \ 797 __asm__ ("dlr\t%0,%2" \ 798 : "=r" (__x.__ll) \ 799 : "0" (__x.__ll), "r" (d)); \ 800 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 801 } while (0) 802 #else 803 #define udiv_qrnnd(q, r, n1, n0, d) \ 804 do { \ 805 register USItype __r0 __asm__ ("0") = (n1); \ 806 register USItype __r1 __asm__ ("1") = (n0); \ 807 __asm__ ("dlr\t%0,%4" \ 808 : "=r" (__r0), "=r" (__r1) \ 809 : "r" (__r0), "r" (__r1), "r" (d)); \ 810 (q) = __r1; (r) = __r0; \ 811 } while (0) 812 #endif /* if 0 */ 813 #else /* if __zarch__ */ 814 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 815 #define smul_ppmm(xh, xl, m0, m1) \ 816 do { \ 817 union {DItype __ll; \ 818 struct {USItype __h, __l;} __i; \ 819 } __x; \ 820 __asm__ ("mr\t%0,%2" \ 821 : "=r" (__x.__ll) \ 822 : "%0" (m0), "r" (m1)); \ 823 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 824 } while (0) 825 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 826 #define sdiv_qrnnd(q, r, n1, n0, d) \ 827 do { \ 828 union {DItype __ll; \ 829 struct {USItype __h, __l;} __i; \ 830 } __x; \ 831 __x.__i.__h = n1; __x.__i.__l = n0; \ 832 __asm__ ("dr\t%0,%2" \ 833 : "=r" (__x.__ll) \ 834 : "0" (__x.__ll), "r" (d)); \ 835 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 836 } while (0) 837 #endif /* if __zarch__ */ 838 #endif 839 840 #if defined (__s390x__) && W_TYPE_SIZE == 64 841 /* We need to cast operands with register constraints, otherwise their types 842 will be assumed to be SImode by gcc. For these machines, such operations 843 will insert a value into the low 32 bits, and leave the high 32 bits with 844 garbage. */ 845 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 846 do { \ 847 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \ 848 : "=r" (sh), "=&r" (sl) \ 849 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 850 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 851 } while (0) 852 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 853 do { \ 854 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \ 855 : "=r" (sh), "=&r" (sl) \ 856 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 857 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 858 } while (0) 859 #define umul_ppmm(xh, xl, m0, m1) \ 860 do { \ 861 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 862 struct {UDItype __h, __l;} __i; \ 863 } __x; \ 864 __asm__ ("mlgr\t%0,%2" \ 865 : "=r" (__x.__ll) \ 866 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \ 867 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 868 } while (0) 869 #define udiv_qrnnd(q, r, n1, n0, d) \ 870 do { \ 871 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 872 struct {UDItype __h, __l;} __i; \ 873 } __x; \ 874 __x.__i.__h = n1; __x.__i.__l = n0; \ 875 __asm__ ("dlgr\t%0,%2" \ 876 : "=r" (__x.__ll) \ 877 : "0" (__x.__ll), "r" ((UDItype)(d))); \ 878 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 879 } while (0) 880 #if 0 /* FIXME: Enable for z10 (?) */ 881 #define count_leading_zeros(cnt, x) \ 882 do { \ 883 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 884 struct {UDItype __h, __l;} __i; \ 885 } __clr_cnt; \ 886 __asm__ ("flogr\t%0,%1" \ 887 : "=r" (__clr_cnt.__ll) \ 888 : "r" (x) __CLOBBER_CC); \ 889 (cnt) = __clr_cnt.__i.__h; \ 890 } while (0) 891 #endif 892 #endif 893 894 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr", 895 so we don't need __CLOBBER_CC. */ 896 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 897 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 898 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \ 899 : "=r" (sh), "=&r" (sl) \ 900 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 901 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 902 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 903 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \ 904 : "=r" (sh), "=&r" (sl) \ 905 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 906 "1" ((USItype)(al)), "g" ((USItype)(bl))) 907 #define umul_ppmm(w1, w0, u, v) \ 908 __asm__ ("mull %3" \ 909 : "=a" (w0), "=d" (w1) \ 910 : "%0" ((USItype)(u)), "rm" ((USItype)(v))) 911 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 912 __asm__ ("divl %4" /* stringification in K&R C */ \ 913 : "=a" (q), "=d" (r) \ 914 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) 915 916 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx 917 /* Pentium bsrl takes between 10 and 72 cycles depending where the most 918 significant 1 bit is, hence the use of the following alternatives. bsfl 919 is slow too, between 18 and 42 depending where the least significant 1 920 bit is, so let the generic count_trailing_zeros below make use of the 921 count_leading_zeros here too. */ 922 923 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE) 924 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 925 cache miss reading from __clz_tab. For P55 it's favoured over the float 926 below so as to avoid mixing MMX and x87, since the penalty for switching 927 between the two is about 100 cycles. 928 929 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 930 16, -1 for 8, or 0 otherwise. This could be written equivalently as 931 follows, but as of gcc 2.95.2 it results in conditional jumps. 932 933 __shift = -(__n < 0x1000000); 934 __shift -= (__n < 0x10000); 935 __shift -= (__n < 0x100); 936 937 The middle two sbbl and cmpl's pair, and with luck something gcc 938 generates might pair with the first cmpl and the last sbbl. The "32+1" 939 constant could be folded into __clz_tab[], but it doesn't seem worth 940 making a different table just for that. */ 941 942 #define count_leading_zeros(c,n) \ 943 do { \ 944 USItype __n = (n); \ 945 USItype __shift; \ 946 __asm__ ("cmpl $0x1000000, %1\n" \ 947 "sbbl %0, %0\n" \ 948 "cmpl $0x10000, %1\n" \ 949 "sbbl $0, %0\n" \ 950 "cmpl $0x100, %1\n" \ 951 "sbbl $0, %0\n" \ 952 : "=&r" (__shift) : "r" (__n)); \ 953 __shift = __shift*8 + 24 + 1; \ 954 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ 955 } while (0) 956 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 957 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ 958 959 #else /* ! pentiummmx || LONGLONG_STANDALONE */ 960 /* The following should be a fixed 14 cycles or so. Some scheduling 961 opportunities should be available between the float load/store too. This 962 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is 963 apparently suggested by the Intel optimizing manual (don't know exactly 964 where). gcc 2.95 or up will be best for this, so the "double" is 965 correctly aligned on the stack. */ 966 #define count_leading_zeros(c,n) \ 967 do { \ 968 union { \ 969 double d; \ 970 unsigned a[2]; \ 971 } __u; \ 972 __u.d = (UWtype) (n); \ 973 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ 974 } while (0) 975 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31) 976 #endif /* pentiummx */ 977 978 #else /* ! pentium */ 979 980 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */ 981 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x) 982 #endif /* gcc clz */ 983 984 /* On P6, gcc prior to 3.0 generates a partial register stall for 985 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former 986 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the 987 cost of one extra instruction. Do this for "i386" too, since that means 988 generic x86. */ 989 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \ 990 && (HAVE_HOST_CPU_i386 \ 991 || HAVE_HOST_CPU_i686 \ 992 || HAVE_HOST_CPU_pentiumpro \ 993 || HAVE_HOST_CPU_pentium2 \ 994 || HAVE_HOST_CPU_pentium3) 995 #define count_leading_zeros(count, x) \ 996 do { \ 997 USItype __cbtmp; \ 998 ASSERT ((x) != 0); \ 999 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1000 (count) = 31 - __cbtmp; \ 1001 } while (0) 1002 #endif /* gcc<3 asm bsrl */ 1003 1004 #ifndef count_leading_zeros 1005 #define count_leading_zeros(count, x) \ 1006 do { \ 1007 USItype __cbtmp; \ 1008 ASSERT ((x) != 0); \ 1009 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 1010 (count) = __cbtmp ^ 31; \ 1011 } while (0) 1012 #endif /* asm bsrl */ 1013 1014 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */ 1015 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x) 1016 #endif /* gcc ctz */ 1017 1018 #ifndef count_trailing_zeros 1019 #define count_trailing_zeros(count, x) \ 1020 do { \ 1021 ASSERT ((x) != 0); \ 1022 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \ 1023 } while (0) 1024 #endif /* asm bsfl */ 1025 1026 #endif /* ! pentium */ 1027 1028 #endif /* 80x86 */ 1029 1030 #if defined (__amd64__) && W_TYPE_SIZE == 64 1031 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1032 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \ 1033 : "=r" (sh), "=&r" (sl) \ 1034 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1035 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1036 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1037 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \ 1038 : "=r" (sh), "=&r" (sl) \ 1039 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1040 "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1041 #if X86_ASM_MULX \ 1042 && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \ 1043 || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen) 1044 #define umul_ppmm(w1, w0, u, v) \ 1045 __asm__ ("mulx\t%3, %0, %1" \ 1046 : "=r" (w0), "=r" (w1) \ 1047 : "%d" ((UDItype)(u)), "rm" ((UDItype)(v))) 1048 #else 1049 #define umul_ppmm(w1, w0, u, v) \ 1050 __asm__ ("mulq\t%3" \ 1051 : "=a" (w0), "=d" (w1) \ 1052 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) 1053 #endif 1054 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 1055 __asm__ ("divq %4" /* stringification in K&R C */ \ 1056 : "=a" (q), "=d" (r) \ 1057 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) 1058 1059 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \ 1060 || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \ 1061 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \ 1062 || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar 1063 #define count_leading_zeros(count, x) \ 1064 do { \ 1065 /* This is lzcnt, spelled for older assemblers. Destination and */ \ 1066 /* source must be a 64-bit registers, hence cast and %q. */ \ 1067 __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1068 } while (0) 1069 #define COUNT_LEADING_ZEROS_0 64 1070 #else 1071 #define count_leading_zeros(count, x) \ 1072 do { \ 1073 UDItype __cbtmp; \ 1074 ASSERT ((x) != 0); \ 1075 __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ 1076 (count) = __cbtmp ^ 63; \ 1077 } while (0) 1078 #endif 1079 1080 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \ 1081 || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar 1082 #define count_trailing_zeros(count, x) \ 1083 do { \ 1084 /* This is tzcnt, spelled for older assemblers. Destination and */ \ 1085 /* source must be a 64-bit registers, hence cast and %q. */ \ 1086 __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1087 } while (0) 1088 #define COUNT_TRAILING_ZEROS_0 64 1089 #else 1090 #define count_trailing_zeros(count, x) \ 1091 do { \ 1092 ASSERT ((x) != 0); \ 1093 __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1094 } while (0) 1095 #endif 1096 #endif /* __amd64__ */ 1097 1098 #if defined (__i860__) && W_TYPE_SIZE == 32 1099 #define rshift_rhlc(r,h,l,c) \ 1100 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ 1101 "=r" (r) : "r" (h), "r" (l), "rn" (c)) 1102 #endif /* i860 */ 1103 1104 #if defined (__i960__) && W_TYPE_SIZE == 32 1105 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1106 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \ 1107 : "=r" (sh), "=&r" (sl) \ 1108 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl)) 1109 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1110 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \ 1111 : "=r" (sh), "=&r" (sl) \ 1112 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl)) 1113 #define umul_ppmm(w1, w0, u, v) \ 1114 ({union {UDItype __ll; \ 1115 struct {USItype __l, __h;} __i; \ 1116 } __x; \ 1117 __asm__ ("emul %2,%1,%0" \ 1118 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \ 1119 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1120 #define __umulsidi3(u, v) \ 1121 ({UDItype __w; \ 1122 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \ 1123 __w; }) 1124 #define udiv_qrnnd(q, r, nh, nl, d) \ 1125 do { \ 1126 union {UDItype __ll; \ 1127 struct {USItype __l, __h;} __i; \ 1128 } __nn; \ 1129 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ 1130 __asm__ ("ediv %d,%n,%0" \ 1131 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \ 1132 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ 1133 } while (0) 1134 #define count_leading_zeros(count, x) \ 1135 do { \ 1136 USItype __cbtmp; \ 1137 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \ 1138 (count) = __cbtmp ^ 31; \ 1139 } while (0) 1140 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */ 1141 #if defined (__i960mx) /* what is the proper symbol to test??? */ 1142 #define rshift_rhlc(r,h,l,c) \ 1143 do { \ 1144 union {UDItype __ll; \ 1145 struct {USItype __l, __h;} __i; \ 1146 } __nn; \ 1147 __nn.__i.__h = (h); __nn.__i.__l = (l); \ 1148 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ 1149 } 1150 #endif /* i960mx */ 1151 #endif /* i960 */ 1152 1153 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \ 1154 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \ 1155 || defined (__mc5307__)) && W_TYPE_SIZE == 32 1156 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1157 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \ 1158 : "=d" (sh), "=&d" (sl) \ 1159 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1160 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1161 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1162 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \ 1163 : "=d" (sh), "=&d" (sl) \ 1164 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1165 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1166 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */ 1167 #if defined (__mc68020__) || defined(mc68020) \ 1168 || defined (__mc68030__) || defined (mc68030) \ 1169 || defined (__mc68040__) || defined (mc68040) \ 1170 || defined (__mcpu32__) || defined (mcpu32) \ 1171 || defined (__NeXT__) 1172 #define umul_ppmm(w1, w0, u, v) \ 1173 __asm__ ("mulu%.l %3,%1:%0" \ 1174 : "=d" (w0), "=d" (w1) \ 1175 : "%0" ((USItype)(u)), "dmi" ((USItype)(v))) 1176 #define udiv_qrnnd(q, r, n1, n0, d) \ 1177 __asm__ ("divu%.l %4,%1:%0" \ 1178 : "=d" (q), "=d" (r) \ 1179 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1180 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1181 __asm__ ("divs%.l %4,%1:%0" \ 1182 : "=d" (q), "=d" (r) \ 1183 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1184 #else /* for other 68k family members use 16x16->32 multiplication */ 1185 #define umul_ppmm(xh, xl, a, b) \ 1186 do { USItype __umul_tmp1, __umul_tmp2; \ 1187 __asm__ ("| Inlined umul_ppmm\n" \ 1188 " move%.l %5,%3\n" \ 1189 " move%.l %2,%0\n" \ 1190 " move%.w %3,%1\n" \ 1191 " swap %3\n" \ 1192 " swap %0\n" \ 1193 " mulu%.w %2,%1\n" \ 1194 " mulu%.w %3,%0\n" \ 1195 " mulu%.w %2,%3\n" \ 1196 " swap %2\n" \ 1197 " mulu%.w %5,%2\n" \ 1198 " add%.l %3,%2\n" \ 1199 " jcc 1f\n" \ 1200 " add%.l %#0x10000,%0\n" \ 1201 "1: move%.l %2,%3\n" \ 1202 " clr%.w %2\n" \ 1203 " swap %2\n" \ 1204 " swap %3\n" \ 1205 " clr%.w %3\n" \ 1206 " add%.l %3,%1\n" \ 1207 " addx%.l %2,%0\n" \ 1208 " | End inlined umul_ppmm" \ 1209 : "=&d" (xh), "=&d" (xl), \ 1210 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ 1211 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ 1212 } while (0) 1213 #endif /* not mc68020 */ 1214 /* The '020, '030, '040 and '060 have bitfield insns. 1215 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to 1216 exclude bfffo on that chip (bitfield insns not available). */ 1217 #if (defined (__mc68020__) || defined (mc68020) \ 1218 || defined (__mc68030__) || defined (mc68030) \ 1219 || defined (__mc68040__) || defined (mc68040) \ 1220 || defined (__mc68060__) || defined (mc68060) \ 1221 || defined (__NeXT__)) \ 1222 && ! defined (__mcpu32__) 1223 #define count_leading_zeros(count, x) \ 1224 __asm__ ("bfffo %1{%b2:%b2},%0" \ 1225 : "=d" (count) \ 1226 : "od" ((USItype) (x)), "n" (0)) 1227 #define COUNT_LEADING_ZEROS_0 32 1228 #endif 1229 #endif /* mc68000 */ 1230 1231 #if defined (__m88000__) && W_TYPE_SIZE == 32 1232 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1233 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \ 1234 : "=r" (sh), "=&r" (sl) \ 1235 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl)) 1236 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1237 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \ 1238 : "=r" (sh), "=&r" (sl) \ 1239 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl)) 1240 #define count_leading_zeros(count, x) \ 1241 do { \ 1242 USItype __cbtmp; \ 1243 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \ 1244 (count) = __cbtmp ^ 31; \ 1245 } while (0) 1246 #define COUNT_LEADING_ZEROS_0 63 /* sic */ 1247 #if defined (__m88110__) 1248 #define umul_ppmm(wh, wl, u, v) \ 1249 do { \ 1250 union {UDItype __ll; \ 1251 struct {USItype __h, __l;} __i; \ 1252 } __x; \ 1253 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ 1254 (wh) = __x.__i.__h; \ 1255 (wl) = __x.__i.__l; \ 1256 } while (0) 1257 #define udiv_qrnnd(q, r, n1, n0, d) \ 1258 ({union {UDItype __ll; \ 1259 struct {USItype __h, __l;} __i; \ 1260 } __x, __q; \ 1261 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1262 __asm__ ("divu.d %0,%1,%2" \ 1263 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ 1264 (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) 1265 #endif /* __m88110__ */ 1266 #endif /* __m88000__ */ 1267 1268 #if defined (__mips) && W_TYPE_SIZE == 32 1269 #if __GMP_GNUC_PREREQ (4,4) 1270 #define umul_ppmm(w1, w0, u, v) \ 1271 do { \ 1272 UDItype __ll = (UDItype)(u) * (v); \ 1273 w1 = __ll >> 32; \ 1274 w0 = __ll; \ 1275 } while (0) 1276 #endif 1277 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1278 #define umul_ppmm(w1, w0, u, v) \ 1279 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) 1280 #endif 1281 #if !defined (umul_ppmm) 1282 #define umul_ppmm(w1, w0, u, v) \ 1283 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1284 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) 1285 #endif 1286 #endif /* __mips */ 1287 1288 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 1289 #if defined (_MIPS_ARCH_MIPS64R6) 1290 #define umul_ppmm(w1, w0, u, v) \ 1291 do { \ 1292 UDItype __m0 = (u), __m1 = (v); \ 1293 (w0) = __m0 * __m1; \ 1294 __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1)); \ 1295 } while (0) 1296 #endif 1297 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4) 1298 #define umul_ppmm(w1, w0, u, v) \ 1299 do { \ 1300 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1301 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1302 w1 = __ll >> 64; \ 1303 w0 = __ll; \ 1304 } while (0) 1305 #endif 1306 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__) 1307 #define umul_ppmm(w1, w0, u, v) \ 1308 __asm__ ("dmultu %2,%3" \ 1309 : "=l" (w0), "=h" (w1) \ 1310 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1311 #endif 1312 #if !defined (umul_ppmm) 1313 #define umul_ppmm(w1, w0, u, v) \ 1314 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1315 : "=d" (w0), "=d" (w1) \ 1316 : "d" ((UDItype)(u)), "d" ((UDItype)(v))) 1317 #endif 1318 #endif /* __mips */ 1319 1320 #if defined (__mmix__) && W_TYPE_SIZE == 64 1321 #define umul_ppmm(w1, w0, u, v) \ 1322 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v)) 1323 #endif 1324 1325 #if defined (__ns32000__) && W_TYPE_SIZE == 32 1326 #define umul_ppmm(w1, w0, u, v) \ 1327 ({union {UDItype __ll; \ 1328 struct {USItype __l, __h;} __i; \ 1329 } __x; \ 1330 __asm__ ("meid %2,%0" \ 1331 : "=g" (__x.__ll) \ 1332 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1333 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1334 #define __umulsidi3(u, v) \ 1335 ({UDItype __w; \ 1336 __asm__ ("meid %2,%0" \ 1337 : "=g" (__w) \ 1338 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1339 __w; }) 1340 #define udiv_qrnnd(q, r, n1, n0, d) \ 1341 ({union {UDItype __ll; \ 1342 struct {USItype __l, __h;} __i; \ 1343 } __x; \ 1344 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1345 __asm__ ("deid %2,%0" \ 1346 : "=g" (__x.__ll) \ 1347 : "0" (__x.__ll), "g" ((USItype)(d))); \ 1348 (r) = __x.__i.__l; (q) = __x.__i.__h; }) 1349 #define count_trailing_zeros(count,x) \ 1350 do { \ 1351 __asm__ ("ffsd %2,%0" \ 1352 : "=r" (count) \ 1353 : "0" ((USItype) 0), "r" ((USItype) (x))); \ 1354 } while (0) 1355 #endif /* __ns32000__ */ 1356 1357 /* In the past we had a block of various #defines tested 1358 _ARCH_PPC - AIX 1359 _ARCH_PWR - AIX 1360 __powerpc__ - gcc 1361 __POWERPC__ - BEOS 1362 __ppc__ - Darwin 1363 PPC - old gcc, GNU/Linux, SysV 1364 The plain PPC test was not good for vxWorks, since PPC is defined on all 1365 CPUs there (eg. m68k too), as a constant one is expected to compare 1366 CPU_FAMILY against. 1367 1368 At any rate, this was pretty unattractive and a bit fragile. The use of 1369 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of 1370 getting the desired effect. 1371 1372 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for 1373 the system vendor compilers. (Is that vendor compilers with inline asm, 1374 or what?) */ 1375 1376 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \ 1377 && W_TYPE_SIZE == 32 1378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1379 do { \ 1380 if (__builtin_constant_p (bh) && (bh) == 0) \ 1381 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1382 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ 1383 __CLOBBER_CC); \ 1384 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1385 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1386 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ 1387 __CLOBBER_CC); \ 1388 else \ 1389 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1390 : "=r" (sh), "=&r" (sl) \ 1391 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl) \ 1392 __CLOBBER_CC); \ 1393 } while (0) 1394 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1395 do { \ 1396 if (__builtin_constant_p (ah) && (ah) == 0) \ 1397 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1398 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ 1399 __CLOBBER_CC); \ 1400 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ 1401 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1402 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ 1403 __CLOBBER_CC); \ 1404 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1405 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1406 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ 1407 __CLOBBER_CC); \ 1408 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1409 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1410 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ 1411 __CLOBBER_CC); \ 1412 else \ 1413 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1414 : "=r" (sh), "=&r" (sl) \ 1415 : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \ 1416 __CLOBBER_CC); \ 1417 } while (0) 1418 #define count_leading_zeros(count, x) \ 1419 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x)) 1420 #define COUNT_LEADING_ZEROS_0 32 1421 #if HAVE_HOST_CPU_FAMILY_powerpc 1422 #if __GMP_GNUC_PREREQ (4,4) 1423 #define umul_ppmm(w1, w0, u, v) \ 1424 do { \ 1425 UDItype __ll = (UDItype)(u) * (v); \ 1426 w1 = __ll >> 32; \ 1427 w0 = __ll; \ 1428 } while (0) 1429 #endif 1430 #if !defined (umul_ppmm) 1431 #define umul_ppmm(ph, pl, m0, m1) \ 1432 do { \ 1433 USItype __m0 = (m0), __m1 = (m1); \ 1434 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1435 (pl) = __m0 * __m1; \ 1436 } while (0) 1437 #endif 1438 #define smul_ppmm(ph, pl, m0, m1) \ 1439 do { \ 1440 SItype __m0 = (m0), __m1 = (m1); \ 1441 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1442 (pl) = __m0 * __m1; \ 1443 } while (0) 1444 #else 1445 #define smul_ppmm(xh, xl, m0, m1) \ 1446 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1)) 1447 #define sdiv_qrnnd(q, r, nh, nl, d) \ 1448 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d)) 1449 #endif 1450 #endif /* 32-bit POWER architecture variants. */ 1451 1452 /* We should test _IBMR2 here when we add assembly support for the system 1453 vendor compilers. */ 1454 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64 1455 #if !defined (_LONG_LONG_LIMB) 1456 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So 1457 use adde etc only when not _LONG_LONG_LIMB. */ 1458 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1459 do { \ 1460 if (__builtin_constant_p (bh) && (bh) == 0) \ 1461 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1462 : "=r" (sh), "=&r" (sl) \ 1463 : "r" ((UDItype)(ah)), \ 1464 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1465 __CLOBBER_CC); \ 1466 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1467 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1468 : "=r" (sh), "=&r" (sl) \ 1469 : "r" ((UDItype)(ah)), \ 1470 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1471 __CLOBBER_CC); \ 1472 else \ 1473 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1474 : "=r" (sh), "=&r" (sl) \ 1475 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1476 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \ 1477 __CLOBBER_CC); \ 1478 } while (0) 1479 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs. 1480 This might seem strange, but gcc folds away the dead code late. */ 1481 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1482 do { \ 1483 if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \ 1484 if (__builtin_constant_p (ah) && (ah) == 0) \ 1485 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \ 1486 : "=r" (sh), "=&r" (sl) \ 1487 : "r" ((UDItype)(bh)), \ 1488 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1489 __CLOBBER_CC); \ 1490 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1491 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \ 1492 : "=r" (sh), "=&r" (sl) \ 1493 : "r" ((UDItype)(bh)), \ 1494 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1495 __CLOBBER_CC); \ 1496 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1497 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \ 1498 : "=r" (sh), "=&r" (sl) \ 1499 : "r" ((UDItype)(ah)), \ 1500 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1501 __CLOBBER_CC); \ 1502 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1503 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \ 1504 : "=r" (sh), "=&r" (sl) \ 1505 : "r" ((UDItype)(ah)), \ 1506 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1507 __CLOBBER_CC); \ 1508 else \ 1509 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \ 1510 : "=r" (sh), "=&r" (sl) \ 1511 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1512 "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \ 1513 __CLOBBER_CC); \ 1514 } else { \ 1515 if (__builtin_constant_p (ah) && (ah) == 0) \ 1516 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1517 : "=r" (sh), "=&r" (sl) \ 1518 : "r" ((UDItype)(bh)), \ 1519 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1520 __CLOBBER_CC); \ 1521 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1522 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1523 : "=r" (sh), "=&r" (sl) \ 1524 : "r" ((UDItype)(bh)), \ 1525 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1526 __CLOBBER_CC); \ 1527 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1528 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1529 : "=r" (sh), "=&r" (sl) \ 1530 : "r" ((UDItype)(ah)), \ 1531 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1532 __CLOBBER_CC); \ 1533 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1534 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1535 : "=r" (sh), "=&r" (sl) \ 1536 : "r" ((UDItype)(ah)), \ 1537 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1538 __CLOBBER_CC); \ 1539 else \ 1540 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1541 : "=r" (sh), "=&r" (sl) \ 1542 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 1543 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \ 1544 __CLOBBER_CC); \ 1545 } \ 1546 } while (0) 1547 #endif /* ! _LONG_LONG_LIMB */ 1548 #define count_leading_zeros(count, x) \ 1549 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) 1550 #define COUNT_LEADING_ZEROS_0 64 1551 #if __GMP_GNUC_PREREQ (4,8) 1552 #define umul_ppmm(w1, w0, u, v) \ 1553 do { \ 1554 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1555 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1556 w1 = __ll >> 64; \ 1557 w0 = __ll; \ 1558 } while (0) 1559 #endif 1560 #if !defined (umul_ppmm) 1561 #define umul_ppmm(ph, pl, m0, m1) \ 1562 do { \ 1563 UDItype __m0 = (m0), __m1 = (m1); \ 1564 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1565 (pl) = __m0 * __m1; \ 1566 } while (0) 1567 #endif 1568 #define smul_ppmm(ph, pl, m0, m1) \ 1569 do { \ 1570 DItype __m0 = (m0), __m1 = (m1); \ 1571 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \ 1572 (pl) = __m0 * __m1; \ 1573 } while (0) 1574 #endif /* 64-bit PowerPC. */ 1575 1576 #if defined (__pyr__) && W_TYPE_SIZE == 32 1577 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1578 __asm__ ("addw %5,%1\n\taddwc %3,%0" \ 1579 : "=r" (sh), "=&r" (sl) \ 1580 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1581 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1582 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1583 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \ 1584 : "=r" (sh), "=&r" (sl) \ 1585 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1586 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1587 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ 1588 #define umul_ppmm(w1, w0, u, v) \ 1589 ({union {UDItype __ll; \ 1590 struct {USItype __h, __l;} __i; \ 1591 } __x; \ 1592 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \ 1593 : "=&r" (__x.__ll) \ 1594 : "g" ((USItype) (u)), "g" ((USItype)(v))); \ 1595 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1596 #endif /* __pyr__ */ 1597 1598 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 1599 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1600 __asm__ ("a %1,%5\n\tae %0,%3" \ 1601 : "=r" (sh), "=&r" (sl) \ 1602 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1603 "%1" ((USItype)(al)), "r" ((USItype)(bl))) 1604 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1605 __asm__ ("s %1,%5\n\tse %0,%3" \ 1606 : "=r" (sh), "=&r" (sl) \ 1607 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1608 "1" ((USItype)(al)), "r" ((USItype)(bl))) 1609 #define smul_ppmm(ph, pl, m0, m1) \ 1610 __asm__ ( \ 1611 "s r2,r2\n" \ 1612 " mts r10,%2\n" \ 1613 " m r2,%3\n" \ 1614 " m r2,%3\n" \ 1615 " m r2,%3\n" \ 1616 " m r2,%3\n" \ 1617 " m r2,%3\n" \ 1618 " m r2,%3\n" \ 1619 " m r2,%3\n" \ 1620 " m r2,%3\n" \ 1621 " m r2,%3\n" \ 1622 " m r2,%3\n" \ 1623 " m r2,%3\n" \ 1624 " m r2,%3\n" \ 1625 " m r2,%3\n" \ 1626 " m r2,%3\n" \ 1627 " m r2,%3\n" \ 1628 " m r2,%3\n" \ 1629 " cas %0,r2,r0\n" \ 1630 " mfs r10,%1" \ 1631 : "=r" (ph), "=r" (pl) \ 1632 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ 1633 : "r2") 1634 #define count_leading_zeros(count, x) \ 1635 do { \ 1636 if ((x) >= 0x10000) \ 1637 __asm__ ("clz %0,%1" \ 1638 : "=r" (count) : "r" ((USItype)(x) >> 16)); \ 1639 else \ 1640 { \ 1641 __asm__ ("clz %0,%1" \ 1642 : "=r" (count) : "r" ((USItype)(x))); \ 1643 (count) += 16; \ 1644 } \ 1645 } while (0) 1646 #endif /* RT/ROMP */ 1647 1648 #if defined (__riscv64) && W_TYPE_SIZE == 64 1649 #define umul_ppmm(ph, pl, u, v) \ 1650 do { \ 1651 UDItype __u = (u), __v = (v); \ 1652 (pl) = __u * __v; \ 1653 __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v)); \ 1654 } while (0) 1655 #endif 1656 1657 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32 1658 #define umul_ppmm(w1, w0, u, v) \ 1659 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \ 1660 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach") 1661 #endif 1662 1663 #if defined (__sparc__) && W_TYPE_SIZE == 32 1664 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1665 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \ 1666 : "=r" (sh), "=&r" (sl) \ 1667 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \ 1668 __CLOBBER_CC) 1669 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1670 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \ 1671 : "=r" (sh), "=&r" (sl) \ 1672 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ 1673 __CLOBBER_CC) 1674 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h 1675 doesn't define anything to indicate that to us, it only sets __sparcv8. */ 1676 #if defined (__sparc_v9__) || defined (__sparcv9) 1677 /* Perhaps we should use floating-point operations here? */ 1678 #if 0 1679 /* Triggers a bug making mpz/tests/t-gcd.c fail. 1680 Perhaps we simply need explicitly zero-extend the inputs? */ 1681 #define umul_ppmm(w1, w0, u, v) \ 1682 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \ 1683 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1") 1684 #else 1685 /* Use v8 umul until above bug is fixed. */ 1686 #define umul_ppmm(w1, w0, u, v) \ 1687 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1688 #endif 1689 /* Use a plain v8 divide for v9. */ 1690 #define udiv_qrnnd(q, r, n1, n0, d) \ 1691 do { \ 1692 USItype __q; \ 1693 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1694 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1695 (r) = (n0) - __q * (d); \ 1696 (q) = __q; \ 1697 } while (0) 1698 #else 1699 #if defined (__sparc_v8__) /* gcc normal */ \ 1700 || defined (__sparcv8) /* gcc solaris */ \ 1701 || HAVE_HOST_CPU_supersparc 1702 /* Don't match immediate range because, 1) it is not often useful, 1703 2) the 'I' flag thinks of the range as a 13 bit signed interval, 1704 while we want to match a 13 bit interval, sign extended to 32 bits, 1705 but INTERPRETED AS UNSIGNED. */ 1706 #define umul_ppmm(w1, w0, u, v) \ 1707 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1708 1709 #if HAVE_HOST_CPU_supersparc 1710 #else 1711 /* Don't use this on SuperSPARC because its udiv only handles 53 bit 1712 dividends and will trap to the kernel for the rest. */ 1713 #define udiv_qrnnd(q, r, n1, n0, d) \ 1714 do { \ 1715 USItype __q; \ 1716 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1717 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1718 (r) = (n0) - __q * (d); \ 1719 (q) = __q; \ 1720 } while (0) 1721 #endif /* HAVE_HOST_CPU_supersparc */ 1722 1723 #else /* ! __sparc_v8__ */ 1724 #if defined (__sparclite__) 1725 /* This has hardware multiply but not divide. It also has two additional 1726 instructions scan (ffs from high bit) and divscc. */ 1727 #define umul_ppmm(w1, w0, u, v) \ 1728 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1729 #define udiv_qrnnd(q, r, n1, n0, d) \ 1730 __asm__ ("! Inlined udiv_qrnnd\n" \ 1731 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ 1732 " tst %%g0\n" \ 1733 " divscc %3,%4,%%g1\n" \ 1734 " divscc %%g1,%4,%%g1\n" \ 1735 " divscc %%g1,%4,%%g1\n" \ 1736 " divscc %%g1,%4,%%g1\n" \ 1737 " divscc %%g1,%4,%%g1\n" \ 1738 " divscc %%g1,%4,%%g1\n" \ 1739 " divscc %%g1,%4,%%g1\n" \ 1740 " divscc %%g1,%4,%%g1\n" \ 1741 " divscc %%g1,%4,%%g1\n" \ 1742 " divscc %%g1,%4,%%g1\n" \ 1743 " divscc %%g1,%4,%%g1\n" \ 1744 " divscc %%g1,%4,%%g1\n" \ 1745 " divscc %%g1,%4,%%g1\n" \ 1746 " divscc %%g1,%4,%%g1\n" \ 1747 " divscc %%g1,%4,%%g1\n" \ 1748 " divscc %%g1,%4,%%g1\n" \ 1749 " divscc %%g1,%4,%%g1\n" \ 1750 " divscc %%g1,%4,%%g1\n" \ 1751 " divscc %%g1,%4,%%g1\n" \ 1752 " divscc %%g1,%4,%%g1\n" \ 1753 " divscc %%g1,%4,%%g1\n" \ 1754 " divscc %%g1,%4,%%g1\n" \ 1755 " divscc %%g1,%4,%%g1\n" \ 1756 " divscc %%g1,%4,%%g1\n" \ 1757 " divscc %%g1,%4,%%g1\n" \ 1758 " divscc %%g1,%4,%%g1\n" \ 1759 " divscc %%g1,%4,%%g1\n" \ 1760 " divscc %%g1,%4,%%g1\n" \ 1761 " divscc %%g1,%4,%%g1\n" \ 1762 " divscc %%g1,%4,%%g1\n" \ 1763 " divscc %%g1,%4,%%g1\n" \ 1764 " divscc %%g1,%4,%0\n" \ 1765 " rd %%y,%1\n" \ 1766 " bl,a 1f\n" \ 1767 " add %1,%4,%1\n" \ 1768 "1: ! End of inline udiv_qrnnd" \ 1769 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ 1770 : "%g1" __AND_CLOBBER_CC) 1771 #define count_leading_zeros(count, x) \ 1772 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) 1773 /* Early sparclites return 63 for an argument of 0, but they warn that future 1774 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 1775 undefined. */ 1776 #endif /* __sparclite__ */ 1777 #endif /* __sparc_v8__ */ 1778 #endif /* __sparc_v9__ */ 1779 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ 1780 #ifndef umul_ppmm 1781 #define umul_ppmm(w1, w0, u, v) \ 1782 __asm__ ("! Inlined umul_ppmm\n" \ 1783 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ 1784 " sra %3,31,%%g2 ! Don't move this insn\n" \ 1785 " and %2,%%g2,%%g2 ! Don't move this insn\n" \ 1786 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \ 1787 " mulscc %%g1,%3,%%g1\n" \ 1788 " mulscc %%g1,%3,%%g1\n" \ 1789 " mulscc %%g1,%3,%%g1\n" \ 1790 " mulscc %%g1,%3,%%g1\n" \ 1791 " mulscc %%g1,%3,%%g1\n" \ 1792 " mulscc %%g1,%3,%%g1\n" \ 1793 " mulscc %%g1,%3,%%g1\n" \ 1794 " mulscc %%g1,%3,%%g1\n" \ 1795 " mulscc %%g1,%3,%%g1\n" \ 1796 " mulscc %%g1,%3,%%g1\n" \ 1797 " mulscc %%g1,%3,%%g1\n" \ 1798 " mulscc %%g1,%3,%%g1\n" \ 1799 " mulscc %%g1,%3,%%g1\n" \ 1800 " mulscc %%g1,%3,%%g1\n" \ 1801 " mulscc %%g1,%3,%%g1\n" \ 1802 " mulscc %%g1,%3,%%g1\n" \ 1803 " mulscc %%g1,%3,%%g1\n" \ 1804 " mulscc %%g1,%3,%%g1\n" \ 1805 " mulscc %%g1,%3,%%g1\n" \ 1806 " mulscc %%g1,%3,%%g1\n" \ 1807 " mulscc %%g1,%3,%%g1\n" \ 1808 " mulscc %%g1,%3,%%g1\n" \ 1809 " mulscc %%g1,%3,%%g1\n" \ 1810 " mulscc %%g1,%3,%%g1\n" \ 1811 " mulscc %%g1,%3,%%g1\n" \ 1812 " mulscc %%g1,%3,%%g1\n" \ 1813 " mulscc %%g1,%3,%%g1\n" \ 1814 " mulscc %%g1,%3,%%g1\n" \ 1815 " mulscc %%g1,%3,%%g1\n" \ 1816 " mulscc %%g1,%3,%%g1\n" \ 1817 " mulscc %%g1,%3,%%g1\n" \ 1818 " mulscc %%g1,%3,%%g1\n" \ 1819 " mulscc %%g1,0,%%g1\n" \ 1820 " add %%g1,%%g2,%0\n" \ 1821 " rd %%y,%1" \ 1822 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ 1823 : "%g1", "%g2" __AND_CLOBBER_CC) 1824 #endif 1825 #ifndef udiv_qrnnd 1826 #ifndef LONGLONG_STANDALONE 1827 #define udiv_qrnnd(q, r, n1, n0, d) \ 1828 do { UWtype __r; \ 1829 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 1830 (r) = __r; \ 1831 } while (0) 1832 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 1833 #endif /* LONGLONG_STANDALONE */ 1834 #endif /* udiv_qrnnd */ 1835 #endif /* __sparc__ */ 1836 1837 #if defined (__sparc__) && W_TYPE_SIZE == 64 1838 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1839 __asm__ ( \ 1840 "addcc %r4,%5,%1\n" \ 1841 " addccc %r6,%7,%%g0\n" \ 1842 " addc %r2,%3,%0" \ 1843 : "=r" (sh), "=&r" (sl) \ 1844 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1845 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1846 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1847 __CLOBBER_CC) 1848 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1849 __asm__ ( \ 1850 "subcc %r4,%5,%1\n" \ 1851 " subccc %r6,%7,%%g0\n" \ 1852 " subc %r2,%3,%0" \ 1853 : "=r" (sh), "=&r" (sl) \ 1854 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \ 1855 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \ 1856 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \ 1857 __CLOBBER_CC) 1858 #if __VIS__ >= 0x300 1859 #undef add_ssaaaa 1860 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1861 __asm__ ( \ 1862 "addcc %r4, %5, %1\n" \ 1863 " addxc %r2, %r3, %0" \ 1864 : "=r" (sh), "=&r" (sl) \ 1865 : "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \ 1866 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC) 1867 #define umul_ppmm(ph, pl, m0, m1) \ 1868 do { \ 1869 UDItype __m0 = (m0), __m1 = (m1); \ 1870 (pl) = __m0 * __m1; \ 1871 __asm__ ("umulxhi\t%2, %1, %0" \ 1872 : "=r" (ph) \ 1873 : "%r" (__m0), "r" (__m1)); \ 1874 } while (0) 1875 #define count_leading_zeros(count, x) \ 1876 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x)) 1877 /* Needed by count_leading_zeros_32 in sparc64.h. */ 1878 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 1879 #endif 1880 #endif 1881 1882 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32 1883 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1884 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ 1885 : "=g" (sh), "=&g" (sl) \ 1886 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1887 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1888 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1889 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \ 1890 : "=g" (sh), "=&g" (sl) \ 1891 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1892 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1893 #define smul_ppmm(xh, xl, m0, m1) \ 1894 do { \ 1895 union {UDItype __ll; \ 1896 struct {USItype __l, __h;} __i; \ 1897 } __x; \ 1898 USItype __m0 = (m0), __m1 = (m1); \ 1899 __asm__ ("emul %1,%2,$0,%0" \ 1900 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \ 1901 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1902 } while (0) 1903 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1904 do { \ 1905 union {DItype __ll; \ 1906 struct {SItype __l, __h;} __i; \ 1907 } __x; \ 1908 __x.__i.__h = n1; __x.__i.__l = n0; \ 1909 __asm__ ("ediv %3,%2,%0,%1" \ 1910 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ 1911 } while (0) 1912 #if 0 1913 /* FIXME: This instruction appears to be unimplemented on some systems (vax 1914 8800 maybe). */ 1915 #define count_trailing_zeros(count,x) \ 1916 do { \ 1917 __asm__ ("ffs 0, 31, %1, %0" \ 1918 : "=g" (count) \ 1919 : "g" ((USItype) (x))); \ 1920 } while (0) 1921 #endif 1922 #endif /* vax */ 1923 1924 #if defined (__z8000__) && W_TYPE_SIZE == 16 1925 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1926 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ 1927 : "=r" (sh), "=&r" (sl) \ 1928 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1929 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1930 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1931 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ 1932 : "=r" (sh), "=&r" (sl) \ 1933 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1934 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1935 #define umul_ppmm(xh, xl, m0, m1) \ 1936 do { \ 1937 union {long int __ll; \ 1938 struct {unsigned int __h, __l;} __i; \ 1939 } __x; \ 1940 unsigned int __m0 = (m0), __m1 = (m1); \ 1941 __asm__ ("mult %S0,%H3" \ 1942 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ 1943 : "%1" (m0), "rQR" (m1)); \ 1944 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1945 (xh) += ((((signed int) __m0 >> 15) & __m1) \ 1946 + (((signed int) __m1 >> 15) & __m0)); \ 1947 } while (0) 1948 #endif /* __z8000__ */ 1949 1950 #endif /* __GNUC__ */ 1951 1952 #endif /* NO_ASM */ 1953 1954 1955 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */ 1956 #if !defined (umul_ppmm) && defined (__umulsidi3) 1957 #define umul_ppmm(ph, pl, m0, m1) \ 1958 do { \ 1959 UDWtype __ll = __umulsidi3 (m0, m1); \ 1960 ph = (UWtype) (__ll >> W_TYPE_SIZE); \ 1961 pl = (UWtype) __ll; \ 1962 } while (0) 1963 #endif 1964 1965 #if !defined (__umulsidi3) 1966 #define __umulsidi3(u, v) \ 1967 ({UWtype __hi, __lo; \ 1968 umul_ppmm (__hi, __lo, u, v); \ 1969 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) 1970 #endif 1971 1972 1973 #if defined (__cplusplus) 1974 #define __longlong_h_C "C" 1975 #else 1976 #define __longlong_h_C 1977 #endif 1978 1979 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r" 1980 forms have "reversed" arguments, meaning the pointer is last, which 1981 sometimes allows better parameter passing, in particular on 64-bit 1982 hppa. */ 1983 1984 #define mpn_umul_ppmm __MPN(umul_ppmm) 1985 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype); 1986 1987 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \ 1988 && ! defined (LONGLONG_STANDALONE) 1989 #define umul_ppmm(wh, wl, u, v) \ 1990 do { \ 1991 UWtype __umul_ppmm__p0; \ 1992 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\ 1993 (wl) = __umul_ppmm__p0; \ 1994 } while (0) 1995 #endif 1996 1997 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r) 1998 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *); 1999 2000 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \ 2001 && ! defined (LONGLONG_STANDALONE) 2002 #define umul_ppmm(wh, wl, u, v) \ 2003 do { \ 2004 UWtype __umul_p0; \ 2005 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \ 2006 (wl) = __umul_p0; \ 2007 } while (0) 2008 #endif 2009 2010 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd) 2011 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype); 2012 2013 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \ 2014 && ! defined (LONGLONG_STANDALONE) 2015 #define udiv_qrnnd(q, r, n1, n0, d) \ 2016 do { \ 2017 UWtype __udiv_qrnnd_r; \ 2018 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \ 2019 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \ 2020 (r) = __udiv_qrnnd_r; \ 2021 } while (0) 2022 #endif 2023 2024 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r) 2025 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *); 2026 2027 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \ 2028 && ! defined (LONGLONG_STANDALONE) 2029 #define udiv_qrnnd(q, r, n1, n0, d) \ 2030 do { \ 2031 UWtype __udiv_qrnnd_r; \ 2032 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \ 2033 &__udiv_qrnnd_r); \ 2034 (r) = __udiv_qrnnd_r; \ 2035 } while (0) 2036 #endif 2037 2038 2039 /* If this machine has no inline assembler, use C macros. */ 2040 2041 #if !defined (add_ssaaaa) 2042 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 2043 do { \ 2044 UWtype __x; \ 2045 __x = (al) + (bl); \ 2046 (sh) = (ah) + (bh) + (__x < (al)); \ 2047 (sl) = __x; \ 2048 } while (0) 2049 #endif 2050 2051 #if !defined (sub_ddmmss) 2052 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 2053 do { \ 2054 UWtype __x; \ 2055 __x = (al) - (bl); \ 2056 (sh) = (ah) - (bh) - ((al) < (bl)); \ 2057 (sl) = __x; \ 2058 } while (0) 2059 #endif 2060 2061 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of 2062 smul_ppmm. */ 2063 #if !defined (umul_ppmm) && defined (smul_ppmm) 2064 #define umul_ppmm(w1, w0, u, v) \ 2065 do { \ 2066 UWtype __w1; \ 2067 UWtype __xm0 = (u), __xm1 = (v); \ 2068 smul_ppmm (__w1, w0, __xm0, __xm1); \ 2069 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2070 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2071 } while (0) 2072 #endif 2073 2074 /* If we still don't have umul_ppmm, define it using plain C. 2075 2076 For reference, when this code is used for squaring (ie. u and v identical 2077 expressions), gcc recognises __x1 and __x2 are the same and generates 3 2078 multiplies, not 4. The subsequent additions could be optimized a bit, 2079 but the only place GMP currently uses such a square is mpn_sqr_basecase, 2080 and chips obliged to use this generic C umul will have plenty of worse 2081 performance problems than a couple of extra instructions on the diagonal 2082 of sqr_basecase. */ 2083 2084 #if !defined (umul_ppmm) 2085 #define umul_ppmm(w1, w0, u, v) \ 2086 do { \ 2087 UWtype __x0, __x1, __x2, __x3; \ 2088 UHWtype __ul, __vl, __uh, __vh; \ 2089 UWtype __u = (u), __v = (v); \ 2090 \ 2091 __ul = __ll_lowpart (__u); \ 2092 __uh = __ll_highpart (__u); \ 2093 __vl = __ll_lowpart (__v); \ 2094 __vh = __ll_highpart (__v); \ 2095 \ 2096 __x0 = (UWtype) __ul * __vl; \ 2097 __x1 = (UWtype) __ul * __vh; \ 2098 __x2 = (UWtype) __uh * __vl; \ 2099 __x3 = (UWtype) __uh * __vh; \ 2100 \ 2101 __x1 += __ll_highpart (__x0);/* this can't give carry */ \ 2102 __x1 += __x2; /* but this indeed can */ \ 2103 if (__x1 < __x2) /* did we get it? */ \ 2104 __x3 += __ll_B; /* yes, add it in the proper pos. */ \ 2105 \ 2106 (w1) = __x3 + __ll_highpart (__x1); \ 2107 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \ 2108 } while (0) 2109 #endif 2110 2111 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will 2112 exist in one form or another. */ 2113 #if !defined (smul_ppmm) 2114 #define smul_ppmm(w1, w0, u, v) \ 2115 do { \ 2116 UWtype __w1; \ 2117 UWtype __xm0 = (u), __xm1 = (v); \ 2118 umul_ppmm (__w1, w0, __xm0, __xm1); \ 2119 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 2120 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 2121 } while (0) 2122 #endif 2123 2124 /* Define this unconditionally, so it can be used for debugging. */ 2125 #define __udiv_qrnnd_c(q, r, n1, n0, d) \ 2126 do { \ 2127 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ 2128 \ 2129 ASSERT ((d) != 0); \ 2130 ASSERT ((n1) < (d)); \ 2131 \ 2132 __d1 = __ll_highpart (d); \ 2133 __d0 = __ll_lowpart (d); \ 2134 \ 2135 __q1 = (n1) / __d1; \ 2136 __r1 = (n1) - __q1 * __d1; \ 2137 __m = __q1 * __d0; \ 2138 __r1 = __r1 * __ll_B | __ll_highpart (n0); \ 2139 if (__r1 < __m) \ 2140 { \ 2141 __q1--, __r1 += (d); \ 2142 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ 2143 if (__r1 < __m) \ 2144 __q1--, __r1 += (d); \ 2145 } \ 2146 __r1 -= __m; \ 2147 \ 2148 __q0 = __r1 / __d1; \ 2149 __r0 = __r1 - __q0 * __d1; \ 2150 __m = __q0 * __d0; \ 2151 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ 2152 if (__r0 < __m) \ 2153 { \ 2154 __q0--, __r0 += (d); \ 2155 if (__r0 >= (d)) \ 2156 if (__r0 < __m) \ 2157 __q0--, __r0 += (d); \ 2158 } \ 2159 __r0 -= __m; \ 2160 \ 2161 (q) = __q1 * __ll_B | __q0; \ 2162 (r) = __r0; \ 2163 } while (0) 2164 2165 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through 2166 __udiv_w_sdiv (defined in libgcc or elsewhere). */ 2167 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \ 2168 && ! defined (LONGLONG_STANDALONE) 2169 #define udiv_qrnnd(q, r, nh, nl, d) \ 2170 do { \ 2171 UWtype __r; \ 2172 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ 2173 (r) = __r; \ 2174 } while (0) 2175 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype); 2176 #endif 2177 2178 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ 2179 #if !defined (udiv_qrnnd) 2180 #define UDIV_NEEDS_NORMALIZATION 1 2181 #define udiv_qrnnd __udiv_qrnnd_c 2182 #endif 2183 2184 #if !defined (count_leading_zeros) 2185 #define count_leading_zeros(count, x) \ 2186 do { \ 2187 UWtype __xr = (x); \ 2188 UWtype __a; \ 2189 \ 2190 if (W_TYPE_SIZE == 32) \ 2191 { \ 2192 __a = __xr < ((UWtype) 1 << 2*__BITS4) \ 2193 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ 2194 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ 2195 : 3*__BITS4 + 1); \ 2196 } \ 2197 else \ 2198 { \ 2199 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ 2200 if (((__xr >> __a) & 0xff) != 0) \ 2201 break; \ 2202 ++__a; \ 2203 } \ 2204 \ 2205 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ 2206 } while (0) 2207 /* This version gives a well-defined value for zero. */ 2208 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) 2209 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2210 #define COUNT_LEADING_ZEROS_SLOW 2211 #endif 2212 2213 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */ 2214 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY 2215 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2216 #endif 2217 2218 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2219 extern const unsigned char __GMP_DECLSPEC __clz_tab[129]; 2220 #endif 2221 2222 #if !defined (count_trailing_zeros) 2223 #if !defined (COUNT_LEADING_ZEROS_SLOW) 2224 /* Define count_trailing_zeros using an asm count_leading_zeros. */ 2225 #define count_trailing_zeros(count, x) \ 2226 do { \ 2227 UWtype __ctz_x = (x); \ 2228 UWtype __ctz_c; \ 2229 ASSERT (__ctz_x != 0); \ 2230 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ 2231 (count) = W_TYPE_SIZE - 1 - __ctz_c; \ 2232 } while (0) 2233 #else 2234 /* Define count_trailing_zeros in plain C, assuming small counts are common. 2235 We use clz_tab without ado, since the C count_leading_zeros above will have 2236 pulled it in. */ 2237 #define count_trailing_zeros(count, x) \ 2238 do { \ 2239 UWtype __ctz_x = (x); \ 2240 int __ctz_c; \ 2241 \ 2242 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2243 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \ 2244 else \ 2245 { \ 2246 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \ 2247 { \ 2248 __ctz_x >>= 8; \ 2249 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2250 break; \ 2251 } \ 2252 \ 2253 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \ 2254 } \ 2255 } while (0) 2256 #endif 2257 #endif 2258 2259 #ifndef UDIV_NEEDS_NORMALIZATION 2260 #define UDIV_NEEDS_NORMALIZATION 0 2261 #endif 2262 2263 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and 2264 that hence the latter should always be used. */ 2265 #ifndef UDIV_PREINV_ALWAYS 2266 #define UDIV_PREINV_ALWAYS 0 2267 #endif 2268