1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2 
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2023 Free Software Foundation, Inc.
4 
5 This file is part of the GNU MPFR Library and has been copied from
6 GNU MP 18339:32dc4af70f95, with the following changes:
7   * the copyright notice (note: only LGPL 3+ is used in MPFR);
8   * the code declared as added for MPFR just below these comments;
9   * __GMP_DECLSPEC renamed to __MPFR_DECLSPEC.
10 
11 The GNU MPFR Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
15 
16 The GNU MPFR Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19 License for more details.
20 
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
23 https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
24 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
25 
26 /* You have to define the following before including this file:
27 
28    UWtype -- An unsigned type, default type for operations (typically a "word")
29    UHWtype -- An unsigned type, at least half the size of UWtype
30    UDWtype -- An unsigned type, at least twice as large a UWtype
31    W_TYPE_SIZE -- size in bits of UWtype
32 
33    SItype, USItype -- Signed and unsigned 32 bit types
34    DItype, UDItype -- Signed and unsigned 64 bit types
35 
36    On a 32 bit machine UWtype should typically be USItype;
37    on a 64 bit machine, UWtype should typically be UDItype.
38 
39    Optionally, define:
40 
41    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
42    NO_ASM -- Disable inline asm
43 
44 
45    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
46    need to include gmp.h and gmp-impl.h, or certain things might not work as
47    expected.
48 */
49 
50 /* Code added for MPFR */
51 
52 #ifndef MPFR_NEED_LONGLONG_H
53 # error "Never include mpfr-longlong.h directly; define MPFR_NEED_LONGLONG_H instead."
54 #endif
55 
56 #ifndef __GMP_GNUC_PREREQ
57 # define __GMP_GNUC_PREREQ(X,Y) __MPFR_GNUC(X,Y)
58 #endif
59 
60 /* End of code added for MPFR */
61 
62 #define __BITS4 (W_TYPE_SIZE / 4)
63 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
64 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
65 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
66 
67 /* This is used to make sure no undesirable sharing between different libraries
68    that use this file takes place.  */
69 #ifndef __MPN
70 #define __MPN(x) __##x
71 #endif
72 
73 /* Define auxiliary asm macros.
74 
75    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
76    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
77    word product in HIGH_PROD and LOW_PROD.
78 
79    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
80    UDWtype product.  This is just a variant of umul_ppmm.
81 
82    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
83    denominator) divides a UDWtype, composed by the UWtype integers
84    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
85    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
86    than DENOMINATOR for correct operation.  If, in addition, the most
87    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
88    UDIV_NEEDS_NORMALIZATION is defined to 1.
89 
90    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
91    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
92    is rounded towards 0.
93 
94    5) count_leading_zeros(count, x) counts the number of zero-bits from the
95    msb to the first non-zero bit in the UWtype X.  This is the number of
96    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
97    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
98 
99    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
100    from the least significant end.
101 
102    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
103    high_addend_2, low_addend_2) adds two UWtype integers, composed by
104    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
105    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
106    (i.e. carry out) is not stored anywhere, and is lost.
107 
108    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
109    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
110    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
111    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
112    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
113    and is lost.
114 
115    If any of these macros are left undefined for a particular CPU,
116    C macros are used.
117 
118 
119    Notes:
120 
121    For add_ssaaaa the two high and two low addends can both commute, but
122    unfortunately gcc only supports one "%" commutative in each asm block.
123    This has always been so but is only documented in recent versions
124    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
125    compiler error in certain rare circumstances.
126 
127    Apparently it was only the last "%" that was ever actually respected, so
128    the code has been updated to leave just that.  Clearly there's a free
129    choice whether high or low should get it, if there's a reason to favour
130    one over the other.  Also obviously when the constraints on the two
131    operands are identical there's no benefit to the reloader in any "%" at
132    all.
133 
134    */
135 
136 /* The CPUs come in alphabetical order below.
137 
138    Please add support for more CPUs here, or improve the current support
139    for the CPUs below!  */
140 
141 
142 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
143    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
144    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
145    __builtin_ctzll.
146 
147    These builtins are only used when we check what code comes out, on some
148    chips they're merely libgcc calls, where we will instead want an inline
149    in that case (either asm or generic C).
150 
151    These builtins are better than an asm block of the same insn, since an
152    asm block doesn't give gcc any information about scheduling or resource
153    usage.  We keep an asm block for use on prior versions of gcc though.
154 
155    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
156    it's not used (for count_leading_zeros) because it generally gives extra
157    code to ensure the result is 0 when the input is 0, which we don't need
158    or want.  */
159 
160 #ifdef _LONG_LONG_LIMB
161 #define count_leading_zeros_gcc_clz(count,x)	\
162   do {						\
163     ASSERT ((x) != 0);				\
164     (count) = __builtin_clzll (x);		\
165   } while (0)
166 #else
167 #define count_leading_zeros_gcc_clz(count,x)	\
168   do {						\
169     ASSERT ((x) != 0);				\
170     (count) = __builtin_clzl (x);		\
171   } while (0)
172 #endif
173 
174 #ifdef _LONG_LONG_LIMB
175 #define count_trailing_zeros_gcc_ctz(count,x)	\
176   do {						\
177     ASSERT ((x) != 0);				\
178     (count) = __builtin_ctzll (x);		\
179   } while (0)
180 #else
181 #define count_trailing_zeros_gcc_ctz(count,x)	\
182   do {						\
183     ASSERT ((x) != 0);				\
184     (count) = __builtin_ctzl (x);		\
185   } while (0)
186 #endif
187 
188 
189 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
190    don't need to be under !NO_ASM */
191 #if ! defined (NO_ASM)
192 
193 #if defined (__alpha) && W_TYPE_SIZE == 64
194 /* Most alpha-based machines, except Cray systems. */
195 #if defined (__GNUC__)
196 #if __GMP_GNUC_PREREQ (3,3)
197 #define umul_ppmm(ph, pl, m0, m1) \
198   do {									\
199     UDItype __m0 = (m0), __m1 = (m1);					\
200     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
201     (pl) = __m0 * __m1;							\
202   } while (0)
203 #else
204 #define umul_ppmm(ph, pl, m0, m1) \
205   do {									\
206     UDItype __m0 = (m0), __m1 = (m1);					\
207     __asm__ ("umulh %r1,%2,%0"						\
208 	     : "=r" (ph)						\
209 	     : "%rJ" (__m0), "rI" (__m1));				\
210     (pl) = __m0 * __m1;							\
211   } while (0)
212 #endif
213 #else /* ! __GNUC__ */
214 #include <machine/builtins.h>
215 #define umul_ppmm(ph, pl, m0, m1) \
216   do {									\
217     UDItype __m0 = (m0), __m1 = (m1);					\
218     (ph) = __UMULH (__m0, __m1);					\
219     (pl) = __m0 * __m1;							\
220   } while (0)
221 #endif
222 #ifndef LONGLONG_STANDALONE
223 #define udiv_qrnnd(q, r, n1, n0, d) \
224   do { UWtype __di;							\
225     __di = __MPN(invert_limb) (d);					\
226     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
227   } while (0)
228 #define UDIV_PREINV_ALWAYS  1
229 #define UDIV_NEEDS_NORMALIZATION 1
230 #endif /* LONGLONG_STANDALONE */
231 
232 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
233    always goes into libgmp.so, even when not actually used.  */
234 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
235 
236 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
237 #define count_leading_zeros(COUNT,X) \
238   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
239 #define count_trailing_zeros(COUNT,X) \
240   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
241 #endif /* clz/ctz using cix */
242 
243 #if ! defined (count_leading_zeros)				\
244   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
245 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
246    "$31" is written explicitly in the asm, since an "r" constraint won't
247    select reg 31.  There seems no need to worry about "r31" syntax for cray,
248    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
249 #define ALPHA_CMPBGE_0(dst, src)					\
250   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
251 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
252    them, locating the highest non-zero byte.  A second __clz_tab lookup
253    counts the leading zero bits in that byte, giving the result.  */
254 #define count_leading_zeros(count, x)					\
255   do {									\
256     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
257     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
258     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
259     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
260     __clz__x >>= __clz__b;						\
261     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
262     __clz__b = 65 - __clz__b;						\
263     (count) = __clz__b - __clz__c;					\
264   } while (0)
265 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
266 #endif /* clz using cmpbge */
267 
268 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
269 #if HAVE_ATTRIBUTE_CONST
270 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
271 #else
272 long __MPN(count_leading_zeros) (UDItype);
273 #endif
274 #define count_leading_zeros(count, x) \
275   ((count) = __MPN(count_leading_zeros) (x))
276 #endif /* clz using mpn */
277 #endif /* __alpha */
278 
279 #if defined (__AVR) && W_TYPE_SIZE == 8
280 #define umul_ppmm(ph, pl, m0, m1) \
281   do {									\
282     unsigned short __p = (unsigned short) (m0) * (m1);			\
283     (ph) = __p >> 8;							\
284     (pl) = __p;								\
285   } while (0)
286 #endif /* AVR */
287 
288 #if defined (_CRAY) && W_TYPE_SIZE == 64
289 #include <intrinsics.h>
290 #define UDIV_PREINV_ALWAYS  1
291 #define UDIV_NEEDS_NORMALIZATION 1
292 long __MPN(count_leading_zeros) (UDItype);
293 #define count_leading_zeros(count, x) \
294   ((count) = _leadz ((UWtype) (x)))
295 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
296 #define umul_ppmm(ph, pl, m0, m1) \
297   do {									\
298     UDItype __m0 = (m0), __m1 = (m1);					\
299     (ph) = _int_mult_upper (__m0, __m1);				\
300     (pl) = __m0 * __m1;							\
301   } while (0)
302 #ifndef LONGLONG_STANDALONE
303 #define udiv_qrnnd(q, r, n1, n0, d) \
304   do { UWtype __di;							\
305     __di = __MPN(invert_limb) (d);					\
306     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
307   } while (0)
308 #endif /* LONGLONG_STANDALONE */
309 #endif /* _CRAYIEEE */
310 #endif /* _CRAY */
311 
312 #if defined (__ia64) && W_TYPE_SIZE == 64
313 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
314    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
315    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
316    register, which takes an extra cycle.  */
317 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
318   do {						\
319     UWtype __x;					\
320     __x = (al) - (bl);				\
321     if ((al) < (bl))				\
322       (sh) = (ah) - (bh) - 1;			\
323     else					\
324       (sh) = (ah) - (bh);			\
325     (sl) = __x;					\
326   } while (0)
327 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
328 /* Do both product parts in assembly, since that gives better code with
329    all gcc versions.  Some callers will just use the upper part, and in
330    that situation we waste an instruction, but not any cycles.  */
331 #define umul_ppmm(ph, pl, m0, m1) \
332     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
333 	     : "=&f" (ph), "=f" (pl)					\
334 	     : "f" (m0), "f" (m1))
335 #define count_leading_zeros(count, x) \
336   do {									\
337     UWtype _x = (x), _y, _a, _c;					\
338     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
339     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
340     _c = (_a - 1) << 3;							\
341     _x >>= _c;								\
342     if (_x >= 1 << 4)							\
343       _x >>= 4, _c += 4;						\
344     if (_x >= 1 << 2)							\
345       _x >>= 2, _c += 2;						\
346     _c += _x >> 1;							\
347     (count) =  W_TYPE_SIZE - 1 - _c;					\
348   } while (0)
349 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
350    based, and we don't need a special case for x==0 here */
351 #define count_trailing_zeros(count, x)					\
352   do {									\
353     UWtype __ctz_x = (x);						\
354     __asm__ ("popcnt %0 = %1"						\
355 	     : "=r" (count)						\
356 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
357   } while (0)
358 #endif
359 #if defined (__INTEL_COMPILER)
360 #include <ia64intrin.h>
361 #define umul_ppmm(ph, pl, m0, m1)					\
362   do {									\
363     UWtype __m0 = (m0), __m1 = (m1);					\
364     ph = _m64_xmahu (__m0, __m1, 0);					\
365     pl = __m0 * __m1;							\
366   } while (0)
367 #endif
368 #ifndef LONGLONG_STANDALONE
369 #define udiv_qrnnd(q, r, n1, n0, d) \
370   do { UWtype __di;							\
371     __di = __MPN(invert_limb) (d);					\
372     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
373   } while (0)
374 #define UDIV_PREINV_ALWAYS  1
375 #define UDIV_NEEDS_NORMALIZATION 1
376 #endif
377 #endif
378 
379 
380 #if defined (__GNUC__)
381 
382 /* We sometimes need to clobber "cc" with gcc2, but that would not be
383    understood by gcc1.  Use cpp to avoid major code duplication.  */
384 #if __GNUC__ < 2
385 #define __CLOBBER_CC
386 #define __AND_CLOBBER_CC
387 #else /* __GNUC__ >= 2 */
388 #define __CLOBBER_CC : "cc"
389 #define __AND_CLOBBER_CC , "cc"
390 #endif /* __GNUC__ < 2 */
391 
392 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
393 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
394   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
395 	   : "=r" (sh), "=&r" (sl)					\
396 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
397 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
398   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
399 	   : "=r" (sh), "=&r" (sl)					\
400 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
401 #define umul_ppmm(xh, xl, m0, m1) \
402   do {									\
403     USItype __m0 = (m0), __m1 = (m1);					\
404     __asm__ ("multiplu %0,%1,%2"					\
405 	     : "=r" (xl)						\
406 	     : "r" (__m0), "r" (__m1));					\
407     __asm__ ("multmu %0,%1,%2"						\
408 	     : "=r" (xh)						\
409 	     : "r" (__m0), "r" (__m1));					\
410   } while (0)
411 #define udiv_qrnnd(q, r, n1, n0, d) \
412   __asm__ ("dividu %0,%3,%4"						\
413 	   : "=r" (q), "=q" (r)						\
414 	   : "1" (n1), "r" (n0), "r" (d))
415 #define count_leading_zeros(count, x) \
416     __asm__ ("clz %0,%1"						\
417 	     : "=r" (count)						\
418 	     : "r" (x))
419 #define COUNT_LEADING_ZEROS_0 32
420 #endif /* __a29k__ */
421 
422 #if defined (__arc__)
423 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
424   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
425 	   : "=r" (sh),							\
426 	     "=&r" (sl)							\
427 	   : "r"  ((USItype) (ah)),					\
428 	     "rICal" ((USItype) (bh)),					\
429 	     "%r" ((USItype) (al)),					\
430 	     "rICal" ((USItype) (bl)))
431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
432   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
433 	   : "=r" (sh),							\
434 	     "=&r" (sl)							\
435 	   : "r" ((USItype) (ah)),					\
436 	     "rICal" ((USItype) (bh)),					\
437 	     "r" ((USItype) (al)),					\
438 	     "rICal" ((USItype) (bl)))
439 #endif
440 
441 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
442     && W_TYPE_SIZE == 32
443 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
444   do {									\
445     if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl))	\
446       __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
447 	   : "=r" (sh), "=&r" (sl)					\
448 	       : "r" (ah), "rI" (bh),					\
449 		 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC);	\
450     else								\
451       __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
452 	   : "=r" (sh), "=&r" (sl)					\
453 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC);	\
454   } while (0)
455 /* FIXME: Extend the immediate range for the low word by using both ADDS and
456    SUBS, since they set carry in the same way.  We need separate definitions
457    for thumb and non-thumb since thumb lacks RSC.  */
458 #if defined (__thumb__)
459 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
460   do {									\
461     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
462 	&& (ah) == (bh))						\
463       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
464 	       : "=r" (sh), "=r" (sl)					\
465 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
466     else if (__builtin_constant_p (al))					\
467       __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"			\
468 	       : "=r" (sh), "=&r" (sl)					\
469 	       : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
470     else								\
471       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
472 	       : "=r" (sh), "=&r" (sl)					\
473 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
474     } while (0)
475 #else
476 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
477   do {									\
478     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
479 	&& (ah) == (bh))						\
480       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
481 	       : "=r" (sh), "=r" (sl)					\
482 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
483     else if (__builtin_constant_p (al))					\
484       {									\
485 	if (__builtin_constant_p (ah))					\
486 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
487 		   : "=r" (sh), "=&r" (sl)				\
488 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
489 	else								\
490 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
491 		   : "=r" (sh), "=&r" (sl)				\
492 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
493       }									\
494     else if (__builtin_constant_p (ah))					\
495       {									\
496 	if (__builtin_constant_p (bl))					\
497 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
498 		   : "=r" (sh), "=&r" (sl)				\
499 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
500 	else								\
501 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
502 		   : "=r" (sh), "=&r" (sl)				\
503 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
504       }									\
505     else								\
506       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
507 	       : "=r" (sh), "=&r" (sl)					\
508 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
509     } while (0)
510 #endif
511 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
512     || defined (__ARM_ARCH_3__)
513 #define umul_ppmm(xh, xl, a, b)						\
514   do {									\
515     register USItype __t0, __t1, __t2;					\
516     __asm__ ("%@ Inlined umul_ppmm\n"					\
517 	   "	mov	%2, %5, lsr #16\n"				\
518 	   "	mov	%0, %6, lsr #16\n"				\
519 	   "	bic	%3, %5, %2, lsl #16\n"				\
520 	   "	bic	%4, %6, %0, lsl #16\n"				\
521 	   "	mul	%1, %3, %4\n"					\
522 	   "	mul	%4, %2, %4\n"					\
523 	   "	mul	%3, %0, %3\n"					\
524 	   "	mul	%0, %2, %0\n"					\
525 	   "	adds	%3, %4, %3\n"					\
526 	   "	addcs	%0, %0, #65536\n"				\
527 	   "	adds	%1, %1, %3, lsl #16\n"				\
528 	   "	adc	%0, %0, %3, lsr #16"				\
529 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
530 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
531 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
532   } while (0)
533 #ifndef LONGLONG_STANDALONE
534 #define udiv_qrnnd(q, r, n1, n0, d) \
535   do { UWtype __r;							\
536     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
537     (r) = __r;								\
538   } while (0)
539 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
540 #endif /* LONGLONG_STANDALONE */
541 #else /* ARMv4 or newer */
542 #define umul_ppmm(xh, xl, a, b) \
543   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
544 #define smul_ppmm(xh, xl, a, b) \
545   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
546 #ifndef LONGLONG_STANDALONE
547 #define udiv_qrnnd(q, r, n1, n0, d) \
548   do { UWtype __di;							\
549     __di = __MPN(invert_limb) (d);					\
550     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
551   } while (0)
552 #define UDIV_PREINV_ALWAYS  1
553 #define UDIV_NEEDS_NORMALIZATION 1
554 #endif /* LONGLONG_STANDALONE */
555 #endif /* defined(__ARM_ARCH_2__) ... */
556 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
557 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
558 #endif /* __arm__ */
559 
560 #if defined (__aarch64__) && W_TYPE_SIZE == 64
561 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
562   do {									\
563     if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
564       __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
565 	       : "=r" (sh), "=&r" (sl)					\
566 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
567 		 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
568     else								\
569       __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
570 	       : "=r" (sh), "=&r" (sl)					\
571 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
572 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
573   } while (0)
574 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
575   do {									\
576     if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
577       __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
578 	       : "=r,r" (sh), "=&r,&r" (sl)				\
579 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
580 		 "r,Z"   ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
581     else								\
582       __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
583 	       : "=r,r" (sh), "=&r,&r" (sl)				\
584 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
585 		 "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC);\
586   } while(0);
587 #if __GMP_GNUC_PREREQ (4,9)
588 #define umul_ppmm(w1, w0, u, v) \
589   do {									\
590     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
591     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
592     w1 = __ll >> 64;							\
593     w0 = __ll;								\
594   } while (0)
595 #endif
596 #if !defined (umul_ppmm)
597 #define umul_ppmm(ph, pl, m0, m1) \
598   do {									\
599     UDItype __m0 = (m0), __m1 = (m1);					\
600     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
601     (pl) = __m0 * __m1;							\
602   } while (0)
603 #endif
604 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
605 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
606 #endif /* __aarch64__ */
607 
608 #if defined (__clipper__) && W_TYPE_SIZE == 32
609 #define umul_ppmm(w1, w0, u, v) \
610   ({union {UDItype __ll;						\
611 	   struct {USItype __l, __h;} __i;				\
612 	  } __x;							\
613   __asm__ ("mulwux %2,%0"						\
614 	   : "=r" (__x.__ll)						\
615 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
616   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
617 #define smul_ppmm(w1, w0, u, v) \
618   ({union {DItype __ll;							\
619 	   struct {SItype __l, __h;} __i;				\
620 	  } __x;							\
621   __asm__ ("mulwx %2,%0"						\
622 	   : "=r" (__x.__ll)						\
623 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
624   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
625 #define __umulsidi3(u, v) \
626   ({UDItype __w;							\
627     __asm__ ("mulwux %2,%0"						\
628 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
629     __w; })
630 #endif /* __clipper__ */
631 
632 /* Fujitsu vector computers.  */
633 #if defined (__uxp__) && W_TYPE_SIZE == 32
634 #define umul_ppmm(ph, pl, u, v) \
635   do {									\
636     union {UDItype __ll;						\
637 	   struct {USItype __h, __l;} __i;				\
638 	  } __x;							\
639     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
640     (ph) = __x.__i.__h;							\
641     (pl) = __x.__i.__l;							\
642   } while (0)
643 #define smul_ppmm(ph, pl, u, v) \
644   do {									\
645     union {UDItype __ll;						\
646 	   struct {USItype __h, __l;} __i;				\
647 	  } __x;							\
648     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
649     (ph) = __x.__i.__h;							\
650     (pl) = __x.__i.__l;							\
651   } while (0)
652 #endif
653 
654 #if defined (__gmicro__) && W_TYPE_SIZE == 32
655 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
656   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
657 	   : "=g" (sh), "=&g" (sl)					\
658 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
659 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
660 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
661   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
662 	   : "=g" (sh), "=&g" (sl)					\
663 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
664 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
665 #define umul_ppmm(ph, pl, m0, m1) \
666   __asm__ ("mulx %3,%0,%1"						\
667 	   : "=g" (ph), "=r" (pl)					\
668 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
669 #define udiv_qrnnd(q, r, nh, nl, d) \
670   __asm__ ("divx %4,%0,%1"						\
671 	   : "=g" (q), "=r" (r)						\
672 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
673 #define count_leading_zeros(count, x) \
674   __asm__ ("bsch/1 %1,%0"						\
675 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
676 #endif
677 
678 #if defined (__hppa) && W_TYPE_SIZE == 32
679 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
680   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
681 	   : "=r" (sh), "=&r" (sl)					\
682 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
683 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
684   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
685 	   : "=r" (sh), "=&r" (sl)					\
686 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
687 #if defined (_PA_RISC1_1)
688 #define umul_ppmm(wh, wl, u, v) \
689   do {									\
690     union {UDItype __ll;						\
691 	   struct {USItype __h, __l;} __i;				\
692 	  } __x;							\
693     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
694     (wh) = __x.__i.__h;							\
695     (wl) = __x.__i.__l;							\
696   } while (0)
697 #endif
698 #define count_leading_zeros(count, x) \
699   do {									\
700     USItype __tmp;							\
701     __asm__ (								\
702        "ldi		1,%0\n"						\
703 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
704 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
705 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
706 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
707 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
708 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
709 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
710 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
711 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
712 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
713 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
714 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
715 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
716 "	sub		%0,%1,%0	; Subtract it.\n"		\
717 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
718   } while (0)
719 #endif /* hppa */
720 
721 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
722    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
723    is just a case of no direct support for 2.0n but treating it like 1.0. */
724 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
725 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
726   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
727 	   : "=r" (sh), "=&r" (sl)					\
728 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
729 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
730   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
731 	   : "=r" (sh), "=&r" (sl)					\
732 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
733 #endif /* hppa */
734 
735 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
736 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
737 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
738   do {									\
739 /*  if (__builtin_constant_p (bl))					\
740       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
741 	       : "=r" (sh), "=&r" (sl)					\
742 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
743     else								\
744 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
745 	       : "=r" (sh), "=&r" (sl)					\
746 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
747   } while (0)
748 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
749   do {									\
750 /*  if (__builtin_constant_p (bl))					\
751       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
752 	       : "=r" (sh), "=&r" (sl)					\
753 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
754     else								\
755 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
756 	       : "=r" (sh), "=&r" (sl)					\
757 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
758   } while (0)
759 #if __GMP_GNUC_PREREQ (4,5)
760 #define umul_ppmm(xh, xl, m0, m1)					\
761   do {									\
762     union {UDItype __ll;						\
763 	   struct {USItype __h, __l;} __i;				\
764 	  } __x;							\
765     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
766     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
767   } while (0)
768 #else
769 #if 0
770 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
771    with a new enough processor pretending we have 32-bit registers.  */
772 #define umul_ppmm(xh, xl, m0, m1)					\
773   do {									\
774     union {UDItype __ll;						\
775 	   struct {USItype __h, __l;} __i;				\
776 	  } __x;							\
777     __asm__ ("mlr\t%0,%2"						\
778 	     : "=r" (__x.__ll)						\
779 	     : "%0" (m0), "r" (m1));					\
780     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
781   } while (0)
782 #else
783 #define umul_ppmm(xh, xl, m0, m1)					\
784   do {									\
785   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
786      DImode for the product, since that would be allocated to a single 64-bit
787      register, whereas mlr uses the low 32-bits of an even-odd register pair.
788   */									\
789     register USItype __r0 __asm__ ("0");				\
790     register USItype __r1 __asm__ ("1") = (m0);				\
791     __asm__ ("mlr\t%0,%3"						\
792 	     : "=r" (__r0), "=r" (__r1)					\
793 	     : "r" (__r1), "r" (m1));					\
794     (xh) = __r0; (xl) = __r1;						\
795   } while (0)
796 #endif /* if 0 */
797 #endif
798 #if 0
799 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
800    with a new enough processor pretending we have 32-bit registers.  */
801 #define udiv_qrnnd(q, r, n1, n0, d)					\
802   do {									\
803     union {UDItype __ll;						\
804 	   struct {USItype __h, __l;} __i;				\
805 	  } __x;							\
806     __x.__i.__h = n1; __x.__i.__l = n0;					\
807     __asm__ ("dlr\t%0,%2"						\
808 	     : "=r" (__x.__ll)						\
809 	     : "0" (__x.__ll), "r" (d));				\
810     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
811   } while (0)
812 #else
813 #define udiv_qrnnd(q, r, n1, n0, d)					\
814   do {									\
815     register USItype __r0 __asm__ ("0") = (n1);				\
816     register USItype __r1 __asm__ ("1") = (n0);				\
817     __asm__ ("dlr\t%0,%4"						\
818 	     : "=r" (__r0), "=r" (__r1)					\
819 	     : "r" (__r0), "r" (__r1), "r" (d));			\
820     (q) = __r1; (r) = __r0;						\
821   } while (0)
822 #endif /* if 0 */
823 #else /* if __zarch__ */
824 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
825 #define smul_ppmm(xh, xl, m0, m1)					\
826   do {									\
827     union {DItype __ll;							\
828 	   struct {USItype __h, __l;} __i;				\
829 	  } __x;							\
830     __asm__ ("mr\t%0,%2"						\
831 	     : "=r" (__x.__ll)						\
832 	     : "%0" (m0), "r" (m1));					\
833     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
834   } while (0)
835 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
836 #define sdiv_qrnnd(q, r, n1, n0, d)					\
837   do {									\
838     union {DItype __ll;							\
839 	   struct {USItype __h, __l;} __i;				\
840 	  } __x;							\
841     __x.__i.__h = n1; __x.__i.__l = n0;					\
842     __asm__ ("dr\t%0,%2"						\
843 	     : "=r" (__x.__ll)						\
844 	     : "0" (__x.__ll), "r" (d));				\
845     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
846   } while (0)
847 #endif /* if __zarch__ */
848 #endif
849 
850 #if defined (__s390x__) && W_TYPE_SIZE == 64
851 /* We need to cast operands with register constraints, otherwise their types
852    will be assumed to be SImode by gcc.  For these machines, such operations
853    will insert a value into the low 32 bits, and leave the high 32 bits with
854    garbage.  */
855 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
856   do {									\
857     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
858 	       : "=r" (sh), "=&r" (sl)					\
859 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
860 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
861   } while (0)
862 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
863   do {									\
864     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
865 	     : "=r" (sh), "=&r" (sl)					\
866 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
867 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
868   } while (0)
869 #if !defined (__clang__)
870 #define umul_ppmm(xh, xl, m0, m1)					\
871   do {									\
872     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
873 	   struct {UDItype __h, __l;} __i;				\
874 	  } __x;							\
875     __asm__ ("mlgr\t%0,%2"						\
876 	     : "=r" (__x.__ll)						\
877 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
878     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
879   } while (0)
880 #define udiv_qrnnd(q, r, n1, n0, d)					\
881   do {									\
882     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
883 	   struct {UDItype __h, __l;} __i;				\
884 	  } __x;							\
885     __x.__i.__h = n1; __x.__i.__l = n0;					\
886     __asm__ ("dlgr\t%0,%2"						\
887 	     : "=r" (__x.__ll)						\
888 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
889     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
890   } while (0)
891 #endif
892 #if 0 /* FIXME: Enable for z10 (?) */
893 #define count_leading_zeros(cnt, x)					\
894   do {									\
895     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
896 	   struct {UDItype __h, __l;} __i;				\
897 	  } __clr_cnt;							\
898     __asm__ ("flogr\t%0,%1"						\
899 	     : "=r" (__clr_cnt.__ll)					\
900 	     : "r" (x) __CLOBBER_CC);					\
901     (cnt) = __clr_cnt.__i.__h;						\
902   } while (0)
903 #endif
904 #endif
905 
906 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
907    so we don't need __CLOBBER_CC.  */
908 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
910   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
911 	   : "=r" (sh), "=&r" (sl)					\
912 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
913 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
915   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
916 	   : "=r" (sh), "=&r" (sl)					\
917 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
918 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
919 #define umul_ppmm(w1, w0, u, v) \
920   __asm__ ("mull %3"							\
921 	   : "=a" (w0), "=d" (w1)					\
922 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
923 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
924   __asm__ ("divl %4"		     /* stringification in K&R C */	\
925 	   : "=a" (q), "=d" (r)						\
926 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
927 
928 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
929 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
930    significant 1 bit is, hence the use of the following alternatives.  bsfl
931    is slow too, between 18 and 42 depending where the least significant 1
932    bit is, so let the generic count_trailing_zeros below make use of the
933    count_leading_zeros here too.  */
934 
935 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
936 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
937    cache miss reading from __clz_tab.  For P55 it's favoured over the float
938    below so as to avoid mixing MMX and x87, since the penalty for switching
939    between the two is about 100 cycles.
940 
941    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
942    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
943    follows, but as of gcc 2.95.2 it results in conditional jumps.
944 
945        __shift = -(__n < 0x1000000);
946        __shift -= (__n < 0x10000);
947        __shift -= (__n < 0x100);
948 
949    The middle two sbbl and cmpl's pair, and with luck something gcc
950    generates might pair with the first cmpl and the last sbbl.  The "32+1"
951    constant could be folded into __clz_tab[], but it doesn't seem worth
952    making a different table just for that.  */
953 
954 #define count_leading_zeros(c,n)					\
955   do {									\
956     USItype  __n = (n);							\
957     USItype  __shift;							\
958     __asm__ ("cmpl  $0x1000000, %1\n"					\
959 	     "sbbl  %0, %0\n"						\
960 	     "cmpl  $0x10000, %1\n"					\
961 	     "sbbl  $0, %0\n"						\
962 	     "cmpl  $0x100, %1\n"					\
963 	     "sbbl  $0, %0\n"						\
964 	     : "=&r" (__shift) : "r"  (__n));				\
965     __shift = __shift*8 + 24 + 1;					\
966     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
967   } while (0)
968 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
969 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
970 
971 #else /* ! pentiummmx || LONGLONG_STANDALONE */
972 /* The following should be a fixed 14 cycles or so.  Some scheduling
973    opportunities should be available between the float load/store too.  This
974    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
975    apparently suggested by the Intel optimizing manual (don't know exactly
976    where).  gcc 2.95 or up will be best for this, so the "double" is
977    correctly aligned on the stack.  */
978 #define count_leading_zeros(c,n)					\
979   do {									\
980     union {								\
981       double    d;							\
982       unsigned  a[2];							\
983     } __u;								\
984     __u.d = (UWtype) (n);						\
985     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
986   } while (0)
987 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
988 #endif /* pentiummx */
989 
990 #else /* ! pentium */
991 
992 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
993 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
994 #endif /* gcc clz */
995 
996 /* On P6, gcc prior to 3.0 generates a partial register stall for
997    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
998    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
999    cost of one extra instruction.  Do this for "i386" too, since that means
1000    generic x86.  */
1001 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
1002   && (HAVE_HOST_CPU_i386						\
1003       || HAVE_HOST_CPU_i686						\
1004       || HAVE_HOST_CPU_pentiumpro					\
1005       || HAVE_HOST_CPU_pentium2						\
1006       || HAVE_HOST_CPU_pentium3)
1007 #define count_leading_zeros(count, x)					\
1008   do {									\
1009     USItype __cbtmp;							\
1010     ASSERT ((x) != 0);							\
1011     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1012     (count) = 31 - __cbtmp;						\
1013   } while (0)
1014 #endif /* gcc<3 asm bsrl */
1015 
1016 #ifndef count_leading_zeros
1017 #define count_leading_zeros(count, x)					\
1018   do {									\
1019     USItype __cbtmp;							\
1020     ASSERT ((x) != 0);							\
1021     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1022     (count) = __cbtmp ^ 31;						\
1023   } while (0)
1024 #endif /* asm bsrl */
1025 
1026 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
1027 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
1028 #endif /* gcc ctz */
1029 
1030 #ifndef count_trailing_zeros
1031 #define count_trailing_zeros(count, x)					\
1032   do {									\
1033     ASSERT ((x) != 0);							\
1034     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
1035   } while (0)
1036 #endif /* asm bsfl */
1037 
1038 #endif /* ! pentium */
1039 
1040 #endif /* 80x86 */
1041 
1042 #if defined (__amd64__) && W_TYPE_SIZE == 64
1043 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1044   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
1045 	   : "=r" (sh), "=&r" (sl)					\
1046 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1047 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1048 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1049   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
1050 	   : "=r" (sh), "=&r" (sl)					\
1051 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1052 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1053 #if X86_ASM_MULX \
1054    && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
1055        || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
1056 #define umul_ppmm(w1, w0, u, v) \
1057   __asm__ ("mulx\t%3, %q0, %q1"						\
1058 	   : "=r" (w0), "=r" (w1)					\
1059 	   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
1060 #else
1061 #define umul_ppmm(w1, w0, u, v) \
1062   __asm__ ("mulq\t%3"							\
1063 	   : "=a" (w0), "=d" (w1)					\
1064 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1065 #endif
1066 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1067   __asm__ ("divq %4"		     /* stringification in K&R C */	\
1068 	   : "=a" (q), "=d" (r)						\
1069 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1070 
1071 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
1072   || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2	\
1073   || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen	\
1074   || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
1075 #define count_leading_zeros(count, x)					\
1076   do {									\
1077     /* This is lzcnt, spelled for older assemblers.  Destination and */	\
1078     /* source must be a 64-bit registers, hence cast and %q.         */	\
1079     __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1080   } while (0)
1081 #define COUNT_LEADING_ZEROS_0 64
1082 #else
1083 #define count_leading_zeros(count, x)					\
1084   do {									\
1085     UDItype __cbtmp;							\
1086     ASSERT ((x) != 0);							\
1087     __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
1088     (count) = __cbtmp ^ 63;						\
1089   } while (0)
1090 #endif
1091 
1092 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
1093   || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
1094 #define count_trailing_zeros(count, x)					\
1095   do {									\
1096     /* This is tzcnt, spelled for older assemblers.  Destination and */	\
1097     /* source must be a 64-bit registers, hence cast and %q.         */	\
1098     __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1099   } while (0)
1100 #define COUNT_TRAILING_ZEROS_0 64
1101 #else
1102 #define count_trailing_zeros(count, x)					\
1103   do {									\
1104     ASSERT ((x) != 0);							\
1105     __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1106   } while (0)
1107 #endif
1108 #endif /* __amd64__ */
1109 
1110 #if defined (__i860__) && W_TYPE_SIZE == 32
1111 #define rshift_rhlc(r,h,l,c) \
1112   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
1113 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
1114 #endif /* i860 */
1115 
1116 #if defined (__i960__) && W_TYPE_SIZE == 32
1117 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1118   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
1119 	   : "=r" (sh), "=&r" (sl)					\
1120 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1121 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1122   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
1123 	   : "=r" (sh), "=&r" (sl)					\
1124 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1125 #define umul_ppmm(w1, w0, u, v) \
1126   ({union {UDItype __ll;						\
1127 	   struct {USItype __l, __h;} __i;				\
1128 	  } __x;							\
1129   __asm__ ("emul %2,%1,%0"						\
1130 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
1131   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1132 #define __umulsidi3(u, v) \
1133   ({UDItype __w;							\
1134     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
1135     __w; })
1136 #define udiv_qrnnd(q, r, nh, nl, d) \
1137   do {									\
1138     union {UDItype __ll;						\
1139 	   struct {USItype __l, __h;} __i;				\
1140 	  } __nn;							\
1141     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
1142     __asm__ ("ediv %d,%n,%0"						\
1143 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
1144     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
1145   } while (0)
1146 #define count_leading_zeros(count, x) \
1147   do {									\
1148     USItype __cbtmp;							\
1149     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
1150     (count) = __cbtmp ^ 31;						\
1151   } while (0)
1152 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1153 #if defined (__i960mx)		/* what is the proper symbol to test??? */
1154 #define rshift_rhlc(r,h,l,c) \
1155   do {									\
1156     union {UDItype __ll;						\
1157 	   struct {USItype __l, __h;} __i;				\
1158 	  } __nn;							\
1159     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
1160     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
1161   }
1162 #endif /* i960mx */
1163 #endif /* i960 */
1164 
1165 
1166 #if defined (__loongarch64) && W_TYPE_SIZE == 64
1167 #define umul_ppmm(w1, w0, u, v) \
1168   do {									\
1169     UDItype __u = (u), __v = (v);					\
1170     (w0) = __u * __v;							\
1171     (w1) = (unsigned __int128__) __u * __v >> 64;			\
1172   } while (0)
1173 #endif
1174 
1175 
1176 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1177      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1178      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1179 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1180   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
1181 	   : "=d" (sh), "=&d" (sl)					\
1182 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
1183 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1184 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1185   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
1186 	   : "=d" (sh), "=&d" (sl)					\
1187 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
1188 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1189 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1190 #if defined (__mc68020__) || defined(mc68020) \
1191      || defined (__mc68030__) || defined (mc68030) \
1192      || defined (__mc68040__) || defined (mc68040) \
1193      || defined (__mcpu32__) || defined (mcpu32) \
1194      || defined (__NeXT__)
1195 #define umul_ppmm(w1, w0, u, v) \
1196   __asm__ ("mulu%.l %3,%1:%0"						\
1197 	   : "=d" (w0), "=d" (w1)					\
1198 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1199 #define udiv_qrnnd(q, r, n1, n0, d) \
1200   __asm__ ("divu%.l %4,%1:%0"						\
1201 	   : "=d" (q), "=d" (r)						\
1202 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1203 #define sdiv_qrnnd(q, r, n1, n0, d) \
1204   __asm__ ("divs%.l %4,%1:%0"						\
1205 	   : "=d" (q), "=d" (r)						\
1206 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1207 #else /* for other 68k family members use 16x16->32 multiplication */
1208 #define umul_ppmm(xh, xl, a, b) \
1209   do { USItype __umul_tmp1, __umul_tmp2;				\
1210 	__asm__ ("| Inlined umul_ppmm\n"				\
1211 "	move%.l	%5,%3\n"						\
1212 "	move%.l	%2,%0\n"						\
1213 "	move%.w	%3,%1\n"						\
1214 "	swap	%3\n"							\
1215 "	swap	%0\n"							\
1216 "	mulu%.w	%2,%1\n"						\
1217 "	mulu%.w	%3,%0\n"						\
1218 "	mulu%.w	%2,%3\n"						\
1219 "	swap	%2\n"							\
1220 "	mulu%.w	%5,%2\n"						\
1221 "	add%.l	%3,%2\n"						\
1222 "	jcc	1f\n"							\
1223 "	add%.l	%#0x10000,%0\n"						\
1224 "1:	move%.l	%2,%3\n"						\
1225 "	clr%.w	%2\n"							\
1226 "	swap	%2\n"							\
1227 "	swap	%3\n"							\
1228 "	clr%.w	%3\n"							\
1229 "	add%.l	%3,%1\n"						\
1230 "	addx%.l	%2,%0\n"						\
1231 "	| End inlined umul_ppmm"					\
1232 	      : "=&d" (xh), "=&d" (xl),					\
1233 		"=&d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
1234 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
1235   } while (0)
1236 #endif /* not mc68020 */
1237 /* The '020, '030, '040 and '060 have bitfield insns.
1238    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1239    exclude bfffo on that chip (bitfield insns not available).  */
1240 #if (defined (__mc68020__) || defined (mc68020)    \
1241      || defined (__mc68030__) || defined (mc68030) \
1242      || defined (__mc68040__) || defined (mc68040) \
1243      || defined (__mc68060__) || defined (mc68060) \
1244      || defined (__NeXT__))			   \
1245   && ! defined (__mcpu32__)
1246 #define count_leading_zeros(count, x) \
1247   __asm__ ("bfffo %1{%b2:%b2},%0"					\
1248 	   : "=d" (count)						\
1249 	   : "od" ((USItype) (x)), "n" (0))
1250 #define COUNT_LEADING_ZEROS_0 32
1251 #endif
1252 #endif /* mc68000 */
1253 
1254 #if defined (__m88000__) && W_TYPE_SIZE == 32
1255 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1256   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
1257 	   : "=r" (sh), "=&r" (sl)					\
1258 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1259 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1260   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
1261 	   : "=r" (sh), "=&r" (sl)					\
1262 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1263 #define count_leading_zeros(count, x) \
1264   do {									\
1265     USItype __cbtmp;							\
1266     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
1267     (count) = __cbtmp ^ 31;						\
1268   } while (0)
1269 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1270 #if defined (__m88110__)
1271 #define umul_ppmm(wh, wl, u, v) \
1272   do {									\
1273     union {UDItype __ll;						\
1274 	   struct {USItype __h, __l;} __i;				\
1275 	  } __x;							\
1276     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
1277     (wh) = __x.__i.__h;							\
1278     (wl) = __x.__i.__l;							\
1279   } while (0)
1280 #define udiv_qrnnd(q, r, n1, n0, d) \
1281   ({union {UDItype __ll;						\
1282 	   struct {USItype __h, __l;} __i;				\
1283 	  } __x, __q;							\
1284   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1285   __asm__ ("divu.d %0,%1,%2"						\
1286 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
1287   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1288 #endif /* __m88110__ */
1289 #endif /* __m88000__ */
1290 
1291 #if defined (__mips) && W_TYPE_SIZE == 32
1292 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
1293 #define umul_ppmm(w1, w0, u, v) \
1294   do {									\
1295     UDItype __ll = (UDItype)(u) * (v);					\
1296     w1 = __ll >> 32;							\
1297     w0 = __ll;								\
1298   } while (0)
1299 #endif
1300 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1301 #define umul_ppmm(w1, w0, u, v) \
1302   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1303 #endif
1304 #if !defined (umul_ppmm)
1305 #define umul_ppmm(w1, w0, u, v) \
1306   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1307 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1308 #endif
1309 #endif /* __mips */
1310 
1311 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1312 #if defined (_MIPS_ARCH_MIPS64R6)
1313 #define umul_ppmm(w1, w0, u, v) \
1314   do {									\
1315     UDItype __m0 = (u), __m1 = (v);					\
1316     (w0) = __m0 * __m1;							\
1317     __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1));	\
1318   } while (0)
1319 #endif
1320 #if !defined (umul_ppmm) && (__GMP_GNUC_PREREQ (4,4) || defined(__clang__))
1321 #define umul_ppmm(w1, w0, u, v) \
1322   do {									\
1323     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1324     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1325     w1 = __ll >> 64;							\
1326     w0 = __ll;								\
1327   } while (0)
1328 #endif
1329 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1330 #define umul_ppmm(w1, w0, u, v) \
1331   __asm__ ("dmultu %2,%3"						\
1332 	   : "=l" (w0), "=h" (w1)					\
1333 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1334 #endif
1335 #if !defined (umul_ppmm)
1336 #define umul_ppmm(w1, w0, u, v) \
1337   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1338 	   : "=d" (w0), "=d" (w1)					\
1339 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1340 #endif
1341 #endif /* __mips */
1342 
1343 #if defined (__mmix__) && W_TYPE_SIZE == 64
1344 #define umul_ppmm(w1, w0, u, v) \
1345   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1346 #endif
1347 
1348 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1349 #define umul_ppmm(w1, w0, u, v) \
1350   ({union {UDItype __ll;						\
1351 	   struct {USItype __l, __h;} __i;				\
1352 	  } __x;							\
1353   __asm__ ("meid %2,%0"							\
1354 	   : "=g" (__x.__ll)						\
1355 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
1356   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1357 #define __umulsidi3(u, v) \
1358   ({UDItype __w;							\
1359     __asm__ ("meid %2,%0"						\
1360 	     : "=g" (__w)						\
1361 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
1362     __w; })
1363 #define udiv_qrnnd(q, r, n1, n0, d) \
1364   ({union {UDItype __ll;						\
1365 	   struct {USItype __l, __h;} __i;				\
1366 	  } __x;							\
1367   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1368   __asm__ ("deid %2,%0"							\
1369 	   : "=g" (__x.__ll)						\
1370 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
1371   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1372 #define count_trailing_zeros(count,x) \
1373   do {									\
1374     __asm__ ("ffsd	%2,%0"						\
1375 	     : "=r" (count)						\
1376 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
1377   } while (0)
1378 #endif /* __ns32000__ */
1379 
1380 /* In the past we had a block of various #defines tested
1381        _ARCH_PPC    - AIX
1382        _ARCH_PWR    - AIX
1383        __powerpc__  - gcc
1384        __POWERPC__  - BEOS
1385        __ppc__      - Darwin
1386        PPC          - old gcc, GNU/Linux, SysV
1387    The plain PPC test was not good for vxWorks, since PPC is defined on all
1388    CPUs there (eg. m68k too), as a constant one is expected to compare
1389    CPU_FAMILY against.
1390 
1391    At any rate, this was pretty unattractive and a bit fragile.  The use of
1392    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1393    getting the desired effect.
1394 
1395    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1396    the system vendor compilers.  (Is that vendor compilers with inline asm,
1397    or what?)  */
1398 
1399 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
1400   && W_TYPE_SIZE == 32
1401 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1402   do {									\
1403     if (__builtin_constant_p (bh) && (bh) == 0)				\
1404       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1405 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
1406 		 __CLOBBER_CC);						\
1407     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1408       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1409 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
1410 		 __CLOBBER_CC);						\
1411     else								\
1412       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1413 	       : "=r" (sh), "=&r" (sl)					\
1414 	       : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
1415 		 __CLOBBER_CC);						\
1416   } while (0)
1417 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1418   do {									\
1419     if (__builtin_constant_p (ah) && (ah) == 0)				\
1420       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1421 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
1422 		 __CLOBBER_CC);						\
1423     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
1424       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1425 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
1426 		 __CLOBBER_CC);						\
1427     else if (__builtin_constant_p (bh) && (bh) == 0)			\
1428       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1429 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
1430 		 __CLOBBER_CC);						\
1431     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1432       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1433 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
1434 		 __CLOBBER_CC);						\
1435     else								\
1436       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
1437 	       : "=r" (sh), "=&r" (sl)					\
1438 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl)		\
1439 		 __CLOBBER_CC);						\
1440   } while (0)
1441 #define count_leading_zeros(count, x) \
1442   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1443 #define COUNT_LEADING_ZEROS_0 32
1444 #if HAVE_HOST_CPU_FAMILY_powerpc
1445 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
1446 #define umul_ppmm(w1, w0, u, v) \
1447   do {									\
1448     UDItype __ll = (UDItype)(u) * (v);					\
1449     w1 = __ll >> 32;							\
1450     w0 = __ll;								\
1451   } while (0)
1452 #endif
1453 #if !defined (umul_ppmm)
1454 #define umul_ppmm(ph, pl, m0, m1) \
1455   do {									\
1456     USItype __m0 = (m0), __m1 = (m1);					\
1457     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1458     (pl) = __m0 * __m1;							\
1459   } while (0)
1460 #endif
1461 #define smul_ppmm(ph, pl, m0, m1) \
1462   do {									\
1463     SItype __m0 = (m0), __m1 = (m1);					\
1464     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1465     (pl) = __m0 * __m1;							\
1466   } while (0)
1467 #else
1468 #define smul_ppmm(xh, xl, m0, m1) \
1469   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1470 #define sdiv_qrnnd(q, r, nh, nl, d) \
1471   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1472 #endif
1473 #endif /* 32-bit POWER architecture variants.  */
1474 
1475 /* We should test _IBMR2 here when we add assembly support for the system
1476    vendor compilers.  */
1477 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1478 #if !defined (_LONG_LONG_LIMB)
1479 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1480    use adde etc only when not _LONG_LONG_LIMB.  */
1481 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1482   do {									\
1483     if (__builtin_constant_p (bh) && (bh) == 0)				\
1484       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1485 	       : "=r" (sh), "=&r" (sl)					\
1486 	       : "r"  ((UDItype)(ah)),					\
1487 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1488 		 __CLOBBER_CC);						\
1489     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
1490       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1491 	       : "=r" (sh), "=&r" (sl)					\
1492 	       : "r"  ((UDItype)(ah)),					\
1493 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1494 		 __CLOBBER_CC);						\
1495     else								\
1496       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1497 	       : "=r" (sh), "=&r" (sl)					\
1498 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
1499 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1500 		 __CLOBBER_CC);						\
1501   } while (0)
1502 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1503    This might seem strange, but gcc folds away the dead code late.  */
1504 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1505   do {									\
1506     if (__builtin_constant_p (bl)					\
1507 	&& (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) {		\
1508 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1509 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
1510 		   : "=r" (sh), "=&r" (sl)				\
1511 		   :                       "r" ((UDItype)(bh)),		\
1512 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1513 		     __CLOBBER_CC);					\
1514 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1515 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
1516 		   : "=r" (sh), "=&r" (sl)				\
1517 		   :                       "r" ((UDItype)(bh)),		\
1518 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1519 		     __CLOBBER_CC);					\
1520 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1521 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
1522 		   : "=r" (sh), "=&r" (sl)				\
1523 		   : "r" ((UDItype)(ah)),				\
1524 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1525 		     __CLOBBER_CC);					\
1526 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1527 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
1528 		   : "=r" (sh), "=&r" (sl)				\
1529 		   : "r" ((UDItype)(ah)),				\
1530 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1531 		     __CLOBBER_CC);					\
1532 	else								\
1533 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
1534 		   : "=r" (sh), "=&r" (sl)				\
1535 		   : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1536 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1537 		     __CLOBBER_CC);					\
1538     } else {								\
1539 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1540 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1541 		   : "=r" (sh), "=&r" (sl)				\
1542 		   :                       "r" ((UDItype)(bh)),		\
1543 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1544 		     __CLOBBER_CC);					\
1545 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1546 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1547 		   : "=r" (sh), "=&r" (sl)				\
1548 		   :                       "r" ((UDItype)(bh)),		\
1549 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1550 		     __CLOBBER_CC);					\
1551 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1552 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1553 		   : "=r" (sh), "=&r" (sl)				\
1554 		   : "r"  ((UDItype)(ah)),				\
1555 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1556 		     __CLOBBER_CC);					\
1557 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1558 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1559 		   : "=r" (sh), "=&r" (sl)				\
1560 		   : "r"  ((UDItype)(ah)),				\
1561 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1562 		     __CLOBBER_CC);					\
1563 	else								\
1564 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
1565 		   : "=r" (sh), "=&r" (sl)				\
1566 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1567 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1568 		     __CLOBBER_CC);					\
1569     }									\
1570   } while (0)
1571 #endif /* ! _LONG_LONG_LIMB */
1572 #define count_leading_zeros(count, x) \
1573   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1574 #define COUNT_LEADING_ZEROS_0 64
1575 /* XXXMRG GCC-9 era, pre-req went from 4.4 to 4.8, check this. */
1576 #if 0 && (__GMP_GNUC_PREREQ (4,8) || defined(__clang__)) /* Disable, this results in libcalls! */
1577 #define umul_ppmm(w1, w0, u, v) \
1578   do {									\
1579     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1580     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1581     w1 = __ll >> 64;							\
1582     w0 = __ll;								\
1583   } while (0)
1584 #endif
1585 #if !defined (umul_ppmm)
1586 #define umul_ppmm(ph, pl, m0, m1) \
1587   do {									\
1588     UDItype __m0 = (m0), __m1 = (m1);					\
1589     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1590     (pl) = __m0 * __m1;							\
1591   } while (0)
1592 #endif
1593 #define smul_ppmm(ph, pl, m0, m1) \
1594   do {									\
1595     DItype __m0 = (m0), __m1 = (m1);					\
1596     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1597     (pl) = __m0 * __m1;							\
1598   } while (0)
1599 #endif /* 64-bit PowerPC.  */
1600 
1601 #if defined (__pyr__) && W_TYPE_SIZE == 32
1602 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1603   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
1604 	   : "=r" (sh), "=&r" (sl)					\
1605 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1606 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1607 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1608   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
1609 	   : "=r" (sh), "=&r" (sl)					\
1610 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1611 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1612 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1613 #define umul_ppmm(w1, w0, u, v) \
1614   ({union {UDItype __ll;						\
1615 	   struct {USItype __h, __l;} __i;				\
1616 	  } __x;							\
1617   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
1618 	   : "=&r" (__x.__ll)						\
1619 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
1620   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1621 #endif /* __pyr__ */
1622 
1623 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1624 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1625   __asm__ ("a %1,%5\n\tae %0,%3"					\
1626 	   : "=r" (sh), "=&r" (sl)					\
1627 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
1628 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1629 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1630   __asm__ ("s %1,%5\n\tse %0,%3"					\
1631 	   : "=r" (sh), "=&r" (sl)					\
1632 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
1633 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
1634 #define smul_ppmm(ph, pl, m0, m1) \
1635   __asm__ (								\
1636        "s	r2,r2\n"						\
1637 "	mts r10,%2\n"							\
1638 "	m	r2,%3\n"						\
1639 "	m	r2,%3\n"						\
1640 "	m	r2,%3\n"						\
1641 "	m	r2,%3\n"						\
1642 "	m	r2,%3\n"						\
1643 "	m	r2,%3\n"						\
1644 "	m	r2,%3\n"						\
1645 "	m	r2,%3\n"						\
1646 "	m	r2,%3\n"						\
1647 "	m	r2,%3\n"						\
1648 "	m	r2,%3\n"						\
1649 "	m	r2,%3\n"						\
1650 "	m	r2,%3\n"						\
1651 "	m	r2,%3\n"						\
1652 "	m	r2,%3\n"						\
1653 "	m	r2,%3\n"						\
1654 "	cas	%0,r2,r0\n"						\
1655 "	mfs	r10,%1"							\
1656 	   : "=r" (ph), "=r" (pl)					\
1657 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
1658 	   : "r2")
1659 #define count_leading_zeros(count, x) \
1660   do {									\
1661     if ((x) >= 0x10000)							\
1662       __asm__ ("clz	%0,%1"						\
1663 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
1664     else								\
1665       {									\
1666 	__asm__ ("clz	%0,%1"						\
1667 		 : "=r" (count) : "r" ((USItype)(x)));			\
1668 	(count) += 16;							\
1669       }									\
1670   } while (0)
1671 #endif /* RT/ROMP */
1672 
1673 #if defined (__riscv) && defined (__riscv_mul) && W_TYPE_SIZE == 64
1674 #define umul_ppmm(ph, pl, u, v) \
1675   do {									\
1676     UDItype __u = (u), __v = (v);					\
1677     (pl) = __u * __v;							\
1678     __asm__ ("mulhu\t%0, %1, %2" : "=r" (ph) : "%r" (__u), "r" (__v));	\
1679   } while (0)
1680 #endif
1681 
1682 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1683 #define umul_ppmm(w1, w0, u, v) \
1684   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
1685 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1686 #endif
1687 
1688 #if defined (__sparc__) && W_TYPE_SIZE == 32
1689 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1690   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1691 	   : "=r" (sh), "=&r" (sl)					\
1692 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
1693 	   __CLOBBER_CC)
1694 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1695   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1696 	   : "=r" (sh), "=&r" (sl)					\
1697 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
1698 	   __CLOBBER_CC)
1699 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1700    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1701 #if defined (__sparc_v9__) || defined (__sparcv9)
1702 /* Perhaps we should use floating-point operations here?  */
1703 #if 0
1704 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1705    Perhaps we simply need explicitly zero-extend the inputs?  */
1706 #define umul_ppmm(w1, w0, u, v) \
1707   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
1708 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1709 #else
1710 /* Use v8 umul until above bug is fixed.  */
1711 #define umul_ppmm(w1, w0, u, v) \
1712   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1713 #endif
1714 /* Use a plain v8 divide for v9.  */
1715 #define udiv_qrnnd(q, r, n1, n0, d) \
1716   do {									\
1717     USItype __q;							\
1718     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1719 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1720     (r) = (n0) - __q * (d);						\
1721     (q) = __q;								\
1722   } while (0)
1723 #else
1724 #if defined (__sparc_v8__)   /* gcc normal */				\
1725   || defined (__sparcv8)     /* gcc solaris */				\
1726   || HAVE_HOST_CPU_supersparc
1727 /* Don't match immediate range because, 1) it is not often useful,
1728    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1729    while we want to match a 13 bit interval, sign extended to 32 bits,
1730    but INTERPRETED AS UNSIGNED.  */
1731 #define umul_ppmm(w1, w0, u, v) \
1732   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1733 
1734 #if HAVE_HOST_CPU_supersparc
1735 #else
1736 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1737    dividends and will trap to the kernel for the rest. */
1738 #define udiv_qrnnd(q, r, n1, n0, d) \
1739   do {									\
1740     USItype __q;							\
1741     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1742 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1743     (r) = (n0) - __q * (d);						\
1744     (q) = __q;								\
1745   } while (0)
1746 #endif /* HAVE_HOST_CPU_supersparc */
1747 
1748 #else /* ! __sparc_v8__ */
1749 #if defined (__sparclite__)
1750 /* This has hardware multiply but not divide.  It also has two additional
1751    instructions scan (ffs from high bit) and divscc.  */
1752 #define umul_ppmm(w1, w0, u, v) \
1753   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1754 #define udiv_qrnnd(q, r, n1, n0, d) \
1755   __asm__ ("! Inlined udiv_qrnnd\n"					\
1756 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1757 "	tst	%%g0\n"							\
1758 "	divscc	%3,%4,%%g1\n"						\
1759 "	divscc	%%g1,%4,%%g1\n"						\
1760 "	divscc	%%g1,%4,%%g1\n"						\
1761 "	divscc	%%g1,%4,%%g1\n"						\
1762 "	divscc	%%g1,%4,%%g1\n"						\
1763 "	divscc	%%g1,%4,%%g1\n"						\
1764 "	divscc	%%g1,%4,%%g1\n"						\
1765 "	divscc	%%g1,%4,%%g1\n"						\
1766 "	divscc	%%g1,%4,%%g1\n"						\
1767 "	divscc	%%g1,%4,%%g1\n"						\
1768 "	divscc	%%g1,%4,%%g1\n"						\
1769 "	divscc	%%g1,%4,%%g1\n"						\
1770 "	divscc	%%g1,%4,%%g1\n"						\
1771 "	divscc	%%g1,%4,%%g1\n"						\
1772 "	divscc	%%g1,%4,%%g1\n"						\
1773 "	divscc	%%g1,%4,%%g1\n"						\
1774 "	divscc	%%g1,%4,%%g1\n"						\
1775 "	divscc	%%g1,%4,%%g1\n"						\
1776 "	divscc	%%g1,%4,%%g1\n"						\
1777 "	divscc	%%g1,%4,%%g1\n"						\
1778 "	divscc	%%g1,%4,%%g1\n"						\
1779 "	divscc	%%g1,%4,%%g1\n"						\
1780 "	divscc	%%g1,%4,%%g1\n"						\
1781 "	divscc	%%g1,%4,%%g1\n"						\
1782 "	divscc	%%g1,%4,%%g1\n"						\
1783 "	divscc	%%g1,%4,%%g1\n"						\
1784 "	divscc	%%g1,%4,%%g1\n"						\
1785 "	divscc	%%g1,%4,%%g1\n"						\
1786 "	divscc	%%g1,%4,%%g1\n"						\
1787 "	divscc	%%g1,%4,%%g1\n"						\
1788 "	divscc	%%g1,%4,%%g1\n"						\
1789 "	divscc	%%g1,%4,%0\n"						\
1790 "	rd	%%y,%1\n"						\
1791 "	bl,a 1f\n"							\
1792 "	add	%1,%4,%1\n"						\
1793 "1:	! End of inline udiv_qrnnd"					\
1794 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
1795 	   : "%g1" __AND_CLOBBER_CC)
1796 #define count_leading_zeros(count, x) \
1797   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1798 /* Early sparclites return 63 for an argument of 0, but they warn that future
1799    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1800    undefined.  */
1801 #endif /* __sparclite__ */
1802 #endif /* __sparc_v8__ */
1803 #endif /* __sparc_v9__ */
1804 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1805 #ifndef umul_ppmm
1806 #define umul_ppmm(w1, w0, u, v) \
1807   __asm__ ("! Inlined umul_ppmm\n"					\
1808 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
1809 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
1810 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
1811 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1812 "	mulscc	%%g1,%3,%%g1\n"						\
1813 "	mulscc	%%g1,%3,%%g1\n"						\
1814 "	mulscc	%%g1,%3,%%g1\n"						\
1815 "	mulscc	%%g1,%3,%%g1\n"						\
1816 "	mulscc	%%g1,%3,%%g1\n"						\
1817 "	mulscc	%%g1,%3,%%g1\n"						\
1818 "	mulscc	%%g1,%3,%%g1\n"						\
1819 "	mulscc	%%g1,%3,%%g1\n"						\
1820 "	mulscc	%%g1,%3,%%g1\n"						\
1821 "	mulscc	%%g1,%3,%%g1\n"						\
1822 "	mulscc	%%g1,%3,%%g1\n"						\
1823 "	mulscc	%%g1,%3,%%g1\n"						\
1824 "	mulscc	%%g1,%3,%%g1\n"						\
1825 "	mulscc	%%g1,%3,%%g1\n"						\
1826 "	mulscc	%%g1,%3,%%g1\n"						\
1827 "	mulscc	%%g1,%3,%%g1\n"						\
1828 "	mulscc	%%g1,%3,%%g1\n"						\
1829 "	mulscc	%%g1,%3,%%g1\n"						\
1830 "	mulscc	%%g1,%3,%%g1\n"						\
1831 "	mulscc	%%g1,%3,%%g1\n"						\
1832 "	mulscc	%%g1,%3,%%g1\n"						\
1833 "	mulscc	%%g1,%3,%%g1\n"						\
1834 "	mulscc	%%g1,%3,%%g1\n"						\
1835 "	mulscc	%%g1,%3,%%g1\n"						\
1836 "	mulscc	%%g1,%3,%%g1\n"						\
1837 "	mulscc	%%g1,%3,%%g1\n"						\
1838 "	mulscc	%%g1,%3,%%g1\n"						\
1839 "	mulscc	%%g1,%3,%%g1\n"						\
1840 "	mulscc	%%g1,%3,%%g1\n"						\
1841 "	mulscc	%%g1,%3,%%g1\n"						\
1842 "	mulscc	%%g1,%3,%%g1\n"						\
1843 "	mulscc	%%g1,%3,%%g1\n"						\
1844 "	mulscc	%%g1,0,%%g1\n"						\
1845 "	add	%%g1,%%g2,%0\n"						\
1846 "	rd	%%y,%1"							\
1847 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
1848 	   : "%g1", "%g2" __AND_CLOBBER_CC)
1849 #endif
1850 #ifndef udiv_qrnnd
1851 #ifndef LONGLONG_STANDALONE
1852 #define udiv_qrnnd(q, r, n1, n0, d) \
1853   do { UWtype __r;							\
1854     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
1855     (r) = __r;								\
1856   } while (0)
1857 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1858 #endif /* LONGLONG_STANDALONE */
1859 #endif /* udiv_qrnnd */
1860 #endif /* __sparc__ */
1861 
1862 #if defined (__sparc__) && W_TYPE_SIZE == 64
1863 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1864   __asm__ (								\
1865        "addcc	%r4,%5,%1\n"						\
1866       "	addccc	%r6,%7,%%g0\n"						\
1867       "	addc	%r2,%3,%0"						\
1868        : "=r" (sh), "=&r" (sl)						\
1869        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1870 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1871 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
1872 	   __CLOBBER_CC)
1873 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1874   __asm__ (								\
1875        "subcc	%r4,%5,%1\n"						\
1876       "	subccc	%r6,%7,%%g0\n"						\
1877       "	subc	%r2,%3,%0"						\
1878        : "=r" (sh), "=&r" (sl)						\
1879        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1880 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1881 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
1882 	   __CLOBBER_CC)
1883 #if __VIS__ >= 0x300
1884 #undef add_ssaaaa
1885 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1886   __asm__ (								\
1887        "addcc	%r4, %5, %1\n"						\
1888       "	addxc	%r2, %r3, %0"						\
1889 	  : "=r" (sh), "=&r" (sl)					\
1890        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
1891 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1892 #define umul_ppmm(ph, pl, m0, m1) \
1893   do {									\
1894     UDItype __m0 = (m0), __m1 = (m1);					\
1895     (pl) = __m0 * __m1;							\
1896     __asm__ ("umulxhi\t%2, %1, %0"					\
1897 	     : "=r" (ph)						\
1898 	     : "%r" (__m0), "r" (__m1));				\
1899   } while (0)
1900 #define count_leading_zeros(count, x) \
1901   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1902 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1903 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1904 #endif
1905 #endif
1906 
1907 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1908 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1909   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1910 	   : "=g" (sh), "=&g" (sl)					\
1911 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1912 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1913 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1914   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1915 	   : "=g" (sh), "=&g" (sl)					\
1916 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1917 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1918 #define smul_ppmm(xh, xl, m0, m1) \
1919   do {									\
1920     union {UDItype __ll;						\
1921 	   struct {USItype __l, __h;} __i;				\
1922 	  } __x;							\
1923     USItype __m0 = (m0), __m1 = (m1);					\
1924     __asm__ ("emul %1,%2,$0,%0"						\
1925 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
1926     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1927   } while (0)
1928 #define sdiv_qrnnd(q, r, n1, n0, d) \
1929   do {									\
1930     union {DItype __ll;							\
1931 	   struct {SItype __l, __h;} __i;				\
1932 	  } __x;							\
1933     __x.__i.__h = n1; __x.__i.__l = n0;					\
1934     __asm__ ("ediv %3,%2,%0,%1"						\
1935 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
1936   } while (0)
1937 #if 0
1938 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1939    8800 maybe). */
1940 #define count_trailing_zeros(count,x)					\
1941   do {									\
1942     __asm__ ("ffs 0, 31, %1, %0"					\
1943 	     : "=g" (count)						\
1944 	     : "g" ((USItype) (x)));					\
1945   } while (0)
1946 #endif
1947 #endif /* vax */
1948 
1949 #if defined (__z8000__) && W_TYPE_SIZE == 16
1950 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1951   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1952 	   : "=r" (sh), "=&r" (sl)					\
1953 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1954 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1955 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1956   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1957 	   : "=r" (sh), "=&r" (sl)					\
1958 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1959 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1960 #define umul_ppmm(xh, xl, m0, m1) \
1961   do {									\
1962     union {long int __ll;						\
1963 	   struct {unsigned int __h, __l;} __i;				\
1964 	  } __x;							\
1965     unsigned int __m0 = (m0), __m1 = (m1);				\
1966     __asm__ ("mult	%S0,%H3"					\
1967 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
1968 	     : "%1" (m0), "rQR" (m1));					\
1969     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1970     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1971 	     + (((signed int) __m1 >> 15) & __m0));			\
1972   } while (0)
1973 #endif /* __z8000__ */
1974 
1975 #endif /* __GNUC__ */
1976 
1977 #endif /* NO_ASM */
1978 
1979 
1980 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1981 #if !defined (umul_ppmm) && defined (__umulsidi3)
1982 #define umul_ppmm(ph, pl, m0, m1) \
1983   do {									\
1984     UDWtype __ll = __umulsidi3 (m0, m1);				\
1985     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
1986     pl = (UWtype) __ll;							\
1987   } while (0)
1988 #endif
1989 
1990 #if !defined (__umulsidi3)
1991 #define __umulsidi3(u, v) \
1992   ({UWtype __hi, __lo;							\
1993     umul_ppmm (__hi, __lo, u, v);					\
1994     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1995 #endif
1996 
1997 
1998 #if defined (__cplusplus)
1999 #define __longlong_h_C "C"
2000 #else
2001 #define __longlong_h_C
2002 #endif
2003 
2004 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
2005    forms have "reversed" arguments, meaning the pointer is last, which
2006    sometimes allows better parameter passing, in particular on 64-bit
2007    hppa. */
2008 
2009 #define mpn_umul_ppmm  __MPN(umul_ppmm)
2010 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
2011 
2012 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
2013   && ! defined (LONGLONG_STANDALONE)
2014 #define umul_ppmm(wh, wl, u, v)						\
2015   do {									\
2016     UWtype __umul_ppmm__p0;						\
2017     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
2018     (wl) = __umul_ppmm__p0;						\
2019   } while (0)
2020 #endif
2021 
2022 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
2023 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
2024 
2025 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
2026   && ! defined (LONGLONG_STANDALONE)
2027 #define umul_ppmm(wh, wl, u, v)						\
2028   do {									\
2029     UWtype __umul_p0;							\
2030     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
2031     (wl) = __umul_p0;							\
2032   } while (0)
2033 #endif
2034 
2035 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
2036 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
2037 
2038 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
2039   && ! defined (LONGLONG_STANDALONE)
2040 #define udiv_qrnnd(q, r, n1, n0, d)					\
2041   do {									\
2042     UWtype __udiv_qrnnd_r;						\
2043     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
2044 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
2045     (r) = __udiv_qrnnd_r;						\
2046   } while (0)
2047 #endif
2048 
2049 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
2050 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
2051 
2052 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
2053   && ! defined (LONGLONG_STANDALONE)
2054 #define udiv_qrnnd(q, r, n1, n0, d)					\
2055   do {									\
2056     UWtype __udiv_qrnnd_r;						\
2057     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
2058 			    &__udiv_qrnnd_r);				\
2059     (r) = __udiv_qrnnd_r;						\
2060   } while (0)
2061 #endif
2062 
2063 
2064 /* If this machine has no inline assembler, use C macros.  */
2065 
2066 #if !defined (add_ssaaaa)
2067 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
2068   do {									\
2069     UWtype __x;								\
2070     UWtype __al = (al);							\
2071     UWtype __bl = (bl);							\
2072     __x = __al + __bl;							\
2073     (sh) = (ah) + (bh) + (__x < __al);					\
2074     (sl) = __x;								\
2075   } while (0)
2076 #endif
2077 
2078 #if !defined (sub_ddmmss)
2079 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
2080   do {									\
2081     UWtype __x;								\
2082     UWtype __al = (al);							\
2083     UWtype __bl = (bl);							\
2084     __x = __al - __bl;							\
2085     (sh) = (ah) - (bh) - (__al < __bl);					\
2086     (sl) = __x;								\
2087   } while (0)
2088 #endif
2089 
2090 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2091    smul_ppmm.  */
2092 #if !defined (umul_ppmm) && defined (smul_ppmm)
2093 #define umul_ppmm(w1, w0, u, v)						\
2094   do {									\
2095     UWtype __w1;							\
2096     UWtype __xm0 = (u), __xm1 = (v);					\
2097     smul_ppmm (__w1, w0, __xm0, __xm1);					\
2098     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2099 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2100   } while (0)
2101 #endif
2102 
2103 /* If we still don't have umul_ppmm, define it using plain C.
2104 
2105    For reference, when this code is used for squaring (ie. u and v identical
2106    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2107    multiplies, not 4.  The subsequent additions could be optimized a bit,
2108    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2109    and chips obliged to use this generic C umul will have plenty of worse
2110    performance problems than a couple of extra instructions on the diagonal
2111    of sqr_basecase.  */
2112 
2113 #if !defined (umul_ppmm)
2114 #define umul_ppmm(w1, w0, u, v)						\
2115   do {									\
2116     UWtype __x0, __x1, __x2, __x3;					\
2117     UHWtype __ul, __vl, __uh, __vh;					\
2118     UWtype __u = (u), __v = (v);					\
2119 									\
2120     __ul = __ll_lowpart (__u);						\
2121     __uh = __ll_highpart (__u);						\
2122     __vl = __ll_lowpart (__v);						\
2123     __vh = __ll_highpart (__v);						\
2124 									\
2125     __x0 = (UWtype) __ul * __vl;					\
2126     __x1 = (UWtype) __ul * __vh;					\
2127     __x2 = (UWtype) __uh * __vl;					\
2128     __x3 = (UWtype) __uh * __vh;					\
2129 									\
2130     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
2131     __x1 += __x2;		/* but this indeed can */		\
2132     if (__x1 < __x2)		/* did we get it? */			\
2133       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
2134 									\
2135     (w1) = __x3 + __ll_highpart (__x1);					\
2136     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
2137   } while (0)
2138 #endif
2139 
2140 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2141    exist in one form or another.  */
2142 #if !defined (smul_ppmm)
2143 #define smul_ppmm(w1, w0, u, v)						\
2144   do {									\
2145     UWtype __w1;							\
2146     UWtype __xm0 = (u), __xm1 = (v);					\
2147     umul_ppmm (__w1, w0, __xm0, __xm1);					\
2148     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2149 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2150   } while (0)
2151 #endif
2152 
2153 /* Define this unconditionally, so it can be used for debugging.  */
2154 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2155   do {									\
2156     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
2157 									\
2158     ASSERT ((d) != 0);							\
2159     ASSERT ((n1) < (d));						\
2160 									\
2161     __d1 = __ll_highpart (d);						\
2162     __d0 = __ll_lowpart (d);						\
2163 									\
2164     __q1 = (n1) / __d1;							\
2165     __r1 = (n1) - __q1 * __d1;						\
2166     __m = __q1 * __d0;							\
2167     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
2168     if (__r1 < __m)							\
2169       {									\
2170 	__q1--, __r1 += (d);						\
2171 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2172 	  if (__r1 < __m)						\
2173 	    __q1--, __r1 += (d);					\
2174       }									\
2175     __r1 -= __m;							\
2176 									\
2177     __q0 = __r1 / __d1;							\
2178     __r0 = __r1  - __q0 * __d1;						\
2179     __m = __q0 * __d0;							\
2180     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
2181     if (__r0 < __m)							\
2182       {									\
2183 	__q0--, __r0 += (d);						\
2184 	if (__r0 >= (d))						\
2185 	  if (__r0 < __m)						\
2186 	    __q0--, __r0 += (d);					\
2187       }									\
2188     __r0 -= __m;							\
2189 									\
2190     (q) = __q1 * __ll_B | __q0;						\
2191     (r) = __r0;								\
2192   } while (0)
2193 
2194 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2195    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2196 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
2197   && ! defined (LONGLONG_STANDALONE)
2198 #define udiv_qrnnd(q, r, nh, nl, d) \
2199   do {									\
2200     UWtype __r;								\
2201     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
2202     (r) = __r;								\
2203   } while (0)
2204 __MPFR_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2205 #endif
2206 
2207 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2208 #if !defined (udiv_qrnnd)
2209 #define UDIV_NEEDS_NORMALIZATION 1
2210 #define udiv_qrnnd __udiv_qrnnd_c
2211 #endif
2212 
2213 #if !defined (count_leading_zeros)
2214 #define count_leading_zeros(count, x) \
2215   do {									\
2216     UWtype __xr = (x);							\
2217     UWtype __a;								\
2218 									\
2219     if (W_TYPE_SIZE == 32)						\
2220       {									\
2221 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
2222 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
2223 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
2224 	  : 3*__BITS4 + 1);						\
2225       }									\
2226     else								\
2227       {									\
2228 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
2229 	  if (((__xr >> __a) & 0xff) != 0)				\
2230 	    break;							\
2231 	++__a;								\
2232       }									\
2233 									\
2234     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
2235   } while (0)
2236 /* This version gives a well-defined value for zero. */
2237 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2238 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2239 #define COUNT_LEADING_ZEROS_SLOW
2240 #endif
2241 
2242 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2243 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2244 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2245 #endif
2246 
2247 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2248 extern const unsigned char __MPFR_DECLSPEC __clz_tab[129];
2249 #endif
2250 
2251 #if !defined (count_trailing_zeros)
2252 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2253 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2254 #define count_trailing_zeros(count, x)					\
2255   do {									\
2256     UWtype __ctz_x = (x);						\
2257     UWtype __ctz_c;							\
2258     ASSERT (__ctz_x != 0);						\
2259     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
2260     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
2261   } while (0)
2262 #else
2263 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2264    We use clz_tab without ado, since the C count_leading_zeros above will have
2265    pulled it in.  */
2266 #define count_trailing_zeros(count, x)					\
2267   do {									\
2268     UWtype __ctz_x = (x);						\
2269     int __ctz_c;							\
2270 									\
2271     if (LIKELY ((__ctz_x & 0xff) != 0))					\
2272       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
2273     else								\
2274       {									\
2275 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
2276 	  {								\
2277 	    __ctz_x >>= 8;						\
2278 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
2279 	      break;							\
2280 	  }								\
2281 									\
2282 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
2283       }									\
2284   } while (0)
2285 #endif
2286 #endif
2287 
2288 #ifndef UDIV_NEEDS_NORMALIZATION
2289 #define UDIV_NEEDS_NORMALIZATION 0
2290 #endif
2291 
2292 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2293    that hence the latter should always be used.  */
2294 #ifndef UDIV_PREINV_ALWAYS
2295 #define UDIV_PREINV_ALWAYS 0
2296 #endif
2297