1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2 
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 /* You have to define the following before including this file:
33 
34    UWtype -- An unsigned type, default type for operations (typically a "word")
35    UHWtype -- An unsigned type, at least half the size of UWtype
36    UDWtype -- An unsigned type, at least twice as large a UWtype
37    W_TYPE_SIZE -- size in bits of UWtype
38 
39    SItype, USItype -- Signed and unsigned 32 bit types
40    DItype, UDItype -- Signed and unsigned 64 bit types
41 
42    On a 32 bit machine UWtype should typically be USItype;
43    on a 64 bit machine, UWtype should typically be UDItype.
44 
45    Optionally, define:
46 
47    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
48    NO_ASM -- Disable inline asm
49 
50 
51    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
52    need to include gmp.h and gmp-impl.h, or certain things might not work as
53    expected.
54 */
55 
56 #define __BITS4 (W_TYPE_SIZE / 4)
57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
60 
61 /* This is used to make sure no undesirable sharing between different libraries
62    that use this file takes place.  */
63 #ifndef __MPN
64 #define __MPN(x) __##x
65 #endif
66 
67 /* Define auxiliary asm macros.
68 
69    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
70    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
71    word product in HIGH_PROD and LOW_PROD.
72 
73    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
74    UDWtype product.  This is just a variant of umul_ppmm.
75 
76    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
77    denominator) divides a UDWtype, composed by the UWtype integers
78    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
79    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
80    than DENOMINATOR for correct operation.  If, in addition, the most
81    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
82    UDIV_NEEDS_NORMALIZATION is defined to 1.
83 
84    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
85    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
86    is rounded towards 0.
87 
88    5) count_leading_zeros(count, x) counts the number of zero-bits from the
89    msb to the first non-zero bit in the UWtype X.  This is the number of
90    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
91    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
92 
93    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
94    from the least significant end.
95 
96    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
97    high_addend_2, low_addend_2) adds two UWtype integers, composed by
98    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
99    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
100    (i.e. carry out) is not stored anywhere, and is lost.
101 
102    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
103    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
104    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
105    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
106    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
107    and is lost.
108 
109    If any of these macros are left undefined for a particular CPU,
110    C macros are used.
111 
112 
113    Notes:
114 
115    For add_ssaaaa the two high and two low addends can both commute, but
116    unfortunately gcc only supports one "%" commutative in each asm block.
117    This has always been so but is only documented in recent versions
118    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
119    compiler error in certain rare circumstances.
120 
121    Apparently it was only the last "%" that was ever actually respected, so
122    the code has been updated to leave just that.  Clearly there's a free
123    choice whether high or low should get it, if there's a reason to favour
124    one over the other.  Also obviously when the constraints on the two
125    operands are identical there's no benefit to the reloader in any "%" at
126    all.
127 
128    */
129 
130 /* The CPUs come in alphabetical order below.
131 
132    Please add support for more CPUs here, or improve the current support
133    for the CPUs below!  */
134 
135 
136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
137    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
138    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
139    __builtin_ctzll.
140 
141    These builtins are only used when we check what code comes out, on some
142    chips they're merely libgcc calls, where we will instead want an inline
143    in that case (either asm or generic C).
144 
145    These builtins are better than an asm block of the same insn, since an
146    asm block doesn't give gcc any information about scheduling or resource
147    usage.  We keep an asm block for use on prior versions of gcc though.
148 
149    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
150    it's not used (for count_leading_zeros) because it generally gives extra
151    code to ensure the result is 0 when the input is 0, which we don't need
152    or want.  */
153 
154 #ifdef _LONG_LONG_LIMB
155 #define count_leading_zeros_gcc_clz(count,x)	\
156   do {						\
157     ASSERT ((x) != 0);				\
158     (count) = __builtin_clzll (x);		\
159   } while (0)
160 #else
161 #define count_leading_zeros_gcc_clz(count,x)	\
162   do {						\
163     ASSERT ((x) != 0);				\
164     (count) = __builtin_clzl (x);		\
165   } while (0)
166 #endif
167 
168 #ifdef _LONG_LONG_LIMB
169 #define count_trailing_zeros_gcc_ctz(count,x)	\
170   do {						\
171     ASSERT ((x) != 0);				\
172     (count) = __builtin_ctzll (x);		\
173   } while (0)
174 #else
175 #define count_trailing_zeros_gcc_ctz(count,x)	\
176   do {						\
177     ASSERT ((x) != 0);				\
178     (count) = __builtin_ctzl (x);		\
179   } while (0)
180 #endif
181 
182 
183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
184    don't need to be under !NO_ASM */
185 #if ! defined (NO_ASM)
186 
187 #if defined (__alpha) && W_TYPE_SIZE == 64
188 /* Most alpha-based machines, except Cray systems. */
189 #if defined (__GNUC__)
190 #if __GMP_GNUC_PREREQ (3,3)
191 #define umul_ppmm(ph, pl, m0, m1) \
192   do {									\
193     UDItype __m0 = (m0), __m1 = (m1);					\
194     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
195     (pl) = __m0 * __m1;							\
196   } while (0)
197 #else
198 #define umul_ppmm(ph, pl, m0, m1) \
199   do {									\
200     UDItype __m0 = (m0), __m1 = (m1);					\
201     __asm__ ("umulh %r1,%2,%0"						\
202 	     : "=r" (ph)						\
203 	     : "%rJ" (__m0), "rI" (__m1));				\
204     (pl) = __m0 * __m1;							\
205   } while (0)
206 #endif
207 #else /* ! __GNUC__ */
208 #include <machine/builtins.h>
209 #define umul_ppmm(ph, pl, m0, m1) \
210   do {									\
211     UDItype __m0 = (m0), __m1 = (m1);					\
212     (ph) = __UMULH (__m0, __m1);					\
213     (pl) = __m0 * __m1;							\
214   } while (0)
215 #endif
216 #ifndef LONGLONG_STANDALONE
217 #define udiv_qrnnd(q, r, n1, n0, d) \
218   do { UWtype __di;							\
219     __di = __MPN(invert_limb) (d);					\
220     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
221   } while (0)
222 #define UDIV_PREINV_ALWAYS  1
223 #define UDIV_NEEDS_NORMALIZATION 1
224 #endif /* LONGLONG_STANDALONE */
225 
226 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
227    always goes into libgmp.so, even when not actually used.  */
228 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
229 
230 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
231 #define count_leading_zeros(COUNT,X) \
232   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
233 #define count_trailing_zeros(COUNT,X) \
234   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
235 #endif /* clz/ctz using cix */
236 
237 #if ! defined (count_leading_zeros)				\
238   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
239 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
240    "$31" is written explicitly in the asm, since an "r" constraint won't
241    select reg 31.  There seems no need to worry about "r31" syntax for cray,
242    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
243 #define ALPHA_CMPBGE_0(dst, src)					\
244   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
245 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
246    them, locating the highest non-zero byte.  A second __clz_tab lookup
247    counts the leading zero bits in that byte, giving the result.  */
248 #define count_leading_zeros(count, x)					\
249   do {									\
250     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
251     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
252     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
253     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
254     __clz__x >>= __clz__b;						\
255     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
256     __clz__b = 65 - __clz__b;						\
257     (count) = __clz__b - __clz__c;					\
258   } while (0)
259 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
260 #endif /* clz using cmpbge */
261 
262 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
263 #if HAVE_ATTRIBUTE_CONST
264 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
265 #else
266 long __MPN(count_leading_zeros) (UDItype);
267 #endif
268 #define count_leading_zeros(count, x) \
269   ((count) = __MPN(count_leading_zeros) (x))
270 #endif /* clz using mpn */
271 #endif /* __alpha */
272 
273 #if defined (__AVR) && W_TYPE_SIZE == 8
274 #define umul_ppmm(ph, pl, m0, m1) \
275   do {									\
276     unsigned short __p = (unsigned short) (m0) * (m1);			\
277     (ph) = __p >> 8;							\
278     (pl) = __p;								\
279   } while (0)
280 #endif /* AVR */
281 
282 #if defined (_CRAY) && W_TYPE_SIZE == 64
283 #include <intrinsics.h>
284 #define UDIV_PREINV_ALWAYS  1
285 #define UDIV_NEEDS_NORMALIZATION 1
286 long __MPN(count_leading_zeros) (UDItype);
287 #define count_leading_zeros(count, x) \
288   ((count) = _leadz ((UWtype) (x)))
289 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
290 #define umul_ppmm(ph, pl, m0, m1) \
291   do {									\
292     UDItype __m0 = (m0), __m1 = (m1);					\
293     (ph) = _int_mult_upper (__m0, __m1);				\
294     (pl) = __m0 * __m1;							\
295   } while (0)
296 #ifndef LONGLONG_STANDALONE
297 #define udiv_qrnnd(q, r, n1, n0, d) \
298   do { UWtype __di;							\
299     __di = __MPN(invert_limb) (d);					\
300     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
301   } while (0)
302 #endif /* LONGLONG_STANDALONE */
303 #endif /* _CRAYIEEE */
304 #endif /* _CRAY */
305 
306 #if defined (__ia64) && W_TYPE_SIZE == 64
307 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
308    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
309    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
310    register, which takes an extra cycle.  */
311 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
312   do {						\
313     UWtype __x;					\
314     __x = (al) - (bl);				\
315     if ((al) < (bl))				\
316       (sh) = (ah) - (bh) - 1;			\
317     else					\
318       (sh) = (ah) - (bh);			\
319     (sl) = __x;					\
320   } while (0)
321 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
322 /* Do both product parts in assembly, since that gives better code with
323    all gcc versions.  Some callers will just use the upper part, and in
324    that situation we waste an instruction, but not any cycles.  */
325 #define umul_ppmm(ph, pl, m0, m1) \
326     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
327 	     : "=&f" (ph), "=f" (pl)					\
328 	     : "f" (m0), "f" (m1))
329 #define count_leading_zeros(count, x) \
330   do {									\
331     UWtype _x = (x), _y, _a, _c;					\
332     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
333     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
334     _c = (_a - 1) << 3;							\
335     _x >>= _c;								\
336     if (_x >= 1 << 4)							\
337       _x >>= 4, _c += 4;						\
338     if (_x >= 1 << 2)							\
339       _x >>= 2, _c += 2;						\
340     _c += _x >> 1;							\
341     (count) =  W_TYPE_SIZE - 1 - _c;					\
342   } while (0)
343 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
344    based, and we don't need a special case for x==0 here */
345 #define count_trailing_zeros(count, x)					\
346   do {									\
347     UWtype __ctz_x = (x);						\
348     __asm__ ("popcnt %0 = %1"						\
349 	     : "=r" (count)						\
350 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
351   } while (0)
352 #endif
353 #if defined (__INTEL_COMPILER)
354 #include <ia64intrin.h>
355 #define umul_ppmm(ph, pl, m0, m1)					\
356   do {									\
357     UWtype __m0 = (m0), __m1 = (m1);					\
358     ph = _m64_xmahu (__m0, __m1, 0);					\
359     pl = __m0 * __m1;							\
360   } while (0)
361 #endif
362 #ifndef LONGLONG_STANDALONE
363 #define udiv_qrnnd(q, r, n1, n0, d) \
364   do { UWtype __di;							\
365     __di = __MPN(invert_limb) (d);					\
366     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
367   } while (0)
368 #define UDIV_PREINV_ALWAYS  1
369 #define UDIV_NEEDS_NORMALIZATION 1
370 #endif
371 #endif
372 
373 
374 #if defined (__GNUC__)
375 
376 /* We sometimes need to clobber "cc" with gcc2, but that would not be
377    understood by gcc1.  Use cpp to avoid major code duplication.  */
378 #if __GNUC__ < 2
379 #define __CLOBBER_CC
380 #define __AND_CLOBBER_CC
381 #else /* __GNUC__ >= 2 */
382 #define __CLOBBER_CC : "cc"
383 #define __AND_CLOBBER_CC , "cc"
384 #endif /* __GNUC__ < 2 */
385 
386 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
387 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
388   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
389 	   : "=r" (sh), "=&r" (sl)					\
390 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
391 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
392   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
393 	   : "=r" (sh), "=&r" (sl)					\
394 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
395 #define umul_ppmm(xh, xl, m0, m1) \
396   do {									\
397     USItype __m0 = (m0), __m1 = (m1);					\
398     __asm__ ("multiplu %0,%1,%2"					\
399 	     : "=r" (xl)						\
400 	     : "r" (__m0), "r" (__m1));					\
401     __asm__ ("multmu %0,%1,%2"						\
402 	     : "=r" (xh)						\
403 	     : "r" (__m0), "r" (__m1));					\
404   } while (0)
405 #define udiv_qrnnd(q, r, n1, n0, d) \
406   __asm__ ("dividu %0,%3,%4"						\
407 	   : "=r" (q), "=q" (r)						\
408 	   : "1" (n1), "r" (n0), "r" (d))
409 #define count_leading_zeros(count, x) \
410     __asm__ ("clz %0,%1"						\
411 	     : "=r" (count)						\
412 	     : "r" (x))
413 #define COUNT_LEADING_ZEROS_0 32
414 #endif /* __a29k__ */
415 
416 #if defined (__arc__)
417 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
418   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
419 	   : "=r" (sh),							\
420 	     "=&r" (sl)							\
421 	   : "r"  ((USItype) (ah)),					\
422 	     "rICal" ((USItype) (bh)),					\
423 	     "%r" ((USItype) (al)),					\
424 	     "rICal" ((USItype) (bl)))
425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
426   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
427 	   : "=r" (sh),							\
428 	     "=&r" (sl)							\
429 	   : "r" ((USItype) (ah)),					\
430 	     "rICal" ((USItype) (bh)),					\
431 	     "r" ((USItype) (al)),					\
432 	     "rICal" ((USItype) (bl)))
433 #endif
434 
435 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
436     && W_TYPE_SIZE == 32
437 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
438   do {									\
439     if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl))	\
440       __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
441 	   : "=r" (sh), "=&r" (sl)					\
442 	       : "r" (ah), "rI" (bh),					\
443 		 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC);	\
444     else								\
445       __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
446 	   : "=r" (sh), "=&r" (sl)					\
447 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC);	\
448   } while (0)
449 /* FIXME: Extend the immediate range for the low word by using both ADDS and
450    SUBS, since they set carry in the same way.  We need separate definitions
451    for thumb and non-thumb since thumb lacks RSC.  */
452 #if defined (__thumb__)
453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
454   do {									\
455     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
456 	&& (ah) == (bh))						\
457       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
458 	       : "=r" (sh), "=r" (sl)					\
459 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
460     else if (__builtin_constant_p (al))					\
461       __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"			\
462 	       : "=r" (sh), "=&r" (sl)					\
463 	       : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
464     else if (__builtin_constant_p (bl))					\
465       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
466 	       : "=r" (sh), "=&r" (sl)					\
467 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
468     else								\
469       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
470 	       : "=r" (sh), "=&r" (sl)					\
471 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
472     } while (0)
473 #else
474 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
475   do {									\
476     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
477 	&& (ah) == (bh))						\
478       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
479 	       : "=r" (sh), "=r" (sl)					\
480 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
481     else if (__builtin_constant_p (al))					\
482       {									\
483 	if (__builtin_constant_p (ah))					\
484 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
485 		   : "=r" (sh), "=&r" (sl)				\
486 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
487 	else								\
488 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
489 		   : "=r" (sh), "=&r" (sl)				\
490 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
491       }									\
492     else if (__builtin_constant_p (ah))					\
493       {									\
494 	if (__builtin_constant_p (bl))					\
495 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
496 		   : "=r" (sh), "=&r" (sl)				\
497 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
498 	else								\
499 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
500 		   : "=r" (sh), "=&r" (sl)				\
501 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
502       }									\
503     else if (__builtin_constant_p (bl))					\
504       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
505 	       : "=r" (sh), "=&r" (sl)					\
506 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
507     else								\
508       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
509 	       : "=r" (sh), "=&r" (sl)					\
510 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
511     } while (0)
512 #endif
513 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
514     || defined (__ARM_ARCH_3__)
515 #define umul_ppmm(xh, xl, a, b)						\
516   do {									\
517     register USItype __t0, __t1, __t2;					\
518     __asm__ ("%@ Inlined umul_ppmm\n"					\
519 	   "	mov	%2, %5, lsr #16\n"				\
520 	   "	mov	%0, %6, lsr #16\n"				\
521 	   "	bic	%3, %5, %2, lsl #16\n"				\
522 	   "	bic	%4, %6, %0, lsl #16\n"				\
523 	   "	mul	%1, %3, %4\n"					\
524 	   "	mul	%4, %2, %4\n"					\
525 	   "	mul	%3, %0, %3\n"					\
526 	   "	mul	%0, %2, %0\n"					\
527 	   "	adds	%3, %4, %3\n"					\
528 	   "	addcs	%0, %0, #65536\n"				\
529 	   "	adds	%1, %1, %3, lsl #16\n"				\
530 	   "	adc	%0, %0, %3, lsr #16"				\
531 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
532 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
533 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
534   } while (0)
535 #ifndef LONGLONG_STANDALONE
536 #define udiv_qrnnd(q, r, n1, n0, d) \
537   do { UWtype __r;							\
538     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
539     (r) = __r;								\
540   } while (0)
541 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
542 #endif /* LONGLONG_STANDALONE */
543 #else /* ARMv4 or newer */
544 #define umul_ppmm(xh, xl, a, b) \
545   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
546 #define smul_ppmm(xh, xl, a, b) \
547   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
548 #ifndef LONGLONG_STANDALONE
549 #define udiv_qrnnd(q, r, n1, n0, d) \
550   do { UWtype __di;							\
551     __di = __MPN(invert_limb) (d);					\
552     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
553   } while (0)
554 #define UDIV_PREINV_ALWAYS  1
555 #define UDIV_NEEDS_NORMALIZATION 1
556 #endif /* LONGLONG_STANDALONE */
557 #endif /* defined(__ARM_ARCH_2__) ... */
558 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
559 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
560 #endif /* __arm__ */
561 
562 #if defined (__aarch64__) && W_TYPE_SIZE == 64
563 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
564   do {									\
565     if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
566       __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
567 	       : "=r" (sh), "=&r" (sl)					\
568 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
569 		 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
570     else								\
571       __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
572 	       : "=r" (sh), "=&r" (sl)					\
573 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
574 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
575   } while (0)
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577   do {									\
578     if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
579       __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
580 	       : "=r,r" (sh), "=&r,&r" (sl)				\
581 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
582 		 "r,Z"   ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
583     else								\
584       __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
585 	       : "=r,r" (sh), "=&r,&r" (sl)				\
586 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
587 		 "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC);\
588   } while(0);
589 #if __GMP_GNUC_PREREQ (4,9)
590 #define umul_ppmm(w1, w0, u, v) \
591   do {									\
592     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
593     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
594     w1 = __ll >> 64;							\
595     w0 = __ll;								\
596   } while (0)
597 #endif
598 #if !defined (umul_ppmm)
599 #define umul_ppmm(ph, pl, m0, m1) \
600   do {									\
601     UDItype __m0 = (m0), __m1 = (m1);					\
602     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
603     (pl) = __m0 * __m1;							\
604   } while (0)
605 #endif
606 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
607 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
608 #endif /* __aarch64__ */
609 
610 #if defined (__clipper__) && W_TYPE_SIZE == 32
611 #define umul_ppmm(w1, w0, u, v) \
612   ({union {UDItype __ll;						\
613 	   struct {USItype __l, __h;} __i;				\
614 	  } __x;							\
615   __asm__ ("mulwux %2,%0"						\
616 	   : "=r" (__x.__ll)						\
617 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
618   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
619 #define smul_ppmm(w1, w0, u, v) \
620   ({union {DItype __ll;							\
621 	   struct {SItype __l, __h;} __i;				\
622 	  } __x;							\
623   __asm__ ("mulwx %2,%0"						\
624 	   : "=r" (__x.__ll)						\
625 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
626   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
627 #define __umulsidi3(u, v) \
628   ({UDItype __w;							\
629     __asm__ ("mulwux %2,%0"						\
630 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
631     __w; })
632 #endif /* __clipper__ */
633 
634 /* Fujitsu vector computers.  */
635 #if defined (__uxp__) && W_TYPE_SIZE == 32
636 #define umul_ppmm(ph, pl, u, v) \
637   do {									\
638     union {UDItype __ll;						\
639 	   struct {USItype __h, __l;} __i;				\
640 	  } __x;							\
641     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
642     (ph) = __x.__i.__h;							\
643     (pl) = __x.__i.__l;							\
644   } while (0)
645 #define smul_ppmm(ph, pl, u, v) \
646   do {									\
647     union {UDItype __ll;						\
648 	   struct {USItype __h, __l;} __i;				\
649 	  } __x;							\
650     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
651     (ph) = __x.__i.__h;							\
652     (pl) = __x.__i.__l;							\
653   } while (0)
654 #endif
655 
656 #if defined (__gmicro__) && W_TYPE_SIZE == 32
657 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
658   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
659 	   : "=g" (sh), "=&g" (sl)					\
660 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
661 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
662 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
663   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
664 	   : "=g" (sh), "=&g" (sl)					\
665 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
666 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
667 #define umul_ppmm(ph, pl, m0, m1) \
668   __asm__ ("mulx %3,%0,%1"						\
669 	   : "=g" (ph), "=r" (pl)					\
670 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
671 #define udiv_qrnnd(q, r, nh, nl, d) \
672   __asm__ ("divx %4,%0,%1"						\
673 	   : "=g" (q), "=r" (r)						\
674 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
675 #define count_leading_zeros(count, x) \
676   __asm__ ("bsch/1 %1,%0"						\
677 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
678 #endif
679 
680 #if defined (__hppa) && W_TYPE_SIZE == 32
681 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
682   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
683 	   : "=r" (sh), "=&r" (sl)					\
684 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
685 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
686   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
687 	   : "=r" (sh), "=&r" (sl)					\
688 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
689 #if defined (_PA_RISC1_1)
690 #define umul_ppmm(wh, wl, u, v) \
691   do {									\
692     union {UDItype __ll;						\
693 	   struct {USItype __h, __l;} __i;				\
694 	  } __x;							\
695     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
696     (wh) = __x.__i.__h;							\
697     (wl) = __x.__i.__l;							\
698   } while (0)
699 #endif
700 #define count_leading_zeros(count, x) \
701   do {									\
702     USItype __tmp;							\
703     __asm__ (								\
704        "ldi		1,%0\n"						\
705 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
706 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
707 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
708 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
709 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
710 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
711 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
712 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
713 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
714 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
715 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
716 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
717 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
718 "	sub		%0,%1,%0	; Subtract it.\n"		\
719 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
720   } while (0)
721 #endif /* hppa */
722 
723 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
724    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
725    is just a case of no direct support for 2.0n but treating it like 1.0. */
726 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
727 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
728   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
729 	   : "=r" (sh), "=&r" (sl)					\
730 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
731 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
732   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
733 	   : "=r" (sh), "=&r" (sl)					\
734 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
735 #endif /* hppa */
736 
737 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
738 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
739 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
740   do {									\
741 /*  if (__builtin_constant_p (bl))					\
742       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
743 	       : "=r" (sh), "=&r" (sl)					\
744 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
745     else								\
746 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
747 	       : "=r" (sh), "=&r" (sl)					\
748 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
749   } while (0)
750 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
751   do {									\
752 /*  if (__builtin_constant_p (bl))					\
753       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
754 	       : "=r" (sh), "=&r" (sl)					\
755 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
756     else								\
757 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
758 	       : "=r" (sh), "=&r" (sl)					\
759 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
760   } while (0)
761 #if __GMP_GNUC_PREREQ (4,5)
762 #define umul_ppmm(xh, xl, m0, m1)					\
763   do {									\
764     union {UDItype __ll;						\
765 	   struct {USItype __h, __l;} __i;				\
766 	  } __x;							\
767     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
768     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
769   } while (0)
770 #else
771 #if 0
772 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
773    with a new enough processor pretending we have 32-bit registers.  */
774 #define umul_ppmm(xh, xl, m0, m1)					\
775   do {									\
776     union {UDItype __ll;						\
777 	   struct {USItype __h, __l;} __i;				\
778 	  } __x;							\
779     __asm__ ("mlr\t%0,%2"						\
780 	     : "=r" (__x.__ll)						\
781 	     : "%0" (m0), "r" (m1));					\
782     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
783   } while (0)
784 #else
785 #define umul_ppmm(xh, xl, m0, m1)					\
786   do {									\
787   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
788      DImode for the product, since that would be allocated to a single 64-bit
789      register, whereas mlr uses the low 32-bits of an even-odd register pair.
790   */									\
791     register USItype __r0 __asm__ ("0");				\
792     register USItype __r1 __asm__ ("1") = (m0);				\
793     __asm__ ("mlr\t%0,%3"						\
794 	     : "=r" (__r0), "=r" (__r1)					\
795 	     : "r" (__r1), "r" (m1));					\
796     (xh) = __r0; (xl) = __r1;						\
797   } while (0)
798 #endif /* if 0 */
799 #endif
800 #if 0
801 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
802    with a new enough processor pretending we have 32-bit registers.  */
803 #define udiv_qrnnd(q, r, n1, n0, d)					\
804   do {									\
805     union {UDItype __ll;						\
806 	   struct {USItype __h, __l;} __i;				\
807 	  } __x;							\
808     __x.__i.__h = n1; __x.__i.__l = n0;					\
809     __asm__ ("dlr\t%0,%2"						\
810 	     : "=r" (__x.__ll)						\
811 	     : "0" (__x.__ll), "r" (d));				\
812     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
813   } while (0)
814 #else
815 #define udiv_qrnnd(q, r, n1, n0, d)					\
816   do {									\
817     register USItype __r0 __asm__ ("0") = (n1);				\
818     register USItype __r1 __asm__ ("1") = (n0);				\
819     __asm__ ("dlr\t%0,%4"						\
820 	     : "=r" (__r0), "=r" (__r1)					\
821 	     : "r" (__r0), "r" (__r1), "r" (d));			\
822     (q) = __r1; (r) = __r0;						\
823   } while (0)
824 #endif /* if 0 */
825 #else /* if __zarch__ */
826 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
827 #define smul_ppmm(xh, xl, m0, m1)					\
828   do {									\
829     union {DItype __ll;							\
830 	   struct {USItype __h, __l;} __i;				\
831 	  } __x;							\
832     __asm__ ("mr\t%0,%2"						\
833 	     : "=r" (__x.__ll)						\
834 	     : "%0" (m0), "r" (m1));					\
835     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
836   } while (0)
837 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
838 #define sdiv_qrnnd(q, r, n1, n0, d)					\
839   do {									\
840     union {DItype __ll;							\
841 	   struct {USItype __h, __l;} __i;				\
842 	  } __x;							\
843     __x.__i.__h = n1; __x.__i.__l = n0;					\
844     __asm__ ("dr\t%0,%2"						\
845 	     : "=r" (__x.__ll)						\
846 	     : "0" (__x.__ll), "r" (d));				\
847     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
848   } while (0)
849 #endif /* if __zarch__ */
850 #endif
851 
852 #if defined (__s390x__) && W_TYPE_SIZE == 64
853 /* We need to cast operands with register constraints, otherwise their types
854    will be assumed to be SImode by gcc.  For these machines, such operations
855    will insert a value into the low 32 bits, and leave the high 32 bits with
856    garbage.  */
857 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
858   do {									\
859     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
860 	       : "=r" (sh), "=&r" (sl)					\
861 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
862 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
863   } while (0)
864 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
865   do {									\
866     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
867 	     : "=r" (sh), "=&r" (sl)					\
868 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
869 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
870   } while (0)
871 #define umul_ppmm(xh, xl, m0, m1)					\
872   do {									\
873     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
874 	   struct {UDItype __h, __l;} __i;				\
875 	  } __x;							\
876     __asm__ ("mlgr\t%0,%2"						\
877 	     : "=r" (__x.__ll)						\
878 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
879     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
880   } while (0)
881 #define udiv_qrnnd(q, r, n1, n0, d)					\
882   do {									\
883     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
884 	   struct {UDItype __h, __l;} __i;				\
885 	  } __x;							\
886     __x.__i.__h = n1; __x.__i.__l = n0;					\
887     __asm__ ("dlgr\t%0,%2"						\
888 	     : "=r" (__x.__ll)						\
889 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
890     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
891   } while (0)
892 #if 0 /* FIXME: Enable for z10 (?) */
893 #define count_leading_zeros(cnt, x)					\
894   do {									\
895     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
896 	   struct {UDItype __h, __l;} __i;				\
897 	  } __clr_cnt;							\
898     __asm__ ("flogr\t%0,%1"						\
899 	     : "=r" (__clr_cnt.__ll)					\
900 	     : "r" (x) __CLOBBER_CC);					\
901     (cnt) = __clr_cnt.__i.__h;						\
902   } while (0)
903 #endif
904 #endif
905 
906 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
907    so we don't need __CLOBBER_CC.  */
908 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
910   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
911 	   : "=r" (sh), "=&r" (sl)					\
912 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
913 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
915   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
916 	   : "=r" (sh), "=&r" (sl)					\
917 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
918 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
919 #define umul_ppmm(w1, w0, u, v) \
920   __asm__ ("mull %3"							\
921 	   : "=a" (w0), "=d" (w1)					\
922 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
923 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
924   __asm__ ("divl %4"		     /* stringification in K&R C */	\
925 	   : "=a" (q), "=d" (r)						\
926 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
927 
928 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
929 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
930    significant 1 bit is, hence the use of the following alternatives.  bsfl
931    is slow too, between 18 and 42 depending where the least significant 1
932    bit is, so let the generic count_trailing_zeros below make use of the
933    count_leading_zeros here too.  */
934 
935 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
936 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
937    cache miss reading from __clz_tab.  For P55 it's favoured over the float
938    below so as to avoid mixing MMX and x87, since the penalty for switching
939    between the two is about 100 cycles.
940 
941    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
942    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
943    follows, but as of gcc 2.95.2 it results in conditional jumps.
944 
945        __shift = -(__n < 0x1000000);
946        __shift -= (__n < 0x10000);
947        __shift -= (__n < 0x100);
948 
949    The middle two sbbl and cmpl's pair, and with luck something gcc
950    generates might pair with the first cmpl and the last sbbl.  The "32+1"
951    constant could be folded into __clz_tab[], but it doesn't seem worth
952    making a different table just for that.  */
953 
954 #define count_leading_zeros(c,n)					\
955   do {									\
956     USItype  __n = (n);							\
957     USItype  __shift;							\
958     __asm__ ("cmpl  $0x1000000, %1\n"					\
959 	     "sbbl  %0, %0\n"						\
960 	     "cmpl  $0x10000, %1\n"					\
961 	     "sbbl  $0, %0\n"						\
962 	     "cmpl  $0x100, %1\n"					\
963 	     "sbbl  $0, %0\n"						\
964 	     : "=&r" (__shift) : "r"  (__n));				\
965     __shift = __shift*8 + 24 + 1;					\
966     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
967   } while (0)
968 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
969 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
970 
971 #else /* ! pentiummmx || LONGLONG_STANDALONE */
972 /* The following should be a fixed 14 cycles or so.  Some scheduling
973    opportunities should be available between the float load/store too.  This
974    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
975    apparently suggested by the Intel optimizing manual (don't know exactly
976    where).  gcc 2.95 or up will be best for this, so the "double" is
977    correctly aligned on the stack.  */
978 #define count_leading_zeros(c,n)					\
979   do {									\
980     union {								\
981       double    d;							\
982       unsigned  a[2];							\
983     } __u;								\
984     __u.d = (UWtype) (n);						\
985     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
986   } while (0)
987 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
988 #endif /* pentiummx */
989 
990 #else /* ! pentium */
991 
992 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
993 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
994 #endif /* gcc clz */
995 
996 /* On P6, gcc prior to 3.0 generates a partial register stall for
997    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
998    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
999    cost of one extra instruction.  Do this for "i386" too, since that means
1000    generic x86.  */
1001 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
1002   && (HAVE_HOST_CPU_i386						\
1003       || HAVE_HOST_CPU_i686						\
1004       || HAVE_HOST_CPU_pentiumpro					\
1005       || HAVE_HOST_CPU_pentium2						\
1006       || HAVE_HOST_CPU_pentium3)
1007 #define count_leading_zeros(count, x)					\
1008   do {									\
1009     USItype __cbtmp;							\
1010     ASSERT ((x) != 0);							\
1011     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1012     (count) = 31 - __cbtmp;						\
1013   } while (0)
1014 #endif /* gcc<3 asm bsrl */
1015 
1016 #ifndef count_leading_zeros
1017 #define count_leading_zeros(count, x)					\
1018   do {									\
1019     USItype __cbtmp;							\
1020     ASSERT ((x) != 0);							\
1021     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1022     (count) = __cbtmp ^ 31;						\
1023   } while (0)
1024 #endif /* asm bsrl */
1025 
1026 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
1027 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
1028 #endif /* gcc ctz */
1029 
1030 #ifndef count_trailing_zeros
1031 #define count_trailing_zeros(count, x)					\
1032   do {									\
1033     ASSERT ((x) != 0);							\
1034     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
1035   } while (0)
1036 #endif /* asm bsfl */
1037 
1038 #endif /* ! pentium */
1039 
1040 #endif /* 80x86 */
1041 
1042 #if defined (__amd64__) && W_TYPE_SIZE == 64
1043 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1044   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
1045 	   : "=r" (sh), "=&r" (sl)					\
1046 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1047 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1048 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1049   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
1050 	   : "=r" (sh), "=&r" (sl)					\
1051 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1052 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1053 #if X86_ASM_MULX \
1054    && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
1055        || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
1056 #define umul_ppmm(w1, w0, u, v) \
1057   __asm__ ("mulx\t%3, %q0, %q1"						\
1058 	   : "=r" (w0), "=r" (w1)					\
1059 	   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
1060 #else
1061 #define umul_ppmm(w1, w0, u, v) \
1062   __asm__ ("mulq\t%3"							\
1063 	   : "=a" (w0), "=d" (w1)					\
1064 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1065 #endif
1066 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1067   __asm__ ("divq %4"		     /* stringification in K&R C */	\
1068 	   : "=a" (q), "=d" (r)						\
1069 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1070 
1071 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
1072   || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2	\
1073   || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen	\
1074   || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
1075 #define count_leading_zeros(count, x)					\
1076   do {									\
1077     /* This is lzcnt, spelled for older assemblers.  Destination and */	\
1078     /* source must be a 64-bit registers, hence cast and %q.         */	\
1079     __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1080   } while (0)
1081 #define COUNT_LEADING_ZEROS_0 64
1082 #else
1083 #define count_leading_zeros(count, x)					\
1084   do {									\
1085     UDItype __cbtmp;							\
1086     ASSERT ((x) != 0);							\
1087     __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
1088     (count) = __cbtmp ^ 63;						\
1089   } while (0)
1090 #endif
1091 
1092 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
1093   || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
1094 #define count_trailing_zeros(count, x)					\
1095   do {									\
1096     /* This is tzcnt, spelled for older assemblers.  Destination and */	\
1097     /* source must be a 64-bit registers, hence cast and %q.         */	\
1098     __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1099   } while (0)
1100 #define COUNT_TRAILING_ZEROS_0 64
1101 #else
1102 #define count_trailing_zeros(count, x)					\
1103   do {									\
1104     ASSERT ((x) != 0);							\
1105     __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1106   } while (0)
1107 #endif
1108 #endif /* __amd64__ */
1109 
1110 #if defined (__i860__) && W_TYPE_SIZE == 32
1111 #define rshift_rhlc(r,h,l,c) \
1112   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
1113 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
1114 #endif /* i860 */
1115 
1116 #if defined (__i960__) && W_TYPE_SIZE == 32
1117 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1118   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
1119 	   : "=r" (sh), "=&r" (sl)					\
1120 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1121 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1122   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
1123 	   : "=r" (sh), "=&r" (sl)					\
1124 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1125 #define umul_ppmm(w1, w0, u, v) \
1126   ({union {UDItype __ll;						\
1127 	   struct {USItype __l, __h;} __i;				\
1128 	  } __x;							\
1129   __asm__ ("emul %2,%1,%0"						\
1130 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
1131   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1132 #define __umulsidi3(u, v) \
1133   ({UDItype __w;							\
1134     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
1135     __w; })
1136 #define udiv_qrnnd(q, r, nh, nl, d) \
1137   do {									\
1138     union {UDItype __ll;						\
1139 	   struct {USItype __l, __h;} __i;				\
1140 	  } __nn;							\
1141     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
1142     __asm__ ("ediv %d,%n,%0"						\
1143 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
1144     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
1145   } while (0)
1146 #define count_leading_zeros(count, x) \
1147   do {									\
1148     USItype __cbtmp;							\
1149     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
1150     (count) = __cbtmp ^ 31;						\
1151   } while (0)
1152 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1153 #if defined (__i960mx)		/* what is the proper symbol to test??? */
1154 #define rshift_rhlc(r,h,l,c) \
1155   do {									\
1156     union {UDItype __ll;						\
1157 	   struct {USItype __l, __h;} __i;				\
1158 	  } __nn;							\
1159     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
1160     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
1161   }
1162 #endif /* i960mx */
1163 #endif /* i960 */
1164 
1165 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1166      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1167      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1168 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1169   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
1170 	   : "=d" (sh), "=&d" (sl)					\
1171 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
1172 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1173 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1174   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
1175 	   : "=d" (sh), "=&d" (sl)					\
1176 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
1177 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1178 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1179 #if defined (__mc68020__) || defined(mc68020) \
1180      || defined (__mc68030__) || defined (mc68030) \
1181      || defined (__mc68040__) || defined (mc68040) \
1182      || defined (__mcpu32__) || defined (mcpu32) \
1183      || defined (__NeXT__)
1184 #define umul_ppmm(w1, w0, u, v) \
1185   __asm__ ("mulu%.l %3,%1:%0"						\
1186 	   : "=d" (w0), "=d" (w1)					\
1187 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1188 #define udiv_qrnnd(q, r, n1, n0, d) \
1189   __asm__ ("divu%.l %4,%1:%0"						\
1190 	   : "=d" (q), "=d" (r)						\
1191 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1192 #define sdiv_qrnnd(q, r, n1, n0, d) \
1193   __asm__ ("divs%.l %4,%1:%0"						\
1194 	   : "=d" (q), "=d" (r)						\
1195 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1196 #else /* for other 68k family members use 16x16->32 multiplication */
1197 #define umul_ppmm(xh, xl, a, b) \
1198   do { USItype __umul_tmp1, __umul_tmp2;				\
1199 	__asm__ ("| Inlined umul_ppmm\n"				\
1200 "	move%.l	%5,%3\n"						\
1201 "	move%.l	%2,%0\n"						\
1202 "	move%.w	%3,%1\n"						\
1203 "	swap	%3\n"							\
1204 "	swap	%0\n"							\
1205 "	mulu%.w	%2,%1\n"						\
1206 "	mulu%.w	%3,%0\n"						\
1207 "	mulu%.w	%2,%3\n"						\
1208 "	swap	%2\n"							\
1209 "	mulu%.w	%5,%2\n"						\
1210 "	add%.l	%3,%2\n"						\
1211 "	jcc	1f\n"							\
1212 "	add%.l	%#0x10000,%0\n"						\
1213 "1:	move%.l	%2,%3\n"						\
1214 "	clr%.w	%2\n"							\
1215 "	swap	%2\n"							\
1216 "	swap	%3\n"							\
1217 "	clr%.w	%3\n"							\
1218 "	add%.l	%3,%1\n"						\
1219 "	addx%.l	%2,%0\n"						\
1220 "	| End inlined umul_ppmm"					\
1221 	      : "=&d" (xh), "=&d" (xl),					\
1222 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
1223 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
1224   } while (0)
1225 #endif /* not mc68020 */
1226 /* The '020, '030, '040 and '060 have bitfield insns.
1227    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1228    exclude bfffo on that chip (bitfield insns not available).  */
1229 #if (defined (__mc68020__) || defined (mc68020)    \
1230      || defined (__mc68030__) || defined (mc68030) \
1231      || defined (__mc68040__) || defined (mc68040) \
1232      || defined (__mc68060__) || defined (mc68060) \
1233      || defined (__NeXT__))			   \
1234   && ! defined (__mcpu32__)
1235 #define count_leading_zeros(count, x) \
1236   __asm__ ("bfffo %1{%b2:%b2},%0"					\
1237 	   : "=d" (count)						\
1238 	   : "od" ((USItype) (x)), "n" (0))
1239 #define COUNT_LEADING_ZEROS_0 32
1240 #endif
1241 #endif /* mc68000 */
1242 
1243 #if defined (__m88000__) && W_TYPE_SIZE == 32
1244 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1245   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
1246 	   : "=r" (sh), "=&r" (sl)					\
1247 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1248 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1249   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
1250 	   : "=r" (sh), "=&r" (sl)					\
1251 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1252 #define count_leading_zeros(count, x) \
1253   do {									\
1254     USItype __cbtmp;							\
1255     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
1256     (count) = __cbtmp ^ 31;						\
1257   } while (0)
1258 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1259 #if defined (__m88110__)
1260 #define umul_ppmm(wh, wl, u, v) \
1261   do {									\
1262     union {UDItype __ll;						\
1263 	   struct {USItype __h, __l;} __i;				\
1264 	  } __x;							\
1265     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
1266     (wh) = __x.__i.__h;							\
1267     (wl) = __x.__i.__l;							\
1268   } while (0)
1269 #define udiv_qrnnd(q, r, n1, n0, d) \
1270   ({union {UDItype __ll;						\
1271 	   struct {USItype __h, __l;} __i;				\
1272 	  } __x, __q;							\
1273   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1274   __asm__ ("divu.d %0,%1,%2"						\
1275 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
1276   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1277 #endif /* __m88110__ */
1278 #endif /* __m88000__ */
1279 
1280 #if defined (__mips) && W_TYPE_SIZE == 32
1281 #if __GMP_GNUC_PREREQ (4,4)
1282 #define umul_ppmm(w1, w0, u, v) \
1283   do {									\
1284     UDItype __ll = (UDItype)(u) * (v);					\
1285     w1 = __ll >> 32;							\
1286     w0 = __ll;								\
1287   } while (0)
1288 #endif
1289 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1290 #define umul_ppmm(w1, w0, u, v) \
1291   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1292 #endif
1293 #if !defined (umul_ppmm)
1294 #define umul_ppmm(w1, w0, u, v) \
1295   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1296 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1297 #endif
1298 #endif /* __mips */
1299 
1300 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1301 #if defined (_MIPS_ARCH_MIPS64R6)
1302 #define umul_ppmm(w1, w0, u, v) \
1303   do {									\
1304     UDItype __m0 = (u), __m1 = (v);					\
1305     (w0) = __m0 * __m1;							\
1306     __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1));	\
1307   } while (0)
1308 #endif
1309 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4)
1310 #define umul_ppmm(w1, w0, u, v) \
1311   do {									\
1312     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1313     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1314     w1 = __ll >> 64;							\
1315     w0 = __ll;								\
1316   } while (0)
1317 #endif
1318 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1319 #define umul_ppmm(w1, w0, u, v) \
1320   __asm__ ("dmultu %2,%3"						\
1321 	   : "=l" (w0), "=h" (w1)					\
1322 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1323 #endif
1324 #if !defined (umul_ppmm)
1325 #define umul_ppmm(w1, w0, u, v) \
1326   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1327 	   : "=d" (w0), "=d" (w1)					\
1328 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1329 #endif
1330 #endif /* __mips */
1331 
1332 #if defined (__mmix__) && W_TYPE_SIZE == 64
1333 #define umul_ppmm(w1, w0, u, v) \
1334   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1335 #endif
1336 
1337 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1338 #define umul_ppmm(w1, w0, u, v) \
1339   ({union {UDItype __ll;						\
1340 	   struct {USItype __l, __h;} __i;				\
1341 	  } __x;							\
1342   __asm__ ("meid %2,%0"							\
1343 	   : "=g" (__x.__ll)						\
1344 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
1345   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1346 #define __umulsidi3(u, v) \
1347   ({UDItype __w;							\
1348     __asm__ ("meid %2,%0"						\
1349 	     : "=g" (__w)						\
1350 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
1351     __w; })
1352 #define udiv_qrnnd(q, r, n1, n0, d) \
1353   ({union {UDItype __ll;						\
1354 	   struct {USItype __l, __h;} __i;				\
1355 	  } __x;							\
1356   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1357   __asm__ ("deid %2,%0"							\
1358 	   : "=g" (__x.__ll)						\
1359 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
1360   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1361 #define count_trailing_zeros(count,x) \
1362   do {									\
1363     __asm__ ("ffsd	%2,%0"						\
1364 	     : "=r" (count)						\
1365 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
1366   } while (0)
1367 #endif /* __ns32000__ */
1368 
1369 /* In the past we had a block of various #defines tested
1370        _ARCH_PPC    - AIX
1371        _ARCH_PWR    - AIX
1372        __powerpc__  - gcc
1373        __POWERPC__  - BEOS
1374        __ppc__      - Darwin
1375        PPC          - old gcc, GNU/Linux, SysV
1376    The plain PPC test was not good for vxWorks, since PPC is defined on all
1377    CPUs there (eg. m68k too), as a constant one is expected to compare
1378    CPU_FAMILY against.
1379 
1380    At any rate, this was pretty unattractive and a bit fragile.  The use of
1381    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1382    getting the desired effect.
1383 
1384    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1385    the system vendor compilers.  (Is that vendor compilers with inline asm,
1386    or what?)  */
1387 
1388 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
1389   && W_TYPE_SIZE == 32
1390 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1391   do {									\
1392     if (__builtin_constant_p (bh) && (bh) == 0)				\
1393       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1394 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
1395 		 __CLOBBER_CC);						\
1396     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1397       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1398 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
1399 		 __CLOBBER_CC);						\
1400     else								\
1401       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1402 	       : "=r" (sh), "=&r" (sl)					\
1403 	       : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
1404 		 __CLOBBER_CC);						\
1405   } while (0)
1406 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1407   do {									\
1408     if (__builtin_constant_p (ah) && (ah) == 0)				\
1409       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1410 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
1411 		 __CLOBBER_CC);						\
1412     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
1413       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1414 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
1415 		 __CLOBBER_CC);						\
1416     else if (__builtin_constant_p (bh) && (bh) == 0)			\
1417       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1418 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
1419 		 __CLOBBER_CC);						\
1420     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1421       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1422 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
1423 		 __CLOBBER_CC);						\
1424     else								\
1425       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
1426 	       : "=r" (sh), "=&r" (sl)					\
1427 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl)		\
1428 		 __CLOBBER_CC);						\
1429   } while (0)
1430 #define count_leading_zeros(count, x) \
1431   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1432 #define COUNT_LEADING_ZEROS_0 32
1433 #if HAVE_HOST_CPU_FAMILY_powerpc
1434 #if __GMP_GNUC_PREREQ (4,4)
1435 #define umul_ppmm(w1, w0, u, v) \
1436   do {									\
1437     UDItype __ll = (UDItype)(u) * (v);					\
1438     w1 = __ll >> 32;							\
1439     w0 = __ll;								\
1440   } while (0)
1441 #endif
1442 #if !defined (umul_ppmm)
1443 #define umul_ppmm(ph, pl, m0, m1) \
1444   do {									\
1445     USItype __m0 = (m0), __m1 = (m1);					\
1446     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1447     (pl) = __m0 * __m1;							\
1448   } while (0)
1449 #endif
1450 #define smul_ppmm(ph, pl, m0, m1) \
1451   do {									\
1452     SItype __m0 = (m0), __m1 = (m1);					\
1453     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1454     (pl) = __m0 * __m1;							\
1455   } while (0)
1456 #else
1457 #define smul_ppmm(xh, xl, m0, m1) \
1458   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1459 #define sdiv_qrnnd(q, r, nh, nl, d) \
1460   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1461 #endif
1462 #endif /* 32-bit POWER architecture variants.  */
1463 
1464 /* We should test _IBMR2 here when we add assembly support for the system
1465    vendor compilers.  */
1466 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1467 #if !defined (_LONG_LONG_LIMB)
1468 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1469    use adde etc only when not _LONG_LONG_LIMB.  */
1470 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1471   do {									\
1472     if (__builtin_constant_p (bh) && (bh) == 0)				\
1473       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1474 	       : "=r" (sh), "=&r" (sl)					\
1475 	       : "r"  ((UDItype)(ah)),					\
1476 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1477 		 __CLOBBER_CC);						\
1478     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
1479       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1480 	       : "=r" (sh), "=&r" (sl)					\
1481 	       : "r"  ((UDItype)(ah)),					\
1482 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1483 		 __CLOBBER_CC);						\
1484     else								\
1485       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1486 	       : "=r" (sh), "=&r" (sl)					\
1487 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
1488 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1489 		 __CLOBBER_CC);						\
1490   } while (0)
1491 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1492    This might seem strange, but gcc folds away the dead code late.  */
1493 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1494   do {									\
1495     if (__builtin_constant_p (bl)					\
1496 	&& (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) {		\
1497 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1498 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
1499 		   : "=r" (sh), "=&r" (sl)				\
1500 		   :                       "r" ((UDItype)(bh)),		\
1501 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1502 		     __CLOBBER_CC);					\
1503 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1504 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
1505 		   : "=r" (sh), "=&r" (sl)				\
1506 		   :                       "r" ((UDItype)(bh)),		\
1507 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1508 		     __CLOBBER_CC);					\
1509 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1510 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
1511 		   : "=r" (sh), "=&r" (sl)				\
1512 		   : "r" ((UDItype)(ah)),				\
1513 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1514 		     __CLOBBER_CC);					\
1515 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1516 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
1517 		   : "=r" (sh), "=&r" (sl)				\
1518 		   : "r" ((UDItype)(ah)),				\
1519 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1520 		     __CLOBBER_CC);					\
1521 	else								\
1522 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
1523 		   : "=r" (sh), "=&r" (sl)				\
1524 		   : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1525 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1526 		     __CLOBBER_CC);					\
1527     } else {								\
1528 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1529 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1530 		   : "=r" (sh), "=&r" (sl)				\
1531 		   :                       "r" ((UDItype)(bh)),		\
1532 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1533 		     __CLOBBER_CC);					\
1534 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1535 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1536 		   : "=r" (sh), "=&r" (sl)				\
1537 		   :                       "r" ((UDItype)(bh)),		\
1538 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1539 		     __CLOBBER_CC);					\
1540 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1541 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1542 		   : "=r" (sh), "=&r" (sl)				\
1543 		   : "r"  ((UDItype)(ah)),				\
1544 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1545 		     __CLOBBER_CC);					\
1546 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1547 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1548 		   : "=r" (sh), "=&r" (sl)				\
1549 		   : "r"  ((UDItype)(ah)),				\
1550 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1551 		     __CLOBBER_CC);					\
1552 	else								\
1553 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
1554 		   : "=r" (sh), "=&r" (sl)				\
1555 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1556 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1557 		     __CLOBBER_CC);					\
1558     }									\
1559   } while (0)
1560 #endif /* ! _LONG_LONG_LIMB */
1561 #define count_leading_zeros(count, x) \
1562   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1563 #define COUNT_LEADING_ZEROS_0 64
1564 #if __GMP_GNUC_PREREQ (4,8)
1565 #define umul_ppmm(w1, w0, u, v) \
1566   do {									\
1567     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1568     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1569     w1 = __ll >> 64;							\
1570     w0 = __ll;								\
1571   } while (0)
1572 #endif
1573 #if !defined (umul_ppmm)
1574 #define umul_ppmm(ph, pl, m0, m1) \
1575   do {									\
1576     UDItype __m0 = (m0), __m1 = (m1);					\
1577     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1578     (pl) = __m0 * __m1;							\
1579   } while (0)
1580 #endif
1581 #define smul_ppmm(ph, pl, m0, m1) \
1582   do {									\
1583     DItype __m0 = (m0), __m1 = (m1);					\
1584     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1585     (pl) = __m0 * __m1;							\
1586   } while (0)
1587 #endif /* 64-bit PowerPC.  */
1588 
1589 #if defined (__pyr__) && W_TYPE_SIZE == 32
1590 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1591   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
1592 	   : "=r" (sh), "=&r" (sl)					\
1593 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1594 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1595 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1596   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
1597 	   : "=r" (sh), "=&r" (sl)					\
1598 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1599 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1600 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1601 #define umul_ppmm(w1, w0, u, v) \
1602   ({union {UDItype __ll;						\
1603 	   struct {USItype __h, __l;} __i;				\
1604 	  } __x;							\
1605   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
1606 	   : "=&r" (__x.__ll)						\
1607 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
1608   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1609 #endif /* __pyr__ */
1610 
1611 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1612 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1613   __asm__ ("a %1,%5\n\tae %0,%3"					\
1614 	   : "=r" (sh), "=&r" (sl)					\
1615 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
1616 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1617 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1618   __asm__ ("s %1,%5\n\tse %0,%3"					\
1619 	   : "=r" (sh), "=&r" (sl)					\
1620 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
1621 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
1622 #define smul_ppmm(ph, pl, m0, m1) \
1623   __asm__ (								\
1624        "s	r2,r2\n"						\
1625 "	mts r10,%2\n"							\
1626 "	m	r2,%3\n"						\
1627 "	m	r2,%3\n"						\
1628 "	m	r2,%3\n"						\
1629 "	m	r2,%3\n"						\
1630 "	m	r2,%3\n"						\
1631 "	m	r2,%3\n"						\
1632 "	m	r2,%3\n"						\
1633 "	m	r2,%3\n"						\
1634 "	m	r2,%3\n"						\
1635 "	m	r2,%3\n"						\
1636 "	m	r2,%3\n"						\
1637 "	m	r2,%3\n"						\
1638 "	m	r2,%3\n"						\
1639 "	m	r2,%3\n"						\
1640 "	m	r2,%3\n"						\
1641 "	m	r2,%3\n"						\
1642 "	cas	%0,r2,r0\n"						\
1643 "	mfs	r10,%1"							\
1644 	   : "=r" (ph), "=r" (pl)					\
1645 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
1646 	   : "r2")
1647 #define count_leading_zeros(count, x) \
1648   do {									\
1649     if ((x) >= 0x10000)							\
1650       __asm__ ("clz	%0,%1"						\
1651 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
1652     else								\
1653       {									\
1654 	__asm__ ("clz	%0,%1"						\
1655 		 : "=r" (count) : "r" ((USItype)(x)));			\
1656 	(count) += 16;							\
1657       }									\
1658   } while (0)
1659 #endif /* RT/ROMP */
1660 
1661 #if defined (__riscv64) && W_TYPE_SIZE == 64
1662 #define umul_ppmm(ph, pl, u, v) \
1663   do {									\
1664     UDItype __u = (u), __v = (v);					\
1665     (pl) = __u * __v;							\
1666     __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v));	\
1667   } while (0)
1668 #endif
1669 
1670 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1671 #define umul_ppmm(w1, w0, u, v) \
1672   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
1673 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1674 #endif
1675 
1676 #if defined (__sparc__) && W_TYPE_SIZE == 32
1677 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1678   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1679 	   : "=r" (sh), "=&r" (sl)					\
1680 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
1681 	   __CLOBBER_CC)
1682 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1683   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1684 	   : "=r" (sh), "=&r" (sl)					\
1685 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
1686 	   __CLOBBER_CC)
1687 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1688    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1689 #if defined (__sparc_v9__) || defined (__sparcv9)
1690 /* Perhaps we should use floating-point operations here?  */
1691 #if 0
1692 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1693    Perhaps we simply need explicitly zero-extend the inputs?  */
1694 #define umul_ppmm(w1, w0, u, v) \
1695   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
1696 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1697 #else
1698 /* Use v8 umul until above bug is fixed.  */
1699 #define umul_ppmm(w1, w0, u, v) \
1700   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1701 #endif
1702 /* Use a plain v8 divide for v9.  */
1703 #define udiv_qrnnd(q, r, n1, n0, d) \
1704   do {									\
1705     USItype __q;							\
1706     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1707 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1708     (r) = (n0) - __q * (d);						\
1709     (q) = __q;								\
1710   } while (0)
1711 #else
1712 #if defined (__sparc_v8__)   /* gcc normal */				\
1713   || defined (__sparcv8)     /* gcc solaris */				\
1714   || HAVE_HOST_CPU_supersparc
1715 /* Don't match immediate range because, 1) it is not often useful,
1716    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1717    while we want to match a 13 bit interval, sign extended to 32 bits,
1718    but INTERPRETED AS UNSIGNED.  */
1719 #define umul_ppmm(w1, w0, u, v) \
1720   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1721 
1722 #if HAVE_HOST_CPU_supersparc
1723 #else
1724 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1725    dividends and will trap to the kernel for the rest. */
1726 #define udiv_qrnnd(q, r, n1, n0, d) \
1727   do {									\
1728     USItype __q;							\
1729     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1730 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1731     (r) = (n0) - __q * (d);						\
1732     (q) = __q;								\
1733   } while (0)
1734 #endif /* HAVE_HOST_CPU_supersparc */
1735 
1736 #else /* ! __sparc_v8__ */
1737 #if defined (__sparclite__)
1738 /* This has hardware multiply but not divide.  It also has two additional
1739    instructions scan (ffs from high bit) and divscc.  */
1740 #define umul_ppmm(w1, w0, u, v) \
1741   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1742 #define udiv_qrnnd(q, r, n1, n0, d) \
1743   __asm__ ("! Inlined udiv_qrnnd\n"					\
1744 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1745 "	tst	%%g0\n"							\
1746 "	divscc	%3,%4,%%g1\n"						\
1747 "	divscc	%%g1,%4,%%g1\n"						\
1748 "	divscc	%%g1,%4,%%g1\n"						\
1749 "	divscc	%%g1,%4,%%g1\n"						\
1750 "	divscc	%%g1,%4,%%g1\n"						\
1751 "	divscc	%%g1,%4,%%g1\n"						\
1752 "	divscc	%%g1,%4,%%g1\n"						\
1753 "	divscc	%%g1,%4,%%g1\n"						\
1754 "	divscc	%%g1,%4,%%g1\n"						\
1755 "	divscc	%%g1,%4,%%g1\n"						\
1756 "	divscc	%%g1,%4,%%g1\n"						\
1757 "	divscc	%%g1,%4,%%g1\n"						\
1758 "	divscc	%%g1,%4,%%g1\n"						\
1759 "	divscc	%%g1,%4,%%g1\n"						\
1760 "	divscc	%%g1,%4,%%g1\n"						\
1761 "	divscc	%%g1,%4,%%g1\n"						\
1762 "	divscc	%%g1,%4,%%g1\n"						\
1763 "	divscc	%%g1,%4,%%g1\n"						\
1764 "	divscc	%%g1,%4,%%g1\n"						\
1765 "	divscc	%%g1,%4,%%g1\n"						\
1766 "	divscc	%%g1,%4,%%g1\n"						\
1767 "	divscc	%%g1,%4,%%g1\n"						\
1768 "	divscc	%%g1,%4,%%g1\n"						\
1769 "	divscc	%%g1,%4,%%g1\n"						\
1770 "	divscc	%%g1,%4,%%g1\n"						\
1771 "	divscc	%%g1,%4,%%g1\n"						\
1772 "	divscc	%%g1,%4,%%g1\n"						\
1773 "	divscc	%%g1,%4,%%g1\n"						\
1774 "	divscc	%%g1,%4,%%g1\n"						\
1775 "	divscc	%%g1,%4,%%g1\n"						\
1776 "	divscc	%%g1,%4,%%g1\n"						\
1777 "	divscc	%%g1,%4,%0\n"						\
1778 "	rd	%%y,%1\n"						\
1779 "	bl,a 1f\n"							\
1780 "	add	%1,%4,%1\n"						\
1781 "1:	! End of inline udiv_qrnnd"					\
1782 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
1783 	   : "%g1" __AND_CLOBBER_CC)
1784 #define count_leading_zeros(count, x) \
1785   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1786 /* Early sparclites return 63 for an argument of 0, but they warn that future
1787    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1788    undefined.  */
1789 #endif /* __sparclite__ */
1790 #endif /* __sparc_v8__ */
1791 #endif /* __sparc_v9__ */
1792 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1793 #ifndef umul_ppmm
1794 #define umul_ppmm(w1, w0, u, v) \
1795   __asm__ ("! Inlined umul_ppmm\n"					\
1796 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
1797 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
1798 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
1799 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1800 "	mulscc	%%g1,%3,%%g1\n"						\
1801 "	mulscc	%%g1,%3,%%g1\n"						\
1802 "	mulscc	%%g1,%3,%%g1\n"						\
1803 "	mulscc	%%g1,%3,%%g1\n"						\
1804 "	mulscc	%%g1,%3,%%g1\n"						\
1805 "	mulscc	%%g1,%3,%%g1\n"						\
1806 "	mulscc	%%g1,%3,%%g1\n"						\
1807 "	mulscc	%%g1,%3,%%g1\n"						\
1808 "	mulscc	%%g1,%3,%%g1\n"						\
1809 "	mulscc	%%g1,%3,%%g1\n"						\
1810 "	mulscc	%%g1,%3,%%g1\n"						\
1811 "	mulscc	%%g1,%3,%%g1\n"						\
1812 "	mulscc	%%g1,%3,%%g1\n"						\
1813 "	mulscc	%%g1,%3,%%g1\n"						\
1814 "	mulscc	%%g1,%3,%%g1\n"						\
1815 "	mulscc	%%g1,%3,%%g1\n"						\
1816 "	mulscc	%%g1,%3,%%g1\n"						\
1817 "	mulscc	%%g1,%3,%%g1\n"						\
1818 "	mulscc	%%g1,%3,%%g1\n"						\
1819 "	mulscc	%%g1,%3,%%g1\n"						\
1820 "	mulscc	%%g1,%3,%%g1\n"						\
1821 "	mulscc	%%g1,%3,%%g1\n"						\
1822 "	mulscc	%%g1,%3,%%g1\n"						\
1823 "	mulscc	%%g1,%3,%%g1\n"						\
1824 "	mulscc	%%g1,%3,%%g1\n"						\
1825 "	mulscc	%%g1,%3,%%g1\n"						\
1826 "	mulscc	%%g1,%3,%%g1\n"						\
1827 "	mulscc	%%g1,%3,%%g1\n"						\
1828 "	mulscc	%%g1,%3,%%g1\n"						\
1829 "	mulscc	%%g1,%3,%%g1\n"						\
1830 "	mulscc	%%g1,%3,%%g1\n"						\
1831 "	mulscc	%%g1,%3,%%g1\n"						\
1832 "	mulscc	%%g1,0,%%g1\n"						\
1833 "	add	%%g1,%%g2,%0\n"						\
1834 "	rd	%%y,%1"							\
1835 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
1836 	   : "%g1", "%g2" __AND_CLOBBER_CC)
1837 #endif
1838 #ifndef udiv_qrnnd
1839 #ifndef LONGLONG_STANDALONE
1840 #define udiv_qrnnd(q, r, n1, n0, d) \
1841   do { UWtype __r;							\
1842     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
1843     (r) = __r;								\
1844   } while (0)
1845 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1846 #endif /* LONGLONG_STANDALONE */
1847 #endif /* udiv_qrnnd */
1848 #endif /* __sparc__ */
1849 
1850 #if defined (__sparc__) && W_TYPE_SIZE == 64
1851 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1852   __asm__ (								\
1853        "addcc	%r4,%5,%1\n"						\
1854       "	addccc	%r6,%7,%%g0\n"						\
1855       "	addc	%r2,%3,%0"						\
1856        : "=r" (sh), "=&r" (sl)						\
1857        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1858 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1859 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
1860 	   __CLOBBER_CC)
1861 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1862   __asm__ (								\
1863        "subcc	%r4,%5,%1\n"						\
1864       "	subccc	%r6,%7,%%g0\n"						\
1865       "	subc	%r2,%3,%0"						\
1866        : "=r" (sh), "=&r" (sl)						\
1867        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1868 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1869 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
1870 	   __CLOBBER_CC)
1871 #if __VIS__ >= 0x300
1872 #undef add_ssaaaa
1873 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1874   __asm__ (								\
1875        "addcc	%r4, %5, %1\n"						\
1876       "	addxc	%r2, %r3, %0"						\
1877 	  : "=r" (sh), "=&r" (sl)					\
1878        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
1879 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1880 #define umul_ppmm(ph, pl, m0, m1) \
1881   do {									\
1882     UDItype __m0 = (m0), __m1 = (m1);					\
1883     (pl) = __m0 * __m1;							\
1884     __asm__ ("umulxhi\t%2, %1, %0"					\
1885 	     : "=r" (ph)						\
1886 	     : "%r" (__m0), "r" (__m1));				\
1887   } while (0)
1888 #define count_leading_zeros(count, x) \
1889   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1890 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1891 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1892 #endif
1893 #endif
1894 
1895 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1896 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1897   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1898 	   : "=g" (sh), "=&g" (sl)					\
1899 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1900 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1901 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1902   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1903 	   : "=g" (sh), "=&g" (sl)					\
1904 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1905 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1906 #define smul_ppmm(xh, xl, m0, m1) \
1907   do {									\
1908     union {UDItype __ll;						\
1909 	   struct {USItype __l, __h;} __i;				\
1910 	  } __x;							\
1911     USItype __m0 = (m0), __m1 = (m1);					\
1912     __asm__ ("emul %1,%2,$0,%0"						\
1913 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
1914     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1915   } while (0)
1916 #define sdiv_qrnnd(q, r, n1, n0, d) \
1917   do {									\
1918     union {DItype __ll;							\
1919 	   struct {SItype __l, __h;} __i;				\
1920 	  } __x;							\
1921     __x.__i.__h = n1; __x.__i.__l = n0;					\
1922     __asm__ ("ediv %3,%2,%0,%1"						\
1923 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
1924   } while (0)
1925 #if 0
1926 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1927    8800 maybe). */
1928 #define count_trailing_zeros(count,x)					\
1929   do {									\
1930     __asm__ ("ffs 0, 31, %1, %0"					\
1931 	     : "=g" (count)						\
1932 	     : "g" ((USItype) (x)));					\
1933   } while (0)
1934 #endif
1935 #endif /* vax */
1936 
1937 #if defined (__z8000__) && W_TYPE_SIZE == 16
1938 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1939   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1940 	   : "=r" (sh), "=&r" (sl)					\
1941 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1942 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1943 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1944   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1945 	   : "=r" (sh), "=&r" (sl)					\
1946 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1947 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1948 #define umul_ppmm(xh, xl, m0, m1) \
1949   do {									\
1950     union {long int __ll;						\
1951 	   struct {unsigned int __h, __l;} __i;				\
1952 	  } __x;							\
1953     unsigned int __m0 = (m0), __m1 = (m1);				\
1954     __asm__ ("mult	%S0,%H3"					\
1955 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
1956 	     : "%1" (m0), "rQR" (m1));					\
1957     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1958     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1959 	     + (((signed int) __m1 >> 15) & __m0));			\
1960   } while (0)
1961 #endif /* __z8000__ */
1962 
1963 #endif /* __GNUC__ */
1964 
1965 #endif /* NO_ASM */
1966 
1967 
1968 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1969 #if !defined (umul_ppmm) && defined (__umulsidi3)
1970 #define umul_ppmm(ph, pl, m0, m1) \
1971   do {									\
1972     UDWtype __ll = __umulsidi3 (m0, m1);				\
1973     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
1974     pl = (UWtype) __ll;							\
1975   } while (0)
1976 #endif
1977 
1978 #if !defined (__umulsidi3)
1979 #define __umulsidi3(u, v) \
1980   ({UWtype __hi, __lo;							\
1981     umul_ppmm (__hi, __lo, u, v);					\
1982     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1983 #endif
1984 
1985 
1986 #if defined (__cplusplus)
1987 #define __longlong_h_C "C"
1988 #else
1989 #define __longlong_h_C
1990 #endif
1991 
1992 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1993    forms have "reversed" arguments, meaning the pointer is last, which
1994    sometimes allows better parameter passing, in particular on 64-bit
1995    hppa. */
1996 
1997 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1998 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1999 
2000 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
2001   && ! defined (LONGLONG_STANDALONE)
2002 #define umul_ppmm(wh, wl, u, v)						\
2003   do {									\
2004     UWtype __umul_ppmm__p0;						\
2005     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
2006     (wl) = __umul_ppmm__p0;						\
2007   } while (0)
2008 #endif
2009 
2010 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
2011 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
2012 
2013 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
2014   && ! defined (LONGLONG_STANDALONE)
2015 #define umul_ppmm(wh, wl, u, v)						\
2016   do {									\
2017     UWtype __umul_p0;							\
2018     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
2019     (wl) = __umul_p0;							\
2020   } while (0)
2021 #endif
2022 
2023 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
2024 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
2025 
2026 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
2027   && ! defined (LONGLONG_STANDALONE)
2028 #define udiv_qrnnd(q, r, n1, n0, d)					\
2029   do {									\
2030     UWtype __udiv_qrnnd_r;						\
2031     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
2032 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
2033     (r) = __udiv_qrnnd_r;						\
2034   } while (0)
2035 #endif
2036 
2037 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
2038 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
2039 
2040 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
2041   && ! defined (LONGLONG_STANDALONE)
2042 #define udiv_qrnnd(q, r, n1, n0, d)					\
2043   do {									\
2044     UWtype __udiv_qrnnd_r;						\
2045     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
2046 			    &__udiv_qrnnd_r);				\
2047     (r) = __udiv_qrnnd_r;						\
2048   } while (0)
2049 #endif
2050 
2051 
2052 /* If this machine has no inline assembler, use C macros.  */
2053 
2054 #if !defined (add_ssaaaa)
2055 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
2056   do {									\
2057     UWtype __x;								\
2058     UWtype __al = (al);							\
2059     UWtype __bl = (bl);							\
2060     __x = __al + __bl;							\
2061     (sh) = (ah) + (bh) + (__x < __al);					\
2062     (sl) = __x;								\
2063   } while (0)
2064 #endif
2065 
2066 #if !defined (sub_ddmmss)
2067 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
2068   do {									\
2069     UWtype __x;								\
2070     UWtype __al = (al);							\
2071     UWtype __bl = (bl);							\
2072     __x = __al - __bl;							\
2073     (sh) = (ah) - (bh) - (__al < __bl);					\
2074     (sl) = __x;								\
2075   } while (0)
2076 #endif
2077 
2078 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2079    smul_ppmm.  */
2080 #if !defined (umul_ppmm) && defined (smul_ppmm)
2081 #define umul_ppmm(w1, w0, u, v)						\
2082   do {									\
2083     UWtype __w1;							\
2084     UWtype __xm0 = (u), __xm1 = (v);					\
2085     smul_ppmm (__w1, w0, __xm0, __xm1);					\
2086     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2087 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2088   } while (0)
2089 #endif
2090 
2091 /* If we still don't have umul_ppmm, define it using plain C.
2092 
2093    For reference, when this code is used for squaring (ie. u and v identical
2094    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2095    multiplies, not 4.  The subsequent additions could be optimized a bit,
2096    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2097    and chips obliged to use this generic C umul will have plenty of worse
2098    performance problems than a couple of extra instructions on the diagonal
2099    of sqr_basecase.  */
2100 
2101 #if !defined (umul_ppmm)
2102 #define umul_ppmm(w1, w0, u, v)						\
2103   do {									\
2104     UWtype __x0, __x1, __x2, __x3;					\
2105     UHWtype __ul, __vl, __uh, __vh;					\
2106     UWtype __u = (u), __v = (v);					\
2107 									\
2108     __ul = __ll_lowpart (__u);						\
2109     __uh = __ll_highpart (__u);						\
2110     __vl = __ll_lowpart (__v);						\
2111     __vh = __ll_highpart (__v);						\
2112 									\
2113     __x0 = (UWtype) __ul * __vl;					\
2114     __x1 = (UWtype) __ul * __vh;					\
2115     __x2 = (UWtype) __uh * __vl;					\
2116     __x3 = (UWtype) __uh * __vh;					\
2117 									\
2118     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
2119     __x1 += __x2;		/* but this indeed can */		\
2120     if (__x1 < __x2)		/* did we get it? */			\
2121       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
2122 									\
2123     (w1) = __x3 + __ll_highpart (__x1);					\
2124     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
2125   } while (0)
2126 #endif
2127 
2128 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2129    exist in one form or another.  */
2130 #if !defined (smul_ppmm)
2131 #define smul_ppmm(w1, w0, u, v)						\
2132   do {									\
2133     UWtype __w1;							\
2134     UWtype __xm0 = (u), __xm1 = (v);					\
2135     umul_ppmm (__w1, w0, __xm0, __xm1);					\
2136     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2137 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2138   } while (0)
2139 #endif
2140 
2141 /* Define this unconditionally, so it can be used for debugging.  */
2142 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2143   do {									\
2144     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
2145 									\
2146     ASSERT ((d) != 0);							\
2147     ASSERT ((n1) < (d));						\
2148 									\
2149     __d1 = __ll_highpart (d);						\
2150     __d0 = __ll_lowpart (d);						\
2151 									\
2152     __q1 = (n1) / __d1;							\
2153     __r1 = (n1) - __q1 * __d1;						\
2154     __m = __q1 * __d0;							\
2155     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
2156     if (__r1 < __m)							\
2157       {									\
2158 	__q1--, __r1 += (d);						\
2159 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2160 	  if (__r1 < __m)						\
2161 	    __q1--, __r1 += (d);					\
2162       }									\
2163     __r1 -= __m;							\
2164 									\
2165     __q0 = __r1 / __d1;							\
2166     __r0 = __r1  - __q0 * __d1;						\
2167     __m = __q0 * __d0;							\
2168     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
2169     if (__r0 < __m)							\
2170       {									\
2171 	__q0--, __r0 += (d);						\
2172 	if (__r0 >= (d))						\
2173 	  if (__r0 < __m)						\
2174 	    __q0--, __r0 += (d);					\
2175       }									\
2176     __r0 -= __m;							\
2177 									\
2178     (q) = __q1 * __ll_B | __q0;						\
2179     (r) = __r0;								\
2180   } while (0)
2181 
2182 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2183    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2184 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
2185   && ! defined (LONGLONG_STANDALONE)
2186 #define udiv_qrnnd(q, r, nh, nl, d) \
2187   do {									\
2188     UWtype __r;								\
2189     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
2190     (r) = __r;								\
2191   } while (0)
2192 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2193 #endif
2194 
2195 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2196 #if !defined (udiv_qrnnd)
2197 #define UDIV_NEEDS_NORMALIZATION 1
2198 #define udiv_qrnnd __udiv_qrnnd_c
2199 #endif
2200 
2201 #if !defined (count_leading_zeros)
2202 #define count_leading_zeros(count, x) \
2203   do {									\
2204     UWtype __xr = (x);							\
2205     UWtype __a;								\
2206 									\
2207     if (W_TYPE_SIZE == 32)						\
2208       {									\
2209 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
2210 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
2211 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
2212 	  : 3*__BITS4 + 1);						\
2213       }									\
2214     else								\
2215       {									\
2216 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
2217 	  if (((__xr >> __a) & 0xff) != 0)				\
2218 	    break;							\
2219 	++__a;								\
2220       }									\
2221 									\
2222     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
2223   } while (0)
2224 /* This version gives a well-defined value for zero. */
2225 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2226 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2227 #define COUNT_LEADING_ZEROS_SLOW
2228 #endif
2229 
2230 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2231 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2232 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2233 #endif
2234 
2235 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2236 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2237 #endif
2238 
2239 #if !defined (count_trailing_zeros)
2240 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2241 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2242 #define count_trailing_zeros(count, x)					\
2243   do {									\
2244     UWtype __ctz_x = (x);						\
2245     UWtype __ctz_c;							\
2246     ASSERT (__ctz_x != 0);						\
2247     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
2248     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
2249   } while (0)
2250 #else
2251 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2252    We use clz_tab without ado, since the C count_leading_zeros above will have
2253    pulled it in.  */
2254 #define count_trailing_zeros(count, x)					\
2255   do {									\
2256     UWtype __ctz_x = (x);						\
2257     int __ctz_c;							\
2258 									\
2259     if (LIKELY ((__ctz_x & 0xff) != 0))					\
2260       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
2261     else								\
2262       {									\
2263 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
2264 	  {								\
2265 	    __ctz_x >>= 8;						\
2266 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
2267 	      break;							\
2268 	  }								\
2269 									\
2270 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
2271       }									\
2272   } while (0)
2273 #endif
2274 #endif
2275 
2276 #ifndef UDIV_NEEDS_NORMALIZATION
2277 #define UDIV_NEEDS_NORMALIZATION 0
2278 #endif
2279 
2280 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2281    that hence the latter should always be used.  */
2282 #ifndef UDIV_PREINV_ALWAYS
2283 #define UDIV_PREINV_ALWAYS 0
2284 #endif
2285