1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2 
3 Copyright 1991-2020 Free Software Foundation, Inc.
4 
5 This file is free software; you can redistribute it and/or modify it under the
6 terms of the GNU Lesser General Public License as published by the Free
7 Software Foundation; either version 3 of the License, or (at your option) any
8 later version.
9 
10 This file is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
12 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
13 details.
14 
15 You should have received a copy of the GNU Lesser General Public License
16 along with this file.  If not, see https://www.gnu.org/licenses/.  */
17 
18 /* You have to define the following before including this file:
19 
20    UWtype -- An unsigned type, default type for operations (typically a "word")
21    UHWtype -- An unsigned type, at least half the size of UWtype
22    UDWtype -- An unsigned type, at least twice as large a UWtype
23    W_TYPE_SIZE -- size in bits of UWtype
24 
25    SItype, USItype -- Signed and unsigned 32 bit types
26    DItype, UDItype -- Signed and unsigned 64 bit types
27 
28    On a 32 bit machine UWtype should typically be USItype;
29    on a 64 bit machine, UWtype should typically be UDItype.
30 
31    Optionally, define:
32 
33    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
34    NO_ASM -- Disable inline asm
35 
36 
37    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
38    need to include gmp.h and gmp-impl.h, or certain things might not work as
39    expected.
40 */
41 
42 #define __BITS4 (W_TYPE_SIZE / 4)
43 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
44 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
45 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
46 
47 /* This is used to make sure no undesirable sharing between different libraries
48    that use this file takes place.  */
49 #ifndef __MPN
50 #define __MPN(x) __##x
51 #endif
52 
53 /* Define auxiliary asm macros.
54 
55    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
56    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
57    word product in HIGH_PROD and LOW_PROD.
58 
59    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
60    UDWtype product.  This is just a variant of umul_ppmm.
61 
62    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
63    denominator) divides a UDWtype, composed by the UWtype integers
64    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
65    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
66    than DENOMINATOR for correct operation.  If, in addition, the most
67    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
68    UDIV_NEEDS_NORMALIZATION is defined to 1.
69 
70    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
71    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
72    is rounded towards 0.
73 
74    5) count_leading_zeros(count, x) counts the number of zero-bits from the
75    msb to the first non-zero bit in the UWtype X.  This is the number of
76    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
77    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
78 
79    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
80    from the least significant end.
81 
82    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
83    high_addend_2, low_addend_2) adds two UWtype integers, composed by
84    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
85    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
86    (i.e. carry out) is not stored anywhere, and is lost.
87 
88    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
89    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
90    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
91    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
92    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
93    and is lost.
94 
95    If any of these macros are left undefined for a particular CPU,
96    C macros are used.
97 
98 
99    Notes:
100 
101    For add_ssaaaa the two high and two low addends can both commute, but
102    unfortunately gcc only supports one "%" commutative in each asm block.
103    This has always been so but is only documented in recent versions
104    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
105    compiler error in certain rare circumstances.
106 
107    Apparently it was only the last "%" that was ever actually respected, so
108    the code has been updated to leave just that.  Clearly there's a free
109    choice whether high or low should get it, if there's a reason to favour
110    one over the other.  Also obviously when the constraints on the two
111    operands are identical there's no benefit to the reloader in any "%" at
112    all.
113 
114    */
115 
116 /* The CPUs come in alphabetical order below.
117 
118    Please add support for more CPUs here, or improve the current support
119    for the CPUs below!  */
120 
121 
122 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
123    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
124    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
125    __builtin_ctzll.
126 
127    These builtins are only used when we check what code comes out, on some
128    chips they're merely libgcc calls, where we will instead want an inline
129    in that case (either asm or generic C).
130 
131    These builtins are better than an asm block of the same insn, since an
132    asm block doesn't give gcc any information about scheduling or resource
133    usage.  We keep an asm block for use on prior versions of gcc though.
134 
135    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
136    it's not used (for count_leading_zeros) because it generally gives extra
137    code to ensure the result is 0 when the input is 0, which we don't need
138    or want.  */
139 
140 #ifdef _LONG_LONG_LIMB
141 #define count_leading_zeros_gcc_clz(count,x)	\
142   do {						\
143     ASSERT ((x) != 0);				\
144     (count) = __builtin_clzll (x);		\
145   } while (0)
146 #else
147 #define count_leading_zeros_gcc_clz(count,x)	\
148   do {						\
149     ASSERT ((x) != 0);				\
150     (count) = __builtin_clzl (x);		\
151   } while (0)
152 #endif
153 
154 #ifdef _LONG_LONG_LIMB
155 #define count_trailing_zeros_gcc_ctz(count,x)	\
156   do {						\
157     ASSERT ((x) != 0);				\
158     (count) = __builtin_ctzll (x);		\
159   } while (0)
160 #else
161 #define count_trailing_zeros_gcc_ctz(count,x)	\
162   do {						\
163     ASSERT ((x) != 0);				\
164     (count) = __builtin_ctzl (x);		\
165   } while (0)
166 #endif
167 
168 
169 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
170    don't need to be under !NO_ASM */
171 #if ! defined (NO_ASM)
172 
173 #if defined (__alpha) && W_TYPE_SIZE == 64
174 /* Most alpha-based machines, except Cray systems. */
175 #if defined (__GNUC__)
176 #if __GMP_GNUC_PREREQ (3,3)
177 #define umul_ppmm(ph, pl, m0, m1) \
178   do {									\
179     UDItype __m0 = (m0), __m1 = (m1);					\
180     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
181     (pl) = __m0 * __m1;							\
182   } while (0)
183 #else
184 #define umul_ppmm(ph, pl, m0, m1) \
185   do {									\
186     UDItype __m0 = (m0), __m1 = (m1);					\
187     __asm__ ("umulh %r1,%2,%0"						\
188 	     : "=r" (ph)						\
189 	     : "%rJ" (__m0), "rI" (__m1));				\
190     (pl) = __m0 * __m1;							\
191   } while (0)
192 #endif
193 #else /* ! __GNUC__ */
194 #include <machine/builtins.h>
195 #define umul_ppmm(ph, pl, m0, m1) \
196   do {									\
197     UDItype __m0 = (m0), __m1 = (m1);					\
198     (ph) = __UMULH (__m0, __m1);					\
199     (pl) = __m0 * __m1;							\
200   } while (0)
201 #endif
202 #ifndef LONGLONG_STANDALONE
203 #define udiv_qrnnd(q, r, n1, n0, d) \
204   do { UWtype __di;							\
205     __di = __MPN(invert_limb) (d);					\
206     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
207   } while (0)
208 #define UDIV_PREINV_ALWAYS  1
209 #define UDIV_NEEDS_NORMALIZATION 1
210 #endif /* LONGLONG_STANDALONE */
211 
212 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
213    always goes into libgmp.so, even when not actually used.  */
214 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
215 
216 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
217 #define count_leading_zeros(COUNT,X) \
218   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
219 #define count_trailing_zeros(COUNT,X) \
220   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
221 #endif /* clz/ctz using cix */
222 
223 #if ! defined (count_leading_zeros)				\
224   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
225 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
226    "$31" is written explicitly in the asm, since an "r" constraint won't
227    select reg 31.  There seems no need to worry about "r31" syntax for cray,
228    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
229 #define ALPHA_CMPBGE_0(dst, src)					\
230   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
231 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
232    them, locating the highest non-zero byte.  A second __clz_tab lookup
233    counts the leading zero bits in that byte, giving the result.  */
234 #define count_leading_zeros(count, x)					\
235   do {									\
236     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
237     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
238     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
239     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
240     __clz__x >>= __clz__b;						\
241     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
242     __clz__b = 65 - __clz__b;						\
243     (count) = __clz__b - __clz__c;					\
244   } while (0)
245 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
246 #endif /* clz using cmpbge */
247 
248 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
249 #if HAVE_ATTRIBUTE_CONST
250 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
251 #else
252 long __MPN(count_leading_zeros) (UDItype);
253 #endif
254 #define count_leading_zeros(count, x) \
255   ((count) = __MPN(count_leading_zeros) (x))
256 #endif /* clz using mpn */
257 #endif /* __alpha */
258 
259 #if defined (__AVR) && W_TYPE_SIZE == 8
260 #define umul_ppmm(ph, pl, m0, m1) \
261   do {									\
262     unsigned short __p = (unsigned short) (m0) * (m1);			\
263     (ph) = __p >> 8;							\
264     (pl) = __p;								\
265   } while (0)
266 #endif /* AVR */
267 
268 #if defined (_CRAY) && W_TYPE_SIZE == 64
269 #include <intrinsics.h>
270 #define UDIV_PREINV_ALWAYS  1
271 #define UDIV_NEEDS_NORMALIZATION 1
272 long __MPN(count_leading_zeros) (UDItype);
273 #define count_leading_zeros(count, x) \
274   ((count) = _leadz ((UWtype) (x)))
275 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
276 #define umul_ppmm(ph, pl, m0, m1) \
277   do {									\
278     UDItype __m0 = (m0), __m1 = (m1);					\
279     (ph) = _int_mult_upper (__m0, __m1);				\
280     (pl) = __m0 * __m1;							\
281   } while (0)
282 #ifndef LONGLONG_STANDALONE
283 #define udiv_qrnnd(q, r, n1, n0, d) \
284   do { UWtype __di;							\
285     __di = __MPN(invert_limb) (d);					\
286     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
287   } while (0)
288 #endif /* LONGLONG_STANDALONE */
289 #endif /* _CRAYIEEE */
290 #endif /* _CRAY */
291 
292 #if defined (__ia64) && W_TYPE_SIZE == 64
293 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
294    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
295    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
296    register, which takes an extra cycle.  */
297 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
298   do {						\
299     UWtype __x;					\
300     __x = (al) - (bl);				\
301     if ((al) < (bl))				\
302       (sh) = (ah) - (bh) - 1;			\
303     else					\
304       (sh) = (ah) - (bh);			\
305     (sl) = __x;					\
306   } while (0)
307 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
308 /* Do both product parts in assembly, since that gives better code with
309    all gcc versions.  Some callers will just use the upper part, and in
310    that situation we waste an instruction, but not any cycles.  */
311 #define umul_ppmm(ph, pl, m0, m1) \
312     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
313 	     : "=&f" (ph), "=f" (pl)					\
314 	     : "f" (m0), "f" (m1))
315 #define count_leading_zeros(count, x) \
316   do {									\
317     UWtype _x = (x), _y, _a, _c;					\
318     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
319     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
320     _c = (_a - 1) << 3;							\
321     _x >>= _c;								\
322     if (_x >= 1 << 4)							\
323       _x >>= 4, _c += 4;						\
324     if (_x >= 1 << 2)							\
325       _x >>= 2, _c += 2;						\
326     _c += _x >> 1;							\
327     (count) =  W_TYPE_SIZE - 1 - _c;					\
328   } while (0)
329 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
330    based, and we don't need a special case for x==0 here */
331 #define count_trailing_zeros(count, x)					\
332   do {									\
333     UWtype __ctz_x = (x);						\
334     __asm__ ("popcnt %0 = %1"						\
335 	     : "=r" (count)						\
336 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
337   } while (0)
338 #endif
339 #if defined (__INTEL_COMPILER)
340 #include <ia64intrin.h>
341 #define umul_ppmm(ph, pl, m0, m1)					\
342   do {									\
343     UWtype __m0 = (m0), __m1 = (m1);					\
344     ph = _m64_xmahu (__m0, __m1, 0);					\
345     pl = __m0 * __m1;							\
346   } while (0)
347 #endif
348 #ifndef LONGLONG_STANDALONE
349 #define udiv_qrnnd(q, r, n1, n0, d) \
350   do { UWtype __di;							\
351     __di = __MPN(invert_limb) (d);					\
352     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
353   } while (0)
354 #define UDIV_PREINV_ALWAYS  1
355 #define UDIV_NEEDS_NORMALIZATION 1
356 #endif
357 #endif
358 
359 
360 #if defined (__GNUC__)
361 
362 /* We sometimes need to clobber "cc" with gcc2, but that would not be
363    understood by gcc1.  Use cpp to avoid major code duplication.  */
364 #if __GNUC__ < 2
365 #define __CLOBBER_CC
366 #define __AND_CLOBBER_CC
367 #else /* __GNUC__ >= 2 */
368 #define __CLOBBER_CC : "cc"
369 #define __AND_CLOBBER_CC , "cc"
370 #endif /* __GNUC__ < 2 */
371 
372 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
373 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
374   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
375 	   : "=r" (sh), "=&r" (sl)					\
376 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
377 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
378   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
379 	   : "=r" (sh), "=&r" (sl)					\
380 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
381 #define umul_ppmm(xh, xl, m0, m1) \
382   do {									\
383     USItype __m0 = (m0), __m1 = (m1);					\
384     __asm__ ("multiplu %0,%1,%2"					\
385 	     : "=r" (xl)						\
386 	     : "r" (__m0), "r" (__m1));					\
387     __asm__ ("multmu %0,%1,%2"						\
388 	     : "=r" (xh)						\
389 	     : "r" (__m0), "r" (__m1));					\
390   } while (0)
391 #define udiv_qrnnd(q, r, n1, n0, d) \
392   __asm__ ("dividu %0,%3,%4"						\
393 	   : "=r" (q), "=q" (r)						\
394 	   : "1" (n1), "r" (n0), "r" (d))
395 #define count_leading_zeros(count, x) \
396     __asm__ ("clz %0,%1"						\
397 	     : "=r" (count)						\
398 	     : "r" (x))
399 #define COUNT_LEADING_ZEROS_0 32
400 #endif /* __a29k__ */
401 
402 #if defined (__arc__)
403 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
404   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
405 	   : "=r" (sh),							\
406 	     "=&r" (sl)							\
407 	   : "r"  ((USItype) (ah)),					\
408 	     "rICal" ((USItype) (bh)),					\
409 	     "%r" ((USItype) (al)),					\
410 	     "rICal" ((USItype) (bl)))
411 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
412   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
413 	   : "=r" (sh),							\
414 	     "=&r" (sl)							\
415 	   : "r" ((USItype) (ah)),					\
416 	     "rICal" ((USItype) (bh)),					\
417 	     "r" ((USItype) (al)),					\
418 	     "rICal" ((USItype) (bl)))
419 #endif
420 
421 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
422     && W_TYPE_SIZE == 32
423 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
424   do {									\
425     if (__builtin_constant_p (bl) && -(USItype)(bl) < 0x100)		\
426       __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
427 	   : "=r" (sh), "=&r" (sl)					\
428 	       : "r" (ah), "rI" (bh),					\
429 		 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC);	\
430     else								\
431       __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
432 	   : "=r" (sh), "=&r" (sl)					\
433 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC);	\
434   } while (0)
435 /* FIXME: Extend the immediate range for the low word by using both ADDS and
436    SUBS, since they set carry in the same way.  Note: We need separate
437    definitions for thumb and non-thumb due to the absence of RSC on thumb.  */
438 #if defined (__thumb__)
439 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
440   do {									\
441     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
442 	&& (ah) == (bh))						\
443       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
444 	       : "=r" (sh), "=r" (sl)					\
445 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
446     else if (__builtin_constant_p (al))					\
447       __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"			\
448 	       : "=r" (sh), "=&r" (sl)					\
449 	       : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
450     else if (__builtin_constant_p (bl))					\
451       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
452 	       : "=r" (sh), "=&r" (sl)					\
453 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
454     else								\
455       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
456 	       : "=r" (sh), "=&r" (sl)					\
457 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
458     } while (0)
459 #else
460 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
461   do {									\
462     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
463 	&& (ah) == (bh))						\
464       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
465 	       : "=r" (sh), "=r" (sl)					\
466 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
467     else if (__builtin_constant_p (al))					\
468       {									\
469 	if (__builtin_constant_p (ah))					\
470 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
471 		   : "=r" (sh), "=&r" (sl)				\
472 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
473 	else								\
474 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
475 		   : "=r" (sh), "=&r" (sl)				\
476 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
477       }									\
478     else if (__builtin_constant_p (ah))					\
479       {									\
480 	if (__builtin_constant_p (bl))					\
481 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
482 		   : "=r" (sh), "=&r" (sl)				\
483 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
484 	else								\
485 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
486 		   : "=r" (sh), "=&r" (sl)				\
487 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
488       }									\
489     else if (__builtin_constant_p (bl))					\
490       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
491 	       : "=r" (sh), "=&r" (sl)					\
492 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
493     else /* only bh might be a constant */				\
494       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
495 	       : "=r" (sh), "=&r" (sl)					\
496 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
497     } while (0)
498 #endif
499 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
500     || defined (__ARM_ARCH_3__)
501 #define umul_ppmm(xh, xl, a, b)						\
502   do {									\
503     register USItype __t0, __t1, __t2;					\
504     __asm__ ("%@ Inlined umul_ppmm\n"					\
505 	   "	mov	%2, %5, lsr #16\n"				\
506 	   "	mov	%0, %6, lsr #16\n"				\
507 	   "	bic	%3, %5, %2, lsl #16\n"				\
508 	   "	bic	%4, %6, %0, lsl #16\n"				\
509 	   "	mul	%1, %3, %4\n"					\
510 	   "	mul	%4, %2, %4\n"					\
511 	   "	mul	%3, %0, %3\n"					\
512 	   "	mul	%0, %2, %0\n"					\
513 	   "	adds	%3, %4, %3\n"					\
514 	   "	addcs	%0, %0, #65536\n"				\
515 	   "	adds	%1, %1, %3, lsl #16\n"				\
516 	   "	adc	%0, %0, %3, lsr #16"				\
517 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
518 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
519 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
520   } while (0)
521 #ifndef LONGLONG_STANDALONE
522 #define udiv_qrnnd(q, r, n1, n0, d) \
523   do { UWtype __r;							\
524     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
525     (r) = __r;								\
526   } while (0)
527 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
528 #endif /* LONGLONG_STANDALONE */
529 #else /* ARMv4 or newer */
530 #define umul_ppmm(xh, xl, a, b) \
531   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
532 #define smul_ppmm(xh, xl, a, b) \
533   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
534 #ifndef LONGLONG_STANDALONE
535 #define udiv_qrnnd(q, r, n1, n0, d) \
536   do { UWtype __di;							\
537     __di = __MPN(invert_limb) (d);					\
538     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
539   } while (0)
540 #define UDIV_PREINV_ALWAYS  1
541 #define UDIV_NEEDS_NORMALIZATION 1
542 #endif /* LONGLONG_STANDALONE */
543 #endif /* defined(__ARM_ARCH_2__) ... */
544 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
545 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
546 #endif /* __arm__ */
547 
548 #if defined (__aarch64__) && W_TYPE_SIZE == 64
549 /* FIXME: Extend the immediate range for the low word by using both
550    ADDS and SUBS, since they set carry in the same way.  */
551 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
552   do {									\
553     if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000)		\
554       __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
555 	       : "=r" (sh), "=&r" (sl)					\
556 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
557 		 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
558     else								\
559       __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
560 	       : "=r" (sh), "=&r" (sl)					\
561 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
562 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
563   } while (0)
564 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
565   do {									\
566     if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000)		\
567       __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
568 	       : "=r,r" (sh), "=&r,&r" (sl)				\
569 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
570 		 "r,Z"   ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
571     else								\
572       __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
573 	       : "=r,r" (sh), "=&r,&r" (sl)				\
574 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
575 		 "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC);\
576   } while(0);
577 #if __GMP_GNUC_PREREQ (4,9)
578 #define umul_ppmm(w1, w0, u, v) \
579   do {									\
580     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
581     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
582     w1 = __ll >> 64;							\
583     w0 = __ll;								\
584   } while (0)
585 #endif
586 #if !defined (umul_ppmm)
587 #define umul_ppmm(ph, pl, m0, m1) \
588   do {									\
589     UDItype __m0 = (m0), __m1 = (m1);					\
590     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
591     (pl) = __m0 * __m1;							\
592   } while (0)
593 #endif
594 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
595 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
596 #endif /* __aarch64__ */
597 
598 #if defined (__clipper__) && W_TYPE_SIZE == 32
599 #define umul_ppmm(w1, w0, u, v) \
600   ({union {UDItype __ll;						\
601 	   struct {USItype __l, __h;} __i;				\
602 	  } __x;							\
603   __asm__ ("mulwux %2,%0"						\
604 	   : "=r" (__x.__ll)						\
605 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
606   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
607 #define smul_ppmm(w1, w0, u, v) \
608   ({union {DItype __ll;							\
609 	   struct {SItype __l, __h;} __i;				\
610 	  } __x;							\
611   __asm__ ("mulwx %2,%0"						\
612 	   : "=r" (__x.__ll)						\
613 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
614   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
615 #define __umulsidi3(u, v) \
616   ({UDItype __w;							\
617     __asm__ ("mulwux %2,%0"						\
618 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
619     __w; })
620 #endif /* __clipper__ */
621 
622 /* Fujitsu vector computers.  */
623 #if defined (__uxp__) && W_TYPE_SIZE == 32
624 #define umul_ppmm(ph, pl, u, v) \
625   do {									\
626     union {UDItype __ll;						\
627 	   struct {USItype __h, __l;} __i;				\
628 	  } __x;							\
629     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
630     (ph) = __x.__i.__h;							\
631     (pl) = __x.__i.__l;							\
632   } while (0)
633 #define smul_ppmm(ph, pl, u, v) \
634   do {									\
635     union {UDItype __ll;						\
636 	   struct {USItype __h, __l;} __i;				\
637 	  } __x;							\
638     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
639     (ph) = __x.__i.__h;							\
640     (pl) = __x.__i.__l;							\
641   } while (0)
642 #endif
643 
644 #if defined (__gmicro__) && W_TYPE_SIZE == 32
645 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
646   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
647 	   : "=g" (sh), "=&g" (sl)					\
648 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
649 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
650 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
651   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
652 	   : "=g" (sh), "=&g" (sl)					\
653 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
654 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
655 #define umul_ppmm(ph, pl, m0, m1) \
656   __asm__ ("mulx %3,%0,%1"						\
657 	   : "=g" (ph), "=r" (pl)					\
658 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
659 #define udiv_qrnnd(q, r, nh, nl, d) \
660   __asm__ ("divx %4,%0,%1"						\
661 	   : "=g" (q), "=r" (r)						\
662 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
663 #define count_leading_zeros(count, x) \
664   __asm__ ("bsch/1 %1,%0"						\
665 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
666 #endif
667 
668 #if defined (__hppa) && W_TYPE_SIZE == 32
669 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
670   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
671 	   : "=r" (sh), "=&r" (sl)					\
672 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
673 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
674   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
675 	   : "=r" (sh), "=&r" (sl)					\
676 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
677 #if defined (_PA_RISC1_1)
678 #define umul_ppmm(wh, wl, u, v) \
679   do {									\
680     union {UDItype __ll;						\
681 	   struct {USItype __h, __l;} __i;				\
682 	  } __x;							\
683     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
684     (wh) = __x.__i.__h;							\
685     (wl) = __x.__i.__l;							\
686   } while (0)
687 #endif
688 #define count_leading_zeros(count, x) \
689   do {									\
690     USItype __tmp;							\
691     __asm__ (								\
692        "ldi		1,%0\n"						\
693 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
694 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
695 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
696 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
697 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
698 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
699 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
700 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
701 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
702 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
703 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
704 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
705 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
706 "	sub		%0,%1,%0	; Subtract it.\n"		\
707 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
708   } while (0)
709 #endif /* hppa */
710 
711 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
712    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
713    is just a case of no direct support for 2.0n but treating it like 1.0. */
714 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
715 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
716   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
717 	   : "=r" (sh), "=&r" (sl)					\
718 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
719 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
720   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
721 	   : "=r" (sh), "=&r" (sl)					\
722 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
723 #endif /* hppa */
724 
725 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
726 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
727 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
728   do {									\
729 /*  if (__builtin_constant_p (bl))					\
730       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
731 	       : "=r" (sh), "=&r" (sl)					\
732 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
733     else								\
734 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
735 	       : "=r" (sh), "=&r" (sl)					\
736 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
737   } while (0)
738 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
739   do {									\
740 /*  if (__builtin_constant_p (bl))					\
741       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
742 	       : "=r" (sh), "=&r" (sl)					\
743 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
744     else								\
745 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
746 	       : "=r" (sh), "=&r" (sl)					\
747 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
748   } while (0)
749 #if __GMP_GNUC_PREREQ (4,5)
750 #define umul_ppmm(xh, xl, m0, m1)					\
751   do {									\
752     union {UDItype __ll;						\
753 	   struct {USItype __h, __l;} __i;				\
754 	  } __x;							\
755     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
756     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
757   } while (0)
758 #else
759 #if 0
760 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
761    with a new enough processor pretending we have 32-bit registers.  */
762 #define umul_ppmm(xh, xl, m0, m1)					\
763   do {									\
764     union {UDItype __ll;						\
765 	   struct {USItype __h, __l;} __i;				\
766 	  } __x;							\
767     __asm__ ("mlr\t%0,%2"						\
768 	     : "=r" (__x.__ll)						\
769 	     : "%0" (m0), "r" (m1));					\
770     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
771   } while (0)
772 #else
773 #define umul_ppmm(xh, xl, m0, m1)					\
774   do {									\
775   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
776      DImode for the product, since that would be allocated to a single 64-bit
777      register, whereas mlr uses the low 32-bits of an even-odd register pair.
778   */									\
779     register USItype __r0 __asm__ ("0");				\
780     register USItype __r1 __asm__ ("1") = (m0);				\
781     __asm__ ("mlr\t%0,%3"						\
782 	     : "=r" (__r0), "=r" (__r1)					\
783 	     : "r" (__r1), "r" (m1));					\
784     (xh) = __r0; (xl) = __r1;						\
785   } while (0)
786 #endif /* if 0 */
787 #endif
788 #if 0
789 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
790    with a new enough processor pretending we have 32-bit registers.  */
791 #define udiv_qrnnd(q, r, n1, n0, d)					\
792   do {									\
793     union {UDItype __ll;						\
794 	   struct {USItype __h, __l;} __i;				\
795 	  } __x;							\
796     __x.__i.__h = n1; __x.__i.__l = n0;					\
797     __asm__ ("dlr\t%0,%2"						\
798 	     : "=r" (__x.__ll)						\
799 	     : "0" (__x.__ll), "r" (d));				\
800     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
801   } while (0)
802 #else
803 #define udiv_qrnnd(q, r, n1, n0, d)					\
804   do {									\
805     register USItype __r0 __asm__ ("0") = (n1);				\
806     register USItype __r1 __asm__ ("1") = (n0);				\
807     __asm__ ("dlr\t%0,%4"						\
808 	     : "=r" (__r0), "=r" (__r1)					\
809 	     : "r" (__r0), "r" (__r1), "r" (d));			\
810     (q) = __r1; (r) = __r0;						\
811   } while (0)
812 #endif /* if 0 */
813 #else /* if __zarch__ */
814 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
815 #define smul_ppmm(xh, xl, m0, m1)					\
816   do {									\
817     union {DItype __ll;							\
818 	   struct {USItype __h, __l;} __i;				\
819 	  } __x;							\
820     __asm__ ("mr\t%0,%2"						\
821 	     : "=r" (__x.__ll)						\
822 	     : "%0" (m0), "r" (m1));					\
823     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
824   } while (0)
825 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
826 #define sdiv_qrnnd(q, r, n1, n0, d)					\
827   do {									\
828     union {DItype __ll;							\
829 	   struct {USItype __h, __l;} __i;				\
830 	  } __x;							\
831     __x.__i.__h = n1; __x.__i.__l = n0;					\
832     __asm__ ("dr\t%0,%2"						\
833 	     : "=r" (__x.__ll)						\
834 	     : "0" (__x.__ll), "r" (d));				\
835     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
836   } while (0)
837 #endif /* if __zarch__ */
838 #endif
839 
840 #if defined (__s390x__) && W_TYPE_SIZE == 64
841 /* We need to cast operands with register constraints, otherwise their types
842    will be assumed to be SImode by gcc.  For these machines, such operations
843    will insert a value into the low 32 bits, and leave the high 32 bits with
844    garbage.  */
845 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
846   do {									\
847     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
848 	       : "=r" (sh), "=&r" (sl)					\
849 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
850 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
851   } while (0)
852 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
853   do {									\
854     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
855 	     : "=r" (sh), "=&r" (sl)					\
856 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
857 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
858   } while (0)
859 #define umul_ppmm(xh, xl, m0, m1)					\
860   do {									\
861     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
862 	   struct {UDItype __h, __l;} __i;				\
863 	  } __x;							\
864     __asm__ ("mlgr\t%0,%2"						\
865 	     : "=r" (__x.__ll)						\
866 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
867     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
868   } while (0)
869 #define udiv_qrnnd(q, r, n1, n0, d)					\
870   do {									\
871     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
872 	   struct {UDItype __h, __l;} __i;				\
873 	  } __x;							\
874     __x.__i.__h = n1; __x.__i.__l = n0;					\
875     __asm__ ("dlgr\t%0,%2"						\
876 	     : "=r" (__x.__ll)						\
877 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
878     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
879   } while (0)
880 #if 0 /* FIXME: Enable for z10 (?) */
881 #define count_leading_zeros(cnt, x)					\
882   do {									\
883     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
884 	   struct {UDItype __h, __l;} __i;				\
885 	  } __clr_cnt;							\
886     __asm__ ("flogr\t%0,%1"						\
887 	     : "=r" (__clr_cnt.__ll)					\
888 	     : "r" (x) __CLOBBER_CC);					\
889     (cnt) = __clr_cnt.__i.__h;						\
890   } while (0)
891 #endif
892 #endif
893 
894 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
895    so we don't need __CLOBBER_CC.  */
896 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
897 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
898   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
899 	   : "=r" (sh), "=&r" (sl)					\
900 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
901 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
902 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
903   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
904 	   : "=r" (sh), "=&r" (sl)					\
905 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
906 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
907 #define umul_ppmm(w1, w0, u, v) \
908   __asm__ ("mull %3"							\
909 	   : "=a" (w0), "=d" (w1)					\
910 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
911 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
912   __asm__ ("divl %4"		     /* stringification in K&R C */	\
913 	   : "=a" (q), "=d" (r)						\
914 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
915 
916 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
917 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
918    significant 1 bit is, hence the use of the following alternatives.  bsfl
919    is slow too, between 18 and 42 depending where the least significant 1
920    bit is, so let the generic count_trailing_zeros below make use of the
921    count_leading_zeros here too.  */
922 
923 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
924 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
925    cache miss reading from __clz_tab.  For P55 it's favoured over the float
926    below so as to avoid mixing MMX and x87, since the penalty for switching
927    between the two is about 100 cycles.
928 
929    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
930    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
931    follows, but as of gcc 2.95.2 it results in conditional jumps.
932 
933        __shift = -(__n < 0x1000000);
934        __shift -= (__n < 0x10000);
935        __shift -= (__n < 0x100);
936 
937    The middle two sbbl and cmpl's pair, and with luck something gcc
938    generates might pair with the first cmpl and the last sbbl.  The "32+1"
939    constant could be folded into __clz_tab[], but it doesn't seem worth
940    making a different table just for that.  */
941 
942 #define count_leading_zeros(c,n)					\
943   do {									\
944     USItype  __n = (n);							\
945     USItype  __shift;							\
946     __asm__ ("cmpl  $0x1000000, %1\n"					\
947 	     "sbbl  %0, %0\n"						\
948 	     "cmpl  $0x10000, %1\n"					\
949 	     "sbbl  $0, %0\n"						\
950 	     "cmpl  $0x100, %1\n"					\
951 	     "sbbl  $0, %0\n"						\
952 	     : "=&r" (__shift) : "r"  (__n));				\
953     __shift = __shift*8 + 24 + 1;					\
954     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
955   } while (0)
956 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
957 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
958 
959 #else /* ! pentiummmx || LONGLONG_STANDALONE */
960 /* The following should be a fixed 14 cycles or so.  Some scheduling
961    opportunities should be available between the float load/store too.  This
962    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
963    apparently suggested by the Intel optimizing manual (don't know exactly
964    where).  gcc 2.95 or up will be best for this, so the "double" is
965    correctly aligned on the stack.  */
966 #define count_leading_zeros(c,n)					\
967   do {									\
968     union {								\
969       double    d;							\
970       unsigned  a[2];							\
971     } __u;								\
972     __u.d = (UWtype) (n);						\
973     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
974   } while (0)
975 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
976 #endif /* pentiummx */
977 
978 #else /* ! pentium */
979 
980 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
981 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
982 #endif /* gcc clz */
983 
984 /* On P6, gcc prior to 3.0 generates a partial register stall for
985    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
986    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
987    cost of one extra instruction.  Do this for "i386" too, since that means
988    generic x86.  */
989 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
990   && (HAVE_HOST_CPU_i386						\
991       || HAVE_HOST_CPU_i686						\
992       || HAVE_HOST_CPU_pentiumpro					\
993       || HAVE_HOST_CPU_pentium2						\
994       || HAVE_HOST_CPU_pentium3)
995 #define count_leading_zeros(count, x)					\
996   do {									\
997     USItype __cbtmp;							\
998     ASSERT ((x) != 0);							\
999     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1000     (count) = 31 - __cbtmp;						\
1001   } while (0)
1002 #endif /* gcc<3 asm bsrl */
1003 
1004 #ifndef count_leading_zeros
1005 #define count_leading_zeros(count, x)					\
1006   do {									\
1007     USItype __cbtmp;							\
1008     ASSERT ((x) != 0);							\
1009     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1010     (count) = __cbtmp ^ 31;						\
1011   } while (0)
1012 #endif /* asm bsrl */
1013 
1014 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
1015 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
1016 #endif /* gcc ctz */
1017 
1018 #ifndef count_trailing_zeros
1019 #define count_trailing_zeros(count, x)					\
1020   do {									\
1021     ASSERT ((x) != 0);							\
1022     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
1023   } while (0)
1024 #endif /* asm bsfl */
1025 
1026 #endif /* ! pentium */
1027 
1028 #endif /* 80x86 */
1029 
1030 #if defined (__amd64__) && W_TYPE_SIZE == 64
1031 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1032   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
1033 	   : "=r" (sh), "=&r" (sl)					\
1034 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1035 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1036 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1037   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
1038 	   : "=r" (sh), "=&r" (sl)					\
1039 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1040 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1041 #if X86_ASM_MULX \
1042    && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
1043        || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
1044 #define umul_ppmm(w1, w0, u, v) \
1045   __asm__ ("mulx\t%3, %0, %1"						\
1046 	   : "=r" (w0), "=r" (w1)					\
1047 	   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
1048 #else
1049 #define umul_ppmm(w1, w0, u, v) \
1050   __asm__ ("mulq\t%3"							\
1051 	   : "=a" (w0), "=d" (w1)					\
1052 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1053 #endif
1054 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1055   __asm__ ("divq %4"		     /* stringification in K&R C */	\
1056 	   : "=a" (q), "=d" (r)						\
1057 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1058 
1059 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
1060   || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2	\
1061   || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen	\
1062   || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
1063 #define count_leading_zeros(count, x)					\
1064   do {									\
1065     /* This is lzcnt, spelled for older assemblers.  Destination and */	\
1066     /* source must be a 64-bit registers, hence cast and %q.         */	\
1067     __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1068   } while (0)
1069 #define COUNT_LEADING_ZEROS_0 64
1070 #else
1071 #define count_leading_zeros(count, x)					\
1072   do {									\
1073     UDItype __cbtmp;							\
1074     ASSERT ((x) != 0);							\
1075     __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
1076     (count) = __cbtmp ^ 63;						\
1077   } while (0)
1078 #endif
1079 
1080 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
1081   || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
1082 #define count_trailing_zeros(count, x)					\
1083   do {									\
1084     /* This is tzcnt, spelled for older assemblers.  Destination and */	\
1085     /* source must be a 64-bit registers, hence cast and %q.         */	\
1086     __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1087   } while (0)
1088 #define COUNT_TRAILING_ZEROS_0 64
1089 #else
1090 #define count_trailing_zeros(count, x)					\
1091   do {									\
1092     ASSERT ((x) != 0);							\
1093     __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1094   } while (0)
1095 #endif
1096 #endif /* __amd64__ */
1097 
1098 #if defined (__i860__) && W_TYPE_SIZE == 32
1099 #define rshift_rhlc(r,h,l,c) \
1100   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
1101 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
1102 #endif /* i860 */
1103 
1104 #if defined (__i960__) && W_TYPE_SIZE == 32
1105 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1106   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
1107 	   : "=r" (sh), "=&r" (sl)					\
1108 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1109 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1110   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
1111 	   : "=r" (sh), "=&r" (sl)					\
1112 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1113 #define umul_ppmm(w1, w0, u, v) \
1114   ({union {UDItype __ll;						\
1115 	   struct {USItype __l, __h;} __i;				\
1116 	  } __x;							\
1117   __asm__ ("emul %2,%1,%0"						\
1118 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
1119   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1120 #define __umulsidi3(u, v) \
1121   ({UDItype __w;							\
1122     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
1123     __w; })
1124 #define udiv_qrnnd(q, r, nh, nl, d) \
1125   do {									\
1126     union {UDItype __ll;						\
1127 	   struct {USItype __l, __h;} __i;				\
1128 	  } __nn;							\
1129     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
1130     __asm__ ("ediv %d,%n,%0"						\
1131 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
1132     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
1133   } while (0)
1134 #define count_leading_zeros(count, x) \
1135   do {									\
1136     USItype __cbtmp;							\
1137     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
1138     (count) = __cbtmp ^ 31;						\
1139   } while (0)
1140 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1141 #if defined (__i960mx)		/* what is the proper symbol to test??? */
1142 #define rshift_rhlc(r,h,l,c) \
1143   do {									\
1144     union {UDItype __ll;						\
1145 	   struct {USItype __l, __h;} __i;				\
1146 	  } __nn;							\
1147     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
1148     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
1149   }
1150 #endif /* i960mx */
1151 #endif /* i960 */
1152 
1153 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1154      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1155      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1156 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1157   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
1158 	   : "=d" (sh), "=&d" (sl)					\
1159 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
1160 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1161 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1162   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
1163 	   : "=d" (sh), "=&d" (sl)					\
1164 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
1165 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1166 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1167 #if defined (__mc68020__) || defined(mc68020) \
1168      || defined (__mc68030__) || defined (mc68030) \
1169      || defined (__mc68040__) || defined (mc68040) \
1170      || defined (__mcpu32__) || defined (mcpu32) \
1171      || defined (__NeXT__)
1172 #define umul_ppmm(w1, w0, u, v) \
1173   __asm__ ("mulu%.l %3,%1:%0"						\
1174 	   : "=d" (w0), "=d" (w1)					\
1175 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1176 #define udiv_qrnnd(q, r, n1, n0, d) \
1177   __asm__ ("divu%.l %4,%1:%0"						\
1178 	   : "=d" (q), "=d" (r)						\
1179 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1180 #define sdiv_qrnnd(q, r, n1, n0, d) \
1181   __asm__ ("divs%.l %4,%1:%0"						\
1182 	   : "=d" (q), "=d" (r)						\
1183 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1184 #else /* for other 68k family members use 16x16->32 multiplication */
1185 #define umul_ppmm(xh, xl, a, b) \
1186   do { USItype __umul_tmp1, __umul_tmp2;				\
1187 	__asm__ ("| Inlined umul_ppmm\n"				\
1188 "	move%.l	%5,%3\n"						\
1189 "	move%.l	%2,%0\n"						\
1190 "	move%.w	%3,%1\n"						\
1191 "	swap	%3\n"							\
1192 "	swap	%0\n"							\
1193 "	mulu%.w	%2,%1\n"						\
1194 "	mulu%.w	%3,%0\n"						\
1195 "	mulu%.w	%2,%3\n"						\
1196 "	swap	%2\n"							\
1197 "	mulu%.w	%5,%2\n"						\
1198 "	add%.l	%3,%2\n"						\
1199 "	jcc	1f\n"							\
1200 "	add%.l	%#0x10000,%0\n"						\
1201 "1:	move%.l	%2,%3\n"						\
1202 "	clr%.w	%2\n"							\
1203 "	swap	%2\n"							\
1204 "	swap	%3\n"							\
1205 "	clr%.w	%3\n"							\
1206 "	add%.l	%3,%1\n"						\
1207 "	addx%.l	%2,%0\n"						\
1208 "	| End inlined umul_ppmm"					\
1209 	      : "=&d" (xh), "=&d" (xl),					\
1210 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
1211 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
1212   } while (0)
1213 #endif /* not mc68020 */
1214 /* The '020, '030, '040 and '060 have bitfield insns.
1215    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1216    exclude bfffo on that chip (bitfield insns not available).  */
1217 #if (defined (__mc68020__) || defined (mc68020)    \
1218      || defined (__mc68030__) || defined (mc68030) \
1219      || defined (__mc68040__) || defined (mc68040) \
1220      || defined (__mc68060__) || defined (mc68060) \
1221      || defined (__NeXT__))			   \
1222   && ! defined (__mcpu32__)
1223 #define count_leading_zeros(count, x) \
1224   __asm__ ("bfffo %1{%b2:%b2},%0"					\
1225 	   : "=d" (count)						\
1226 	   : "od" ((USItype) (x)), "n" (0))
1227 #define COUNT_LEADING_ZEROS_0 32
1228 #endif
1229 #endif /* mc68000 */
1230 
1231 #if defined (__m88000__) && W_TYPE_SIZE == 32
1232 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1233   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
1234 	   : "=r" (sh), "=&r" (sl)					\
1235 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1236 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1237   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
1238 	   : "=r" (sh), "=&r" (sl)					\
1239 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1240 #define count_leading_zeros(count, x) \
1241   do {									\
1242     USItype __cbtmp;							\
1243     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
1244     (count) = __cbtmp ^ 31;						\
1245   } while (0)
1246 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1247 #if defined (__m88110__)
1248 #define umul_ppmm(wh, wl, u, v) \
1249   do {									\
1250     union {UDItype __ll;						\
1251 	   struct {USItype __h, __l;} __i;				\
1252 	  } __x;							\
1253     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
1254     (wh) = __x.__i.__h;							\
1255     (wl) = __x.__i.__l;							\
1256   } while (0)
1257 #define udiv_qrnnd(q, r, n1, n0, d) \
1258   ({union {UDItype __ll;						\
1259 	   struct {USItype __h, __l;} __i;				\
1260 	  } __x, __q;							\
1261   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1262   __asm__ ("divu.d %0,%1,%2"						\
1263 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
1264   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1265 #endif /* __m88110__ */
1266 #endif /* __m88000__ */
1267 
1268 #if defined (__mips) && W_TYPE_SIZE == 32
1269 #if __GMP_GNUC_PREREQ (4,4)
1270 #define umul_ppmm(w1, w0, u, v) \
1271   do {									\
1272     UDItype __ll = (UDItype)(u) * (v);					\
1273     w1 = __ll >> 32;							\
1274     w0 = __ll;								\
1275   } while (0)
1276 #endif
1277 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1278 #define umul_ppmm(w1, w0, u, v) \
1279   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1280 #endif
1281 #if !defined (umul_ppmm)
1282 #define umul_ppmm(w1, w0, u, v) \
1283   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1284 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1285 #endif
1286 #endif /* __mips */
1287 
1288 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1289 #if defined (_MIPS_ARCH_MIPS64R6)
1290 #define umul_ppmm(w1, w0, u, v) \
1291   do {									\
1292     UDItype __m0 = (u), __m1 = (v);					\
1293     (w0) = __m0 * __m1;							\
1294     __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1));	\
1295   } while (0)
1296 #endif
1297 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4)
1298 #define umul_ppmm(w1, w0, u, v) \
1299   do {									\
1300     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1301     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1302     w1 = __ll >> 64;							\
1303     w0 = __ll;								\
1304   } while (0)
1305 #endif
1306 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1307 #define umul_ppmm(w1, w0, u, v) \
1308   __asm__ ("dmultu %2,%3"						\
1309 	   : "=l" (w0), "=h" (w1)					\
1310 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1311 #endif
1312 #if !defined (umul_ppmm)
1313 #define umul_ppmm(w1, w0, u, v) \
1314   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1315 	   : "=d" (w0), "=d" (w1)					\
1316 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1317 #endif
1318 #endif /* __mips */
1319 
1320 #if defined (__mmix__) && W_TYPE_SIZE == 64
1321 #define umul_ppmm(w1, w0, u, v) \
1322   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1323 #endif
1324 
1325 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1326 #define umul_ppmm(w1, w0, u, v) \
1327   ({union {UDItype __ll;						\
1328 	   struct {USItype __l, __h;} __i;				\
1329 	  } __x;							\
1330   __asm__ ("meid %2,%0"							\
1331 	   : "=g" (__x.__ll)						\
1332 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
1333   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1334 #define __umulsidi3(u, v) \
1335   ({UDItype __w;							\
1336     __asm__ ("meid %2,%0"						\
1337 	     : "=g" (__w)						\
1338 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
1339     __w; })
1340 #define udiv_qrnnd(q, r, n1, n0, d) \
1341   ({union {UDItype __ll;						\
1342 	   struct {USItype __l, __h;} __i;				\
1343 	  } __x;							\
1344   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1345   __asm__ ("deid %2,%0"							\
1346 	   : "=g" (__x.__ll)						\
1347 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
1348   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1349 #define count_trailing_zeros(count,x) \
1350   do {									\
1351     __asm__ ("ffsd	%2,%0"						\
1352 	     : "=r" (count)						\
1353 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
1354   } while (0)
1355 #endif /* __ns32000__ */
1356 
1357 /* In the past we had a block of various #defines tested
1358        _ARCH_PPC    - AIX
1359        _ARCH_PWR    - AIX
1360        __powerpc__  - gcc
1361        __POWERPC__  - BEOS
1362        __ppc__      - Darwin
1363        PPC          - old gcc, GNU/Linux, SysV
1364    The plain PPC test was not good for vxWorks, since PPC is defined on all
1365    CPUs there (eg. m68k too), as a constant one is expected to compare
1366    CPU_FAMILY against.
1367 
1368    At any rate, this was pretty unattractive and a bit fragile.  The use of
1369    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1370    getting the desired effect.
1371 
1372    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1373    the system vendor compilers.  (Is that vendor compilers with inline asm,
1374    or what?)  */
1375 
1376 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
1377   && W_TYPE_SIZE == 32
1378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1379   do {									\
1380     if (__builtin_constant_p (bh) && (bh) == 0)				\
1381       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1382 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
1383 		 __CLOBBER_CC);						\
1384     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1385       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1386 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
1387 		 __CLOBBER_CC);						\
1388     else								\
1389       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1390 	       : "=r" (sh), "=&r" (sl)					\
1391 	       : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
1392 		 __CLOBBER_CC);						\
1393   } while (0)
1394 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1395   do {									\
1396     if (__builtin_constant_p (ah) && (ah) == 0)				\
1397       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1398 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
1399 		 __CLOBBER_CC);						\
1400     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
1401       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1402 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
1403 		 __CLOBBER_CC);						\
1404     else if (__builtin_constant_p (bh) && (bh) == 0)			\
1405       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1406 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
1407 		 __CLOBBER_CC);						\
1408     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1409       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1410 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
1411 		 __CLOBBER_CC);						\
1412     else								\
1413       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
1414 	       : "=r" (sh), "=&r" (sl)					\
1415 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl)		\
1416 		 __CLOBBER_CC);						\
1417   } while (0)
1418 #define count_leading_zeros(count, x) \
1419   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1420 #define COUNT_LEADING_ZEROS_0 32
1421 #if HAVE_HOST_CPU_FAMILY_powerpc
1422 #if __GMP_GNUC_PREREQ (4,4)
1423 #define umul_ppmm(w1, w0, u, v) \
1424   do {									\
1425     UDItype __ll = (UDItype)(u) * (v);					\
1426     w1 = __ll >> 32;							\
1427     w0 = __ll;								\
1428   } while (0)
1429 #endif
1430 #if !defined (umul_ppmm)
1431 #define umul_ppmm(ph, pl, m0, m1) \
1432   do {									\
1433     USItype __m0 = (m0), __m1 = (m1);					\
1434     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1435     (pl) = __m0 * __m1;							\
1436   } while (0)
1437 #endif
1438 #define smul_ppmm(ph, pl, m0, m1) \
1439   do {									\
1440     SItype __m0 = (m0), __m1 = (m1);					\
1441     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1442     (pl) = __m0 * __m1;							\
1443   } while (0)
1444 #else
1445 #define smul_ppmm(xh, xl, m0, m1) \
1446   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1447 #define sdiv_qrnnd(q, r, nh, nl, d) \
1448   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1449 #endif
1450 #endif /* 32-bit POWER architecture variants.  */
1451 
1452 /* We should test _IBMR2 here when we add assembly support for the system
1453    vendor compilers.  */
1454 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1455 #if !defined (_LONG_LONG_LIMB)
1456 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1457    use adde etc only when not _LONG_LONG_LIMB.  */
1458 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1459   do {									\
1460     if (__builtin_constant_p (bh) && (bh) == 0)				\
1461       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1462 	       : "=r" (sh), "=&r" (sl)					\
1463 	       : "r"  ((UDItype)(ah)),					\
1464 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1465 		 __CLOBBER_CC);						\
1466     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
1467       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1468 	       : "=r" (sh), "=&r" (sl)					\
1469 	       : "r"  ((UDItype)(ah)),					\
1470 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1471 		 __CLOBBER_CC);						\
1472     else								\
1473       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1474 	       : "=r" (sh), "=&r" (sl)					\
1475 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
1476 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
1477 		 __CLOBBER_CC);						\
1478   } while (0)
1479 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1480    This might seem strange, but gcc folds away the dead code late.  */
1481 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1482   do {									\
1483     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	\
1484 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1485 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
1486 		   : "=r" (sh), "=&r" (sl)				\
1487 		   :                       "r" ((UDItype)(bh)),		\
1488 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1489 		     __CLOBBER_CC);					\
1490 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1491 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
1492 		   : "=r" (sh), "=&r" (sl)				\
1493 		   :                       "r" ((UDItype)(bh)),		\
1494 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1495 		     __CLOBBER_CC);					\
1496 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1497 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
1498 		   : "=r" (sh), "=&r" (sl)				\
1499 		   : "r"  ((UDItype)(ah)),				\
1500 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1501 		     __CLOBBER_CC);					\
1502 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1503 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
1504 		   : "=r" (sh), "=&r" (sl)				\
1505 		   : "r"  ((UDItype)(ah)),				\
1506 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1507 		     __CLOBBER_CC);					\
1508 	else								\
1509 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
1510 		   : "=r" (sh), "=&r" (sl)				\
1511 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1512 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
1513 		     __CLOBBER_CC);					\
1514     } else {								\
1515 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1516 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1517 		   : "=r" (sh), "=&r" (sl)				\
1518 		   :                       "r" ((UDItype)(bh)),		\
1519 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1520 		     __CLOBBER_CC);					\
1521 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1522 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1523 		   : "=r" (sh), "=&r" (sl)				\
1524 		   :                       "r" ((UDItype)(bh)),		\
1525 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1526 		     __CLOBBER_CC);					\
1527 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1528 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1529 		   : "=r" (sh), "=&r" (sl)				\
1530 		   : "r"  ((UDItype)(ah)),				\
1531 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1532 		     __CLOBBER_CC);					\
1533 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1534 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1535 		   : "=r" (sh), "=&r" (sl)				\
1536 		   : "r"  ((UDItype)(ah)),				\
1537 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1538 		     __CLOBBER_CC);					\
1539 	else								\
1540 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
1541 		   : "=r" (sh), "=&r" (sl)				\
1542 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1543 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
1544 		     __CLOBBER_CC);					\
1545     }									\
1546   } while (0)
1547 #endif /* ! _LONG_LONG_LIMB */
1548 #define count_leading_zeros(count, x) \
1549   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1550 #define COUNT_LEADING_ZEROS_0 64
1551 #if __GMP_GNUC_PREREQ (4,8)
1552 #define umul_ppmm(w1, w0, u, v) \
1553   do {									\
1554     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1555     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1556     w1 = __ll >> 64;							\
1557     w0 = __ll;								\
1558   } while (0)
1559 #endif
1560 #if !defined (umul_ppmm)
1561 #define umul_ppmm(ph, pl, m0, m1) \
1562   do {									\
1563     UDItype __m0 = (m0), __m1 = (m1);					\
1564     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1565     (pl) = __m0 * __m1;							\
1566   } while (0)
1567 #endif
1568 #define smul_ppmm(ph, pl, m0, m1) \
1569   do {									\
1570     DItype __m0 = (m0), __m1 = (m1);					\
1571     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1572     (pl) = __m0 * __m1;							\
1573   } while (0)
1574 #endif /* 64-bit PowerPC.  */
1575 
1576 #if defined (__pyr__) && W_TYPE_SIZE == 32
1577 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1578   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
1579 	   : "=r" (sh), "=&r" (sl)					\
1580 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1581 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1582 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1583   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
1584 	   : "=r" (sh), "=&r" (sl)					\
1585 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1586 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1587 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1588 #define umul_ppmm(w1, w0, u, v) \
1589   ({union {UDItype __ll;						\
1590 	   struct {USItype __h, __l;} __i;				\
1591 	  } __x;							\
1592   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
1593 	   : "=&r" (__x.__ll)						\
1594 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
1595   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1596 #endif /* __pyr__ */
1597 
1598 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1599 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1600   __asm__ ("a %1,%5\n\tae %0,%3"					\
1601 	   : "=r" (sh), "=&r" (sl)					\
1602 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
1603 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1604 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1605   __asm__ ("s %1,%5\n\tse %0,%3"					\
1606 	   : "=r" (sh), "=&r" (sl)					\
1607 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
1608 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
1609 #define smul_ppmm(ph, pl, m0, m1) \
1610   __asm__ (								\
1611        "s	r2,r2\n"						\
1612 "	mts r10,%2\n"							\
1613 "	m	r2,%3\n"						\
1614 "	m	r2,%3\n"						\
1615 "	m	r2,%3\n"						\
1616 "	m	r2,%3\n"						\
1617 "	m	r2,%3\n"						\
1618 "	m	r2,%3\n"						\
1619 "	m	r2,%3\n"						\
1620 "	m	r2,%3\n"						\
1621 "	m	r2,%3\n"						\
1622 "	m	r2,%3\n"						\
1623 "	m	r2,%3\n"						\
1624 "	m	r2,%3\n"						\
1625 "	m	r2,%3\n"						\
1626 "	m	r2,%3\n"						\
1627 "	m	r2,%3\n"						\
1628 "	m	r2,%3\n"						\
1629 "	cas	%0,r2,r0\n"						\
1630 "	mfs	r10,%1"							\
1631 	   : "=r" (ph), "=r" (pl)					\
1632 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
1633 	   : "r2")
1634 #define count_leading_zeros(count, x) \
1635   do {									\
1636     if ((x) >= 0x10000)							\
1637       __asm__ ("clz	%0,%1"						\
1638 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
1639     else								\
1640       {									\
1641 	__asm__ ("clz	%0,%1"						\
1642 		 : "=r" (count) : "r" ((USItype)(x)));			\
1643 	(count) += 16;							\
1644       }									\
1645   } while (0)
1646 #endif /* RT/ROMP */
1647 
1648 #if defined (__riscv64) && W_TYPE_SIZE == 64
1649 #define umul_ppmm(ph, pl, u, v) \
1650   do {									\
1651     UDItype __u = (u), __v = (v);					\
1652     (pl) = __u * __v;							\
1653     __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v));	\
1654   } while (0)
1655 #endif
1656 
1657 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1658 #define umul_ppmm(w1, w0, u, v) \
1659   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
1660 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1661 #endif
1662 
1663 #if defined (__sparc__) && W_TYPE_SIZE == 32
1664 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1665   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1666 	   : "=r" (sh), "=&r" (sl)					\
1667 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
1668 	   __CLOBBER_CC)
1669 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1670   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1671 	   : "=r" (sh), "=&r" (sl)					\
1672 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
1673 	   __CLOBBER_CC)
1674 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1675    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1676 #if defined (__sparc_v9__) || defined (__sparcv9)
1677 /* Perhaps we should use floating-point operations here?  */
1678 #if 0
1679 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1680    Perhaps we simply need explicitly zero-extend the inputs?  */
1681 #define umul_ppmm(w1, w0, u, v) \
1682   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
1683 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1684 #else
1685 /* Use v8 umul until above bug is fixed.  */
1686 #define umul_ppmm(w1, w0, u, v) \
1687   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1688 #endif
1689 /* Use a plain v8 divide for v9.  */
1690 #define udiv_qrnnd(q, r, n1, n0, d) \
1691   do {									\
1692     USItype __q;							\
1693     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1694 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1695     (r) = (n0) - __q * (d);						\
1696     (q) = __q;								\
1697   } while (0)
1698 #else
1699 #if defined (__sparc_v8__)   /* gcc normal */				\
1700   || defined (__sparcv8)     /* gcc solaris */				\
1701   || HAVE_HOST_CPU_supersparc
1702 /* Don't match immediate range because, 1) it is not often useful,
1703    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1704    while we want to match a 13 bit interval, sign extended to 32 bits,
1705    but INTERPRETED AS UNSIGNED.  */
1706 #define umul_ppmm(w1, w0, u, v) \
1707   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1708 
1709 #if HAVE_HOST_CPU_supersparc
1710 #else
1711 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1712    dividends and will trap to the kernel for the rest. */
1713 #define udiv_qrnnd(q, r, n1, n0, d) \
1714   do {									\
1715     USItype __q;							\
1716     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1717 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1718     (r) = (n0) - __q * (d);						\
1719     (q) = __q;								\
1720   } while (0)
1721 #endif /* HAVE_HOST_CPU_supersparc */
1722 
1723 #else /* ! __sparc_v8__ */
1724 #if defined (__sparclite__)
1725 /* This has hardware multiply but not divide.  It also has two additional
1726    instructions scan (ffs from high bit) and divscc.  */
1727 #define umul_ppmm(w1, w0, u, v) \
1728   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1729 #define udiv_qrnnd(q, r, n1, n0, d) \
1730   __asm__ ("! Inlined udiv_qrnnd\n"					\
1731 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1732 "	tst	%%g0\n"							\
1733 "	divscc	%3,%4,%%g1\n"						\
1734 "	divscc	%%g1,%4,%%g1\n"						\
1735 "	divscc	%%g1,%4,%%g1\n"						\
1736 "	divscc	%%g1,%4,%%g1\n"						\
1737 "	divscc	%%g1,%4,%%g1\n"						\
1738 "	divscc	%%g1,%4,%%g1\n"						\
1739 "	divscc	%%g1,%4,%%g1\n"						\
1740 "	divscc	%%g1,%4,%%g1\n"						\
1741 "	divscc	%%g1,%4,%%g1\n"						\
1742 "	divscc	%%g1,%4,%%g1\n"						\
1743 "	divscc	%%g1,%4,%%g1\n"						\
1744 "	divscc	%%g1,%4,%%g1\n"						\
1745 "	divscc	%%g1,%4,%%g1\n"						\
1746 "	divscc	%%g1,%4,%%g1\n"						\
1747 "	divscc	%%g1,%4,%%g1\n"						\
1748 "	divscc	%%g1,%4,%%g1\n"						\
1749 "	divscc	%%g1,%4,%%g1\n"						\
1750 "	divscc	%%g1,%4,%%g1\n"						\
1751 "	divscc	%%g1,%4,%%g1\n"						\
1752 "	divscc	%%g1,%4,%%g1\n"						\
1753 "	divscc	%%g1,%4,%%g1\n"						\
1754 "	divscc	%%g1,%4,%%g1\n"						\
1755 "	divscc	%%g1,%4,%%g1\n"						\
1756 "	divscc	%%g1,%4,%%g1\n"						\
1757 "	divscc	%%g1,%4,%%g1\n"						\
1758 "	divscc	%%g1,%4,%%g1\n"						\
1759 "	divscc	%%g1,%4,%%g1\n"						\
1760 "	divscc	%%g1,%4,%%g1\n"						\
1761 "	divscc	%%g1,%4,%%g1\n"						\
1762 "	divscc	%%g1,%4,%%g1\n"						\
1763 "	divscc	%%g1,%4,%%g1\n"						\
1764 "	divscc	%%g1,%4,%0\n"						\
1765 "	rd	%%y,%1\n"						\
1766 "	bl,a 1f\n"							\
1767 "	add	%1,%4,%1\n"						\
1768 "1:	! End of inline udiv_qrnnd"					\
1769 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
1770 	   : "%g1" __AND_CLOBBER_CC)
1771 #define count_leading_zeros(count, x) \
1772   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1773 /* Early sparclites return 63 for an argument of 0, but they warn that future
1774    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1775    undefined.  */
1776 #endif /* __sparclite__ */
1777 #endif /* __sparc_v8__ */
1778 #endif /* __sparc_v9__ */
1779 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1780 #ifndef umul_ppmm
1781 #define umul_ppmm(w1, w0, u, v) \
1782   __asm__ ("! Inlined umul_ppmm\n"					\
1783 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
1784 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
1785 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
1786 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1787 "	mulscc	%%g1,%3,%%g1\n"						\
1788 "	mulscc	%%g1,%3,%%g1\n"						\
1789 "	mulscc	%%g1,%3,%%g1\n"						\
1790 "	mulscc	%%g1,%3,%%g1\n"						\
1791 "	mulscc	%%g1,%3,%%g1\n"						\
1792 "	mulscc	%%g1,%3,%%g1\n"						\
1793 "	mulscc	%%g1,%3,%%g1\n"						\
1794 "	mulscc	%%g1,%3,%%g1\n"						\
1795 "	mulscc	%%g1,%3,%%g1\n"						\
1796 "	mulscc	%%g1,%3,%%g1\n"						\
1797 "	mulscc	%%g1,%3,%%g1\n"						\
1798 "	mulscc	%%g1,%3,%%g1\n"						\
1799 "	mulscc	%%g1,%3,%%g1\n"						\
1800 "	mulscc	%%g1,%3,%%g1\n"						\
1801 "	mulscc	%%g1,%3,%%g1\n"						\
1802 "	mulscc	%%g1,%3,%%g1\n"						\
1803 "	mulscc	%%g1,%3,%%g1\n"						\
1804 "	mulscc	%%g1,%3,%%g1\n"						\
1805 "	mulscc	%%g1,%3,%%g1\n"						\
1806 "	mulscc	%%g1,%3,%%g1\n"						\
1807 "	mulscc	%%g1,%3,%%g1\n"						\
1808 "	mulscc	%%g1,%3,%%g1\n"						\
1809 "	mulscc	%%g1,%3,%%g1\n"						\
1810 "	mulscc	%%g1,%3,%%g1\n"						\
1811 "	mulscc	%%g1,%3,%%g1\n"						\
1812 "	mulscc	%%g1,%3,%%g1\n"						\
1813 "	mulscc	%%g1,%3,%%g1\n"						\
1814 "	mulscc	%%g1,%3,%%g1\n"						\
1815 "	mulscc	%%g1,%3,%%g1\n"						\
1816 "	mulscc	%%g1,%3,%%g1\n"						\
1817 "	mulscc	%%g1,%3,%%g1\n"						\
1818 "	mulscc	%%g1,%3,%%g1\n"						\
1819 "	mulscc	%%g1,0,%%g1\n"						\
1820 "	add	%%g1,%%g2,%0\n"						\
1821 "	rd	%%y,%1"							\
1822 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
1823 	   : "%g1", "%g2" __AND_CLOBBER_CC)
1824 #endif
1825 #ifndef udiv_qrnnd
1826 #ifndef LONGLONG_STANDALONE
1827 #define udiv_qrnnd(q, r, n1, n0, d) \
1828   do { UWtype __r;							\
1829     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
1830     (r) = __r;								\
1831   } while (0)
1832 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1833 #endif /* LONGLONG_STANDALONE */
1834 #endif /* udiv_qrnnd */
1835 #endif /* __sparc__ */
1836 
1837 #if defined (__sparc__) && W_TYPE_SIZE == 64
1838 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1839   __asm__ (								\
1840        "addcc	%r4,%5,%1\n"						\
1841       "	addccc	%r6,%7,%%g0\n"						\
1842       "	addc	%r2,%3,%0"						\
1843        : "=r" (sh), "=&r" (sl)						\
1844        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1845 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1846 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
1847 	   __CLOBBER_CC)
1848 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1849   __asm__ (								\
1850        "subcc	%r4,%5,%1\n"						\
1851       "	subccc	%r6,%7,%%g0\n"						\
1852       "	subc	%r2,%3,%0"						\
1853        : "=r" (sh), "=&r" (sl)						\
1854        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1855 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1856 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
1857 	   __CLOBBER_CC)
1858 #if __VIS__ >= 0x300
1859 #undef add_ssaaaa
1860 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1861   __asm__ (								\
1862        "addcc	%r4, %5, %1\n"						\
1863       "	addxc	%r2, %r3, %0"						\
1864 	  : "=r" (sh), "=&r" (sl)					\
1865        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
1866 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1867 #define umul_ppmm(ph, pl, m0, m1) \
1868   do {									\
1869     UDItype __m0 = (m0), __m1 = (m1);					\
1870     (pl) = __m0 * __m1;							\
1871     __asm__ ("umulxhi\t%2, %1, %0"					\
1872 	     : "=r" (ph)						\
1873 	     : "%r" (__m0), "r" (__m1));				\
1874   } while (0)
1875 #define count_leading_zeros(count, x) \
1876   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1877 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1878 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1879 #endif
1880 #endif
1881 
1882 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1883 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1884   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1885 	   : "=g" (sh), "=&g" (sl)					\
1886 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1887 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1888 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1889   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1890 	   : "=g" (sh), "=&g" (sl)					\
1891 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1892 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1893 #define smul_ppmm(xh, xl, m0, m1) \
1894   do {									\
1895     union {UDItype __ll;						\
1896 	   struct {USItype __l, __h;} __i;				\
1897 	  } __x;							\
1898     USItype __m0 = (m0), __m1 = (m1);					\
1899     __asm__ ("emul %1,%2,$0,%0"						\
1900 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
1901     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1902   } while (0)
1903 #define sdiv_qrnnd(q, r, n1, n0, d) \
1904   do {									\
1905     union {DItype __ll;							\
1906 	   struct {SItype __l, __h;} __i;				\
1907 	  } __x;							\
1908     __x.__i.__h = n1; __x.__i.__l = n0;					\
1909     __asm__ ("ediv %3,%2,%0,%1"						\
1910 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
1911   } while (0)
1912 #if 0
1913 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1914    8800 maybe). */
1915 #define count_trailing_zeros(count,x)					\
1916   do {									\
1917     __asm__ ("ffs 0, 31, %1, %0"					\
1918 	     : "=g" (count)						\
1919 	     : "g" ((USItype) (x)));					\
1920   } while (0)
1921 #endif
1922 #endif /* vax */
1923 
1924 #if defined (__z8000__) && W_TYPE_SIZE == 16
1925 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1926   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1927 	   : "=r" (sh), "=&r" (sl)					\
1928 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1929 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1930 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1931   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1932 	   : "=r" (sh), "=&r" (sl)					\
1933 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1934 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1935 #define umul_ppmm(xh, xl, m0, m1) \
1936   do {									\
1937     union {long int __ll;						\
1938 	   struct {unsigned int __h, __l;} __i;				\
1939 	  } __x;							\
1940     unsigned int __m0 = (m0), __m1 = (m1);				\
1941     __asm__ ("mult	%S0,%H3"					\
1942 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
1943 	     : "%1" (m0), "rQR" (m1));					\
1944     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1945     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1946 	     + (((signed int) __m1 >> 15) & __m0));			\
1947   } while (0)
1948 #endif /* __z8000__ */
1949 
1950 #endif /* __GNUC__ */
1951 
1952 #endif /* NO_ASM */
1953 
1954 
1955 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1956 #if !defined (umul_ppmm) && defined (__umulsidi3)
1957 #define umul_ppmm(ph, pl, m0, m1) \
1958   do {									\
1959     UDWtype __ll = __umulsidi3 (m0, m1);				\
1960     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
1961     pl = (UWtype) __ll;							\
1962   } while (0)
1963 #endif
1964 
1965 #if !defined (__umulsidi3)
1966 #define __umulsidi3(u, v) \
1967   ({UWtype __hi, __lo;							\
1968     umul_ppmm (__hi, __lo, u, v);					\
1969     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1970 #endif
1971 
1972 
1973 #if defined (__cplusplus)
1974 #define __longlong_h_C "C"
1975 #else
1976 #define __longlong_h_C
1977 #endif
1978 
1979 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1980    forms have "reversed" arguments, meaning the pointer is last, which
1981    sometimes allows better parameter passing, in particular on 64-bit
1982    hppa. */
1983 
1984 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1985 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1986 
1987 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1988   && ! defined (LONGLONG_STANDALONE)
1989 #define umul_ppmm(wh, wl, u, v)						\
1990   do {									\
1991     UWtype __umul_ppmm__p0;						\
1992     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1993     (wl) = __umul_ppmm__p0;						\
1994   } while (0)
1995 #endif
1996 
1997 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1998 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1999 
2000 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
2001   && ! defined (LONGLONG_STANDALONE)
2002 #define umul_ppmm(wh, wl, u, v)						\
2003   do {									\
2004     UWtype __umul_p0;							\
2005     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
2006     (wl) = __umul_p0;							\
2007   } while (0)
2008 #endif
2009 
2010 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
2011 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
2012 
2013 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
2014   && ! defined (LONGLONG_STANDALONE)
2015 #define udiv_qrnnd(q, r, n1, n0, d)					\
2016   do {									\
2017     UWtype __udiv_qrnnd_r;						\
2018     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
2019 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
2020     (r) = __udiv_qrnnd_r;						\
2021   } while (0)
2022 #endif
2023 
2024 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
2025 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
2026 
2027 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
2028   && ! defined (LONGLONG_STANDALONE)
2029 #define udiv_qrnnd(q, r, n1, n0, d)					\
2030   do {									\
2031     UWtype __udiv_qrnnd_r;						\
2032     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
2033 			    &__udiv_qrnnd_r);				\
2034     (r) = __udiv_qrnnd_r;						\
2035   } while (0)
2036 #endif
2037 
2038 
2039 /* If this machine has no inline assembler, use C macros.  */
2040 
2041 #if !defined (add_ssaaaa)
2042 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
2043   do {									\
2044     UWtype __x;								\
2045     __x = (al) + (bl);							\
2046     (sh) = (ah) + (bh) + (__x < (al));					\
2047     (sl) = __x;								\
2048   } while (0)
2049 #endif
2050 
2051 #if !defined (sub_ddmmss)
2052 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
2053   do {									\
2054     UWtype __x;								\
2055     __x = (al) - (bl);							\
2056     (sh) = (ah) - (bh) - ((al) < (bl));					\
2057     (sl) = __x;								\
2058   } while (0)
2059 #endif
2060 
2061 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2062    smul_ppmm.  */
2063 #if !defined (umul_ppmm) && defined (smul_ppmm)
2064 #define umul_ppmm(w1, w0, u, v)						\
2065   do {									\
2066     UWtype __w1;							\
2067     UWtype __xm0 = (u), __xm1 = (v);					\
2068     smul_ppmm (__w1, w0, __xm0, __xm1);					\
2069     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2070 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2071   } while (0)
2072 #endif
2073 
2074 /* If we still don't have umul_ppmm, define it using plain C.
2075 
2076    For reference, when this code is used for squaring (ie. u and v identical
2077    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2078    multiplies, not 4.  The subsequent additions could be optimized a bit,
2079    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2080    and chips obliged to use this generic C umul will have plenty of worse
2081    performance problems than a couple of extra instructions on the diagonal
2082    of sqr_basecase.  */
2083 
2084 #if !defined (umul_ppmm)
2085 #define umul_ppmm(w1, w0, u, v)						\
2086   do {									\
2087     UWtype __x0, __x1, __x2, __x3;					\
2088     UHWtype __ul, __vl, __uh, __vh;					\
2089     UWtype __u = (u), __v = (v);					\
2090 									\
2091     __ul = __ll_lowpart (__u);						\
2092     __uh = __ll_highpart (__u);						\
2093     __vl = __ll_lowpart (__v);						\
2094     __vh = __ll_highpart (__v);						\
2095 									\
2096     __x0 = (UWtype) __ul * __vl;					\
2097     __x1 = (UWtype) __ul * __vh;					\
2098     __x2 = (UWtype) __uh * __vl;					\
2099     __x3 = (UWtype) __uh * __vh;					\
2100 									\
2101     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
2102     __x1 += __x2;		/* but this indeed can */		\
2103     if (__x1 < __x2)		/* did we get it? */			\
2104       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
2105 									\
2106     (w1) = __x3 + __ll_highpart (__x1);					\
2107     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
2108   } while (0)
2109 #endif
2110 
2111 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2112    exist in one form or another.  */
2113 #if !defined (smul_ppmm)
2114 #define smul_ppmm(w1, w0, u, v)						\
2115   do {									\
2116     UWtype __w1;							\
2117     UWtype __xm0 = (u), __xm1 = (v);					\
2118     umul_ppmm (__w1, w0, __xm0, __xm1);					\
2119     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2120 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2121   } while (0)
2122 #endif
2123 
2124 /* Define this unconditionally, so it can be used for debugging.  */
2125 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2126   do {									\
2127     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
2128 									\
2129     ASSERT ((d) != 0);							\
2130     ASSERT ((n1) < (d));						\
2131 									\
2132     __d1 = __ll_highpart (d);						\
2133     __d0 = __ll_lowpart (d);						\
2134 									\
2135     __q1 = (n1) / __d1;							\
2136     __r1 = (n1) - __q1 * __d1;						\
2137     __m = __q1 * __d0;							\
2138     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
2139     if (__r1 < __m)							\
2140       {									\
2141 	__q1--, __r1 += (d);						\
2142 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2143 	  if (__r1 < __m)						\
2144 	    __q1--, __r1 += (d);					\
2145       }									\
2146     __r1 -= __m;							\
2147 									\
2148     __q0 = __r1 / __d1;							\
2149     __r0 = __r1  - __q0 * __d1;						\
2150     __m = __q0 * __d0;							\
2151     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
2152     if (__r0 < __m)							\
2153       {									\
2154 	__q0--, __r0 += (d);						\
2155 	if (__r0 >= (d))						\
2156 	  if (__r0 < __m)						\
2157 	    __q0--, __r0 += (d);					\
2158       }									\
2159     __r0 -= __m;							\
2160 									\
2161     (q) = __q1 * __ll_B | __q0;						\
2162     (r) = __r0;								\
2163   } while (0)
2164 
2165 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2166    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2167 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
2168   && ! defined (LONGLONG_STANDALONE)
2169 #define udiv_qrnnd(q, r, nh, nl, d) \
2170   do {									\
2171     UWtype __r;								\
2172     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
2173     (r) = __r;								\
2174   } while (0)
2175 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2176 #endif
2177 
2178 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2179 #if !defined (udiv_qrnnd)
2180 #define UDIV_NEEDS_NORMALIZATION 1
2181 #define udiv_qrnnd __udiv_qrnnd_c
2182 #endif
2183 
2184 #if !defined (count_leading_zeros)
2185 #define count_leading_zeros(count, x) \
2186   do {									\
2187     UWtype __xr = (x);							\
2188     UWtype __a;								\
2189 									\
2190     if (W_TYPE_SIZE == 32)						\
2191       {									\
2192 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
2193 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
2194 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
2195 	  : 3*__BITS4 + 1);						\
2196       }									\
2197     else								\
2198       {									\
2199 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
2200 	  if (((__xr >> __a) & 0xff) != 0)				\
2201 	    break;							\
2202 	++__a;								\
2203       }									\
2204 									\
2205     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
2206   } while (0)
2207 /* This version gives a well-defined value for zero. */
2208 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2209 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2210 #define COUNT_LEADING_ZEROS_SLOW
2211 #endif
2212 
2213 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2214 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2215 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2216 #endif
2217 
2218 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2219 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2220 #endif
2221 
2222 #if !defined (count_trailing_zeros)
2223 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2224 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2225 #define count_trailing_zeros(count, x)					\
2226   do {									\
2227     UWtype __ctz_x = (x);						\
2228     UWtype __ctz_c;							\
2229     ASSERT (__ctz_x != 0);						\
2230     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
2231     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
2232   } while (0)
2233 #else
2234 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2235    We use clz_tab without ado, since the C count_leading_zeros above will have
2236    pulled it in.  */
2237 #define count_trailing_zeros(count, x)					\
2238   do {									\
2239     UWtype __ctz_x = (x);						\
2240     int __ctz_c;							\
2241 									\
2242     if (LIKELY ((__ctz_x & 0xff) != 0))					\
2243       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
2244     else								\
2245       {									\
2246 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
2247 	  {								\
2248 	    __ctz_x >>= 8;						\
2249 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
2250 	      break;							\
2251 	  }								\
2252 									\
2253 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
2254       }									\
2255   } while (0)
2256 #endif
2257 #endif
2258 
2259 #ifndef UDIV_NEEDS_NORMALIZATION
2260 #define UDIV_NEEDS_NORMALIZATION 0
2261 #endif
2262 
2263 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2264    that hence the latter should always be used.  */
2265 #ifndef UDIV_PREINV_ALWAYS
2266 #define UDIV_PREINV_ALWAYS 0
2267 #endif
2268