1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2 
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2015 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 (Copied from GMP 6.1.0.)
8 
9 The GNU MP Library is free software; you can redistribute it and/or modify
10 it under the terms of either:
11 
12   * the GNU Lesser General Public License as published by the Free
13     Software Foundation; either version 3 of the License, or (at your
14     option) any later version.
15 
16 or
17 
18   * the GNU General Public License as published by the Free Software
19     Foundation; either version 2 of the License, or (at your option) any
20     later version.
21 
22 or both in parallel, as here.
23 
24 The GNU MP Library is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27 for more details.
28 
29 You should have received copies of the GNU General Public License and the
30 GNU Lesser General Public License along with the GNU MP Library.  If not,
31 see https://www.gnu.org/licenses/.  */
32 
33 /* the following was added for use within GMP-ECM */
34 #ifndef HAVE_HOST_CPU_FAMILY_power
35 #define HAVE_HOST_CPU_FAMILY_power 0
36 #endif
37 
38 #ifndef HAVE_HOST_CPU_FAMILY_powerpc
39 #define HAVE_HOST_CPU_FAMILY_powerpc 0
40 #endif
41 
42 #ifndef HAVE_HOST_CPU_FAMILY_x86
43 #define HAVE_HOST_CPU_FAMILY_x86 0
44 #endif
45 
46 #ifndef HAVE_NATIVE_mpn_umul_ppmm
47 #define HAVE_NATIVE_mpn_umul_ppmm 0
48 #endif
49 
50 #ifndef HAVE_NATIVE_mpn_umul_ppmm_r
51 #define HAVE_NATIVE_mpn_umul_ppmm_r 0
52 #endif
53 
54 #ifndef HAVE_NATIVE_mpn_udiv_qrnnd
55 #define HAVE_NATIVE_mpn_udiv_qrnnd 0
56 #endif
57 
58 #ifndef HAVE_NATIVE_mpn_udiv_qrnnd_r
59 #define HAVE_NATIVE_mpn_udiv_qrnnd_r 0
60 #endif
61 
62 #ifndef HAVE_HOST_CPU_i586
63 #define HAVE_HOST_CPU_i586 0
64 #endif
65 
66 #ifndef HAVE_HOST_CPU_pentium
67 #define HAVE_HOST_CPU_pentium 0
68 #endif
69 
70 #ifndef HAVE_HOST_CPU_pentiummmx
71 #define HAVE_HOST_CPU_pentiummmx 0
72 #endif
73 
74 /* end of stuff added for GMP-ECM */
75 
76 /* You have to define the following before including this file:
77 
78    UWtype -- An unsigned type, default type for operations (typically a "word")
79    UHWtype -- An unsigned type, at least half the size of UWtype
80    UDWtype -- An unsigned type, at least twice as large a UWtype
81    W_TYPE_SIZE -- size in bits of UWtype
82 
83    SItype, USItype -- Signed and unsigned 32 bit types
84    DItype, UDItype -- Signed and unsigned 64 bit types
85 
86    On a 32 bit machine UWtype should typically be USItype;
87    on a 64 bit machine, UWtype should typically be UDItype.
88 
89    Optionally, define:
90 
91    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
92    NO_ASM -- Disable inline asm
93 
94 
95    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
96    need to include gmp.h and gmp-impl.h, or certain things might not work as
97    expected.
98 */
99 
100 #define __BITS4 (W_TYPE_SIZE / 4)
101 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
102 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
103 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
104 
105 /* This is used to make sure no undesirable sharing between different libraries
106    that use this file takes place.  */
107 #ifndef __MPN
108 #define __MPN(x) __##x
109 #endif
110 
111 /* Define auxiliary asm macros.
112 
113    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
114    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
115    word product in HIGH_PROD and LOW_PROD.
116 
117    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
118    UDWtype product.  This is just a variant of umul_ppmm.
119 
120    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
121    denominator) divides a UDWtype, composed by the UWtype integers
122    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
123    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
124    than DENOMINATOR for correct operation.  If, in addition, the most
125    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
126    UDIV_NEEDS_NORMALIZATION is defined to 1.
127 
128    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
129    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
130    is rounded towards 0.
131 
132    5) count_leading_zeros(count, x) counts the number of zero-bits from the
133    msb to the first non-zero bit in the UWtype X.  This is the number of
134    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
135    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
136 
137    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
138    from the least significant end.
139 
140    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
141    high_addend_2, low_addend_2) adds two UWtype integers, composed by
142    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
143    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
144    (i.e. carry out) is not stored anywhere, and is lost.
145 
146    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
147    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
148    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
149    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
150    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
151    and is lost.
152 
153    If any of these macros are left undefined for a particular CPU,
154    C macros are used.
155 
156 
157    Notes:
158 
159    For add_ssaaaa the two high and two low addends can both commute, but
160    unfortunately gcc only supports one "%" commutative in each asm block.
161    This has always been so but is only documented in recent versions
162    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
163    compiler error in certain rare circumstances.
164 
165    Apparently it was only the last "%" that was ever actually respected, so
166    the code has been updated to leave just that.  Clearly there's a free
167    choice whether high or low should get it, if there's a reason to favour
168    one over the other.  Also obviously when the constraints on the two
169    operands are identical there's no benefit to the reloader in any "%" at
170    all.
171 
172    */
173 
174 /* The CPUs come in alphabetical order below.
175 
176    Please add support for more CPUs here, or improve the current support
177    for the CPUs below!  */
178 
179 
180 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
181    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
182    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
183    __builtin_ctzll.
184 
185    These builtins are only used when we check what code comes out, on some
186    chips they're merely libgcc calls, where we will instead want an inline
187    in that case (either asm or generic C).
188 
189    These builtins are better than an asm block of the same insn, since an
190    asm block doesn't give gcc any information about scheduling or resource
191    usage.  We keep an asm block for use on prior versions of gcc though.
192 
193    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
194    it's not used (for count_leading_zeros) because it generally gives extra
195    code to ensure the result is 0 when the input is 0, which we don't need
196    or want.  */
197 
198 #ifdef _LONG_LONG_LIMB
199 #define count_leading_zeros_gcc_clz(count,x)	\
200   do {						\
201     ASSERT ((x) != 0);				\
202     (count) = __builtin_clzll (x);		\
203   } while (0)
204 #else
205 #define count_leading_zeros_gcc_clz(count,x)	\
206   do {						\
207     ASSERT ((x) != 0);				\
208     (count) = __builtin_clzl (x);		\
209   } while (0)
210 #endif
211 
212 #ifdef _LONG_LONG_LIMB
213 #define count_trailing_zeros_gcc_ctz(count,x)	\
214   do {						\
215     ASSERT ((x) != 0);				\
216     (count) = __builtin_ctzll (x);		\
217   } while (0)
218 #else
219 #define count_trailing_zeros_gcc_ctz(count,x)	\
220   do {						\
221     ASSERT ((x) != 0);				\
222     (count) = __builtin_ctzl (x);		\
223   } while (0)
224 #endif
225 
226 
227 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
228    don't need to be under !NO_ASM */
229 #if ! defined (NO_ASM)
230 
231 #if defined (__alpha) && W_TYPE_SIZE == 64
232 /* Most alpha-based machines, except Cray systems. */
233 #if defined (__GNUC__)
234 #if __GMP_GNUC_PREREQ (3,3)
235 #define umul_ppmm(ph, pl, m0, m1) \
236   do {									\
237     UDItype __m0 = (m0), __m1 = (m1);					\
238     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
239     (pl) = __m0 * __m1;							\
240   } while (0)
241 #else
242 #define umul_ppmm(ph, pl, m0, m1) \
243   do {									\
244     UDItype __m0 = (m0), __m1 = (m1);					\
245     __asm__ ("umulh %r1,%2,%0"						\
246 	     : "=r" (ph)						\
247 	     : "%rJ" (__m0), "rI" (__m1));				\
248     (pl) = __m0 * __m1;							\
249   } while (0)
250 #endif
251 #define UMUL_TIME 18
252 #else /* ! __GNUC__ */
253 #include <machine/builtins.h>
254 #define umul_ppmm(ph, pl, m0, m1) \
255   do {									\
256     UDItype __m0 = (m0), __m1 = (m1);					\
257     (ph) = __UMULH (__m0, __m1);					\
258     (pl) = __m0 * __m1;							\
259   } while (0)
260 #endif
261 #ifndef LONGLONG_STANDALONE
262 #define udiv_qrnnd(q, r, n1, n0, d) \
263   do { UWtype __di;							\
264     __di = __MPN(invert_limb) (d);					\
265     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
266   } while (0)
267 #define UDIV_PREINV_ALWAYS  1
268 #define UDIV_NEEDS_NORMALIZATION 1
269 #define UDIV_TIME 220
270 #endif /* LONGLONG_STANDALONE */
271 
272 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
273    always goes into libgmp.so, even when not actually used.  */
274 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
275 
276 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
277 #define count_leading_zeros(COUNT,X) \
278   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
279 #define count_trailing_zeros(COUNT,X) \
280   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
281 #endif /* clz/ctz using cix */
282 
283 #if ! defined (count_leading_zeros)				\
284   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
285 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
286    "$31" is written explicitly in the asm, since an "r" constraint won't
287    select reg 31.  There seems no need to worry about "r31" syntax for cray,
288    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
289 #define ALPHA_CMPBGE_0(dst, src)					\
290   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
291 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
292    them, locating the highest non-zero byte.  A second __clz_tab lookup
293    counts the leading zero bits in that byte, giving the result.  */
294 #define count_leading_zeros(count, x)					\
295   do {									\
296     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
297     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
298     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
299     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
300     __clz__x >>= __clz__b;						\
301     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
302     __clz__b = 65 - __clz__b;						\
303     (count) = __clz__b - __clz__c;					\
304   } while (0)
305 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
306 #endif /* clz using cmpbge */
307 
308 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
309 #if HAVE_ATTRIBUTE_CONST
310 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
311 #else
312 long __MPN(count_leading_zeros) (UDItype);
313 #endif
314 #define count_leading_zeros(count, x) \
315   ((count) = __MPN(count_leading_zeros) (x))
316 #endif /* clz using mpn */
317 #endif /* __alpha */
318 
319 #if defined (__AVR) && W_TYPE_SIZE == 8
320 #define umul_ppmm(ph, pl, m0, m1) \
321   do {									\
322     unsigned short __p = (unsigned short) (m0) * (m1);			\
323     (ph) = __p >> 8;							\
324     (pl) = __p;								\
325   } while (0)
326 #endif /* AVR */
327 
328 #if defined (_CRAY) && W_TYPE_SIZE == 64
329 #include <intrinsics.h>
330 #define UDIV_PREINV_ALWAYS  1
331 #define UDIV_NEEDS_NORMALIZATION 1
332 #define UDIV_TIME 220
333 long __MPN(count_leading_zeros) (UDItype);
334 #define count_leading_zeros(count, x) \
335   ((count) = _leadz ((UWtype) (x)))
336 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
337 #define umul_ppmm(ph, pl, m0, m1) \
338   do {									\
339     UDItype __m0 = (m0), __m1 = (m1);					\
340     (ph) = _int_mult_upper (__m0, __m1);				\
341     (pl) = __m0 * __m1;							\
342   } while (0)
343 #ifndef LONGLONG_STANDALONE
344 #define udiv_qrnnd(q, r, n1, n0, d) \
345   do { UWtype __di;							\
346     __di = __MPN(invert_limb) (d);					\
347     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
348   } while (0)
349 #endif /* LONGLONG_STANDALONE */
350 #endif /* _CRAYIEEE */
351 #endif /* _CRAY */
352 
353 #if defined (__ia64) && W_TYPE_SIZE == 64
354 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
355    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
356    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
357    register, which takes an extra cycle.  */
358 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
359   do {						\
360     UWtype __x;					\
361     __x = (al) - (bl);				\
362     if ((al) < (bl))				\
363       (sh) = (ah) - (bh) - 1;			\
364     else					\
365       (sh) = (ah) - (bh);			\
366     (sl) = __x;					\
367   } while (0)
368 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
369 /* Do both product parts in assembly, since that gives better code with
370    all gcc versions.  Some callers will just use the upper part, and in
371    that situation we waste an instruction, but not any cycles.  */
372 #define umul_ppmm(ph, pl, m0, m1) \
373     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
374 	     : "=&f" (ph), "=f" (pl)					\
375 	     : "f" (m0), "f" (m1))
376 #define UMUL_TIME 14
377 #define count_leading_zeros(count, x) \
378   do {									\
379     UWtype _x = (x), _y, _a, _c;					\
380     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
381     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
382     _c = (_a - 1) << 3;							\
383     _x >>= _c;								\
384     if (_x >= 1 << 4)							\
385       _x >>= 4, _c += 4;						\
386     if (_x >= 1 << 2)							\
387       _x >>= 2, _c += 2;						\
388     _c += _x >> 1;							\
389     (count) =  W_TYPE_SIZE - 1 - _c;					\
390   } while (0)
391 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
392    based, and we don't need a special case for x==0 here */
393 #define count_trailing_zeros(count, x)					\
394   do {									\
395     UWtype __ctz_x = (x);						\
396     __asm__ ("popcnt %0 = %1"						\
397 	     : "=r" (count)						\
398 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
399   } while (0)
400 #endif
401 #if defined (__INTEL_COMPILER)
402 #include <ia64intrin.h>
403 #define umul_ppmm(ph, pl, m0, m1)					\
404   do {									\
405     UWtype __m0 = (m0), __m1 = (m1);					\
406     ph = _m64_xmahu (__m0, __m1, 0);					\
407     pl = __m0 * __m1;							\
408   } while (0)
409 #endif
410 #ifndef LONGLONG_STANDALONE
411 #define udiv_qrnnd(q, r, n1, n0, d) \
412   do { UWtype __di;							\
413     __di = __MPN(invert_limb) (d);					\
414     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
415   } while (0)
416 #define UDIV_PREINV_ALWAYS  1
417 #define UDIV_NEEDS_NORMALIZATION 1
418 #endif
419 #define UDIV_TIME 220
420 #endif
421 
422 
423 #if defined (__GNUC__)
424 
425 /* We sometimes need to clobber "cc" with gcc2, but that would not be
426    understood by gcc1.  Use cpp to avoid major code duplication.  */
427 #if __GNUC__ < 2
428 #define __CLOBBER_CC
429 #define __AND_CLOBBER_CC
430 #else /* __GNUC__ >= 2 */
431 #define __CLOBBER_CC : "cc"
432 #define __AND_CLOBBER_CC , "cc"
433 #endif /* __GNUC__ < 2 */
434 
435 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
436 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
437   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
438 	   : "=r" (sh), "=&r" (sl)					\
439 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
440 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
441   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
442 	   : "=r" (sh), "=&r" (sl)					\
443 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
444 #define umul_ppmm(xh, xl, m0, m1) \
445   do {									\
446     USItype __m0 = (m0), __m1 = (m1);					\
447     __asm__ ("multiplu %0,%1,%2"					\
448 	     : "=r" (xl)						\
449 	     : "r" (__m0), "r" (__m1));					\
450     __asm__ ("multmu %0,%1,%2"						\
451 	     : "=r" (xh)						\
452 	     : "r" (__m0), "r" (__m1));					\
453   } while (0)
454 #define udiv_qrnnd(q, r, n1, n0, d) \
455   __asm__ ("dividu %0,%3,%4"						\
456 	   : "=r" (q), "=q" (r)						\
457 	   : "1" (n1), "r" (n0), "r" (d))
458 #define count_leading_zeros(count, x) \
459     __asm__ ("clz %0,%1"						\
460 	     : "=r" (count)						\
461 	     : "r" (x))
462 #define COUNT_LEADING_ZEROS_0 32
463 #endif /* __a29k__ */
464 
465 #if defined (__arc__)
466 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
467   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
468 	   : "=r" (sh),							\
469 	     "=&r" (sl)							\
470 	   : "r"  ((USItype) (ah)),					\
471 	     "rIJ" ((USItype) (bh)),					\
472 	     "%r" ((USItype) (al)),					\
473 	     "rIJ" ((USItype) (bl)))
474 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
475   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
476 	   : "=r" (sh),							\
477 	     "=&r" (sl)							\
478 	   : "r" ((USItype) (ah)),					\
479 	     "rIJ" ((USItype) (bh)),					\
480 	     "r" ((USItype) (al)),					\
481 	     "rIJ" ((USItype) (bl)))
482 #endif
483 
484 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
485     && W_TYPE_SIZE == 32
486 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
487   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
488 	   : "=r" (sh), "=&r" (sl)					\
489 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
490 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
491   do {									\
492     if (__builtin_constant_p (al))					\
493       {									\
494 	if (__builtin_constant_p (ah))					\
495 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
496 		   : "=r" (sh), "=&r" (sl)				\
497 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
498 	else								\
499 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
500 		   : "=r" (sh), "=&r" (sl)				\
501 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
502       }									\
503     else if (__builtin_constant_p (ah))					\
504       {									\
505 	if (__builtin_constant_p (bl))					\
506 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
507 		   : "=r" (sh), "=&r" (sl)				\
508 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
509 	else								\
510 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
511 		   : "=r" (sh), "=&r" (sl)				\
512 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
513       }									\
514     else if (__builtin_constant_p (bl))					\
515       {									\
516 	if (__builtin_constant_p (bh))					\
517 	  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"		\
518 		   : "=r" (sh), "=&r" (sl)				\
519 		   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
520 	else								\
521 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
522 		   : "=r" (sh), "=&r" (sl)				\
523 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
524       }									\
525     else /* only bh might be a constant */				\
526       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
527 	       : "=r" (sh), "=&r" (sl)					\
528 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
529     } while (0)
530 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
531     || defined (__ARM_ARCH_3__)
532 #define umul_ppmm(xh, xl, a, b)						\
533   do {									\
534     register USItype __t0, __t1, __t2;					\
535     __asm__ ("%@ Inlined umul_ppmm\n"					\
536 	   "	mov	%2, %5, lsr #16\n"				\
537 	   "	mov	%0, %6, lsr #16\n"				\
538 	   "	bic	%3, %5, %2, lsl #16\n"				\
539 	   "	bic	%4, %6, %0, lsl #16\n"				\
540 	   "	mul	%1, %3, %4\n"					\
541 	   "	mul	%4, %2, %4\n"					\
542 	   "	mul	%3, %0, %3\n"					\
543 	   "	mul	%0, %2, %0\n"					\
544 	   "	adds	%3, %4, %3\n"					\
545 	   "	addcs	%0, %0, #65536\n"				\
546 	   "	adds	%1, %1, %3, lsl #16\n"				\
547 	   "	adc	%0, %0, %3, lsr #16"				\
548 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
549 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
550 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
551   } while (0)
552 #define UMUL_TIME 20
553 #define udiv_qrnnd(q, r, n1, n0, d) \
554   do { UWtype __r;							\
555     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
556     (r) = __r;								\
557   } while (0)
558 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
559 #define UDIV_TIME 200
560 #else /* ARMv4 or newer */
561 #define umul_ppmm(xh, xl, a, b) \
562   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
563 #define UMUL_TIME 5
564 #define smul_ppmm(xh, xl, a, b) \
565   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
566 #ifndef LONGLONG_STANDALONE
567 #define udiv_qrnnd(q, r, n1, n0, d) \
568   do { UWtype __di;							\
569     __di = __MPN(invert_limb) (d);					\
570     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
571   } while (0)
572 #define UDIV_PREINV_ALWAYS  1
573 #define UDIV_NEEDS_NORMALIZATION 1
574 #define UDIV_TIME 70
575 #endif /* LONGLONG_STANDALONE */
576 #endif /* defined(__ARM_ARCH_2__) ... */
577 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
578 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
579 #define COUNT_LEADING_ZEROS_0 32
580 #endif /* __arm__ */
581 
582 #if defined (__aarch64__) && W_TYPE_SIZE == 64
583 /* FIXME: Extend the immediate range for the low word by using both
584    ADDS and SUBS, since they set carry in the same way.  */
585 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
586   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
587 	   : "=r" (sh), "=&r" (sl)					\
588 	   : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
589 	     "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
590 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
591   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
592 	   : "=r,r" (sh), "=&r,&r" (sl)					\
593 	   : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),		\
594 	     "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC)
595 #define umul_ppmm(ph, pl, m0, m1) \
596   do {									\
597     UDItype __m0 = (m0), __m1 = (m1);					\
598     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
599     (pl) = __m0 * __m1;							\
600   } while (0)
601 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
602 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
603 #define COUNT_LEADING_ZEROS_0 64
604 #endif /* __aarch64__ */
605 
606 #if defined (__clipper__) && W_TYPE_SIZE == 32
607 #define umul_ppmm(w1, w0, u, v) \
608   ({union {UDItype __ll;						\
609 	   struct {USItype __l, __h;} __i;				\
610 	  } __x;							\
611   __asm__ ("mulwux %2,%0"						\
612 	   : "=r" (__x.__ll)						\
613 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
614   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
615 #define smul_ppmm(w1, w0, u, v) \
616   ({union {DItype __ll;							\
617 	   struct {SItype __l, __h;} __i;				\
618 	  } __x;							\
619   __asm__ ("mulwx %2,%0"						\
620 	   : "=r" (__x.__ll)						\
621 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
622   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
623 #define __umulsidi3(u, v) \
624   ({UDItype __w;							\
625     __asm__ ("mulwux %2,%0"						\
626 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
627     __w; })
628 #endif /* __clipper__ */
629 
630 /* Fujitsu vector computers.  */
631 #if defined (__uxp__) && W_TYPE_SIZE == 32
632 #define umul_ppmm(ph, pl, u, v) \
633   do {									\
634     union {UDItype __ll;						\
635 	   struct {USItype __h, __l;} __i;				\
636 	  } __x;							\
637     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
638     (ph) = __x.__i.__h;							\
639     (pl) = __x.__i.__l;							\
640   } while (0)
641 #define smul_ppmm(ph, pl, u, v) \
642   do {									\
643     union {UDItype __ll;						\
644 	   struct {USItype __h, __l;} __i;				\
645 	  } __x;							\
646     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
647     (ph) = __x.__i.__h;							\
648     (pl) = __x.__i.__l;							\
649   } while (0)
650 #endif
651 
652 #if defined (__gmicro__) && W_TYPE_SIZE == 32
653 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
654   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
655 	   : "=g" (sh), "=&g" (sl)					\
656 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
657 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
658 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
659   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
660 	   : "=g" (sh), "=&g" (sl)					\
661 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
662 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
663 #define umul_ppmm(ph, pl, m0, m1) \
664   __asm__ ("mulx %3,%0,%1"						\
665 	   : "=g" (ph), "=r" (pl)					\
666 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
667 #define udiv_qrnnd(q, r, nh, nl, d) \
668   __asm__ ("divx %4,%0,%1"						\
669 	   : "=g" (q), "=r" (r)						\
670 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
671 #define count_leading_zeros(count, x) \
672   __asm__ ("bsch/1 %1,%0"						\
673 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
674 #endif
675 
676 #if defined (__hppa) && W_TYPE_SIZE == 32
677 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
678   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
679 	   : "=r" (sh), "=&r" (sl)					\
680 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
681 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
682   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
683 	   : "=r" (sh), "=&r" (sl)					\
684 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
685 #if defined (_PA_RISC1_1)
686 #define umul_ppmm(wh, wl, u, v) \
687   do {									\
688     union {UDItype __ll;						\
689 	   struct {USItype __h, __l;} __i;				\
690 	  } __x;							\
691     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
692     (wh) = __x.__i.__h;							\
693     (wl) = __x.__i.__l;							\
694   } while (0)
695 #define UMUL_TIME 8
696 #define UDIV_TIME 60
697 #else
698 #define UMUL_TIME 40
699 #define UDIV_TIME 80
700 #endif
701 #define count_leading_zeros(count, x) \
702   do {									\
703     USItype __tmp;							\
704     __asm__ (								\
705        "ldi		1,%0\n"						\
706 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
707 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
708 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
709 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
710 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
711 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
712 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
713 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
714 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
715 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
716 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
717 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
718 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
719 "	sub		%0,%1,%0	; Subtract it.\n"		\
720 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
721   } while (0)
722 #endif /* hppa */
723 
724 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
725    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
726    is just a case of no direct support for 2.0n but treating it like 1.0. */
727 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
728 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
729   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
730 	   : "=r" (sh), "=&r" (sl)					\
731 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
732 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
733   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
734 	   : "=r" (sh), "=&r" (sl)					\
735 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
736 #endif /* hppa */
737 
738 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
739 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
740 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
741   do {									\
742 /*  if (__builtin_constant_p (bl))					\
743       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
744 	       : "=r" (sh), "=&r" (sl)					\
745 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
746     else								\
747 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
748 	       : "=r" (sh), "=&r" (sl)					\
749 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
750   } while (0)
751 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
752   do {									\
753 /*  if (__builtin_constant_p (bl))					\
754       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
755 	       : "=r" (sh), "=&r" (sl)					\
756 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
757     else								\
758 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
759 	       : "=r" (sh), "=&r" (sl)					\
760 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
761   } while (0)
762 #if __GMP_GNUC_PREREQ (4,5)
763 #define umul_ppmm(xh, xl, m0, m1)					\
764   do {									\
765     union {UDItype __ll;						\
766 	   struct {USItype __h, __l;} __i;				\
767 	  } __x;							\
768     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
769     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
770   } while (0)
771 #else
772 #if 0
773 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
774    with a new enough processor pretending we have 32-bit registers.  */
775 #define umul_ppmm(xh, xl, m0, m1)					\
776   do {									\
777     union {UDItype __ll;						\
778 	   struct {USItype __h, __l;} __i;				\
779 	  } __x;							\
780     __asm__ ("mlr\t%0,%2"						\
781 	     : "=r" (__x.__ll)						\
782 	     : "%0" (m0), "r" (m1));					\
783     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
784   } while (0)
785 #else
786 #define umul_ppmm(xh, xl, m0, m1)					\
787   do {									\
788   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
789      DImode for the product, since that would be allocated to a single 64-bit
790      register, whereas mlr uses the low 32-bits of an even-odd register pair.
791   */									\
792     register USItype __r0 __asm__ ("0");				\
793     register USItype __r1 __asm__ ("1") = (m0);				\
794     __asm__ ("mlr\t%0,%3"						\
795 	     : "=r" (__r0), "=r" (__r1)					\
796 	     : "r" (__r1), "r" (m1));					\
797     (xh) = __r0; (xl) = __r1;						\
798   } while (0)
799 #endif /* if 0 */
800 #endif
801 #if 0
802 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
803    with a new enough processor pretending we have 32-bit registers.  */
804 #define udiv_qrnnd(q, r, n1, n0, d)					\
805   do {									\
806     union {UDItype __ll;						\
807 	   struct {USItype __h, __l;} __i;				\
808 	  } __x;							\
809     __x.__i.__h = n1; __x.__i.__l = n0;					\
810     __asm__ ("dlr\t%0,%2"						\
811 	     : "=r" (__x.__ll)						\
812 	     : "0" (__x.__ll), "r" (d));				\
813     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
814   } while (0)
815 #else
816 #define udiv_qrnnd(q, r, n1, n0, d)					\
817   do {									\
818     register USItype __r0 __asm__ ("0") = (n1);				\
819     register USItype __r1 __asm__ ("1") = (n0);				\
820     __asm__ ("dlr\t%0,%4"						\
821 	     : "=r" (__r0), "=r" (__r1)					\
822 	     : "r" (__r0), "r" (__r1), "r" (d));			\
823     (q) = __r1; (r) = __r0;						\
824   } while (0)
825 #endif /* if 0 */
826 #else /* if __zarch__ */
827 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
828 #define smul_ppmm(xh, xl, m0, m1)					\
829   do {									\
830     union {DItype __ll;							\
831 	   struct {USItype __h, __l;} __i;				\
832 	  } __x;							\
833     __asm__ ("mr\t%0,%2"						\
834 	     : "=r" (__x.__ll)						\
835 	     : "%0" (m0), "r" (m1));					\
836     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
837   } while (0)
838 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
839 #define sdiv_qrnnd(q, r, n1, n0, d)					\
840   do {									\
841     union {DItype __ll;							\
842 	   struct {USItype __h, __l;} __i;				\
843 	  } __x;							\
844     __x.__i.__h = n1; __x.__i.__l = n0;					\
845     __asm__ ("dr\t%0,%2"						\
846 	     : "=r" (__x.__ll)						\
847 	     : "0" (__x.__ll), "r" (d));				\
848     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
849   } while (0)
850 #endif /* if __zarch__ */
851 #endif
852 
853 #if defined (__s390x__) && W_TYPE_SIZE == 64
854 /* We need to cast operands with register constraints, otherwise their types
855    will be assumed to be SImode by gcc.  For these machines, such operations
856    will insert a value into the low 32 bits, and leave the high 32 bits with
857    garbage.  */
858 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
859   do {									\
860     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
861 	       : "=r" (sh), "=&r" (sl)					\
862 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
863 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
864   } while (0)
865 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
866   do {									\
867     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
868 	     : "=r" (sh), "=&r" (sl)					\
869 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
870 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
871   } while (0)
872 #define umul_ppmm(xh, xl, m0, m1)					\
873   do {									\
874     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
875 	   struct {UDItype __h, __l;} __i;				\
876 	  } __x;							\
877     __asm__ ("mlgr\t%0,%2"						\
878 	     : "=r" (__x.__ll)						\
879 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
880     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
881   } while (0)
882 #define udiv_qrnnd(q, r, n1, n0, d)					\
883   do {									\
884     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
885 	   struct {UDItype __h, __l;} __i;				\
886 	  } __x;							\
887     __x.__i.__h = n1; __x.__i.__l = n0;					\
888     __asm__ ("dlgr\t%0,%2"						\
889 	     : "=r" (__x.__ll)						\
890 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
891     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
892   } while (0)
893 #if 0 /* FIXME: Enable for z10 (?) */
894 #define count_leading_zeros(cnt, x)					\
895   do {									\
896     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
897 	   struct {UDItype __h, __l;} __i;				\
898 	  } __clr_cnt;							\
899     __asm__ ("flogr\t%0,%1"						\
900 	     : "=r" (__clr_cnt.__ll)					\
901 	     : "r" (x) __CLOBBER_CC);					\
902     (cnt) = __clr_cnt.__i.__h;						\
903   } while (0)
904 #endif
905 #endif
906 
907 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
908    so we don't need __CLOBBER_CC.  */
909 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
910 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
911   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
912 	   : "=r" (sh), "=&r" (sl)					\
913 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
914 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
915 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
916   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
917 	   : "=r" (sh), "=&r" (sl)					\
918 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
919 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
920 #define umul_ppmm(w1, w0, u, v) \
921   __asm__ ("mull %3"							\
922 	   : "=a" (w0), "=d" (w1)					\
923 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
924 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
925   __asm__ ("divl %4"		     /* stringification in K&R C */	\
926 	   : "=a" (q), "=d" (r)						\
927 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
928 
929 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
930 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
931    significant 1 bit is, hence the use of the following alternatives.  bsfl
932    is slow too, between 18 and 42 depending where the least significant 1
933    bit is, so let the generic count_trailing_zeros below make use of the
934    count_leading_zeros here too.  */
935 
936 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
937 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
938    cache miss reading from __clz_tab.  For P55 it's favoured over the float
939    below so as to avoid mixing MMX and x87, since the penalty for switching
940    between the two is about 100 cycles.
941 
942    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
943    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
944    follows, but as of gcc 2.95.2 it results in conditional jumps.
945 
946        __shift = -(__n < 0x1000000);
947        __shift -= (__n < 0x10000);
948        __shift -= (__n < 0x100);
949 
950    The middle two sbbl and cmpl's pair, and with luck something gcc
951    generates might pair with the first cmpl and the last sbbl.  The "32+1"
952    constant could be folded into __clz_tab[], but it doesn't seem worth
953    making a different table just for that.  */
954 
955 #define count_leading_zeros(c,n)					\
956   do {									\
957     USItype  __n = (n);							\
958     USItype  __shift;							\
959     __asm__ ("cmpl  $0x1000000, %1\n"					\
960 	     "sbbl  %0, %0\n"						\
961 	     "cmpl  $0x10000, %1\n"					\
962 	     "sbbl  $0, %0\n"						\
963 	     "cmpl  $0x100, %1\n"					\
964 	     "sbbl  $0, %0\n"						\
965 	     : "=&r" (__shift) : "r"  (__n));				\
966     __shift = __shift*8 + 24 + 1;					\
967     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
968   } while (0)
969 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
970 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
971 
972 #else /* ! pentiummmx || LONGLONG_STANDALONE */
973 /* The following should be a fixed 14 cycles or so.  Some scheduling
974    opportunities should be available between the float load/store too.  This
975    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
976    apparently suggested by the Intel optimizing manual (don't know exactly
977    where).  gcc 2.95 or up will be best for this, so the "double" is
978    correctly aligned on the stack.  */
979 #define count_leading_zeros(c,n)					\
980   do {									\
981     union {								\
982       double    d;							\
983       unsigned  a[2];							\
984     } __u;								\
985     ASSERT ((n) != 0);							\
986     __u.d = (UWtype) (n);						\
987     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
988   } while (0)
989 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
990 #endif /* pentiummx */
991 
992 #else /* ! pentium */
993 
994 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
995 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
996 #endif /* gcc clz */
997 
998 /* On P6, gcc prior to 3.0 generates a partial register stall for
999    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
1000    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
1001    cost of one extra instruction.  Do this for "i386" too, since that means
1002    generic x86.  */
1003 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
1004   && (HAVE_HOST_CPU_i386						\
1005       || HAVE_HOST_CPU_i686						\
1006       || HAVE_HOST_CPU_pentiumpro					\
1007       || HAVE_HOST_CPU_pentium2						\
1008       || HAVE_HOST_CPU_pentium3)
1009 #define count_leading_zeros(count, x)					\
1010   do {									\
1011     USItype __cbtmp;							\
1012     ASSERT ((x) != 0);							\
1013     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1014     (count) = 31 - __cbtmp;						\
1015   } while (0)
1016 #endif /* gcc<3 asm bsrl */
1017 
1018 #ifndef count_leading_zeros
1019 #define count_leading_zeros(count, x)					\
1020   do {									\
1021     USItype __cbtmp;							\
1022     ASSERT ((x) != 0);							\
1023     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
1024     (count) = __cbtmp ^ 31;						\
1025   } while (0)
1026 #endif /* asm bsrl */
1027 
1028 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
1029 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
1030 #endif /* gcc ctz */
1031 
1032 #ifndef count_trailing_zeros
1033 #define count_trailing_zeros(count, x)					\
1034   do {									\
1035     ASSERT ((x) != 0);							\
1036     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
1037   } while (0)
1038 #endif /* asm bsfl */
1039 
1040 #endif /* ! pentium */
1041 
1042 #ifndef UMUL_TIME
1043 #define UMUL_TIME 10
1044 #endif
1045 #ifndef UDIV_TIME
1046 #define UDIV_TIME 40
1047 #endif
1048 #endif /* 80x86 */
1049 
1050 #if defined (__amd64__) && W_TYPE_SIZE == 64
1051 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1052   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
1053 	   : "=r" (sh), "=&r" (sl)					\
1054 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1055 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1056 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1057   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
1058 	   : "=r" (sh), "=&r" (sl)					\
1059 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1060 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1061 #define umul_ppmm(w1, w0, u, v) \
1062   __asm__ ("mulq %3"							\
1063 	   : "=a" (w0), "=d" (w1)					\
1064 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1065 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1066   __asm__ ("divq %4"		     /* stringification in K&R C */	\
1067 	   : "=a" (q), "=d" (r)						\
1068 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1069 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1070 #define count_leading_zeros(count, x)					\
1071   do {									\
1072     UDItype __cbtmp;							\
1073     ASSERT ((x) != 0);							\
1074     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
1075     (count) = __cbtmp ^ 63;						\
1076   } while (0)
1077 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1078    count is only an int. */
1079 #define count_trailing_zeros(count, x)					\
1080   do {									\
1081     ASSERT ((x) != 0);							\
1082     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1083   } while (0)
1084 #endif /* __amd64__ */
1085 
1086 #if defined (__i860__) && W_TYPE_SIZE == 32
1087 #define rshift_rhlc(r,h,l,c) \
1088   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
1089 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
1090 #endif /* i860 */
1091 
1092 #if defined (__i960__) && W_TYPE_SIZE == 32
1093 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1094   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
1095 	   : "=r" (sh), "=&r" (sl)					\
1096 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1097 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1098   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
1099 	   : "=r" (sh), "=&r" (sl)					\
1100 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1101 #define umul_ppmm(w1, w0, u, v) \
1102   ({union {UDItype __ll;						\
1103 	   struct {USItype __l, __h;} __i;				\
1104 	  } __x;							\
1105   __asm__ ("emul %2,%1,%0"						\
1106 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
1107   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1108 #define __umulsidi3(u, v) \
1109   ({UDItype __w;							\
1110     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
1111     __w; })
1112 #define udiv_qrnnd(q, r, nh, nl, d) \
1113   do {									\
1114     union {UDItype __ll;						\
1115 	   struct {USItype __l, __h;} __i;				\
1116 	  } __nn;							\
1117     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
1118     __asm__ ("ediv %d,%n,%0"						\
1119 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
1120     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
1121   } while (0)
1122 #define count_leading_zeros(count, x) \
1123   do {									\
1124     USItype __cbtmp;							\
1125     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
1126     (count) = __cbtmp ^ 31;						\
1127   } while (0)
1128 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1129 #if defined (__i960mx)		/* what is the proper symbol to test??? */
1130 #define rshift_rhlc(r,h,l,c) \
1131   do {									\
1132     union {UDItype __ll;						\
1133 	   struct {USItype __l, __h;} __i;				\
1134 	  } __nn;							\
1135     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
1136     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
1137   }
1138 #endif /* i960mx */
1139 #endif /* i960 */
1140 
1141 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1142      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1143      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1144 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1145   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
1146 	   : "=d" (sh), "=&d" (sl)					\
1147 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
1148 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1149 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1150   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
1151 	   : "=d" (sh), "=&d" (sl)					\
1152 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
1153 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1154 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1155 #if defined (__mc68020__) || defined(mc68020) \
1156      || defined (__mc68030__) || defined (mc68030) \
1157      || defined (__mc68040__) || defined (mc68040) \
1158      || defined (__mcpu32__) || defined (mcpu32) \
1159      || defined (__NeXT__)
1160 #define umul_ppmm(w1, w0, u, v) \
1161   __asm__ ("mulu%.l %3,%1:%0"						\
1162 	   : "=d" (w0), "=d" (w1)					\
1163 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1164 #define UMUL_TIME 45
1165 #define udiv_qrnnd(q, r, n1, n0, d) \
1166   __asm__ ("divu%.l %4,%1:%0"						\
1167 	   : "=d" (q), "=d" (r)						\
1168 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1169 #define UDIV_TIME 90
1170 #define sdiv_qrnnd(q, r, n1, n0, d) \
1171   __asm__ ("divs%.l %4,%1:%0"						\
1172 	   : "=d" (q), "=d" (r)						\
1173 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1174 #else /* for other 68k family members use 16x16->32 multiplication */
1175 #define umul_ppmm(xh, xl, a, b) \
1176   do { USItype __umul_tmp1, __umul_tmp2;				\
1177 	__asm__ ("| Inlined umul_ppmm\n"				\
1178 "	move%.l	%5,%3\n"						\
1179 "	move%.l	%2,%0\n"						\
1180 "	move%.w	%3,%1\n"						\
1181 "	swap	%3\n"							\
1182 "	swap	%0\n"							\
1183 "	mulu%.w	%2,%1\n"						\
1184 "	mulu%.w	%3,%0\n"						\
1185 "	mulu%.w	%2,%3\n"						\
1186 "	swap	%2\n"							\
1187 "	mulu%.w	%5,%2\n"						\
1188 "	add%.l	%3,%2\n"						\
1189 "	jcc	1f\n"							\
1190 "	add%.l	%#0x10000,%0\n"						\
1191 "1:	move%.l	%2,%3\n"						\
1192 "	clr%.w	%2\n"							\
1193 "	swap	%2\n"							\
1194 "	swap	%3\n"							\
1195 "	clr%.w	%3\n"							\
1196 "	add%.l	%3,%1\n"						\
1197 "	addx%.l	%2,%0\n"						\
1198 "	| End inlined umul_ppmm"					\
1199 	      : "=&d" (xh), "=&d" (xl),					\
1200 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
1201 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
1202   } while (0)
1203 #define UMUL_TIME 100
1204 #define UDIV_TIME 400
1205 #endif /* not mc68020 */
1206 /* The '020, '030, '040 and '060 have bitfield insns.
1207    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1208    exclude bfffo on that chip (bitfield insns not available).  */
1209 #if (defined (__mc68020__) || defined (mc68020)    \
1210      || defined (__mc68030__) || defined (mc68030) \
1211      || defined (__mc68040__) || defined (mc68040) \
1212      || defined (__mc68060__) || defined (mc68060) \
1213      || defined (__NeXT__))			   \
1214   && ! defined (__mcpu32__)
1215 #define count_leading_zeros(count, x) \
1216   __asm__ ("bfffo %1{%b2:%b2},%0"					\
1217 	   : "=d" (count)						\
1218 	   : "od" ((USItype) (x)), "n" (0))
1219 #define COUNT_LEADING_ZEROS_0 32
1220 #endif
1221 #endif /* mc68000 */
1222 
1223 #if defined (__m88000__) && W_TYPE_SIZE == 32
1224 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1225   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
1226 	   : "=r" (sh), "=&r" (sl)					\
1227 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1228 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1229   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
1230 	   : "=r" (sh), "=&r" (sl)					\
1231 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1232 #define count_leading_zeros(count, x) \
1233   do {									\
1234     USItype __cbtmp;							\
1235     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
1236     (count) = __cbtmp ^ 31;						\
1237   } while (0)
1238 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1239 #if defined (__m88110__)
1240 #define umul_ppmm(wh, wl, u, v) \
1241   do {									\
1242     union {UDItype __ll;						\
1243 	   struct {USItype __h, __l;} __i;				\
1244 	  } __x;							\
1245     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
1246     (wh) = __x.__i.__h;							\
1247     (wl) = __x.__i.__l;							\
1248   } while (0)
1249 #define udiv_qrnnd(q, r, n1, n0, d) \
1250   ({union {UDItype __ll;						\
1251 	   struct {USItype __h, __l;} __i;				\
1252 	  } __x, __q;							\
1253   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1254   __asm__ ("divu.d %0,%1,%2"						\
1255 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
1256   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1257 #define UMUL_TIME 5
1258 #define UDIV_TIME 25
1259 #else
1260 #define UMUL_TIME 17
1261 #define UDIV_TIME 150
1262 #endif /* __m88110__ */
1263 #endif /* __m88000__ */
1264 
1265 #if defined (__mips) && W_TYPE_SIZE == 32
1266 #if __GMP_GNUC_PREREQ (4,4)
1267 #define umul_ppmm(w1, w0, u, v) \
1268   do {									\
1269     UDItype __ll = (UDItype)(u) * (v);					\
1270     w1 = __ll >> 32;							\
1271     w0 = __ll;								\
1272   } while (0)
1273 #endif
1274 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1275 #define umul_ppmm(w1, w0, u, v) \
1276   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1277 #endif
1278 #if !defined (umul_ppmm)
1279 #define umul_ppmm(w1, w0, u, v) \
1280   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1281 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1282 #endif
1283 #define UMUL_TIME 10
1284 #define UDIV_TIME 100
1285 #endif /* __mips */
1286 
1287 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1288 #if __GMP_GNUC_PREREQ (4,4)
1289 #define umul_ppmm(w1, w0, u, v) \
1290   do {									\
1291     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1292     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1293     w1 = __ll >> 64;							\
1294     w0 = __ll;								\
1295   } while (0)
1296 #endif
1297 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1298 #define umul_ppmm(w1, w0, u, v) \
1299   __asm__ ("dmultu %2,%3"						\
1300 	   : "=l" (w0), "=h" (w1)					\
1301 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1302 #endif
1303 #if !defined (umul_ppmm)
1304 #define umul_ppmm(w1, w0, u, v) \
1305   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1306 	   : "=d" (w0), "=d" (w1)					\
1307 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1308 #endif
1309 #define UMUL_TIME 20
1310 #define UDIV_TIME 140
1311 #endif /* __mips */
1312 
1313 #if defined (__mmix__) && W_TYPE_SIZE == 64
1314 #define umul_ppmm(w1, w0, u, v) \
1315   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1316 #endif
1317 
1318 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1319 #define umul_ppmm(w1, w0, u, v) \
1320   ({union {UDItype __ll;						\
1321 	   struct {USItype __l, __h;} __i;				\
1322 	  } __x;							\
1323   __asm__ ("meid %2,%0"							\
1324 	   : "=g" (__x.__ll)						\
1325 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
1326   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1327 #define __umulsidi3(u, v) \
1328   ({UDItype __w;							\
1329     __asm__ ("meid %2,%0"						\
1330 	     : "=g" (__w)						\
1331 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
1332     __w; })
1333 #define udiv_qrnnd(q, r, n1, n0, d) \
1334   ({union {UDItype __ll;						\
1335 	   struct {USItype __l, __h;} __i;				\
1336 	  } __x;							\
1337   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1338   __asm__ ("deid %2,%0"							\
1339 	   : "=g" (__x.__ll)						\
1340 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
1341   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1342 #define count_trailing_zeros(count,x) \
1343   do {									\
1344     __asm__ ("ffsd	%2,%0"						\
1345 	     : "=r" (count)						\
1346 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
1347   } while (0)
1348 #endif /* __ns32000__ */
1349 
1350 /* In the past we had a block of various #defines tested
1351        _ARCH_PPC    - AIX
1352        _ARCH_PWR    - AIX
1353        __powerpc__  - gcc
1354        __POWERPC__  - BEOS
1355        __ppc__      - Darwin
1356        PPC          - old gcc, GNU/Linux, SysV
1357    The plain PPC test was not good for vxWorks, since PPC is defined on all
1358    CPUs there (eg. m68k too), as a constant one is expected to compare
1359    CPU_FAMILY against.
1360 
1361    At any rate, this was pretty unattractive and a bit fragile.  The use of
1362    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1363    getting the desired effect.
1364 
1365    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1366    the system vendor compilers.  (Is that vendor compilers with inline asm,
1367    or what?)  */
1368 
1369 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
1370   && W_TYPE_SIZE == 32
1371 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1372   do {									\
1373     if (__builtin_constant_p (bh) && (bh) == 0)				\
1374       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1375 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
1376     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1377       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1378 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
1379     else								\
1380       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1381 	     : "=r" (sh), "=&r" (sl)					\
1382 	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
1383   } while (0)
1384 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1385   do {									\
1386     if (__builtin_constant_p (ah) && (ah) == 0)				\
1387       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1388 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1389     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
1390       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1391 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1392     else if (__builtin_constant_p (bh) && (bh) == 0)			\
1393       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1394 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1395     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1396       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1397 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1398     else								\
1399       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
1400 	       : "=r" (sh), "=&r" (sl)					\
1401 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
1402   } while (0)
1403 #define count_leading_zeros(count, x) \
1404   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1405 #define COUNT_LEADING_ZEROS_0 32
1406 #if HAVE_HOST_CPU_FAMILY_powerpc
1407 #if __GMP_GNUC_PREREQ (4,4)
1408 #define umul_ppmm(w1, w0, u, v) \
1409   do {									\
1410     UDItype __ll = (UDItype)(u) * (v);					\
1411     w1 = __ll >> 32;							\
1412     w0 = __ll;								\
1413   } while (0)
1414 #endif
1415 #if !defined (umul_ppmm)
1416 #define umul_ppmm(ph, pl, m0, m1) \
1417   do {									\
1418     USItype __m0 = (m0), __m1 = (m1);					\
1419     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1420     (pl) = __m0 * __m1;							\
1421   } while (0)
1422 #endif
1423 #define UMUL_TIME 15
1424 #define smul_ppmm(ph, pl, m0, m1) \
1425   do {									\
1426     SItype __m0 = (m0), __m1 = (m1);					\
1427     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1428     (pl) = __m0 * __m1;							\
1429   } while (0)
1430 #define SMUL_TIME 14
1431 #define UDIV_TIME 120
1432 #else
1433 #define UMUL_TIME 8
1434 #define smul_ppmm(xh, xl, m0, m1) \
1435   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1436 #define SMUL_TIME 4
1437 #define sdiv_qrnnd(q, r, nh, nl, d) \
1438   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1439 #define UDIV_TIME 100
1440 #endif
1441 #endif /* 32-bit POWER architecture variants.  */
1442 
1443 /* We should test _IBMR2 here when we add assembly support for the system
1444    vendor compilers.  */
1445 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1446 #if !defined (_LONG_LONG_LIMB)
1447 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1448    use adde etc only when not _LONG_LONG_LIMB.  */
1449 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1450   do {									\
1451     if (__builtin_constant_p (bh) && (bh) == 0)				\
1452       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1453 	       : "=r" (sh), "=&r" (sl)					\
1454 	       : "r"  ((UDItype)(ah)),					\
1455 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
1456     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
1457       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1458 	       : "=r" (sh), "=&r" (sl)					\
1459 	       : "r"  ((UDItype)(ah)),					\
1460 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
1461     else								\
1462       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1463 	       : "=r" (sh), "=&r" (sl)					\
1464 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
1465 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
1466   } while (0)
1467 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1468    This might seem strange, but gcc folds away the dead code late.  */
1469 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1470   do {									\
1471     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	\
1472 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1473 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
1474 		   : "=r" (sh), "=&r" (sl)				\
1475 		   :                       "r" ((UDItype)(bh)),		\
1476 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1477 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1478 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
1479 		   : "=r" (sh), "=&r" (sl)				\
1480 		   :                       "r" ((UDItype)(bh)),		\
1481 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1482 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1483 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
1484 		   : "=r" (sh), "=&r" (sl)				\
1485 		   : "r"  ((UDItype)(ah)),				\
1486 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1487 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1488 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
1489 		   : "=r" (sh), "=&r" (sl)				\
1490 		   : "r"  ((UDItype)(ah)),				\
1491 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1492 	else								\
1493 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
1494 		   : "=r" (sh), "=&r" (sl)				\
1495 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1496 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1497     } else {								\
1498 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1499 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1500 		   : "=r" (sh), "=&r" (sl)				\
1501 		   :                       "r" ((UDItype)(bh)),		\
1502 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1503 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1504 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1505 		   : "=r" (sh), "=&r" (sl)				\
1506 		   :                       "r" ((UDItype)(bh)),		\
1507 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1508 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1509 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1510 		   : "=r" (sh), "=&r" (sl)				\
1511 		   : "r"  ((UDItype)(ah)),				\
1512 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1513 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1514 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1515 		   : "=r" (sh), "=&r" (sl)				\
1516 		   : "r"  ((UDItype)(ah)),				\
1517 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1518 	else								\
1519 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
1520 		   : "=r" (sh), "=&r" (sl)				\
1521 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1522 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1523     }									\
1524   } while (0)
1525 #endif /* ! _LONG_LONG_LIMB */
1526 #define count_leading_zeros(count, x) \
1527   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1528 #define COUNT_LEADING_ZEROS_0 64
1529 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1530 #define umul_ppmm(w1, w0, u, v) \
1531   do {									\
1532     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1533     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1534     w1 = __ll >> 64;							\
1535     w0 = __ll;								\
1536   } while (0)
1537 #endif
1538 #if !defined (umul_ppmm)
1539 #define umul_ppmm(ph, pl, m0, m1) \
1540   do {									\
1541     UDItype __m0 = (m0), __m1 = (m1);					\
1542     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1543     (pl) = __m0 * __m1;							\
1544   } while (0)
1545 #endif
1546 #define UMUL_TIME 15
1547 #define smul_ppmm(ph, pl, m0, m1) \
1548   do {									\
1549     DItype __m0 = (m0), __m1 = (m1);					\
1550     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1551     (pl) = __m0 * __m1;							\
1552   } while (0)
1553 #define SMUL_TIME 14  /* ??? */
1554 #define UDIV_TIME 120 /* ??? */
1555 #endif /* 64-bit PowerPC.  */
1556 
1557 #if defined (__pyr__) && W_TYPE_SIZE == 32
1558 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1559   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
1560 	   : "=r" (sh), "=&r" (sl)					\
1561 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1562 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1563 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1564   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
1565 	   : "=r" (sh), "=&r" (sl)					\
1566 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1567 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1568 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1569 #define umul_ppmm(w1, w0, u, v) \
1570   ({union {UDItype __ll;						\
1571 	   struct {USItype __h, __l;} __i;				\
1572 	  } __x;							\
1573   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
1574 	   : "=&r" (__x.__ll)						\
1575 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
1576   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1577 #endif /* __pyr__ */
1578 
1579 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1580 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1581   __asm__ ("a %1,%5\n\tae %0,%3"					\
1582 	   : "=r" (sh), "=&r" (sl)					\
1583 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
1584 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1585 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1586   __asm__ ("s %1,%5\n\tse %0,%3"					\
1587 	   : "=r" (sh), "=&r" (sl)					\
1588 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
1589 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
1590 #define smul_ppmm(ph, pl, m0, m1) \
1591   __asm__ (								\
1592        "s	r2,r2\n"						\
1593 "	mts r10,%2\n"							\
1594 "	m	r2,%3\n"						\
1595 "	m	r2,%3\n"						\
1596 "	m	r2,%3\n"						\
1597 "	m	r2,%3\n"						\
1598 "	m	r2,%3\n"						\
1599 "	m	r2,%3\n"						\
1600 "	m	r2,%3\n"						\
1601 "	m	r2,%3\n"						\
1602 "	m	r2,%3\n"						\
1603 "	m	r2,%3\n"						\
1604 "	m	r2,%3\n"						\
1605 "	m	r2,%3\n"						\
1606 "	m	r2,%3\n"						\
1607 "	m	r2,%3\n"						\
1608 "	m	r2,%3\n"						\
1609 "	m	r2,%3\n"						\
1610 "	cas	%0,r2,r0\n"						\
1611 "	mfs	r10,%1"							\
1612 	   : "=r" (ph), "=r" (pl)					\
1613 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
1614 	   : "r2")
1615 #define UMUL_TIME 20
1616 #define UDIV_TIME 200
1617 #define count_leading_zeros(count, x) \
1618   do {									\
1619     if ((x) >= 0x10000)							\
1620       __asm__ ("clz	%0,%1"						\
1621 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
1622     else								\
1623       {									\
1624 	__asm__ ("clz	%0,%1"						\
1625 		 : "=r" (count) : "r" ((USItype)(x)));			\
1626 	(count) += 16;							\
1627       }									\
1628   } while (0)
1629 #endif /* RT/ROMP */
1630 
1631 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1632 #define umul_ppmm(w1, w0, u, v) \
1633   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
1634 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1635 #define UMUL_TIME 5
1636 #endif
1637 
1638 #if defined (__sparc__) && W_TYPE_SIZE == 32
1639 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1640   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1641 	   : "=r" (sh), "=&r" (sl)					\
1642 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
1643 	   __CLOBBER_CC)
1644 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1645   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1646 	   : "=r" (sh), "=&r" (sl)					\
1647 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
1648 	   __CLOBBER_CC)
1649 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1650    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1651 #if defined (__sparc_v9__) || defined (__sparcv9)
1652 /* Perhaps we should use floating-point operations here?  */
1653 #if 0
1654 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1655    Perhaps we simply need explicitly zero-extend the inputs?  */
1656 #define umul_ppmm(w1, w0, u, v) \
1657   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
1658 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1659 #else
1660 /* Use v8 umul until above bug is fixed.  */
1661 #define umul_ppmm(w1, w0, u, v) \
1662   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1663 #endif
1664 /* Use a plain v8 divide for v9.  */
1665 #define udiv_qrnnd(q, r, n1, n0, d) \
1666   do {									\
1667     USItype __q;							\
1668     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1669 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1670     (r) = (n0) - __q * (d);						\
1671     (q) = __q;								\
1672   } while (0)
1673 #else
1674 #if defined (__sparc_v8__)   /* gcc normal */				\
1675   || defined (__sparcv8)     /* gcc solaris */				\
1676   || HAVE_HOST_CPU_supersparc
1677 /* Don't match immediate range because, 1) it is not often useful,
1678    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1679    while we want to match a 13 bit interval, sign extended to 32 bits,
1680    but INTERPRETED AS UNSIGNED.  */
1681 #define umul_ppmm(w1, w0, u, v) \
1682   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1683 #define UMUL_TIME 5
1684 
1685 #if HAVE_HOST_CPU_supersparc
1686 #define UDIV_TIME 60		/* SuperSPARC timing */
1687 #else
1688 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1689    dividends and will trap to the kernel for the rest. */
1690 #define udiv_qrnnd(q, r, n1, n0, d) \
1691   do {									\
1692     USItype __q;							\
1693     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1694 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1695     (r) = (n0) - __q * (d);						\
1696     (q) = __q;								\
1697   } while (0)
1698 #define UDIV_TIME 25
1699 #endif /* HAVE_HOST_CPU_supersparc */
1700 
1701 #else /* ! __sparc_v8__ */
1702 #if defined (__sparclite__)
1703 /* This has hardware multiply but not divide.  It also has two additional
1704    instructions scan (ffs from high bit) and divscc.  */
1705 #define umul_ppmm(w1, w0, u, v) \
1706   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1707 #define UMUL_TIME 5
1708 #define udiv_qrnnd(q, r, n1, n0, d) \
1709   __asm__ ("! Inlined udiv_qrnnd\n"					\
1710 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1711 "	tst	%%g0\n"							\
1712 "	divscc	%3,%4,%%g1\n"						\
1713 "	divscc	%%g1,%4,%%g1\n"						\
1714 "	divscc	%%g1,%4,%%g1\n"						\
1715 "	divscc	%%g1,%4,%%g1\n"						\
1716 "	divscc	%%g1,%4,%%g1\n"						\
1717 "	divscc	%%g1,%4,%%g1\n"						\
1718 "	divscc	%%g1,%4,%%g1\n"						\
1719 "	divscc	%%g1,%4,%%g1\n"						\
1720 "	divscc	%%g1,%4,%%g1\n"						\
1721 "	divscc	%%g1,%4,%%g1\n"						\
1722 "	divscc	%%g1,%4,%%g1\n"						\
1723 "	divscc	%%g1,%4,%%g1\n"						\
1724 "	divscc	%%g1,%4,%%g1\n"						\
1725 "	divscc	%%g1,%4,%%g1\n"						\
1726 "	divscc	%%g1,%4,%%g1\n"						\
1727 "	divscc	%%g1,%4,%%g1\n"						\
1728 "	divscc	%%g1,%4,%%g1\n"						\
1729 "	divscc	%%g1,%4,%%g1\n"						\
1730 "	divscc	%%g1,%4,%%g1\n"						\
1731 "	divscc	%%g1,%4,%%g1\n"						\
1732 "	divscc	%%g1,%4,%%g1\n"						\
1733 "	divscc	%%g1,%4,%%g1\n"						\
1734 "	divscc	%%g1,%4,%%g1\n"						\
1735 "	divscc	%%g1,%4,%%g1\n"						\
1736 "	divscc	%%g1,%4,%%g1\n"						\
1737 "	divscc	%%g1,%4,%%g1\n"						\
1738 "	divscc	%%g1,%4,%%g1\n"						\
1739 "	divscc	%%g1,%4,%%g1\n"						\
1740 "	divscc	%%g1,%4,%%g1\n"						\
1741 "	divscc	%%g1,%4,%%g1\n"						\
1742 "	divscc	%%g1,%4,%%g1\n"						\
1743 "	divscc	%%g1,%4,%0\n"						\
1744 "	rd	%%y,%1\n"						\
1745 "	bl,a 1f\n"							\
1746 "	add	%1,%4,%1\n"						\
1747 "1:	! End of inline udiv_qrnnd"					\
1748 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
1749 	   : "%g1" __AND_CLOBBER_CC)
1750 #define UDIV_TIME 37
1751 #define count_leading_zeros(count, x) \
1752   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1753 /* Early sparclites return 63 for an argument of 0, but they warn that future
1754    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1755    undefined.  */
1756 #endif /* __sparclite__ */
1757 #endif /* __sparc_v8__ */
1758 #endif /* __sparc_v9__ */
1759 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1760 #ifndef umul_ppmm
1761 #define umul_ppmm(w1, w0, u, v) \
1762   __asm__ ("! Inlined umul_ppmm\n"					\
1763 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
1764 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
1765 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
1766 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1767 "	mulscc	%%g1,%3,%%g1\n"						\
1768 "	mulscc	%%g1,%3,%%g1\n"						\
1769 "	mulscc	%%g1,%3,%%g1\n"						\
1770 "	mulscc	%%g1,%3,%%g1\n"						\
1771 "	mulscc	%%g1,%3,%%g1\n"						\
1772 "	mulscc	%%g1,%3,%%g1\n"						\
1773 "	mulscc	%%g1,%3,%%g1\n"						\
1774 "	mulscc	%%g1,%3,%%g1\n"						\
1775 "	mulscc	%%g1,%3,%%g1\n"						\
1776 "	mulscc	%%g1,%3,%%g1\n"						\
1777 "	mulscc	%%g1,%3,%%g1\n"						\
1778 "	mulscc	%%g1,%3,%%g1\n"						\
1779 "	mulscc	%%g1,%3,%%g1\n"						\
1780 "	mulscc	%%g1,%3,%%g1\n"						\
1781 "	mulscc	%%g1,%3,%%g1\n"						\
1782 "	mulscc	%%g1,%3,%%g1\n"						\
1783 "	mulscc	%%g1,%3,%%g1\n"						\
1784 "	mulscc	%%g1,%3,%%g1\n"						\
1785 "	mulscc	%%g1,%3,%%g1\n"						\
1786 "	mulscc	%%g1,%3,%%g1\n"						\
1787 "	mulscc	%%g1,%3,%%g1\n"						\
1788 "	mulscc	%%g1,%3,%%g1\n"						\
1789 "	mulscc	%%g1,%3,%%g1\n"						\
1790 "	mulscc	%%g1,%3,%%g1\n"						\
1791 "	mulscc	%%g1,%3,%%g1\n"						\
1792 "	mulscc	%%g1,%3,%%g1\n"						\
1793 "	mulscc	%%g1,%3,%%g1\n"						\
1794 "	mulscc	%%g1,%3,%%g1\n"						\
1795 "	mulscc	%%g1,%3,%%g1\n"						\
1796 "	mulscc	%%g1,%3,%%g1\n"						\
1797 "	mulscc	%%g1,%3,%%g1\n"						\
1798 "	mulscc	%%g1,%3,%%g1\n"						\
1799 "	mulscc	%%g1,0,%%g1\n"						\
1800 "	add	%%g1,%%g2,%0\n"						\
1801 "	rd	%%y,%1"							\
1802 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
1803 	   : "%g1", "%g2" __AND_CLOBBER_CC)
1804 #define UMUL_TIME 39		/* 39 instructions */
1805 #endif
1806 #ifndef udiv_qrnnd
1807 #ifndef LONGLONG_STANDALONE
1808 #define udiv_qrnnd(q, r, n1, n0, d) \
1809   do { UWtype __r;							\
1810     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
1811     (r) = __r;								\
1812   } while (0)
1813 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1814 #ifndef UDIV_TIME
1815 #define UDIV_TIME 140
1816 #endif
1817 #endif /* LONGLONG_STANDALONE */
1818 #endif /* udiv_qrnnd */
1819 #endif /* __sparc__ */
1820 
1821 #if defined (__sparc__) && W_TYPE_SIZE == 64
1822 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1823   __asm__ (								\
1824        "addcc	%r4,%5,%1\n"						\
1825       "	addccc	%r6,%7,%%g0\n"						\
1826       "	addc	%r2,%3,%0"						\
1827        : "=r" (sh), "=&r" (sl)						\
1828        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1829 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1830 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
1831 	   __CLOBBER_CC)
1832 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1833   __asm__ (								\
1834        "subcc	%r4,%5,%1\n"						\
1835       "	subccc	%r6,%7,%%g0\n"						\
1836       "	subc	%r2,%3,%0"						\
1837        : "=r" (sh), "=&r" (sl)						\
1838        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1839 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1840 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
1841 	   __CLOBBER_CC)
1842 #if __VIS__ >= 0x300
1843 #undef add_ssaaaa
1844 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1845   __asm__ (								\
1846        "addcc	%r4, %5, %1\n"						\
1847       "	addxc	%r2, %r3, %0"						\
1848 	  : "=r" (sh), "=&r" (sl)					\
1849        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
1850 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1851 #define umul_ppmm(ph, pl, m0, m1) \
1852   do {									\
1853     UDItype __m0 = (m0), __m1 = (m1);					\
1854     (pl) = __m0 * __m1;							\
1855     __asm__ ("umulxhi\t%2, %1, %0"					\
1856 	     : "=r" (ph)						\
1857 	     : "%r" (__m0), "r" (__m1));				\
1858   } while (0)
1859 #define count_leading_zeros(count, x) \
1860   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1861 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1862 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1863 #endif
1864 #endif
1865 
1866 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1867 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1868   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1869 	   : "=g" (sh), "=&g" (sl)					\
1870 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1871 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1872 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1873   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1874 	   : "=g" (sh), "=&g" (sl)					\
1875 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1876 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1877 #define smul_ppmm(xh, xl, m0, m1) \
1878   do {									\
1879     union {UDItype __ll;						\
1880 	   struct {USItype __l, __h;} __i;				\
1881 	  } __x;							\
1882     USItype __m0 = (m0), __m1 = (m1);					\
1883     __asm__ ("emul %1,%2,$0,%0"						\
1884 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
1885     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1886   } while (0)
1887 #define sdiv_qrnnd(q, r, n1, n0, d) \
1888   do {									\
1889     union {DItype __ll;							\
1890 	   struct {SItype __l, __h;} __i;				\
1891 	  } __x;							\
1892     __x.__i.__h = n1; __x.__i.__l = n0;					\
1893     __asm__ ("ediv %3,%2,%0,%1"						\
1894 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
1895   } while (0)
1896 #if 0
1897 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1898    8800 maybe). */
1899 #define count_trailing_zeros(count,x)					\
1900   do {									\
1901     __asm__ ("ffs 0, 31, %1, %0"					\
1902 	     : "=g" (count)						\
1903 	     : "g" ((USItype) (x)));					\
1904   } while (0)
1905 #endif
1906 #endif /* vax */
1907 
1908 #if defined (__z8000__) && W_TYPE_SIZE == 16
1909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1910   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1911 	   : "=r" (sh), "=&r" (sl)					\
1912 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1913 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1915   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1916 	   : "=r" (sh), "=&r" (sl)					\
1917 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1918 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1919 #define umul_ppmm(xh, xl, m0, m1) \
1920   do {									\
1921     union {long int __ll;						\
1922 	   struct {unsigned int __h, __l;} __i;				\
1923 	  } __x;							\
1924     unsigned int __m0 = (m0), __m1 = (m1);				\
1925     __asm__ ("mult	%S0,%H3"					\
1926 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
1927 	     : "%1" (m0), "rQR" (m1));					\
1928     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1929     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1930 	     + (((signed int) __m1 >> 15) & __m0));			\
1931   } while (0)
1932 #endif /* __z8000__ */
1933 
1934 #endif /* __GNUC__ */
1935 
1936 #endif /* NO_ASM */
1937 
1938 
1939 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1940 #if !defined (umul_ppmm) && defined (__umulsidi3)
1941 #define umul_ppmm(ph, pl, m0, m1) \
1942   {									\
1943     UDWtype __ll = __umulsidi3 (m0, m1);				\
1944     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
1945     pl = (UWtype) __ll;							\
1946   }
1947 #endif
1948 
1949 #if !defined (__umulsidi3)
1950 #define __umulsidi3(u, v) \
1951   ({UWtype __hi, __lo;							\
1952     umul_ppmm (__hi, __lo, u, v);					\
1953     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1954 #endif
1955 
1956 
1957 #if defined (__cplusplus)
1958 #define __longlong_h_C "C"
1959 #else
1960 #define __longlong_h_C
1961 #endif
1962 
1963 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1964    forms have "reversed" arguments, meaning the pointer is last, which
1965    sometimes allows better parameter passing, in particular on 64-bit
1966    hppa. */
1967 
1968 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1969 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1970 
1971 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1972   && ! defined (LONGLONG_STANDALONE)
1973 #define umul_ppmm(wh, wl, u, v)						\
1974   do {									\
1975     UWtype __umul_ppmm__p0;						\
1976     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1977     (wl) = __umul_ppmm__p0;						\
1978   } while (0)
1979 #endif
1980 
1981 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1982 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1983 
1984 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
1985   && ! defined (LONGLONG_STANDALONE)
1986 #define umul_ppmm(wh, wl, u, v)						\
1987   do {									\
1988     UWtype __umul_p0;							\
1989     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
1990     (wl) = __umul_p0;							\
1991   } while (0)
1992 #endif
1993 
1994 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1995 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1996 
1997 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
1998   && ! defined (LONGLONG_STANDALONE)
1999 #define udiv_qrnnd(q, r, n1, n0, d)					\
2000   do {									\
2001     UWtype __udiv_qrnnd_r;						\
2002     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
2003 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
2004     (r) = __udiv_qrnnd_r;						\
2005   } while (0)
2006 #endif
2007 
2008 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
2009 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
2010 
2011 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
2012   && ! defined (LONGLONG_STANDALONE)
2013 #define udiv_qrnnd(q, r, n1, n0, d)					\
2014   do {									\
2015     UWtype __udiv_qrnnd_r;						\
2016     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
2017 			    &__udiv_qrnnd_r);				\
2018     (r) = __udiv_qrnnd_r;						\
2019   } while (0)
2020 #endif
2021 
2022 
2023 /* If this machine has no inline assembler, use C macros.  */
2024 
2025 #if !defined (add_ssaaaa)
2026 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
2027   do {									\
2028     UWtype __x;								\
2029     __x = (al) + (bl);							\
2030     (sh) = (ah) + (bh) + (__x < (al));					\
2031     (sl) = __x;								\
2032   } while (0)
2033 #endif
2034 
2035 #if !defined (sub_ddmmss)
2036 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
2037   do {									\
2038     UWtype __x;								\
2039     __x = (al) - (bl);							\
2040     (sh) = (ah) - (bh) - ((al) < (bl));					\
2041     (sl) = __x;								\
2042   } while (0)
2043 #endif
2044 
2045 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2046    smul_ppmm.  */
2047 #if !defined (umul_ppmm) && defined (smul_ppmm)
2048 #define umul_ppmm(w1, w0, u, v)						\
2049   do {									\
2050     UWtype __w1;							\
2051     UWtype __xm0 = (u), __xm1 = (v);					\
2052     smul_ppmm (__w1, w0, __xm0, __xm1);					\
2053     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2054 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2055   } while (0)
2056 #endif
2057 
2058 /* If we still don't have umul_ppmm, define it using plain C.
2059 
2060    For reference, when this code is used for squaring (ie. u and v identical
2061    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2062    multiplies, not 4.  The subsequent additions could be optimized a bit,
2063    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2064    and chips obliged to use this generic C umul will have plenty of worse
2065    performance problems than a couple of extra instructions on the diagonal
2066    of sqr_basecase.  */
2067 
2068 #if !defined (umul_ppmm)
2069 #define umul_ppmm(w1, w0, u, v)						\
2070   do {									\
2071     UWtype __x0, __x1, __x2, __x3;					\
2072     UHWtype __ul, __vl, __uh, __vh;					\
2073     UWtype __u = (u), __v = (v);					\
2074 									\
2075     __ul = __ll_lowpart (__u);						\
2076     __uh = __ll_highpart (__u);						\
2077     __vl = __ll_lowpart (__v);						\
2078     __vh = __ll_highpart (__v);						\
2079 									\
2080     __x0 = (UWtype) __ul * __vl;					\
2081     __x1 = (UWtype) __ul * __vh;					\
2082     __x2 = (UWtype) __uh * __vl;					\
2083     __x3 = (UWtype) __uh * __vh;					\
2084 									\
2085     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
2086     __x1 += __x2;		/* but this indeed can */		\
2087     if (__x1 < __x2)		/* did we get it? */			\
2088       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
2089 									\
2090     (w1) = __x3 + __ll_highpart (__x1);					\
2091     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
2092   } while (0)
2093 #endif
2094 
2095 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2096    exist in one form or another.  */
2097 #if !defined (smul_ppmm)
2098 #define smul_ppmm(w1, w0, u, v)						\
2099   do {									\
2100     UWtype __w1;							\
2101     UWtype __xm0 = (u), __xm1 = (v);					\
2102     umul_ppmm (__w1, w0, __xm0, __xm1);					\
2103     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2104 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2105   } while (0)
2106 #endif
2107 
2108 /* Define this unconditionally, so it can be used for debugging.  */
2109 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2110   do {									\
2111     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
2112 									\
2113     ASSERT ((d) != 0);							\
2114     ASSERT ((n1) < (d));						\
2115 									\
2116     __d1 = __ll_highpart (d);						\
2117     __d0 = __ll_lowpart (d);						\
2118 									\
2119     __q1 = (n1) / __d1;							\
2120     __r1 = (n1) - __q1 * __d1;						\
2121     __m = __q1 * __d0;							\
2122     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
2123     if (__r1 < __m)							\
2124       {									\
2125 	__q1--, __r1 += (d);						\
2126 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2127 	  if (__r1 < __m)						\
2128 	    __q1--, __r1 += (d);					\
2129       }									\
2130     __r1 -= __m;							\
2131 									\
2132     __q0 = __r1 / __d1;							\
2133     __r0 = __r1  - __q0 * __d1;						\
2134     __m = __q0 * __d0;							\
2135     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
2136     if (__r0 < __m)							\
2137       {									\
2138 	__q0--, __r0 += (d);						\
2139 	if (__r0 >= (d))						\
2140 	  if (__r0 < __m)						\
2141 	    __q0--, __r0 += (d);					\
2142       }									\
2143     __r0 -= __m;							\
2144 									\
2145     (q) = __q1 * __ll_B | __q0;						\
2146     (r) = __r0;								\
2147   } while (0)
2148 
2149 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2150    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2151 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2152 #define udiv_qrnnd(q, r, nh, nl, d) \
2153   do {									\
2154     UWtype __r;								\
2155     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
2156     (r) = __r;								\
2157   } while (0)
2158 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2159 #endif
2160 
2161 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2162 #if !defined (udiv_qrnnd)
2163 #define UDIV_NEEDS_NORMALIZATION 1
2164 #define udiv_qrnnd __udiv_qrnnd_c
2165 #endif
2166 
2167 #if !defined (count_leading_zeros)
2168 #define count_leading_zeros(count, x) \
2169   do {									\
2170     UWtype __xr = (x);							\
2171     UWtype __a;								\
2172 									\
2173     if (W_TYPE_SIZE == 32)						\
2174       {									\
2175 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
2176 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
2177 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
2178 	  : 3*__BITS4 + 1);						\
2179       }									\
2180     else								\
2181       {									\
2182 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
2183 	  if (((__xr >> __a) & 0xff) != 0)				\
2184 	    break;							\
2185 	++__a;								\
2186       }									\
2187 									\
2188     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
2189   } while (0)
2190 /* This version gives a well-defined value for zero. */
2191 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2192 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2193 #define COUNT_LEADING_ZEROS_SLOW
2194 #endif
2195 
2196 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2197 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2198 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2199 #endif
2200 
2201 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2202 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2203 #endif
2204 
2205 #if !defined (count_trailing_zeros)
2206 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2207 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2208 #define count_trailing_zeros(count, x)					\
2209   do {									\
2210     UWtype __ctz_x = (x);						\
2211     UWtype __ctz_c;							\
2212     ASSERT (__ctz_x != 0);						\
2213     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
2214     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
2215   } while (0)
2216 #else
2217 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2218    We use clz_tab without ado, since the C count_leading_zeros above will have
2219    pulled it in.  */
2220 #define count_trailing_zeros(count, x)					\
2221   do {									\
2222     UWtype __ctz_x = (x);						\
2223     int __ctz_c;							\
2224 									\
2225     if (LIKELY ((__ctz_x & 0xff) != 0))					\
2226       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
2227     else								\
2228       {									\
2229 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
2230 	  {								\
2231 	    __ctz_x >>= 8;						\
2232 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
2233 	      break;							\
2234 	  }								\
2235 									\
2236 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
2237       }									\
2238   } while (0)
2239 #endif
2240 #endif
2241 
2242 #ifndef UDIV_NEEDS_NORMALIZATION
2243 #define UDIV_NEEDS_NORMALIZATION 0
2244 #endif
2245 
2246 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2247    that hence the latter should always be used.  */
2248 #ifndef UDIV_PREINV_ALWAYS
2249 #define UDIV_PREINV_ALWAYS 0
2250 #endif
2251 
2252 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2253 #ifndef UMUL_TIME
2254 #define UMUL_TIME 1
2255 #endif
2256 
2257 #ifndef UDIV_TIME
2258 #define UDIV_TIME UMUL_TIME
2259 #endif
2260