1 /* Software floating-point emulation.
2    Basic two-word fraction declaration and manipulation.
3    Copyright (C) 1997,1998,1999,2006,2007 Free Software Foundation, Inc.
4    This file is part of the GNU C Library.
5    Contributed by Richard Henderson (rth@cygnus.com),
6 		  Jakub Jelinek (jj@ultra.linux.cz),
7 		  David S. Miller (davem@redhat.com) and
8 		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
9 
10    The GNU C Library is free software; you can redistribute it and/or
11    modify it under the terms of the GNU Lesser General Public
12    License as published by the Free Software Foundation; either
13    version 2.1 of the License, or (at your option) any later version.
14 
15    In addition to the permissions in the GNU Lesser General Public
16    License, the Free Software Foundation gives you unlimited
17    permission to link the compiled version of this file into
18    combinations with other programs, and to distribute those
19    combinations without any restriction coming from the use of this
20    file.  (The Lesser General Public License restrictions do apply in
21    other respects; for example, they cover modification of the file,
22    and distribution when not linked into a combine executable.)
23 
24    The GNU C Library is distributed in the hope that it will be useful,
25    but WITHOUT ANY WARRANTY; without even the implied warranty of
26    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27    Lesser General Public License for more details.
28 
29    You should have received a copy of the GNU Lesser General Public
30    License along with the GNU C Library; if not, see
31    <http://www.gnu.org/licenses/>.  */
32 
33 #define _FP_FRAC_DECL_2(X)	_FP_W_TYPE X##_f0, X##_f1
34 #define _FP_FRAC_COPY_2(D,S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
35 #define _FP_FRAC_SET_2(X,I)	__FP_FRAC_SET_2(X, I)
36 #define _FP_FRAC_HIGH_2(X)	(X##_f1)
37 #define _FP_FRAC_LOW_2(X)	(X##_f0)
38 #define _FP_FRAC_WORD_2(X,w)	(X##_f##w)
39 
40 #define _FP_FRAC_SLL_2(X,N)						    \
41 (void)(((N) < _FP_W_TYPE_SIZE)						    \
42        ? ({								    \
43 	    if (__builtin_constant_p(N) && (N) == 1)			    \
44 	      {								    \
45 		X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0);   \
46 		X##_f0 += X##_f0;					    \
47 	      }								    \
48 	    else							    \
49 	      {								    \
50 		X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
51 		X##_f0 <<= (N);						    \
52 	      }								    \
53 	    0;								    \
54 	  })								    \
55        : ({								    \
56 	    X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);			    \
57 	    X##_f0 = 0;							    \
58 	  }))
59 
60 
61 #define _FP_FRAC_SRL_2(X,N)						\
62 (void)(((N) < _FP_W_TYPE_SIZE)						\
63        ? ({								\
64 	    X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N));	\
65 	    X##_f1 >>= (N);						\
66 	  })								\
67        : ({								\
68 	    X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);			\
69 	    X##_f1 = 0;							\
70 	  }))
71 
72 /* Right shift with sticky-lsb.  */
73 #define _FP_FRAC_SRST_2(X,S, N,sz)					  \
74 (void)(((N) < _FP_W_TYPE_SIZE)						  \
75        ? ({								  \
76 	    S = (__builtin_constant_p(N) && (N) == 1			  \
77 		 ? X##_f0 & 1						  \
78 		 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		  \
79 	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
80 	    X##_f1 >>= (N);						  \
81 	  })								  \
82        : ({								  \
83 	    S = ((((N) == _FP_W_TYPE_SIZE				  \
84 		   ? 0							  \
85 		   : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		  \
86 		  | X##_f0) != 0);					  \
87 	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		  \
88 	    X##_f1 = 0;							  \
89 	  }))
90 
91 #define _FP_FRAC_SRS_2(X,N,sz)						  \
92 (void)(((N) < _FP_W_TYPE_SIZE)						  \
93        ? ({								  \
94 	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \
95 		      (__builtin_constant_p(N) && (N) == 1		  \
96 		       ? X##_f0 & 1					  \
97 		       : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0));	  \
98 	    X##_f1 >>= (N);						  \
99 	  })								  \
100        : ({								  \
101 	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) |		  \
102 		      ((((N) == _FP_W_TYPE_SIZE				  \
103 			 ? 0						  \
104 			 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	  \
105 			| X##_f0) != 0));				  \
106 	    X##_f1 = 0;							  \
107 	  }))
108 
109 #define _FP_FRAC_ADDI_2(X,I)	\
110   __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
111 
112 #define _FP_FRAC_ADD_2(R,X,Y)	\
113   __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
114 
115 #define _FP_FRAC_SUB_2(R,X,Y)	\
116   __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
117 
118 #define _FP_FRAC_DEC_2(X,Y)	\
119   __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
120 
121 #define _FP_FRAC_CLZ_2(R,X)	\
122   do {				\
123     if (X##_f1)			\
124       __FP_CLZ(R,X##_f1);	\
125     else 			\
126     {				\
127       __FP_CLZ(R,X##_f0);	\
128       R += _FP_W_TYPE_SIZE;	\
129     }				\
130   } while(0)
131 
132 /* Predicates */
133 #define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE)X##_f1 < 0)
134 #define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
135 #define _FP_FRAC_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
136 #define _FP_FRAC_CLEAR_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
137 #define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
138 #define _FP_FRAC_GT_2(X, Y)	\
139   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
140 #define _FP_FRAC_GE_2(X, Y)	\
141   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
142 
143 #define _FP_ZEROFRAC_2		0, 0
144 #define _FP_MINFRAC_2		0, 1
145 #define _FP_MAXFRAC_2		(~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
146 
147 /*
148  * Internals
149  */
150 
151 #define __FP_FRAC_SET_2(X,I1,I0)	(X##_f0 = I0, X##_f1 = I1)
152 
153 #define __FP_CLZ_2(R, xh, xl)	\
154   do {				\
155     if (xh)			\
156       __FP_CLZ(R,xh);		\
157     else 			\
158     {				\
159       __FP_CLZ(R,xl);		\
160       R += _FP_W_TYPE_SIZE;	\
161     }				\
162   } while(0)
163 
164 #if 0
165 
166 #ifndef __FP_FRAC_ADDI_2
167 #define __FP_FRAC_ADDI_2(xh, xl, i)	\
168   (xh += ((xl += i) < i))
169 #endif
170 #ifndef __FP_FRAC_ADD_2
171 #define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
172   (rh = xh + yh + ((rl = xl + yl) < xl))
173 #endif
174 #ifndef __FP_FRAC_SUB_2
175 #define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
176   (rh = xh - yh - ((rl = xl - yl) > xl))
177 #endif
178 #ifndef __FP_FRAC_DEC_2
179 #define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
180   do {					\
181     UWtype _t = xl;			\
182     xh -= yh + ((xl -= yl) > _t);	\
183   } while (0)
184 #endif
185 
186 #else
187 
188 #undef __FP_FRAC_ADDI_2
189 #define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa(xh, xl, xh, xl, 0, i)
190 #undef __FP_FRAC_ADD_2
191 #define __FP_FRAC_ADD_2			add_ssaaaa
192 #undef __FP_FRAC_SUB_2
193 #define __FP_FRAC_SUB_2			sub_ddmmss
194 #undef __FP_FRAC_DEC_2
195 #define __FP_FRAC_DEC_2(xh, xl, yh, yl)	sub_ddmmss(xh, xl, xh, xl, yh, yl)
196 
197 #endif
198 
199 /*
200  * Unpack the raw bits of a native fp value.  Do not classify or
201  * normalize the data.
202  */
203 
204 #define _FP_UNPACK_RAW_2(fs, X, val)			\
205   do {							\
206     union _FP_UNION_##fs _flo; _flo.flt = (val);	\
207 							\
208     X##_f0 = _flo.bits.frac0;				\
209     X##_f1 = _flo.bits.frac1;				\
210     X##_e  = _flo.bits.exp;				\
211     X##_s  = _flo.bits.sign;				\
212   } while (0)
213 
214 #define _FP_UNPACK_RAW_2_P(fs, X, val)			\
215   do {							\
216     union _FP_UNION_##fs *_flo =			\
217       (union _FP_UNION_##fs *)(val);			\
218 							\
219     X##_f0 = _flo->bits.frac0;				\
220     X##_f1 = _flo->bits.frac1;				\
221     X##_e  = _flo->bits.exp;				\
222     X##_s  = _flo->bits.sign;				\
223   } while (0)
224 
225 
226 /*
227  * Repack the raw bits of a native fp value.
228  */
229 
230 #define _FP_PACK_RAW_2(fs, val, X)			\
231   do {							\
232     union _FP_UNION_##fs _flo;				\
233 							\
234     _flo.bits.frac0 = X##_f0;				\
235     _flo.bits.frac1 = X##_f1;				\
236     _flo.bits.exp   = X##_e;				\
237     _flo.bits.sign  = X##_s;				\
238 							\
239     (val) = _flo.flt;					\
240   } while (0)
241 
242 #define _FP_PACK_RAW_2_P(fs, val, X)			\
243   do {							\
244     union _FP_UNION_##fs *_flo =			\
245       (union _FP_UNION_##fs *)(val);			\
246 							\
247     _flo->bits.frac0 = X##_f0;				\
248     _flo->bits.frac1 = X##_f1;				\
249     _flo->bits.exp   = X##_e;				\
250     _flo->bits.sign  = X##_s;				\
251   } while (0)
252 
253 
254 /*
255  * Multiplication algorithms:
256  */
257 
258 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
259 
260 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
261   do {									\
262     _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
263 									\
264     doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);	\
265     doit(_b_f1, _b_f0, X##_f0, Y##_f1);					\
266     doit(_c_f1, _c_f0, X##_f1, Y##_f0);					\
267     doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1);	\
268 									\
269     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
270 		    _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0,		\
271 		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
272 		    _FP_FRAC_WORD_4(_z,1));				\
273     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
274 		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0,		\
275 		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
276 		    _FP_FRAC_WORD_4(_z,1));				\
277 									\
278     /* Normalize since we know where the msb of the multiplicands	\
279        were (bit B), we know that the msb of the of the product is	\
280        at either 2B or 2B-1.  */					\
281     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
282     R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
283     R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
284   } while (0)
285 
286 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
287    Do only 3 multiplications instead of four. This one is for machines
288    where multiplication is much more expensive than subtraction.  */
289 
290 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
291   do {									\
292     _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
293     _FP_W_TYPE _d;							\
294     int _c1, _c2;							\
295 									\
296     _b_f0 = X##_f0 + X##_f1;						\
297     _c1 = _b_f0 < X##_f0;						\
298     _b_f1 = Y##_f0 + Y##_f1;						\
299     _c2 = _b_f1 < Y##_f0;						\
300     doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);			\
301     doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1);	\
302     doit(_c_f1, _c_f0, X##_f1, Y##_f1);					\
303 									\
304     _b_f0 &= -_c2;							\
305     _b_f1 &= -_c1;							\
306     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
307 		    _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d,		\
308 		    0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1));	\
309     __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
310 		     _b_f0);						\
311     __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
312 		     _b_f1);						\
313     __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
314 		    _FP_FRAC_WORD_4(_z,1),				\
315 		    0, _d, _FP_FRAC_WORD_4(_z,0));			\
316     __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
317 		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0);		\
318     __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2),	\
319 		    _c_f1, _c_f0,					\
320 		    _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2));	\
321 									\
322     /* Normalize since we know where the msb of the multiplicands	\
323        were (bit B), we know that the msb of the of the product is	\
324        at either 2B or 2B-1.  */					\
325     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
326     R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
327     R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
328   } while (0)
329 
330 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
331   do {									\
332     _FP_FRAC_DECL_4(_z);						\
333     _FP_W_TYPE _x[2], _y[2];						\
334     _x[0] = X##_f0; _x[1] = X##_f1;					\
335     _y[0] = Y##_f0; _y[1] = Y##_f1;					\
336 									\
337     mpn_mul_n(_z_f, _x, _y, 2);						\
338 									\
339     /* Normalize since we know where the msb of the multiplicands	\
340        were (bit B), we know that the msb of the of the product is	\
341        at either 2B or 2B-1.  */					\
342     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
343     R##_f0 = _z_f[0];							\
344     R##_f1 = _z_f[1];							\
345   } while (0)
346 
347 /* Do at most 120x120=240 bits multiplication using double floating
348    point multiplication.  This is useful if floating point
349    multiplication has much bigger throughput than integer multiply.
350    It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
351    between 106 and 120 only.
352    Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
353    SETFETZ is a macro which will disable all FPU exceptions and set rounding
354    towards zero,  RESETFE should optionally reset it back.  */
355 
356 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe)	\
357   do {										\
358     static const double _const[] = {						\
359       /* 2^-24 */ 5.9604644775390625e-08,					\
360       /* 2^-48 */ 3.5527136788005009e-15,					\
361       /* 2^-72 */ 2.1175823681357508e-22,					\
362       /* 2^-96 */ 1.2621774483536189e-29,					\
363       /* 2^28 */ 2.68435456e+08,						\
364       /* 2^4 */ 1.600000e+01,							\
365       /* 2^-20 */ 9.5367431640625e-07,						\
366       /* 2^-44 */ 5.6843418860808015e-14,					\
367       /* 2^-68 */ 3.3881317890172014e-21,					\
368       /* 2^-92 */ 2.0194839173657902e-28,					\
369       /* 2^-116 */ 1.2037062152420224e-35};					\
370     double _a240, _b240, _c240, _d240, _e240, _f240, 				\
371 	   _g240, _h240, _i240, _j240, _k240;					\
372     union { double d; UDItype i; } _l240, _m240, _n240, _o240,			\
373 				   _p240, _q240, _r240, _s240;			\
374     UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;			\
375 										\
376     if (wfracbits < 106 || wfracbits > 120)					\
377       abort();									\
378 										\
379     setfetz;									\
380 										\
381     _e240 = (double)(long)(X##_f0 & 0xffffff);					\
382     _j240 = (double)(long)(Y##_f0 & 0xffffff);					\
383     _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff);				\
384     _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff);				\
385     _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48));	\
386     _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48));	\
387     _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff);				\
388     _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff);				\
389     _a240 = (double)(long)(X##_f1 >> 32);					\
390     _f240 = (double)(long)(Y##_f1 >> 32);					\
391     _e240 *= _const[3];								\
392     _j240 *= _const[3];								\
393     _d240 *= _const[2];								\
394     _i240 *= _const[2];								\
395     _c240 *= _const[1];								\
396     _h240 *= _const[1];								\
397     _b240 *= _const[0];								\
398     _g240 *= _const[0];								\
399     _s240.d =							      _e240*_j240;\
400     _r240.d =						_d240*_j240 + _e240*_i240;\
401     _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240;\
402     _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
403     _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
404     _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;		\
405     _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;				\
406     _l240.d = _a240*_g240 + _b240*_f240;					\
407     _k240 =   _a240*_f240;							\
408     _r240.d += _s240.d;								\
409     _q240.d += _r240.d;								\
410     _p240.d += _q240.d;								\
411     _o240.d += _p240.d;								\
412     _n240.d += _o240.d;								\
413     _m240.d += _n240.d;								\
414     _l240.d += _m240.d;								\
415     _k240 += _l240.d;								\
416     _s240.d -= ((_const[10]+_s240.d)-_const[10]);				\
417     _r240.d -= ((_const[9]+_r240.d)-_const[9]);					\
418     _q240.d -= ((_const[8]+_q240.d)-_const[8]);					\
419     _p240.d -= ((_const[7]+_p240.d)-_const[7]);					\
420     _o240.d += _const[7];							\
421     _n240.d += _const[6];							\
422     _m240.d += _const[5];							\
423     _l240.d += _const[4];							\
424     if (_s240.d != 0.0) _y240 = 1;						\
425     if (_r240.d != 0.0) _y240 = 1;						\
426     if (_q240.d != 0.0) _y240 = 1;						\
427     if (_p240.d != 0.0) _y240 = 1;						\
428     _t240 = (DItype)_k240;							\
429     _u240 = _l240.i;								\
430     _v240 = _m240.i;								\
431     _w240 = _n240.i;								\
432     _x240 = _o240.i;								\
433     R##_f1 = (_t240 << (128 - (wfracbits - 1)))					\
434 	     | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104));			\
435     R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1)))			\
436     	     | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))			\
437     	     | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))			\
438     	     | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))			\
439     	     | _y240;								\
440     resetfe;									\
441   } while (0)
442 
443 /*
444  * Division algorithms:
445  */
446 
447 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
448   do {									\
449     _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0;		\
450     if (_FP_FRAC_GT_2(X, Y))						\
451       {									\
452 	_n_f2 = X##_f1 >> 1;						\
453 	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
454 	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1);			\
455       }									\
456     else								\
457       {									\
458 	R##_e--;							\
459 	_n_f2 = X##_f1;							\
460 	_n_f1 = X##_f0;							\
461 	_n_f0 = 0;							\
462       }									\
463 									\
464     /* Normalize, i.e. make the most significant bit of the 		\
465        denominator set. */						\
466     _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs);				\
467 									\
468     udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1);			\
469     umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0);				\
470     _r_f0 = _n_f0;							\
471     if (_FP_FRAC_GT_2(_m, _r))						\
472       {									\
473 	R##_f1--;							\
474 	_FP_FRAC_ADD_2(_r, Y, _r);					\
475 	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
476 	  {								\
477 	    R##_f1--;							\
478 	    _FP_FRAC_ADD_2(_r, Y, _r);					\
479 	  }								\
480       }									\
481     _FP_FRAC_DEC_2(_r, _m);						\
482 									\
483     if (_r_f1 == Y##_f1)						\
484       {									\
485 	/* This is a special case, not an optimization			\
486 	   (_r/Y##_f1 would not fit into UWtype).			\
487 	   As _r is guaranteed to be < Y,  R##_f0 can be either		\
488 	   (UWtype)-1 or (UWtype)-2.  But as we know what kind		\
489 	   of bits it is (sticky, guard, round),  we don't care.	\
490 	   We also don't care what the reminder is,  because the	\
491 	   guard bit will be set anyway.  -jj */			\
492 	R##_f0 = -1;							\
493       }									\
494     else								\
495       {									\
496 	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1);		\
497 	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0);			\
498 	_r_f0 = 0;							\
499 	if (_FP_FRAC_GT_2(_m, _r))					\
500 	  {								\
501 	    R##_f0--;							\
502 	    _FP_FRAC_ADD_2(_r, Y, _r);					\
503 	    if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
504 	      {								\
505 		R##_f0--;						\
506 		_FP_FRAC_ADD_2(_r, Y, _r);				\
507 	      }								\
508 	  }								\
509 	if (!_FP_FRAC_EQ_2(_r, _m))					\
510 	  R##_f0 |= _FP_WORK_STICKY;					\
511       }									\
512   } while (0)
513 
514 
515 #define _FP_DIV_MEAT_2_gmp(fs, R, X, Y)					\
516   do {									\
517     _FP_W_TYPE _x[4], _y[2], _z[4];					\
518     _y[0] = Y##_f0; _y[1] = Y##_f1;					\
519     _x[0] = _x[3] = 0;							\
520     if (_FP_FRAC_GT_2(X, Y))						\
521       {									\
522 	R##_e++;							\
523 	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) |	\
524 		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
525 			    (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE)));	\
526 	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE);	\
527       }									\
528     else								\
529       {									\
530 	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) |	\
531 		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
532 			    (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE)));	\
533 	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE);	\
534       }									\
535 									\
536     (void) mpn_divrem (_z, 0, _x, 4, _y, 2);				\
537     R##_f1 = _z[1];							\
538     R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0);				\
539   } while (0)
540 
541 
542 /*
543  * Square root algorithms:
544  * We have just one right now, maybe Newton approximation
545  * should be added for those machines where division is fast.
546  */
547 
548 #define _FP_SQRT_MEAT_2(R, S, T, X, q)			\
549   do {							\
550     while (q)						\
551       {							\
552 	T##_f1 = S##_f1 + q;				\
553 	if (T##_f1 <= X##_f1)				\
554 	  {						\
555 	    S##_f1 = T##_f1 + q;			\
556 	    X##_f1 -= T##_f1;				\
557 	    R##_f1 += q;				\
558 	  }						\
559 	_FP_FRAC_SLL_2(X, 1);				\
560 	q >>= 1;					\
561       }							\
562     q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1);		\
563     while (q != _FP_WORK_ROUND)				\
564       {							\
565 	T##_f0 = S##_f0 + q;				\
566 	T##_f1 = S##_f1;				\
567 	if (T##_f1 < X##_f1 || 				\
568 	    (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
569 	  {						\
570 	    S##_f0 = T##_f0 + q;			\
571 	    S##_f1 += (T##_f0 > S##_f0);		\
572 	    _FP_FRAC_DEC_2(X, T);			\
573 	    R##_f0 += q;				\
574 	  }						\
575 	_FP_FRAC_SLL_2(X, 1);				\
576 	q >>= 1;					\
577       }							\
578     if (X##_f0 | X##_f1)				\
579       {							\
580 	if (S##_f1 < X##_f1 || 				\
581 	    (S##_f1 == X##_f1 && S##_f0 < X##_f0))	\
582 	  R##_f0 |= _FP_WORK_ROUND;			\
583 	R##_f0 |= _FP_WORK_STICKY;			\
584       }							\
585   } while (0)
586 
587 
588 /*
589  * Assembly/disassembly for converting to/from integral types.
590  * No shifting or overflow handled here.
591  */
592 
593 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
594 (void)((rsize <= _FP_W_TYPE_SIZE)		\
595        ? ({ r = X##_f0; })			\
596        : ({					\
597 	    r = X##_f1;				\
598 	    r <<= _FP_W_TYPE_SIZE;		\
599 	    r += X##_f0;			\
600 	  }))
601 
602 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)				\
603   do {									\
604     X##_f0 = r;								\
605     X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE);	\
606   } while (0)
607 
608 /*
609  * Convert FP values between word sizes
610  */
611 
612 #define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
613 
614 #define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
615 
616 #define _FP_FRAC_COPY_2_2(D,S)		_FP_FRAC_COPY_2(D,S)
617