1 /* Software floating-point emulation.
2    Basic two-word fraction declaration and manipulation.
3    Copyright (C) 1997-2016 Free Software Foundation, Inc.
4    This file is part of the GNU C Library.
5    Contributed by Richard Henderson (rth@cygnus.com),
6 		  Jakub Jelinek (jj@ultra.linux.cz),
7 		  David S. Miller (davem@redhat.com) and
8 		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
9 
10    The GNU C Library is free software; you can redistribute it and/or
11    modify it under the terms of the GNU Lesser General Public
12    License as published by the Free Software Foundation; either
13    version 2.1 of the License, or (at your option) any later version.
14 
15    In addition to the permissions in the GNU Lesser General Public
16    License, the Free Software Foundation gives you unlimited
17    permission to link the compiled version of this file into
18    combinations with other programs, and to distribute those
19    combinations without any restriction coming from the use of this
20    file.  (The Lesser General Public License restrictions do apply in
21    other respects; for example, they cover modification of the file,
22    and distribution when not linked into a combine executable.)
23 
24    The GNU C Library is distributed in the hope that it will be useful,
25    but WITHOUT ANY WARRANTY; without even the implied warranty of
26    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27    Lesser General Public License for more details.
28 
29    You should have received a copy of the GNU Lesser General Public
30    License along with the GNU C Library; if not, see
31    <http://www.gnu.org/licenses/>.  */
32 
33 #ifndef SOFT_FP_OP_2_H
34 #define SOFT_FP_OP_2_H	1
35 
36 #define _FP_FRAC_DECL_2(X)				\
37   _FP_W_TYPE X##_f0 _FP_ZERO_INIT, X##_f1 _FP_ZERO_INIT
38 #define _FP_FRAC_COPY_2(D, S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
39 #define _FP_FRAC_SET_2(X, I)	__FP_FRAC_SET_2 (X, I)
40 #define _FP_FRAC_HIGH_2(X)	(X##_f1)
41 #define _FP_FRAC_LOW_2(X)	(X##_f0)
42 #define _FP_FRAC_WORD_2(X, w)	(X##_f##w)
43 
44 #define _FP_FRAC_SLL_2(X, N)						\
45   (void) (((N) < _FP_W_TYPE_SIZE)					\
46 	  ? ({								\
47 	      if (__builtin_constant_p (N) && (N) == 1)			\
48 		{							\
49 		  X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE) (X##_f0)) < 0); \
50 		  X##_f0 += X##_f0;					\
51 		}							\
52 	      else							\
53 		{							\
54 		  X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
55 		  X##_f0 <<= (N);					\
56 		}							\
57 	      0;							\
58 	    })								\
59 	  : ({								\
60 	      X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);		\
61 	      X##_f0 = 0;						\
62 	    }))
63 
64 
65 #define _FP_FRAC_SRL_2(X, N)						\
66   (void) (((N) < _FP_W_TYPE_SIZE)					\
67 	  ? ({								\
68 	      X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
69 	      X##_f1 >>= (N);						\
70 	    })								\
71 	  : ({								\
72 	      X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);		\
73 	      X##_f1 = 0;						\
74 	    }))
75 
76 /* Right shift with sticky-lsb.  */
77 #define _FP_FRAC_SRST_2(X, S, N, sz)					\
78   (void) (((N) < _FP_W_TYPE_SIZE)					\
79 	  ? ({								\
80 	      S = (__builtin_constant_p (N) && (N) == 1			\
81 		   ? X##_f0 & 1						\
82 		   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		\
83 	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
84 	      X##_f1 >>= (N);						\
85 	    })								\
86 	  : ({								\
87 	      S = ((((N) == _FP_W_TYPE_SIZE				\
88 		     ? 0						\
89 		     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		\
90 		    | X##_f0) != 0);					\
91 	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		\
92 	      X##_f1 = 0;						\
93 	    }))
94 
95 #define _FP_FRAC_SRS_2(X, N, sz)					\
96   (void) (((N) < _FP_W_TYPE_SIZE)					\
97 	  ? ({								\
98 	      X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \
99 			| (__builtin_constant_p (N) && (N) == 1		\
100 			   ? X##_f0 & 1					\
101 			   : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
102 	      X##_f1 >>= (N);						\
103 	    })								\
104 	  : ({								\
105 	      X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)		\
106 			| ((((N) == _FP_W_TYPE_SIZE			\
107 			     ? 0					\
108 			     : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	\
109 			    | X##_f0) != 0));				\
110 	      X##_f1 = 0;						\
111 	    }))
112 
113 #define _FP_FRAC_ADDI_2(X, I)	\
114   __FP_FRAC_ADDI_2 (X##_f1, X##_f0, I)
115 
116 #define _FP_FRAC_ADD_2(R, X, Y)	\
117   __FP_FRAC_ADD_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
118 
119 #define _FP_FRAC_SUB_2(R, X, Y)	\
120   __FP_FRAC_SUB_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
121 
122 #define _FP_FRAC_DEC_2(X, Y)	\
123   __FP_FRAC_DEC_2 (X##_f1, X##_f0, Y##_f1, Y##_f0)
124 
125 #define _FP_FRAC_CLZ_2(R, X)			\
126   do						\
127     {						\
128       if (X##_f1)				\
129 	__FP_CLZ ((R), X##_f1);			\
130       else					\
131 	{					\
132 	  __FP_CLZ ((R), X##_f0);		\
133 	  (R) += _FP_W_TYPE_SIZE;		\
134 	}					\
135     }						\
136   while (0)
137 
138 /* Predicates.  */
139 #define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE) X##_f1 < 0)
140 #define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
141 #define _FP_FRAC_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
142 #define _FP_FRAC_CLEAR_OVERP_2(fs, X)	(_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
143 #define _FP_FRAC_HIGHBIT_DW_2(fs, X)	\
144   (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
145 #define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
146 #define _FP_FRAC_GT_2(X, Y)	\
147   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
148 #define _FP_FRAC_GE_2(X, Y)	\
149   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
150 
151 #define _FP_ZEROFRAC_2		0, 0
152 #define _FP_MINFRAC_2		0, 1
153 #define _FP_MAXFRAC_2		(~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
154 
155 /* Internals.  */
156 
157 #define __FP_FRAC_SET_2(X, I1, I0)	(X##_f0 = I0, X##_f1 = I1)
158 
159 #define __FP_CLZ_2(R, xh, xl)			\
160   do						\
161     {						\
162       if (xh)					\
163 	__FP_CLZ ((R), xh);			\
164       else					\
165 	{					\
166 	  __FP_CLZ ((R), xl);			\
167 	  (R) += _FP_W_TYPE_SIZE;		\
168 	}					\
169     }						\
170   while (0)
171 
172 #if 0
173 
174 # ifndef __FP_FRAC_ADDI_2
175 #  define __FP_FRAC_ADDI_2(xh, xl, i)	\
176   (xh += ((xl += i) < i))
177 # endif
178 # ifndef __FP_FRAC_ADD_2
179 #  define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
180   (rh = xh + yh + ((rl = xl + yl) < xl))
181 # endif
182 # ifndef __FP_FRAC_SUB_2
183 #  define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
184   (rh = xh - yh - ((rl = xl - yl) > xl))
185 # endif
186 # ifndef __FP_FRAC_DEC_2
187 #  define __FP_FRAC_DEC_2(xh, xl, yh, yl)		\
188   do							\
189     {							\
190       UWtype __FP_FRAC_DEC_2_t = xl;			\
191       xh -= yh + ((xl -= yl) > __FP_FRAC_DEC_2_t);	\
192     }							\
193   while (0)
194 # endif
195 
196 #else
197 
198 # undef __FP_FRAC_ADDI_2
199 # define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa (xh, xl, xh, xl, 0, i)
200 # undef __FP_FRAC_ADD_2
201 # define __FP_FRAC_ADD_2		add_ssaaaa
202 # undef __FP_FRAC_SUB_2
203 # define __FP_FRAC_SUB_2		sub_ddmmss
204 # undef __FP_FRAC_DEC_2
205 # define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
206   sub_ddmmss (xh, xl, xh, xl, yh, yl)
207 
208 #endif
209 
210 /* Unpack the raw bits of a native fp value.  Do not classify or
211    normalize the data.  */
212 
213 #define _FP_UNPACK_RAW_2(fs, X, val)			\
214   do							\
215     {							\
216       union _FP_UNION_##fs _FP_UNPACK_RAW_2_flo;	\
217       _FP_UNPACK_RAW_2_flo.flt = (val);			\
218 							\
219       X##_f0 = _FP_UNPACK_RAW_2_flo.bits.frac0;		\
220       X##_f1 = _FP_UNPACK_RAW_2_flo.bits.frac1;		\
221       X##_e  = _FP_UNPACK_RAW_2_flo.bits.exp;		\
222       X##_s  = _FP_UNPACK_RAW_2_flo.bits.sign;		\
223     }							\
224   while (0)
225 
226 #define _FP_UNPACK_RAW_2_P(fs, X, val)			\
227   do							\
228     {							\
229       union _FP_UNION_##fs *_FP_UNPACK_RAW_2_P_flo	\
230 	= (union _FP_UNION_##fs *) (val);		\
231 							\
232       X##_f0 = _FP_UNPACK_RAW_2_P_flo->bits.frac0;	\
233       X##_f1 = _FP_UNPACK_RAW_2_P_flo->bits.frac1;	\
234       X##_e  = _FP_UNPACK_RAW_2_P_flo->bits.exp;	\
235       X##_s  = _FP_UNPACK_RAW_2_P_flo->bits.sign;	\
236     }							\
237   while (0)
238 
239 
240 /* Repack the raw bits of a native fp value.  */
241 
242 #define _FP_PACK_RAW_2(fs, val, X)		\
243   do						\
244     {						\
245       union _FP_UNION_##fs _FP_PACK_RAW_2_flo;	\
246 						\
247       _FP_PACK_RAW_2_flo.bits.frac0 = X##_f0;	\
248       _FP_PACK_RAW_2_flo.bits.frac1 = X##_f1;	\
249       _FP_PACK_RAW_2_flo.bits.exp   = X##_e;	\
250       _FP_PACK_RAW_2_flo.bits.sign  = X##_s;	\
251 						\
252       (val) = _FP_PACK_RAW_2_flo.flt;		\
253     }						\
254   while (0)
255 
256 #define _FP_PACK_RAW_2_P(fs, val, X)			\
257   do							\
258     {							\
259       union _FP_UNION_##fs *_FP_PACK_RAW_2_P_flo	\
260 	= (union _FP_UNION_##fs *) (val);		\
261 							\
262       _FP_PACK_RAW_2_P_flo->bits.frac0 = X##_f0;	\
263       _FP_PACK_RAW_2_P_flo->bits.frac1 = X##_f1;	\
264       _FP_PACK_RAW_2_P_flo->bits.exp   = X##_e;		\
265       _FP_PACK_RAW_2_P_flo->bits.sign  = X##_s;		\
266     }							\
267   while (0)
268 
269 
270 /* Multiplication algorithms: */
271 
272 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
273 
274 #define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit)		\
275   do									\
276     {									\
277       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_b);			\
278       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_c);			\
279 									\
280       doit (_FP_FRAC_WORD_4 (R, 1), _FP_FRAC_WORD_4 (R, 0),		\
281 	    X##_f0, Y##_f0);						\
282       doit (_FP_MUL_MEAT_DW_2_wide_b_f1, _FP_MUL_MEAT_DW_2_wide_b_f0,	\
283 	    X##_f0, Y##_f1);						\
284       doit (_FP_MUL_MEAT_DW_2_wide_c_f1, _FP_MUL_MEAT_DW_2_wide_c_f0,	\
285 	    X##_f1, Y##_f0);						\
286       doit (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),		\
287 	    X##_f1, Y##_f1);						\
288 									\
289       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
290 		       _FP_FRAC_WORD_4 (R, 1), 0,			\
291 		       _FP_MUL_MEAT_DW_2_wide_b_f1,			\
292 		       _FP_MUL_MEAT_DW_2_wide_b_f0,			\
293 		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
294 		       _FP_FRAC_WORD_4 (R, 1));				\
295       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
296 		       _FP_FRAC_WORD_4 (R, 1), 0,			\
297 		       _FP_MUL_MEAT_DW_2_wide_c_f1,			\
298 		       _FP_MUL_MEAT_DW_2_wide_c_f0,			\
299 		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
300 		       _FP_FRAC_WORD_4 (R, 1));				\
301     }									\
302   while (0)
303 
304 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
305   do									\
306     {									\
307       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_z);				\
308 									\
309       _FP_MUL_MEAT_DW_2_wide ((wfracbits), _FP_MUL_MEAT_2_wide_z,	\
310 			      X, Y, doit);				\
311 									\
312       /* Normalize since we know where the msb of the multiplicands	\
313 	 were (bit B), we know that the msb of the of the product is	\
314 	 at either 2B or 2B-1.  */					\
315       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_z, (wfracbits)-1,		\
316 		      2*(wfracbits));					\
317       R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 0);		\
318       R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 1);		\
319     }									\
320   while (0)
321 
322 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
323    Do only 3 multiplications instead of four. This one is for machines
324    where multiplication is much more expensive than subtraction.  */
325 
326 #define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit)		\
327   do									\
328     {									\
329       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_b);			\
330       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_c);			\
331       _FP_W_TYPE _FP_MUL_MEAT_DW_2_wide_3mul_d;				\
332       int _FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
333       int _FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
334 									\
335       _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 = X##_f0 + X##_f1;		\
336       _FP_MUL_MEAT_DW_2_wide_3mul_c1					\
337 	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 < X##_f0;			\
338       _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 = Y##_f0 + Y##_f1;		\
339       _FP_MUL_MEAT_DW_2_wide_3mul_c2					\
340 	= _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 < Y##_f0;			\
341       doit (_FP_MUL_MEAT_DW_2_wide_3mul_d, _FP_FRAC_WORD_4 (R, 0),	\
342 	    X##_f0, Y##_f0);						\
343       doit (_FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1),		\
344 	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f0,				\
345 	    _FP_MUL_MEAT_DW_2_wide_3mul_b_f1);				\
346       doit (_FP_MUL_MEAT_DW_2_wide_3mul_c_f1,				\
347 	    _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, X##_f1, Y##_f1);		\
348 									\
349       _FP_MUL_MEAT_DW_2_wide_3mul_b_f0					\
350 	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c2;				\
351       _FP_MUL_MEAT_DW_2_wide_3mul_b_f1					\
352 	&= -_FP_MUL_MEAT_DW_2_wide_3mul_c1;				\
353       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
354 		       _FP_FRAC_WORD_4 (R, 1),				\
355 		       (_FP_MUL_MEAT_DW_2_wide_3mul_c1			\
356 			& _FP_MUL_MEAT_DW_2_wide_3mul_c2), 0,		\
357 		       _FP_MUL_MEAT_DW_2_wide_3mul_d,			\
358 		       0, _FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1)); \
359       __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
360 			_FP_MUL_MEAT_DW_2_wide_3mul_b_f0);		\
361       __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
362 			_FP_MUL_MEAT_DW_2_wide_3mul_b_f1);		\
363       __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
364 		       _FP_FRAC_WORD_4 (R, 1),				\
365 		       0, _FP_MUL_MEAT_DW_2_wide_3mul_d,		\
366 		       _FP_FRAC_WORD_4 (R, 0));				\
367       __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
368 		       _FP_FRAC_WORD_4 (R, 1), 0,			\
369 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
370 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0);		\
371       __FP_FRAC_ADD_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),	\
372 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,		\
373 		       _FP_MUL_MEAT_DW_2_wide_3mul_c_f0,		\
374 		       _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2));	\
375     }									\
376   while (0)
377 
378 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
379   do									\
380     {									\
381       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_3mul_z);			\
382 									\
383       _FP_MUL_MEAT_DW_2_wide_3mul ((wfracbits),				\
384 				   _FP_MUL_MEAT_2_wide_3mul_z,		\
385 				   X, Y, doit);				\
386 									\
387       /* Normalize since we know where the msb of the multiplicands	\
388 	 were (bit B), we know that the msb of the of the product is	\
389 	 at either 2B or 2B-1.  */					\
390       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_3mul_z,			\
391 		      (wfracbits)-1, 2*(wfracbits));			\
392       R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 0);		\
393       R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 1);		\
394     }									\
395   while (0)
396 
397 #define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y)	\
398   do							\
399     {							\
400       _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_x[2];		\
401       _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_y[2];		\
402       _FP_MUL_MEAT_DW_2_gmp_x[0] = X##_f0;		\
403       _FP_MUL_MEAT_DW_2_gmp_x[1] = X##_f1;		\
404       _FP_MUL_MEAT_DW_2_gmp_y[0] = Y##_f0;		\
405       _FP_MUL_MEAT_DW_2_gmp_y[1] = Y##_f1;		\
406 							\
407       mpn_mul_n (R##_f, _FP_MUL_MEAT_DW_2_gmp_x,	\
408 		 _FP_MUL_MEAT_DW_2_gmp_y, 2);		\
409     }							\
410   while (0)
411 
412 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
413   do									\
414     {									\
415       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_gmp_z);				\
416 									\
417       _FP_MUL_MEAT_DW_2_gmp ((wfracbits), _FP_MUL_MEAT_2_gmp_z, X, Y);	\
418 									\
419       /* Normalize since we know where the msb of the multiplicands	\
420 	 were (bit B), we know that the msb of the of the product is	\
421 	 at either 2B or 2B-1.  */					\
422       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_gmp_z, (wfracbits)-1,		\
423 		      2*(wfracbits));					\
424       R##_f0 = _FP_MUL_MEAT_2_gmp_z_f[0];				\
425       R##_f1 = _FP_MUL_MEAT_2_gmp_z_f[1];				\
426     }									\
427   while (0)
428 
429 /* Do at most 120x120=240 bits multiplication using double floating
430    point multiplication.  This is useful if floating point
431    multiplication has much bigger throughput than integer multiply.
432    It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
433    between 106 and 120 only.
434    Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
435    SETFETZ is a macro which will disable all FPU exceptions and set rounding
436    towards zero,  RESETFE should optionally reset it back.  */
437 
438 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
439   do									\
440     {									\
441       static const double _const[] =					\
442 	{								\
443 	  /* 2^-24 */ 5.9604644775390625e-08,				\
444 	  /* 2^-48 */ 3.5527136788005009e-15,				\
445 	  /* 2^-72 */ 2.1175823681357508e-22,				\
446 	  /* 2^-96 */ 1.2621774483536189e-29,				\
447 	  /* 2^28 */ 2.68435456e+08,					\
448 	  /* 2^4 */ 1.600000e+01,					\
449 	  /* 2^-20 */ 9.5367431640625e-07,				\
450 	  /* 2^-44 */ 5.6843418860808015e-14,				\
451 	  /* 2^-68 */ 3.3881317890172014e-21,				\
452 	  /* 2^-92 */ 2.0194839173657902e-28,				\
453 	  /* 2^-116 */ 1.2037062152420224e-35				\
454 	};								\
455       double _a240, _b240, _c240, _d240, _e240, _f240,			\
456 	_g240, _h240, _i240, _j240, _k240;				\
457       union { double d; UDItype i; } _l240, _m240, _n240, _o240,	\
458 				       _p240, _q240, _r240, _s240;	\
459       UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;		\
460 									\
461       _FP_STATIC_ASSERT ((wfracbits) >= 106 && (wfracbits) <= 120,	\
462 			 "wfracbits out of range");			\
463 									\
464       setfetz;								\
465 									\
466       _e240 = (double) (long) (X##_f0 & 0xffffff);			\
467       _j240 = (double) (long) (Y##_f0 & 0xffffff);			\
468       _d240 = (double) (long) ((X##_f0 >> 24) & 0xffffff);		\
469       _i240 = (double) (long) ((Y##_f0 >> 24) & 0xffffff);		\
470       _c240 = (double) (long) (((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \
471       _h240 = (double) (long) (((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \
472       _b240 = (double) (long) ((X##_f1 >> 8) & 0xffffff);		\
473       _g240 = (double) (long) ((Y##_f1 >> 8) & 0xffffff);		\
474       _a240 = (double) (long) (X##_f1 >> 32);				\
475       _f240 = (double) (long) (Y##_f1 >> 32);				\
476       _e240 *= _const[3];						\
477       _j240 *= _const[3];						\
478       _d240 *= _const[2];						\
479       _i240 *= _const[2];						\
480       _c240 *= _const[1];						\
481       _h240 *= _const[1];						\
482       _b240 *= _const[0];						\
483       _g240 *= _const[0];						\
484       _s240.d =							      _e240*_j240; \
485       _r240.d =						_d240*_j240 + _e240*_i240; \
486       _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240; \
487       _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240; \
488       _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240; \
489       _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;	\
490       _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;		\
491       _l240.d = _a240*_g240 + _b240*_f240;				\
492       _k240 =   _a240*_f240;						\
493       _r240.d += _s240.d;						\
494       _q240.d += _r240.d;						\
495       _p240.d += _q240.d;						\
496       _o240.d += _p240.d;						\
497       _n240.d += _o240.d;						\
498       _m240.d += _n240.d;						\
499       _l240.d += _m240.d;						\
500       _k240 += _l240.d;							\
501       _s240.d -= ((_const[10]+_s240.d)-_const[10]);			\
502       _r240.d -= ((_const[9]+_r240.d)-_const[9]);			\
503       _q240.d -= ((_const[8]+_q240.d)-_const[8]);			\
504       _p240.d -= ((_const[7]+_p240.d)-_const[7]);			\
505       _o240.d += _const[7];						\
506       _n240.d += _const[6];						\
507       _m240.d += _const[5];						\
508       _l240.d += _const[4];						\
509       if (_s240.d != 0.0)						\
510 	_y240 = 1;							\
511       if (_r240.d != 0.0)						\
512 	_y240 = 1;							\
513       if (_q240.d != 0.0)						\
514 	_y240 = 1;							\
515       if (_p240.d != 0.0)						\
516 	_y240 = 1;							\
517       _t240 = (DItype) _k240;						\
518       _u240 = _l240.i;							\
519       _v240 = _m240.i;							\
520       _w240 = _n240.i;							\
521       _x240 = _o240.i;							\
522       R##_f1 = ((_t240 << (128 - (wfracbits - 1)))			\
523 		| ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)));	\
524       R##_f0 = (((_u240 & 0xffffff) << (168 - (wfracbits - 1)))		\
525 		| ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))	\
526 		| ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))	\
527 		| ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))	\
528 		| _y240);						\
529       resetfe;								\
530     }									\
531   while (0)
532 
533 /* Division algorithms: */
534 
535 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
536   do									\
537     {									\
538       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f2;				\
539       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f1;				\
540       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f0;				\
541       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f1;				\
542       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f0;				\
543       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f1;				\
544       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f0;				\
545       if (_FP_FRAC_GE_2 (X, Y))						\
546 	{								\
547 	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1 >> 1;			\
548 	  _FP_DIV_MEAT_2_udiv_n_f1					\
549 	    = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
550 	  _FP_DIV_MEAT_2_udiv_n_f0					\
551 	    = X##_f0 << (_FP_W_TYPE_SIZE - 1);				\
552 	}								\
553       else								\
554 	{								\
555 	  R##_e--;							\
556 	  _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1;				\
557 	  _FP_DIV_MEAT_2_udiv_n_f1 = X##_f0;				\
558 	  _FP_DIV_MEAT_2_udiv_n_f0 = 0;					\
559 	}								\
560 									\
561       /* Normalize, i.e. make the most significant bit of the		\
562 	 denominator set.  */						\
563       _FP_FRAC_SLL_2 (Y, _FP_WFRACXBITS_##fs);				\
564 									\
565       udiv_qrnnd (R##_f1, _FP_DIV_MEAT_2_udiv_r_f1,			\
566 		  _FP_DIV_MEAT_2_udiv_n_f2, _FP_DIV_MEAT_2_udiv_n_f1,	\
567 		  Y##_f1);						\
568       umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, _FP_DIV_MEAT_2_udiv_m_f0,	\
569 		 R##_f1, Y##_f0);					\
570       _FP_DIV_MEAT_2_udiv_r_f0 = _FP_DIV_MEAT_2_udiv_n_f0;		\
571       if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, _FP_DIV_MEAT_2_udiv_r))	\
572 	{								\
573 	  R##_f1--;							\
574 	  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
575 			  _FP_DIV_MEAT_2_udiv_r);			\
576 	  if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)			\
577 	      && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
578 				_FP_DIV_MEAT_2_udiv_r))			\
579 	    {								\
580 	      R##_f1--;							\
581 	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
582 			      _FP_DIV_MEAT_2_udiv_r);			\
583 	    }								\
584 	}								\
585       _FP_FRAC_DEC_2 (_FP_DIV_MEAT_2_udiv_r, _FP_DIV_MEAT_2_udiv_m);	\
586 									\
587       if (_FP_DIV_MEAT_2_udiv_r_f1 == Y##_f1)				\
588 	{								\
589 	  /* This is a special case, not an optimization		\
590 	     (_FP_DIV_MEAT_2_udiv_r/Y##_f1 would not fit into UWtype).	\
591 	     As _FP_DIV_MEAT_2_udiv_r is guaranteed to be < Y,		\
592 	     R##_f0 can be either (UWtype)-1 or (UWtype)-2.  But as we	\
593 	     know what kind of bits it is (sticky, guard, round),	\
594 	     we don't care.  We also don't care what the reminder is,	\
595 	     because the guard bit will be set anyway.  -jj */		\
596 	  R##_f0 = -1;							\
597 	}								\
598       else								\
599 	{								\
600 	  udiv_qrnnd (R##_f0, _FP_DIV_MEAT_2_udiv_r_f1,			\
601 		      _FP_DIV_MEAT_2_udiv_r_f1,				\
602 		      _FP_DIV_MEAT_2_udiv_r_f0, Y##_f1);		\
603 	  umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1,				\
604 		     _FP_DIV_MEAT_2_udiv_m_f0, R##_f0, Y##_f0);		\
605 	  _FP_DIV_MEAT_2_udiv_r_f0 = 0;					\
606 	  if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,			\
607 			     _FP_DIV_MEAT_2_udiv_r))			\
608 	    {								\
609 	      R##_f0--;							\
610 	      _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,			\
611 			      _FP_DIV_MEAT_2_udiv_r);			\
612 	      if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)		\
613 		  && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,		\
614 				    _FP_DIV_MEAT_2_udiv_r))		\
615 		{							\
616 		  R##_f0--;						\
617 		  _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,		\
618 				  _FP_DIV_MEAT_2_udiv_r);		\
619 		}							\
620 	    }								\
621 	  if (!_FP_FRAC_EQ_2 (_FP_DIV_MEAT_2_udiv_r,			\
622 			      _FP_DIV_MEAT_2_udiv_m))			\
623 	    R##_f0 |= _FP_WORK_STICKY;					\
624 	}								\
625     }									\
626   while (0)
627 
628 
629 /* Square root algorithms:
630    We have just one right now, maybe Newton approximation
631    should be added for those machines where division is fast.  */
632 
633 #define _FP_SQRT_MEAT_2(R, S, T, X, q)				\
634   do								\
635     {								\
636       while (q)							\
637 	{							\
638 	  T##_f1 = S##_f1 + (q);				\
639 	  if (T##_f1 <= X##_f1)					\
640 	    {							\
641 	      S##_f1 = T##_f1 + (q);				\
642 	      X##_f1 -= T##_f1;					\
643 	      R##_f1 += (q);					\
644 	    }							\
645 	  _FP_FRAC_SLL_2 (X, 1);				\
646 	  (q) >>= 1;						\
647 	}							\
648       (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);		\
649       while ((q) != _FP_WORK_ROUND)				\
650 	{							\
651 	  T##_f0 = S##_f0 + (q);				\
652 	  T##_f1 = S##_f1;					\
653 	  if (T##_f1 < X##_f1					\
654 	      || (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
655 	    {							\
656 	      S##_f0 = T##_f0 + (q);				\
657 	      S##_f1 += (T##_f0 > S##_f0);			\
658 	      _FP_FRAC_DEC_2 (X, T);				\
659 	      R##_f0 += (q);					\
660 	    }							\
661 	  _FP_FRAC_SLL_2 (X, 1);				\
662 	  (q) >>= 1;						\
663 	}							\
664       if (X##_f0 | X##_f1)					\
665 	{							\
666 	  if (S##_f1 < X##_f1					\
667 	      || (S##_f1 == X##_f1 && S##_f0 < X##_f0))		\
668 	    R##_f0 |= _FP_WORK_ROUND;				\
669 	  R##_f0 |= _FP_WORK_STICKY;				\
670 	}							\
671     }								\
672   while (0)
673 
674 
675 /* Assembly/disassembly for converting to/from integral types.
676    No shifting or overflow handled here.  */
677 
678 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
679   (void) (((rsize) <= _FP_W_TYPE_SIZE)		\
680 	  ? ({ (r) = X##_f0; })			\
681 	  : ({					\
682 	      (r) = X##_f1;			\
683 	      (r) <<= _FP_W_TYPE_SIZE;		\
684 	      (r) += X##_f0;			\
685 	    }))
686 
687 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)	\
688   do						\
689     {						\
690       X##_f0 = (r);				\
691       X##_f1 = ((rsize) <= _FP_W_TYPE_SIZE	\
692 		? 0				\
693 		: (r) >> _FP_W_TYPE_SIZE);	\
694     }						\
695   while (0)
696 
697 /* Convert FP values between word sizes.  */
698 
699 #define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
700 
701 #define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
702 
703 #define _FP_FRAC_COPY_2_2(D, S)		_FP_FRAC_COPY_2 (D, S)
704 
705 #endif /* !SOFT_FP_OP_2_H */
706