1 
2 /*============================================================================
3 
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
6 
7 Written by John R. Hauser.  This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704.  Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980.  The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
16 
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25 
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
30 
31 =============================================================================*/
32 
33 #include "milieu.h"
34 #include "softfloat.h"
35 
36 /*----------------------------------------------------------------------------
37 | Floating-point rounding mode, extended double-precision rounding precision,
38 | and exception flags.
39 *----------------------------------------------------------------------------*/
40 int8 float_exception_flags = 0;
41 #ifdef FLOATX80
42 int8 floatx80_rounding_precision = 80;
43 #endif
44 
45 int8 float_rounding_mode = float_round_nearest_even;
46 
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine:  (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output.  These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize"
56 
57 /*----------------------------------------------------------------------------
58 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
59 | and 7, and returns the properly rounded 32-bit integer corresponding to the
60 | input.  If `zSign' is 1, the input is negated before being converted to an
61 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
62 | is simply rounded to an integer, with the inexact exception raised if the
63 | input cannot be represented exactly as an integer.  However, if the fixed-
64 | point input is too large, the invalid exception is raised and the largest
65 | positive or negative integer is returned.
66 *----------------------------------------------------------------------------*/
67 
roundAndPackInt32(flag zSign,bits64 absZ)68 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
69 {
70 	int8 roundingMode;
71 	flag roundNearestEven;
72 	int8 roundIncrement, roundBits;
73 	int32 z;
74 
75 	roundingMode = float_rounding_mode;
76 	roundNearestEven = ( roundingMode == float_round_nearest_even );
77 	roundIncrement = 0x40;
78 	if ( ! roundNearestEven ) {
79 		if ( roundingMode == float_round_to_zero ) {
80 			roundIncrement = 0;
81 		}
82 		else {
83 			roundIncrement = 0x7F;
84 			if ( zSign ) {
85 				if ( roundingMode == float_round_up ) roundIncrement = 0;
86 			}
87 			else {
88 				if ( roundingMode == float_round_down ) roundIncrement = 0;
89 			}
90 		}
91 	}
92 	roundBits = absZ & 0x7F;
93 	absZ = ( absZ + roundIncrement )>>7;
94 	absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
95 	z = absZ;
96 	if ( zSign ) z = - z;
97 	if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
98 		float_raise( float_flag_invalid );
99 		return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
100 	}
101 	if ( roundBits ) float_exception_flags |= float_flag_inexact;
102 	return z;
103 
104 }
105 
106 /*----------------------------------------------------------------------------
107 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
108 | `absZ1', with binary point between bits 63 and 64 (between the input words),
109 | and returns the properly rounded 64-bit integer corresponding to the input.
110 | If `zSign' is 1, the input is negated before being converted to an integer.
111 | Ordinarily, the fixed-point input is simply rounded to an integer, with
112 | the inexact exception raised if the input cannot be represented exactly as
113 | an integer.  However, if the fixed-point input is too large, the invalid
114 | exception is raised and the largest positive or negative integer is
115 | returned.
116 *----------------------------------------------------------------------------*/
117 
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)118 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
119 {
120 	int8 roundingMode;
121 	flag roundNearestEven, increment;
122 	int64 z;
123 
124 	roundingMode = float_rounding_mode;
125 	roundNearestEven = ( roundingMode == float_round_nearest_even );
126 	increment = ( (sbits64) absZ1 < 0 );
127 	if ( ! roundNearestEven ) {
128 		if ( roundingMode == float_round_to_zero ) {
129 			increment = 0;
130 		}
131 		else {
132 			if ( zSign ) {
133 				increment = ( roundingMode == float_round_down ) && absZ1;
134 			}
135 			else {
136 				increment = ( roundingMode == float_round_up ) && absZ1;
137 			}
138 		}
139 	}
140 	if ( increment ) {
141 		++absZ0;
142 		if ( absZ0 == 0 ) goto overflow;
143 		absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
144 	}
145 	z = absZ0;
146 	if ( zSign ) z = - z;
147 	if ( z && ( ( z < 0 ) ^ zSign ) ) {
148 	overflow:
149 		float_raise( float_flag_invalid );
150 		return
151 				zSign ? (sbits64) LIT64( 0x8000000000000000 )
152 			: LIT64( 0x7FFFFFFFFFFFFFFF );
153 	}
154 	if ( absZ1 ) float_exception_flags |= float_flag_inexact;
155 	return z;
156 
157 }
158 
159 /*----------------------------------------------------------------------------
160 | Returns the fraction bits of the single-precision floating-point value `a'.
161 *----------------------------------------------------------------------------*/
162 
extractFloat32Frac(float32 a)163 INLINE bits32 extractFloat32Frac( float32 a )
164 {
165 	return a & 0x007FFFFF;
166 
167 }
168 
169 /*----------------------------------------------------------------------------
170 | Returns the exponent bits of the single-precision floating-point value `a'.
171 *----------------------------------------------------------------------------*/
172 
extractFloat32Exp(float32 a)173 INLINE int16 extractFloat32Exp( float32 a )
174 {
175 	return ( a>>23 ) & 0xFF;
176 
177 }
178 
179 /*----------------------------------------------------------------------------
180 | Returns the sign bit of the single-precision floating-point value `a'.
181 *----------------------------------------------------------------------------*/
182 
extractFloat32Sign(float32 a)183 INLINE flag extractFloat32Sign( float32 a )
184 {
185 	return a>>31;
186 
187 }
188 
189 /*----------------------------------------------------------------------------
190 | Normalizes the subnormal single-precision floating-point value represented
191 | by the denormalized significand `aSig'.  The normalized exponent and
192 | significand are stored at the locations pointed to by `zExpPtr' and
193 | `zSigPtr', respectively.
194 *----------------------------------------------------------------------------*/
195 
196 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)197 	normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
198 {
199 	int8 shiftCount;
200 
201 	shiftCount = countLeadingZeros32( aSig ) - 8;
202 	*zSigPtr = aSig<<shiftCount;
203 	*zExpPtr = 1 - shiftCount;
204 
205 }
206 
207 /*----------------------------------------------------------------------------
208 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
209 | single-precision floating-point value, returning the result.  After being
210 | shifted into the proper positions, the three fields are simply added
211 | together to form the result.  This means that any integer portion of `zSig'
212 | will be added into the exponent.  Since a properly normalized significand
213 | will have an integer portion equal to 1, the `zExp' input should be 1 less
214 | than the desired result exponent whenever `zSig' is a complete, normalized
215 | significand.
216 *----------------------------------------------------------------------------*/
217 
packFloat32(flag zSign,int16 zExp,bits32 zSig)218 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
219 {
220 	return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
221 
222 }
223 
224 /*----------------------------------------------------------------------------
225 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
226 | and significand `zSig', and returns the proper single-precision floating-
227 | point value corresponding to the abstract input.  Ordinarily, the abstract
228 | value is simply rounded and packed into the single-precision format, with
229 | the inexact exception raised if the abstract input cannot be represented
230 | exactly.  However, if the abstract value is too large, the overflow and
231 | inexact exceptions are raised and an infinity or maximal finite value is
232 | returned.  If the abstract value is too small, the input value is rounded to
233 | a subnormal number, and the underflow and inexact exceptions are raised if
234 | the abstract input cannot be represented exactly as a subnormal single-
235 | precision floating-point number.
236 |     The input significand `zSig' has its binary point between bits 30
237 | and 29, which is 7 bits to the left of the usual location.  This shifted
238 | significand must be normalized or smaller.  If `zSig' is not normalized,
239 | `zExp' must be 0; in that case, the result returned is a subnormal number,
240 | and it must not require rounding.  In the usual case that `zSig' is
241 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
242 | The handling of underflow and overflow follows the IEC/IEEE Standard for
243 | Binary Floating-Point Arithmetic.
244 *----------------------------------------------------------------------------*/
245 
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)246 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
247 {
248 	int8 roundingMode;
249 	flag roundNearestEven;
250 	int8 roundIncrement, roundBits;
251 	flag isTiny;
252 
253 	roundingMode = float_rounding_mode;
254 	roundNearestEven = ( roundingMode == float_round_nearest_even );
255 	roundIncrement = 0x40;
256 	if ( ! roundNearestEven ) {
257 		if ( roundingMode == float_round_to_zero ) {
258 			roundIncrement = 0;
259 		}
260 		else {
261 			roundIncrement = 0x7F;
262 			if ( zSign ) {
263 				if ( roundingMode == float_round_up ) roundIncrement = 0;
264 			}
265 			else {
266 				if ( roundingMode == float_round_down ) roundIncrement = 0;
267 			}
268 		}
269 	}
270 	roundBits = zSig & 0x7F;
271 	if ( 0xFD <= (bits16) zExp ) {
272 		if (    ( 0xFD < zExp )
273 				|| (    ( zExp == 0xFD )
274 					&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
275 			) {
276 			float_raise( float_flag_overflow | float_flag_inexact );
277 			return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
278 		}
279 		if ( zExp < 0 ) {
280 			isTiny =
281 					( float_detect_tininess == float_tininess_before_rounding )
282 				|| ( zExp < -1 )
283 				|| ( zSig + roundIncrement < 0x80000000 );
284 			shift32RightJamming( zSig, - zExp, &zSig );
285 			zExp = 0;
286 			roundBits = zSig & 0x7F;
287 			if ( isTiny && roundBits ) float_raise( float_flag_underflow );
288 		}
289 	}
290 	if ( roundBits ) float_exception_flags |= float_flag_inexact;
291 	zSig = ( zSig + roundIncrement )>>7;
292 	zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
293 	if ( zSig == 0 ) zExp = 0;
294 	return packFloat32( zSign, zExp, zSig );
295 
296 }
297 
298 /*----------------------------------------------------------------------------
299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
300 | and significand `zSig', and returns the proper single-precision floating-
301 | point value corresponding to the abstract input.  This routine is just like
302 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
303 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
304 | floating-point exponent.
305 *----------------------------------------------------------------------------*/
306 
307 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)308 	normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
309 {
310 	int8 shiftCount;
311 
312 	shiftCount = countLeadingZeros32( zSig ) - 1;
313 	return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
314 
315 }
316 
317 /*----------------------------------------------------------------------------
318 | Returns the fraction bits of the double-precision floating-point value `a'.
319 *----------------------------------------------------------------------------*/
320 
extractFloat64Frac(float64 a)321 INLINE bits64 extractFloat64Frac( float64 a )
322 {
323 	return a & LIT64( 0x000FFFFFFFFFFFFF );
324 
325 }
326 
327 /*----------------------------------------------------------------------------
328 | Returns the exponent bits of the double-precision floating-point value `a'.
329 *----------------------------------------------------------------------------*/
330 
extractFloat64Exp(float64 a)331 INLINE int16 extractFloat64Exp( float64 a )
332 {
333 	return ( a>>52 ) & 0x7FF;
334 
335 }
336 
337 /*----------------------------------------------------------------------------
338 | Returns the sign bit of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
340 
extractFloat64Sign(float64 a)341 INLINE flag extractFloat64Sign( float64 a )
342 {
343 	return a>>63;
344 
345 }
346 
347 /*----------------------------------------------------------------------------
348 | Normalizes the subnormal double-precision floating-point value represented
349 | by the denormalized significand `aSig'.  The normalized exponent and
350 | significand are stored at the locations pointed to by `zExpPtr' and
351 | `zSigPtr', respectively.
352 *----------------------------------------------------------------------------*/
353 
354 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)355 	normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
356 {
357 	int8 shiftCount;
358 
359 	shiftCount = countLeadingZeros64( aSig ) - 11;
360 	*zSigPtr = aSig<<shiftCount;
361 	*zExpPtr = 1 - shiftCount;
362 
363 }
364 
365 /*----------------------------------------------------------------------------
366 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
367 | double-precision floating-point value, returning the result.  After being
368 | shifted into the proper positions, the three fields are simply added
369 | together to form the result.  This means that any integer portion of `zSig'
370 | will be added into the exponent.  Since a properly normalized significand
371 | will have an integer portion equal to 1, the `zExp' input should be 1 less
372 | than the desired result exponent whenever `zSig' is a complete, normalized
373 | significand.
374 *----------------------------------------------------------------------------*/
375 
packFloat64(flag zSign,int16 zExp,bits64 zSig)376 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
377 {
378 	return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
379 
380 }
381 
382 /*----------------------------------------------------------------------------
383 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
384 | and significand `zSig', and returns the proper double-precision floating-
385 | point value corresponding to the abstract input.  Ordinarily, the abstract
386 | value is simply rounded and packed into the double-precision format, with
387 | the inexact exception raised if the abstract input cannot be represented
388 | exactly.  However, if the abstract value is too large, the overflow and
389 | inexact exceptions are raised and an infinity or maximal finite value is
390 | returned.  If the abstract value is too small, the input value is rounded
391 | to a subnormal number, and the underflow and inexact exceptions are raised
392 | if the abstract input cannot be represented exactly as a subnormal double-
393 | precision floating-point number.
394 |     The input significand `zSig' has its binary point between bits 62
395 | and 61, which is 10 bits to the left of the usual location.  This shifted
396 | significand must be normalized or smaller.  If `zSig' is not normalized,
397 | `zExp' must be 0; in that case, the result returned is a subnormal number,
398 | and it must not require rounding.  In the usual case that `zSig' is
399 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
400 | The handling of underflow and overflow follows the IEC/IEEE Standard for
401 | Binary Floating-Point Arithmetic.
402 *----------------------------------------------------------------------------*/
403 
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)404 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
405 {
406 	int8 roundingMode;
407 	flag roundNearestEven;
408 	int16 roundIncrement, roundBits;
409 	flag isTiny;
410 
411 	roundingMode = float_rounding_mode;
412 	roundNearestEven = ( roundingMode == float_round_nearest_even );
413 	roundIncrement = 0x200;
414 	if ( ! roundNearestEven ) {
415 		if ( roundingMode == float_round_to_zero ) {
416 			roundIncrement = 0;
417 		}
418 		else {
419 			roundIncrement = 0x3FF;
420 			if ( zSign ) {
421 				if ( roundingMode == float_round_up ) roundIncrement = 0;
422 			}
423 			else {
424 				if ( roundingMode == float_round_down ) roundIncrement = 0;
425 			}
426 		}
427 	}
428 	roundBits = zSig & 0x3FF;
429 	if ( 0x7FD <= (bits16) zExp ) {
430 		if (    ( 0x7FD < zExp )
431 				|| (    ( zExp == 0x7FD )
432 					&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
433 			) {
434 			float_raise( float_flag_overflow | float_flag_inexact );
435 			return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
436 		}
437 		if ( zExp < 0 ) {
438 			isTiny =
439 					( float_detect_tininess == float_tininess_before_rounding )
440 				|| ( zExp < -1 )
441 				|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
442 			shift64RightJamming( zSig, - zExp, &zSig );
443 			zExp = 0;
444 			roundBits = zSig & 0x3FF;
445 			if ( isTiny && roundBits ) float_raise( float_flag_underflow );
446 		}
447 	}
448 	if ( roundBits ) float_exception_flags |= float_flag_inexact;
449 	zSig = ( zSig + roundIncrement )>>10;
450 	zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
451 	if ( zSig == 0 ) zExp = 0;
452 	return packFloat64( zSign, zExp, zSig );
453 
454 }
455 
456 /*----------------------------------------------------------------------------
457 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
458 | and significand `zSig', and returns the proper double-precision floating-
459 | point value corresponding to the abstract input.  This routine is just like
460 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
461 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
462 | floating-point exponent.
463 *----------------------------------------------------------------------------*/
464 
465 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)466 	normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
467 {
468 	int8 shiftCount;
469 
470 	shiftCount = countLeadingZeros64( zSig ) - 1;
471 	return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
472 
473 }
474 
475 #ifdef FLOATX80
476 
477 /*----------------------------------------------------------------------------
478 | Normalizes the subnormal extended double-precision floating-point value
479 | represented by the denormalized significand `aSig'.  The normalized exponent
480 | and significand are stored at the locations pointed to by `zExpPtr' and
481 | `zSigPtr', respectively.
482 *----------------------------------------------------------------------------*/
483 
484 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)485 	normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
486 {
487 	int8 shiftCount;
488 
489 	shiftCount = countLeadingZeros64( aSig );
490 	*zSigPtr = aSig<<shiftCount;
491 	*zExpPtr = 1 - shiftCount;
492 
493 }
494 
495 /*----------------------------------------------------------------------------
496 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
497 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
498 | and returns the proper extended double-precision floating-point value
499 | corresponding to the abstract input.  Ordinarily, the abstract value is
500 | rounded and packed into the extended double-precision format, with the
501 | inexact exception raised if the abstract input cannot be represented
502 | exactly.  However, if the abstract value is too large, the overflow and
503 | inexact exceptions are raised and an infinity or maximal finite value is
504 | returned.  If the abstract value is too small, the input value is rounded to
505 | a subnormal number, and the underflow and inexact exceptions are raised if
506 | the abstract input cannot be represented exactly as a subnormal extended
507 | double-precision floating-point number.
508 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
509 | number of bits as single or double precision, respectively.  Otherwise, the
510 | result is rounded to the full precision of the extended double-precision
511 | format.
512 |     The input significand must be normalized or smaller.  If the input
513 | significand is not normalized, `zExp' must be 0; in that case, the result
514 | returned is a subnormal number, and it must not require rounding.  The
515 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
516 | Floating-Point Arithmetic.
517 *----------------------------------------------------------------------------*/
518 
519 // roundAndPackFloatx80 is now also used in fyl2x.c
520 
521 /* static */ floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)522 	roundAndPackFloatx80(
523 		int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
524 	)
525 {
526 	int8 roundingMode;
527 	flag roundNearestEven, increment, isTiny;
528 	int64 roundIncrement, roundMask, roundBits;
529 
530 	roundingMode = float_rounding_mode;
531 	roundNearestEven = ( roundingMode == float_round_nearest_even );
532 	if ( roundingPrecision == 80 ) goto precision80;
533 	if ( roundingPrecision == 64 ) {
534 		roundIncrement = LIT64( 0x0000000000000400 );
535 		roundMask = LIT64( 0x00000000000007FF );
536 	}
537 	else if ( roundingPrecision == 32 ) {
538 		roundIncrement = LIT64( 0x0000008000000000 );
539 		roundMask = LIT64( 0x000000FFFFFFFFFF );
540 	}
541 	else {
542 		goto precision80;
543 	}
544 	zSig0 |= ( zSig1 != 0 );
545 	if ( ! roundNearestEven ) {
546 		if ( roundingMode == float_round_to_zero ) {
547 			roundIncrement = 0;
548 		}
549 		else {
550 			roundIncrement = roundMask;
551 			if ( zSign ) {
552 				if ( roundingMode == float_round_up ) roundIncrement = 0;
553 			}
554 			else {
555 				if ( roundingMode == float_round_down ) roundIncrement = 0;
556 			}
557 		}
558 	}
559 	roundBits = zSig0 & roundMask;
560 	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
561 		if (    ( 0x7FFE < zExp )
562 				|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
563 			) {
564 			goto overflow;
565 		}
566 		if ( zExp <= 0 ) {
567 			isTiny =
568 					( float_detect_tininess == float_tininess_before_rounding )
569 				|| ( zExp < 0 )
570 				|| ( zSig0 <= zSig0 + roundIncrement );
571 			shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
572 			zExp = 0;
573 			roundBits = zSig0 & roundMask;
574 			if ( isTiny && roundBits ) float_raise( float_flag_underflow );
575 			if ( roundBits ) float_exception_flags |= float_flag_inexact;
576 			zSig0 += roundIncrement;
577 			if ( (sbits64) zSig0 < 0 ) zExp = 1;
578 			roundIncrement = roundMask + 1;
579 			if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
580 				roundMask |= roundIncrement;
581 			}
582 			zSig0 &= ~ roundMask;
583 			return packFloatx80( zSign, zExp, zSig0 );
584 		}
585 	}
586 	if ( roundBits ) float_exception_flags |= float_flag_inexact;
587 	zSig0 += roundIncrement;
588 	if ( zSig0 < roundIncrement ) {
589 		++zExp;
590 		zSig0 = LIT64( 0x8000000000000000 );
591 	}
592 	roundIncrement = roundMask + 1;
593 	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
594 		roundMask |= roundIncrement;
595 	}
596 	zSig0 &= ~ roundMask;
597 	if ( zSig0 == 0 ) zExp = 0;
598 	return packFloatx80( zSign, zExp, zSig0 );
599 	precision80:
600 	increment = ( (sbits64) zSig1 < 0 );
601 	if ( ! roundNearestEven ) {
602 		if ( roundingMode == float_round_to_zero ) {
603 			increment = 0;
604 		}
605 		else {
606 			if ( zSign ) {
607 				increment = ( roundingMode == float_round_down ) && zSig1;
608 			}
609 			else {
610 				increment = ( roundingMode == float_round_up ) && zSig1;
611 			}
612 		}
613 	}
614 	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
615 		if (    ( 0x7FFE < zExp )
616 				|| (    ( zExp == 0x7FFE )
617 					&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
618 					&& increment
619 				)
620 			) {
621 			roundMask = 0;
622 	overflow:
623 			float_raise( float_flag_overflow | float_flag_inexact );
624 			if (    ( roundingMode == float_round_to_zero )
625 					|| ( zSign && ( roundingMode == float_round_up ) )
626 					|| ( ! zSign && ( roundingMode == float_round_down ) )
627 				) {
628 				return packFloatx80( zSign, 0x7FFE, ~ roundMask );
629 			}
630 			return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
631 		}
632 		if ( zExp <= 0 ) {
633 			isTiny =
634 					( float_detect_tininess == float_tininess_before_rounding )
635 				|| ( zExp < 0 )
636 				|| ! increment
637 				|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
638 			shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
639 			zExp = 0;
640 			if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
641 			if ( zSig1 ) float_exception_flags |= float_flag_inexact;
642 			if ( roundNearestEven ) {
643 				increment = ( (sbits64) zSig1 < 0 );
644 			}
645 			else {
646 				if ( zSign ) {
647 					increment = ( roundingMode == float_round_down ) && zSig1;
648 				}
649 				else {
650 					increment = ( roundingMode == float_round_up ) && zSig1;
651 				}
652 			}
653 			if ( increment ) {
654 				++zSig0;
655 				zSig0 &=
656 					~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
657 				if ( (sbits64) zSig0 < 0 ) zExp = 1;
658 			}
659 			return packFloatx80( zSign, zExp, zSig0 );
660 		}
661 	}
662 	if ( zSig1 ) float_exception_flags |= float_flag_inexact;
663 	if ( increment ) {
664 		++zSig0;
665 		if ( zSig0 == 0 ) {
666 			++zExp;
667 			zSig0 = LIT64( 0x8000000000000000 );
668 		}
669 		else {
670 			zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
671 		}
672 	}
673 	else {
674 		if ( zSig0 == 0 ) zExp = 0;
675 	}
676 	return packFloatx80( zSign, zExp, zSig0 );
677 
678 }
679 
680 /*----------------------------------------------------------------------------
681 | Takes an abstract floating-point value having sign `zSign', exponent
682 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
683 | and returns the proper extended double-precision floating-point value
684 | corresponding to the abstract input.  This routine is just like
685 | `roundAndPackFloatx80' except that the input significand does not have to be
686 | normalized.
687 *----------------------------------------------------------------------------*/
688 
689 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)690 	normalizeRoundAndPackFloatx80(
691 		int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
692 	)
693 {
694 	int8 shiftCount;
695 
696 	if ( zSig0 == 0 ) {
697 		zSig0 = zSig1;
698 		zSig1 = 0;
699 		zExp -= 64;
700 	}
701 	shiftCount = countLeadingZeros64( zSig0 );
702 	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
703 	zExp -= shiftCount;
704 	return
705 		roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
706 
707 }
708 
709 #endif
710 
711 #ifdef FLOAT128
712 
713 /*----------------------------------------------------------------------------
714 | Returns the least-significant 64 fraction bits of the quadruple-precision
715 | floating-point value `a'.
716 *----------------------------------------------------------------------------*/
717 
extractFloat128Frac1(float128 a)718 INLINE bits64 extractFloat128Frac1( float128 a )
719 {
720 	return a.low;
721 
722 }
723 
724 /*----------------------------------------------------------------------------
725 | Returns the most-significant 48 fraction bits of the quadruple-precision
726 | floating-point value `a'.
727 *----------------------------------------------------------------------------*/
728 
extractFloat128Frac0(float128 a)729 INLINE bits64 extractFloat128Frac0( float128 a )
730 {
731 	return a.high & LIT64( 0x0000FFFFFFFFFFFF );
732 
733 }
734 
735 /*----------------------------------------------------------------------------
736 | Returns the exponent bits of the quadruple-precision floating-point value
737 | `a'.
738 *----------------------------------------------------------------------------*/
739 
extractFloat128Exp(float128 a)740 INLINE int32 extractFloat128Exp( float128 a )
741 {
742 	return ( a.high>>48 ) & 0x7FFF;
743 
744 }
745 
746 /*----------------------------------------------------------------------------
747 | Returns the sign bit of the quadruple-precision floating-point value `a'.
748 *----------------------------------------------------------------------------*/
749 
extractFloat128Sign(float128 a)750 INLINE flag extractFloat128Sign( float128 a )
751 {
752 	return a.high>>63;
753 
754 }
755 
756 /*----------------------------------------------------------------------------
757 | Normalizes the subnormal quadruple-precision floating-point value
758 | represented by the denormalized significand formed by the concatenation of
759 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
760 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
761 | significand are stored at the location pointed to by `zSig0Ptr', and the
762 | least significant 64 bits of the normalized significand are stored at the
763 | location pointed to by `zSig1Ptr'.
764 *----------------------------------------------------------------------------*/
765 
766 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)767 	normalizeFloat128Subnormal(
768 		bits64 aSig0,
769 		bits64 aSig1,
770 		int32 *zExpPtr,
771 		bits64 *zSig0Ptr,
772 		bits64 *zSig1Ptr
773 	)
774 {
775 	int8 shiftCount;
776 
777 	if ( aSig0 == 0 ) {
778 		shiftCount = countLeadingZeros64( aSig1 ) - 15;
779 		if ( shiftCount < 0 ) {
780 			*zSig0Ptr = aSig1>>( - shiftCount );
781 			*zSig1Ptr = aSig1<<( shiftCount & 63 );
782 		}
783 		else {
784 			*zSig0Ptr = aSig1<<shiftCount;
785 			*zSig1Ptr = 0;
786 		}
787 		*zExpPtr = - shiftCount - 63;
788 	}
789 	else {
790 		shiftCount = countLeadingZeros64( aSig0 ) - 15;
791 		shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
792 		*zExpPtr = 1 - shiftCount;
793 	}
794 
795 }
796 
797 #endif
798 
799 /*----------------------------------------------------------------------------
800 | Returns the result of converting the 32-bit two's complement integer `a'
801 | to the single-precision floating-point format.  The conversion is performed
802 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
803 *----------------------------------------------------------------------------*/
804 
int32_to_float32(int32 a)805 float32 int32_to_float32( int32 a )
806 {
807 	flag zSign;
808 
809 	if ( a == 0 ) return 0;
810 	if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
811 	zSign = ( a < 0 );
812 	return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
813 
814 }
815 
816 /*----------------------------------------------------------------------------
817 | Returns the result of converting the 32-bit two's complement integer `a'
818 | to the double-precision floating-point format.  The conversion is performed
819 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
820 *----------------------------------------------------------------------------*/
821 
int32_to_float64(int32 a)822 float64 int32_to_float64( int32 a )
823 {
824 	flag zSign;
825 	uint32 absA;
826 	int8 shiftCount;
827 	bits64 zSig;
828 
829 	if ( a == 0 ) return 0;
830 	zSign = ( a < 0 );
831 	absA = zSign ? - a : a;
832 	shiftCount = countLeadingZeros32( absA ) + 21;
833 	zSig = absA;
834 	return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
835 
836 }
837 
838 #ifdef FLOATX80
839 
840 /*----------------------------------------------------------------------------
841 | Returns the result of converting the 32-bit two's complement integer `a'
842 | to the extended double-precision floating-point format.  The conversion
843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
844 | Arithmetic.
845 *----------------------------------------------------------------------------*/
846 
int32_to_floatx80(int32 a)847 floatx80 int32_to_floatx80( int32 a )
848 {
849 	flag zSign;
850 	uint32 absA;
851 	int8 shiftCount;
852 	bits64 zSig;
853 
854 	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
855 	zSign = ( a < 0 );
856 	absA = zSign ? - a : a;
857 	shiftCount = countLeadingZeros32( absA ) + 32;
858 	zSig = absA;
859 	return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
860 
861 }
862 
863 #endif
864 
865 #ifdef FLOAT128
866 
867 /*----------------------------------------------------------------------------
868 | Returns the result of converting the 32-bit two's complement integer `a' to
869 | the quadruple-precision floating-point format.  The conversion is performed
870 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
871 *----------------------------------------------------------------------------*/
872 
int32_to_float128(int32 a)873 float128 int32_to_float128( int32 a )
874 {
875 	flag zSign;
876 	uint32 absA;
877 	int8 shiftCount;
878 	bits64 zSig0;
879 
880 	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
881 	zSign = ( a < 0 );
882 	absA = zSign ? - a : a;
883 	shiftCount = countLeadingZeros32( absA ) + 17;
884 	zSig0 = absA;
885 	return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
886 
887 }
888 
889 #endif
890 
891 /*----------------------------------------------------------------------------
892 | Returns the result of converting the 64-bit two's complement integer `a'
893 | to the single-precision floating-point format.  The conversion is performed
894 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
895 *----------------------------------------------------------------------------*/
896 
int64_to_float32(int64 a)897 float32 int64_to_float32( int64 a )
898 {
899 	flag zSign;
900 	uint64 absA;
901 	int8 shiftCount;
902 //    bits32 zSig;
903 
904 	if ( a == 0 ) return 0;
905 	zSign = ( a < 0 );
906 	absA = zSign ? - a : a;
907 	shiftCount = countLeadingZeros64( absA ) - 40;
908 	if ( 0 <= shiftCount ) {
909 		return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
910 	}
911 	else {
912 		shiftCount += 7;
913 		if ( shiftCount < 0 ) {
914 			shift64RightJamming( absA, - shiftCount, &absA );
915 		}
916 		else {
917 			absA <<= shiftCount;
918 		}
919 		return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
920 	}
921 
922 }
923 
924 /*----------------------------------------------------------------------------
925 | Returns the result of converting the 64-bit two's complement integer `a'
926 | to the double-precision floating-point format.  The conversion is performed
927 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
928 *----------------------------------------------------------------------------*/
929 
int64_to_float64(int64 a)930 float64 int64_to_float64( int64 a )
931 {
932 	flag zSign;
933 
934 	if ( a == 0 ) return 0;
935 	if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
936 		return packFloat64( 1, 0x43E, 0 );
937 	}
938 	zSign = ( a < 0 );
939 	return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
940 
941 }
942 
943 #ifdef FLOATX80
944 
945 /*----------------------------------------------------------------------------
946 | Returns the result of converting the 64-bit two's complement integer `a'
947 | to the extended double-precision floating-point format.  The conversion
948 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
949 | Arithmetic.
950 *----------------------------------------------------------------------------*/
951 
int64_to_floatx80(int64 a)952 floatx80 int64_to_floatx80( int64 a )
953 {
954 	flag zSign;
955 	uint64 absA;
956 	int8 shiftCount;
957 
958 	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
959 	zSign = ( a < 0 );
960 	absA = zSign ? - a : a;
961 	shiftCount = countLeadingZeros64( absA );
962 	return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
963 
964 }
965 
966 #endif
967 
968 #ifdef FLOAT128
969 
970 /*----------------------------------------------------------------------------
971 | Returns the result of converting the 64-bit two's complement integer `a' to
972 | the quadruple-precision floating-point format.  The conversion is performed
973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
974 *----------------------------------------------------------------------------*/
975 
int64_to_float128(int64 a)976 float128 int64_to_float128( int64 a )
977 {
978 	flag zSign;
979 	uint64 absA;
980 	int8 shiftCount;
981 	int32 zExp;
982 	bits64 zSig0, zSig1;
983 
984 	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
985 	zSign = ( a < 0 );
986 	absA = zSign ? - a : a;
987 	shiftCount = countLeadingZeros64( absA ) + 49;
988 	zExp = 0x406E - shiftCount;
989 	if ( 64 <= shiftCount ) {
990 		zSig1 = 0;
991 		zSig0 = absA;
992 		shiftCount -= 64;
993 	}
994 	else {
995 		zSig1 = absA;
996 		zSig0 = 0;
997 	}
998 	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
999 	return packFloat128( zSign, zExp, zSig0, zSig1 );
1000 
1001 }
1002 
1003 #endif
1004 
1005 /*----------------------------------------------------------------------------
1006 | Returns the result of converting the single-precision floating-point value
1007 | `a' to the 32-bit two's complement integer format.  The conversion is
1008 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1009 | Arithmetic---which means in particular that the conversion is rounded
1010 | according to the current rounding mode.  If `a' is a NaN, the largest
1011 | positive integer is returned.  Otherwise, if the conversion overflows, the
1012 | largest integer with the same sign as `a' is returned.
1013 *----------------------------------------------------------------------------*/
1014 
float32_to_int32(float32 a)1015 int32 float32_to_int32( float32 a )
1016 {
1017 	flag aSign;
1018 	int16 aExp, shiftCount;
1019 	bits32 aSig;
1020 	bits64 aSig64;
1021 
1022 	aSig = extractFloat32Frac( a );
1023 	aExp = extractFloat32Exp( a );
1024 	aSign = extractFloat32Sign( a );
1025 	if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1026 	if ( aExp ) aSig |= 0x00800000;
1027 	shiftCount = 0xAF - aExp;
1028 	aSig64 = aSig;
1029 	aSig64 <<= 32;
1030 	if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1031 	return roundAndPackInt32( aSign, aSig64 );
1032 
1033 }
1034 
1035 /*----------------------------------------------------------------------------
1036 | Returns the result of converting the single-precision floating-point value
1037 | `a' to the 32-bit two's complement integer format.  The conversion is
1038 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1039 | Arithmetic, except that the conversion is always rounded toward zero.
1040 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1041 | the conversion overflows, the largest integer with the same sign as `a' is
1042 | returned.
1043 *----------------------------------------------------------------------------*/
1044 
float32_to_int32_round_to_zero(float32 a)1045 int32 float32_to_int32_round_to_zero( float32 a )
1046 {
1047 	flag aSign;
1048 	int16 aExp, shiftCount;
1049 	bits32 aSig;
1050 	int32 z;
1051 
1052 	aSig = extractFloat32Frac( a );
1053 	aExp = extractFloat32Exp( a );
1054 	aSign = extractFloat32Sign( a );
1055 	shiftCount = aExp - 0x9E;
1056 	if ( 0 <= shiftCount ) {
1057 		if ( a != 0xCF000000 ) {
1058 			float_raise( float_flag_invalid );
1059 			if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1060 		}
1061 		return (sbits32) 0x80000000;
1062 	}
1063 	else if ( aExp <= 0x7E ) {
1064 		if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1065 		return 0;
1066 	}
1067 	aSig = ( aSig | 0x00800000 )<<8;
1068 	z = aSig>>( - shiftCount );
1069 	if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1070 		float_exception_flags |= float_flag_inexact;
1071 	}
1072 	if ( aSign ) z = - z;
1073 	return z;
1074 
1075 }
1076 
1077 /*----------------------------------------------------------------------------
1078 | Returns the result of converting the single-precision floating-point value
1079 | `a' to the 64-bit two's complement integer format.  The conversion is
1080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1081 | Arithmetic---which means in particular that the conversion is rounded
1082 | according to the current rounding mode.  If `a' is a NaN, the largest
1083 | positive integer is returned.  Otherwise, if the conversion overflows, the
1084 | largest integer with the same sign as `a' is returned.
1085 *----------------------------------------------------------------------------*/
1086 
float32_to_int64(float32 a)1087 int64 float32_to_int64( float32 a )
1088 {
1089 	flag aSign;
1090 	int16 aExp, shiftCount;
1091 	bits32 aSig;
1092 	bits64 aSig64, aSigExtra;
1093 
1094 	aSig = extractFloat32Frac( a );
1095 	aExp = extractFloat32Exp( a );
1096 	aSign = extractFloat32Sign( a );
1097 	shiftCount = 0xBE - aExp;
1098 	if ( shiftCount < 0 ) {
1099 		float_raise( float_flag_invalid );
1100 		if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1101 			return LIT64( 0x7FFFFFFFFFFFFFFF );
1102 		}
1103 		return (sbits64) LIT64( 0x8000000000000000 );
1104 	}
1105 	if ( aExp ) aSig |= 0x00800000;
1106 	aSig64 = aSig;
1107 	aSig64 <<= 40;
1108 	shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1109 	return roundAndPackInt64( aSign, aSig64, aSigExtra );
1110 
1111 }
1112 
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the single-precision floating-point value
1115 | `a' to the 64-bit two's complement integer format.  The conversion is
1116 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1117 | Arithmetic, except that the conversion is always rounded toward zero.  If
1118 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1119 | conversion overflows, the largest integer with the same sign as `a' is
1120 | returned.
1121 *----------------------------------------------------------------------------*/
1122 
float32_to_int64_round_to_zero(float32 a)1123 int64 float32_to_int64_round_to_zero( float32 a )
1124 {
1125 	flag aSign;
1126 	int16 aExp, shiftCount;
1127 	bits32 aSig;
1128 	bits64 aSig64;
1129 	int64 z;
1130 
1131 	aSig = extractFloat32Frac( a );
1132 	aExp = extractFloat32Exp( a );
1133 	aSign = extractFloat32Sign( a );
1134 	shiftCount = aExp - 0xBE;
1135 	if ( 0 <= shiftCount ) {
1136 		if ( a != 0xDF000000 ) {
1137 			float_raise( float_flag_invalid );
1138 			if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1139 				return LIT64( 0x7FFFFFFFFFFFFFFF );
1140 			}
1141 		}
1142 		return (sbits64) LIT64( 0x8000000000000000 );
1143 	}
1144 	else if ( aExp <= 0x7E ) {
1145 		if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1146 		return 0;
1147 	}
1148 	aSig64 = aSig | 0x00800000;
1149 	aSig64 <<= 40;
1150 	z = aSig64>>( - shiftCount );
1151 	if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1152 		float_exception_flags |= float_flag_inexact;
1153 	}
1154 	if ( aSign ) z = - z;
1155 	return z;
1156 
1157 }
1158 
1159 /*----------------------------------------------------------------------------
1160 | Returns the result of converting the single-precision floating-point value
1161 | `a' to the double-precision floating-point format.  The conversion is
1162 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1163 | Arithmetic.
1164 *----------------------------------------------------------------------------*/
1165 
float32_to_float64(float32 a)1166 float64 float32_to_float64( float32 a )
1167 {
1168 	flag aSign;
1169 	int16 aExp;
1170 	bits32 aSig;
1171 
1172 	aSig = extractFloat32Frac( a );
1173 	aExp = extractFloat32Exp( a );
1174 	aSign = extractFloat32Sign( a );
1175 	if ( aExp == 0xFF ) {
1176 		if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1177 		return packFloat64( aSign, 0x7FF, 0 );
1178 	}
1179 	if ( aExp == 0 ) {
1180 		if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1181 		normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1182 		--aExp;
1183 	}
1184 	return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1185 
1186 }
1187 
1188 #ifdef FLOATX80
1189 
1190 /*----------------------------------------------------------------------------
1191 | Returns the result of converting the single-precision floating-point value
1192 | `a' to the extended double-precision floating-point format.  The conversion
1193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1194 | Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196 
float32_to_floatx80(float32 a)1197 floatx80 float32_to_floatx80( float32 a )
1198 {
1199 	flag aSign;
1200 	int16 aExp;
1201 	bits32 aSig;
1202 
1203 	aSig = extractFloat32Frac( a );
1204 	aExp = extractFloat32Exp( a );
1205 	aSign = extractFloat32Sign( a );
1206 	if ( aExp == 0xFF ) {
1207 		if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1208 		return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1209 	}
1210 	if ( aExp == 0 ) {
1211 		if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1212 		normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1213 	}
1214 	aSig |= 0x00800000;
1215 	return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1216 
1217 }
1218 
1219 #endif
1220 
1221 #ifdef FLOAT128
1222 
1223 /*----------------------------------------------------------------------------
1224 | Returns the result of converting the single-precision floating-point value
1225 | `a' to the double-precision floating-point format.  The conversion is
1226 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1227 | Arithmetic.
1228 *----------------------------------------------------------------------------*/
1229 
float32_to_float128(float32 a)1230 float128 float32_to_float128( float32 a )
1231 {
1232 	flag aSign;
1233 	int16 aExp;
1234 	bits32 aSig;
1235 
1236 	aSig = extractFloat32Frac( a );
1237 	aExp = extractFloat32Exp( a );
1238 	aSign = extractFloat32Sign( a );
1239 	if ( aExp == 0xFF ) {
1240 		if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1241 		return packFloat128( aSign, 0x7FFF, 0, 0 );
1242 	}
1243 	if ( aExp == 0 ) {
1244 		if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1245 		normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1246 		--aExp;
1247 	}
1248 	return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1249 
1250 }
1251 
1252 #endif
1253 
1254 /*----------------------------------------------------------------------------
1255 | Rounds the single-precision floating-point value `a' to an integer, and
1256 | returns the result as a single-precision floating-point value.  The
1257 | operation is performed according to the IEC/IEEE Standard for Binary
1258 | Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1260 
float32_round_to_int(float32 a)1261 float32 float32_round_to_int( float32 a )
1262 {
1263 	flag aSign;
1264 	int16 aExp;
1265 	bits32 lastBitMask, roundBitsMask;
1266 	int8 roundingMode;
1267 	float32 z;
1268 
1269 	aExp = extractFloat32Exp( a );
1270 	if ( 0x96 <= aExp ) {
1271 		if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1272 			return propagateFloat32NaN( a, a );
1273 		}
1274 		return a;
1275 	}
1276 	if ( aExp <= 0x7E ) {
1277 		if ( (bits32) ( a<<1 ) == 0 ) return a;
1278 		float_exception_flags |= float_flag_inexact;
1279 		aSign = extractFloat32Sign( a );
1280 		switch ( float_rounding_mode ) {
1281 			case float_round_nearest_even:
1282 			if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1283 				return packFloat32( aSign, 0x7F, 0 );
1284 			}
1285 			break;
1286 			case float_round_down:
1287 			return aSign ? 0xBF800000 : 0;
1288 			case float_round_up:
1289 			return aSign ? 0x80000000 : 0x3F800000;
1290 		}
1291 		return packFloat32( aSign, 0, 0 );
1292 	}
1293 	lastBitMask = 1;
1294 	lastBitMask <<= 0x96 - aExp;
1295 	roundBitsMask = lastBitMask - 1;
1296 	z = a;
1297 	roundingMode = float_rounding_mode;
1298 	if ( roundingMode == float_round_nearest_even ) {
1299 		z += lastBitMask>>1;
1300 		if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1301 	}
1302 	else if ( roundingMode != float_round_to_zero ) {
1303 		if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1304 			z += roundBitsMask;
1305 		}
1306 	}
1307 	z &= ~ roundBitsMask;
1308 	if ( z != a ) float_exception_flags |= float_flag_inexact;
1309 	return z;
1310 
1311 }
1312 
1313 /*----------------------------------------------------------------------------
1314 | Returns the result of adding the absolute values of the single-precision
1315 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1316 | before being returned.  `zSign' is ignored if the result is a NaN.
1317 | The addition is performed according to the IEC/IEEE Standard for Binary
1318 | Floating-Point Arithmetic.
1319 *----------------------------------------------------------------------------*/
1320 
addFloat32Sigs(float32 a,float32 b,flag zSign)1321 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1322 {
1323 	int16 aExp, bExp, zExp;
1324 	bits32 aSig, bSig, zSig;
1325 	int16 expDiff;
1326 
1327 	aSig = extractFloat32Frac( a );
1328 	aExp = extractFloat32Exp( a );
1329 	bSig = extractFloat32Frac( b );
1330 	bExp = extractFloat32Exp( b );
1331 	expDiff = aExp - bExp;
1332 	aSig <<= 6;
1333 	bSig <<= 6;
1334 	if ( 0 < expDiff ) {
1335 		if ( aExp == 0xFF ) {
1336 			if ( aSig ) return propagateFloat32NaN( a, b );
1337 			return a;
1338 		}
1339 		if ( bExp == 0 ) {
1340 			--expDiff;
1341 		}
1342 		else {
1343 			bSig |= 0x20000000;
1344 		}
1345 		shift32RightJamming( bSig, expDiff, &bSig );
1346 		zExp = aExp;
1347 	}
1348 	else if ( expDiff < 0 ) {
1349 		if ( bExp == 0xFF ) {
1350 			if ( bSig ) return propagateFloat32NaN( a, b );
1351 			return packFloat32( zSign, 0xFF, 0 );
1352 		}
1353 		if ( aExp == 0 ) {
1354 			++expDiff;
1355 		}
1356 		else {
1357 			aSig |= 0x20000000;
1358 		}
1359 		shift32RightJamming( aSig, - expDiff, &aSig );
1360 		zExp = bExp;
1361 	}
1362 	else {
1363 		if ( aExp == 0xFF ) {
1364 			if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1365 			return a;
1366 		}
1367 		if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1368 		zSig = 0x40000000 + aSig + bSig;
1369 		zExp = aExp;
1370 		goto roundAndPack;
1371 	}
1372 	aSig |= 0x20000000;
1373 	zSig = ( aSig + bSig )<<1;
1374 	--zExp;
1375 	if ( (sbits32) zSig < 0 ) {
1376 		zSig = aSig + bSig;
1377 		++zExp;
1378 	}
1379 	roundAndPack:
1380 	return roundAndPackFloat32( zSign, zExp, zSig );
1381 
1382 }
1383 
1384 /*----------------------------------------------------------------------------
1385 | Returns the result of subtracting the absolute values of the single-
1386 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1387 | difference is negated before being returned.  `zSign' is ignored if the
1388 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1389 | Standard for Binary Floating-Point Arithmetic.
1390 *----------------------------------------------------------------------------*/
1391 
subFloat32Sigs(float32 a,float32 b,flag zSign)1392 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1393 {
1394 	int16 aExp, bExp, zExp;
1395 	bits32 aSig, bSig, zSig;
1396 	int16 expDiff;
1397 
1398 	aSig = extractFloat32Frac( a );
1399 	aExp = extractFloat32Exp( a );
1400 	bSig = extractFloat32Frac( b );
1401 	bExp = extractFloat32Exp( b );
1402 	expDiff = aExp - bExp;
1403 	aSig <<= 7;
1404 	bSig <<= 7;
1405 	if ( 0 < expDiff ) goto aExpBigger;
1406 	if ( expDiff < 0 ) goto bExpBigger;
1407 	if ( aExp == 0xFF ) {
1408 		if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1409 		float_raise( float_flag_invalid );
1410 		return float32_default_nan;
1411 	}
1412 	if ( aExp == 0 ) {
1413 		aExp = 1;
1414 		bExp = 1;
1415 	}
1416 	if ( bSig < aSig ) goto aBigger;
1417 	if ( aSig < bSig ) goto bBigger;
1418 	return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1419 	bExpBigger:
1420 	if ( bExp == 0xFF ) {
1421 		if ( bSig ) return propagateFloat32NaN( a, b );
1422 		return packFloat32( zSign ^ 1, 0xFF, 0 );
1423 	}
1424 	if ( aExp == 0 ) {
1425 		++expDiff;
1426 	}
1427 	else {
1428 		aSig |= 0x40000000;
1429 	}
1430 	shift32RightJamming( aSig, - expDiff, &aSig );
1431 	bSig |= 0x40000000;
1432 	bBigger:
1433 	zSig = bSig - aSig;
1434 	zExp = bExp;
1435 	zSign ^= 1;
1436 	goto normalizeRoundAndPack;
1437 	aExpBigger:
1438 	if ( aExp == 0xFF ) {
1439 		if ( aSig ) return propagateFloat32NaN( a, b );
1440 		return a;
1441 	}
1442 	if ( bExp == 0 ) {
1443 		--expDiff;
1444 	}
1445 	else {
1446 		bSig |= 0x40000000;
1447 	}
1448 	shift32RightJamming( bSig, expDiff, &bSig );
1449 	aSig |= 0x40000000;
1450 	aBigger:
1451 	zSig = aSig - bSig;
1452 	zExp = aExp;
1453 	normalizeRoundAndPack:
1454 	--zExp;
1455 	return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1456 
1457 }
1458 
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of adding the single-precision floating-point values `a'
1461 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1462 | Binary Floating-Point Arithmetic.
1463 *----------------------------------------------------------------------------*/
1464 
float32_add(float32 a,float32 b)1465 float32 float32_add( float32 a, float32 b )
1466 {
1467 	flag aSign, bSign;
1468 
1469 	aSign = extractFloat32Sign( a );
1470 	bSign = extractFloat32Sign( b );
1471 	if ( aSign == bSign ) {
1472 		return addFloat32Sigs( a, b, aSign );
1473 	}
1474 	else {
1475 		return subFloat32Sigs( a, b, aSign );
1476 	}
1477 
1478 }
1479 
1480 /*----------------------------------------------------------------------------
1481 | Returns the result of subtracting the single-precision floating-point values
1482 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1483 | for Binary Floating-Point Arithmetic.
1484 *----------------------------------------------------------------------------*/
1485 
float32_sub(float32 a,float32 b)1486 float32 float32_sub( float32 a, float32 b )
1487 {
1488 	flag aSign, bSign;
1489 
1490 	aSign = extractFloat32Sign( a );
1491 	bSign = extractFloat32Sign( b );
1492 	if ( aSign == bSign ) {
1493 		return subFloat32Sigs( a, b, aSign );
1494 	}
1495 	else {
1496 		return addFloat32Sigs( a, b, aSign );
1497 	}
1498 
1499 }
1500 
1501 /*----------------------------------------------------------------------------
1502 | Returns the result of multiplying the single-precision floating-point values
1503 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1504 | for Binary Floating-Point Arithmetic.
1505 *----------------------------------------------------------------------------*/
1506 
float32_mul(float32 a,float32 b)1507 float32 float32_mul( float32 a, float32 b )
1508 {
1509 	flag aSign, bSign, zSign;
1510 	int16 aExp, bExp, zExp;
1511 	bits32 aSig, bSig;
1512 	bits64 zSig64;
1513 	bits32 zSig;
1514 
1515 	aSig = extractFloat32Frac( a );
1516 	aExp = extractFloat32Exp( a );
1517 	aSign = extractFloat32Sign( a );
1518 	bSig = extractFloat32Frac( b );
1519 	bExp = extractFloat32Exp( b );
1520 	bSign = extractFloat32Sign( b );
1521 	zSign = aSign ^ bSign;
1522 	if ( aExp == 0xFF ) {
1523 		if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1524 			return propagateFloat32NaN( a, b );
1525 		}
1526 		if ( ( bExp | bSig ) == 0 ) {
1527 			float_raise( float_flag_invalid );
1528 			return float32_default_nan;
1529 		}
1530 		return packFloat32( zSign, 0xFF, 0 );
1531 	}
1532 	if ( bExp == 0xFF ) {
1533 		if ( bSig ) return propagateFloat32NaN( a, b );
1534 		if ( ( aExp | aSig ) == 0 ) {
1535 			float_raise( float_flag_invalid );
1536 			return float32_default_nan;
1537 		}
1538 		return packFloat32( zSign, 0xFF, 0 );
1539 	}
1540 	if ( aExp == 0 ) {
1541 		if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1542 		normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1543 	}
1544 	if ( bExp == 0 ) {
1545 		if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1546 		normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1547 	}
1548 	zExp = aExp + bExp - 0x7F;
1549 	aSig = ( aSig | 0x00800000 )<<7;
1550 	bSig = ( bSig | 0x00800000 )<<8;
1551 	shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1552 	zSig = zSig64;
1553 	if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1554 		zSig <<= 1;
1555 		--zExp;
1556 	}
1557 	return roundAndPackFloat32( zSign, zExp, zSig );
1558 
1559 }
1560 
1561 /*----------------------------------------------------------------------------
1562 | Returns the result of dividing the single-precision floating-point value `a'
1563 | by the corresponding value `b'.  The operation is performed according to the
1564 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1565 *----------------------------------------------------------------------------*/
1566 
float32_div(float32 a,float32 b)1567 float32 float32_div( float32 a, float32 b )
1568 {
1569 	flag aSign, bSign, zSign;
1570 	int16 aExp, bExp, zExp;
1571 	bits32 aSig, bSig, zSig;
1572 
1573 	aSig = extractFloat32Frac( a );
1574 	aExp = extractFloat32Exp( a );
1575 	aSign = extractFloat32Sign( a );
1576 	bSig = extractFloat32Frac( b );
1577 	bExp = extractFloat32Exp( b );
1578 	bSign = extractFloat32Sign( b );
1579 	zSign = aSign ^ bSign;
1580 	if ( aExp == 0xFF ) {
1581 		if ( aSig ) return propagateFloat32NaN( a, b );
1582 		if ( bExp == 0xFF ) {
1583 			if ( bSig ) return propagateFloat32NaN( a, b );
1584 			float_raise( float_flag_invalid );
1585 			return float32_default_nan;
1586 		}
1587 		return packFloat32( zSign, 0xFF, 0 );
1588 	}
1589 	if ( bExp == 0xFF ) {
1590 		if ( bSig ) return propagateFloat32NaN( a, b );
1591 		return packFloat32( zSign, 0, 0 );
1592 	}
1593 	if ( bExp == 0 ) {
1594 		if ( bSig == 0 ) {
1595 			if ( ( aExp | aSig ) == 0 ) {
1596 				float_raise( float_flag_invalid );
1597 				return float32_default_nan;
1598 			}
1599 			float_raise( float_flag_divbyzero );
1600 			return packFloat32( zSign, 0xFF, 0 );
1601 		}
1602 		normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1603 	}
1604 	if ( aExp == 0 ) {
1605 		if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1606 		normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607 	}
1608 	zExp = aExp - bExp + 0x7D;
1609 	aSig = ( aSig | 0x00800000 )<<7;
1610 	bSig = ( bSig | 0x00800000 )<<8;
1611 	if ( bSig <= ( aSig + aSig ) ) {
1612 		aSig >>= 1;
1613 		++zExp;
1614 	}
1615 	zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1616 	if ( ( zSig & 0x3F ) == 0 ) {
1617 		zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1618 	}
1619 	return roundAndPackFloat32( zSign, zExp, zSig );
1620 
1621 }
1622 
1623 /*----------------------------------------------------------------------------
1624 | Returns the remainder of the single-precision floating-point value `a'
1625 | with respect to the corresponding value `b'.  The operation is performed
1626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1627 *----------------------------------------------------------------------------*/
1628 
float32_rem(float32 a,float32 b)1629 float32 float32_rem( float32 a, float32 b )
1630 {
1631 	flag aSign, zSign;
1632 	int16 aExp, bExp, expDiff;
1633 	bits32 aSig, bSig;
1634 	bits32 q;
1635 	bits64 aSig64, bSig64, q64;
1636 	bits32 alternateASig;
1637 	sbits32 sigMean;
1638 
1639 	aSig = extractFloat32Frac( a );
1640 	aExp = extractFloat32Exp( a );
1641 	aSign = extractFloat32Sign( a );
1642 	bSig = extractFloat32Frac( b );
1643 	bExp = extractFloat32Exp( b );
1644 //    bSign = extractFloat32Sign( b );
1645 	if ( aExp == 0xFF ) {
1646 		if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1647 			return propagateFloat32NaN( a, b );
1648 		}
1649 		float_raise( float_flag_invalid );
1650 		return float32_default_nan;
1651 	}
1652 	if ( bExp == 0xFF ) {
1653 		if ( bSig ) return propagateFloat32NaN( a, b );
1654 		return a;
1655 	}
1656 	if ( bExp == 0 ) {
1657 		if ( bSig == 0 ) {
1658 			float_raise( float_flag_invalid );
1659 			return float32_default_nan;
1660 		}
1661 		normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1662 	}
1663 	if ( aExp == 0 ) {
1664 		if ( aSig == 0 ) return a;
1665 		normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1666 	}
1667 	expDiff = aExp - bExp;
1668 	aSig |= 0x00800000;
1669 	bSig |= 0x00800000;
1670 	if ( expDiff < 32 ) {
1671 		aSig <<= 8;
1672 		bSig <<= 8;
1673 		if ( expDiff < 0 ) {
1674 			if ( expDiff < -1 ) return a;
1675 			aSig >>= 1;
1676 		}
1677 		q = ( bSig <= aSig );
1678 		if ( q ) aSig -= bSig;
1679 		if ( 0 < expDiff ) {
1680 			q = ( ( (bits64) aSig )<<32 ) / bSig;
1681 			q >>= 32 - expDiff;
1682 			bSig >>= 2;
1683 			aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1684 		}
1685 		else {
1686 			aSig >>= 2;
1687 			bSig >>= 2;
1688 		}
1689 	}
1690 	else {
1691 		if ( bSig <= aSig ) aSig -= bSig;
1692 		aSig64 = ( (bits64) aSig )<<40;
1693 		bSig64 = ( (bits64) bSig )<<40;
1694 		expDiff -= 64;
1695 		while ( 0 < expDiff ) {
1696 			q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1697 			q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1698 			aSig64 = - ( ( bSig * q64 )<<38 );
1699 			expDiff -= 62;
1700 		}
1701 		expDiff += 64;
1702 		q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1703 		q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1704 		q = q64>>( 64 - expDiff );
1705 		bSig <<= 6;
1706 		aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1707 	}
1708 	do {
1709 		alternateASig = aSig;
1710 		++q;
1711 		aSig -= bSig;
1712 	} while ( 0 <= (sbits32) aSig );
1713 	sigMean = aSig + alternateASig;
1714 	if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1715 		aSig = alternateASig;
1716 	}
1717 	zSign = ( (sbits32) aSig < 0 );
1718 	if ( zSign ) aSig = - aSig;
1719 	return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1720 
1721 }
1722 
1723 /*----------------------------------------------------------------------------
1724 | Returns the square root of the single-precision floating-point value `a'.
1725 | The operation is performed according to the IEC/IEEE Standard for Binary
1726 | Floating-Point Arithmetic.
1727 *----------------------------------------------------------------------------*/
1728 
float32_sqrt(float32 a)1729 float32 float32_sqrt( float32 a )
1730 {
1731 	flag aSign;
1732 	int16 aExp, zExp;
1733 	bits32 aSig, zSig;
1734 	bits64 rem, term;
1735 
1736 	aSig = extractFloat32Frac( a );
1737 	aExp = extractFloat32Exp( a );
1738 	aSign = extractFloat32Sign( a );
1739 	if ( aExp == 0xFF ) {
1740 		if ( aSig ) return propagateFloat32NaN( a, 0 );
1741 		if ( ! aSign ) return a;
1742 		float_raise( float_flag_invalid );
1743 		return float32_default_nan;
1744 	}
1745 	if ( aSign ) {
1746 		if ( ( aExp | aSig ) == 0 ) return a;
1747 		float_raise( float_flag_invalid );
1748 		return float32_default_nan;
1749 	}
1750 	if ( aExp == 0 ) {
1751 		if ( aSig == 0 ) return 0;
1752 		normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1753 	}
1754 	zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1755 	aSig = ( aSig | 0x00800000 )<<8;
1756 	zSig = estimateSqrt32( aExp, aSig ) + 2;
1757 	if ( ( zSig & 0x7F ) <= 5 ) {
1758 		if ( zSig < 2 ) {
1759 			zSig = 0x7FFFFFFF;
1760 			goto roundAndPack;
1761 		}
1762 		aSig >>= aExp & 1;
1763 		term = ( (bits64) zSig ) * zSig;
1764 		rem = ( ( (bits64) aSig )<<32 ) - term;
1765 		while ( (sbits64) rem < 0 ) {
1766 			--zSig;
1767 			rem += ( ( (bits64) zSig )<<1 ) | 1;
1768 		}
1769 		zSig |= ( rem != 0 );
1770 	}
1771 	shift32RightJamming( zSig, 1, &zSig );
1772 	roundAndPack:
1773 	return roundAndPackFloat32( 0, zExp, zSig );
1774 
1775 }
1776 
1777 /*----------------------------------------------------------------------------
1778 | Returns 1 if the single-precision floating-point value `a' is equal to
1779 | the corresponding value `b', and 0 otherwise.  The comparison is performed
1780 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1781 *----------------------------------------------------------------------------*/
1782 
float32_eq(float32 a,float32 b)1783 flag float32_eq( float32 a, float32 b )
1784 {
1785 	if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1786 			|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1787 		) {
1788 		if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1789 			float_raise( float_flag_invalid );
1790 		}
1791 		return 0;
1792 	}
1793 	return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1794 
1795 }
1796 
1797 /*----------------------------------------------------------------------------
1798 | Returns 1 if the single-precision floating-point value `a' is less than
1799 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
1800 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1801 | Arithmetic.
1802 *----------------------------------------------------------------------------*/
1803 
float32_le(float32 a,float32 b)1804 flag float32_le( float32 a, float32 b )
1805 {
1806 	flag aSign, bSign;
1807 
1808 	if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1809 			|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1810 		) {
1811 		float_raise( float_flag_invalid );
1812 		return 0;
1813 	}
1814 	aSign = extractFloat32Sign( a );
1815 	bSign = extractFloat32Sign( b );
1816 	if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1817 	return ( a == b ) || ( aSign ^ ( a < b ) );
1818 
1819 }
1820 
1821 /*----------------------------------------------------------------------------
1822 | Returns 1 if the single-precision floating-point value `a' is less than
1823 | the corresponding value `b', and 0 otherwise.  The comparison is performed
1824 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1825 *----------------------------------------------------------------------------*/
1826 
float32_lt(float32 a,float32 b)1827 flag float32_lt( float32 a, float32 b )
1828 {
1829 	flag aSign, bSign;
1830 
1831 	if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1832 			|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1833 		) {
1834 		float_raise( float_flag_invalid );
1835 		return 0;
1836 	}
1837 	aSign = extractFloat32Sign( a );
1838 	bSign = extractFloat32Sign( b );
1839 	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1840 	return ( a != b ) && ( aSign ^ ( a < b ) );
1841 
1842 }
1843 
1844 /*----------------------------------------------------------------------------
1845 | Returns 1 if the single-precision floating-point value `a' is equal to
1846 | the corresponding value `b', and 0 otherwise.  The invalid exception is
1847 | raised if either operand is a NaN.  Otherwise, the comparison is performed
1848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1849 *----------------------------------------------------------------------------*/
1850 
float32_eq_signaling(float32 a,float32 b)1851 flag float32_eq_signaling( float32 a, float32 b )
1852 {
1853 	if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1854 			|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1855 		) {
1856 		float_raise( float_flag_invalid );
1857 		return 0;
1858 	}
1859 	return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1860 
1861 }
1862 
1863 /*----------------------------------------------------------------------------
1864 | Returns 1 if the single-precision floating-point value `a' is less than or
1865 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1866 | cause an exception.  Otherwise, the comparison is performed according to the
1867 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1868 *----------------------------------------------------------------------------*/
1869 
float32_le_quiet(float32 a,float32 b)1870 flag float32_le_quiet( float32 a, float32 b )
1871 {
1872 	flag aSign, bSign;
1873 //    int16 aExp, bExp;
1874 
1875 	if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1876 			|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1877 		) {
1878 		if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1879 			float_raise( float_flag_invalid );
1880 		}
1881 		return 0;
1882 	}
1883 	aSign = extractFloat32Sign( a );
1884 	bSign = extractFloat32Sign( b );
1885 	if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1886 	return ( a == b ) || ( aSign ^ ( a < b ) );
1887 
1888 }
1889 
1890 /*----------------------------------------------------------------------------
1891 | Returns 1 if the single-precision floating-point value `a' is less than
1892 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1893 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1894 | Standard for Binary Floating-Point Arithmetic.
1895 *----------------------------------------------------------------------------*/
1896 
float32_lt_quiet(float32 a,float32 b)1897 flag float32_lt_quiet( float32 a, float32 b )
1898 {
1899 	flag aSign, bSign;
1900 
1901 	if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1902 			|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1903 		) {
1904 		if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1905 			float_raise( float_flag_invalid );
1906 		}
1907 		return 0;
1908 	}
1909 	aSign = extractFloat32Sign( a );
1910 	bSign = extractFloat32Sign( b );
1911 	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1912 	return ( a != b ) && ( aSign ^ ( a < b ) );
1913 
1914 }
1915 
1916 /*----------------------------------------------------------------------------
1917 | Returns the result of converting the double-precision floating-point value
1918 | `a' to the 32-bit two's complement integer format.  The conversion is
1919 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1920 | Arithmetic---which means in particular that the conversion is rounded
1921 | according to the current rounding mode.  If `a' is a NaN, the largest
1922 | positive integer is returned.  Otherwise, if the conversion overflows, the
1923 | largest integer with the same sign as `a' is returned.
1924 *----------------------------------------------------------------------------*/
1925 
float64_to_int32(float64 a)1926 int32 float64_to_int32( float64 a )
1927 {
1928 	flag aSign;
1929 	int16 aExp, shiftCount;
1930 	bits64 aSig;
1931 
1932 	aSig = extractFloat64Frac( a );
1933 	aExp = extractFloat64Exp( a );
1934 	aSign = extractFloat64Sign( a );
1935 	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1936 	if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1937 	shiftCount = 0x42C - aExp;
1938 	if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1939 	return roundAndPackInt32( aSign, aSig );
1940 
1941 }
1942 
1943 /*----------------------------------------------------------------------------
1944 | Returns the result of converting the double-precision floating-point value
1945 | `a' to the 32-bit two's complement integer format.  The conversion is
1946 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1947 | Arithmetic, except that the conversion is always rounded toward zero.
1948 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1949 | the conversion overflows, the largest integer with the same sign as `a' is
1950 | returned.
1951 *----------------------------------------------------------------------------*/
1952 
float64_to_int32_round_to_zero(float64 a)1953 int32 float64_to_int32_round_to_zero( float64 a )
1954 {
1955 	flag aSign;
1956 	int16 aExp, shiftCount;
1957 	bits64 aSig, savedASig;
1958 	int32 z;
1959 
1960 	aSig = extractFloat64Frac( a );
1961 	aExp = extractFloat64Exp( a );
1962 	aSign = extractFloat64Sign( a );
1963 	if ( 0x41E < aExp ) {
1964 		if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1965 		goto invalid;
1966 	}
1967 	else if ( aExp < 0x3FF ) {
1968 		if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1969 		return 0;
1970 	}
1971 	aSig |= LIT64( 0x0010000000000000 );
1972 	shiftCount = 0x433 - aExp;
1973 	savedASig = aSig;
1974 	aSig >>= shiftCount;
1975 	z = aSig;
1976 	if ( aSign ) z = - z;
1977 	if ( ( z < 0 ) ^ aSign ) {
1978 	invalid:
1979 		float_raise( float_flag_invalid );
1980 		return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1981 	}
1982 	if ( ( aSig<<shiftCount ) != savedASig ) {
1983 		float_exception_flags |= float_flag_inexact;
1984 	}
1985 	return z;
1986 
1987 }
1988 
1989 /*----------------------------------------------------------------------------
1990 | Returns the result of converting the double-precision floating-point value
1991 | `a' to the 64-bit two's complement integer format.  The conversion is
1992 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1993 | Arithmetic---which means in particular that the conversion is rounded
1994 | according to the current rounding mode.  If `a' is a NaN, the largest
1995 | positive integer is returned.  Otherwise, if the conversion overflows, the
1996 | largest integer with the same sign as `a' is returned.
1997 *----------------------------------------------------------------------------*/
1998 
float64_to_int64(float64 a)1999 int64 float64_to_int64( float64 a )
2000 {
2001 	flag aSign;
2002 	int16 aExp, shiftCount;
2003 	bits64 aSig, aSigExtra;
2004 
2005 	aSig = extractFloat64Frac( a );
2006 	aExp = extractFloat64Exp( a );
2007 	aSign = extractFloat64Sign( a );
2008 	if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2009 	shiftCount = 0x433 - aExp;
2010 	if ( shiftCount <= 0 ) {
2011 		if ( 0x43E < aExp ) {
2012 			float_raise( float_flag_invalid );
2013 			if (    ! aSign
2014 					|| (    ( aExp == 0x7FF )
2015 						&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2016 				) {
2017 				return LIT64( 0x7FFFFFFFFFFFFFFF );
2018 			}
2019 			return (sbits64) LIT64( 0x8000000000000000 );
2020 		}
2021 		aSigExtra = 0;
2022 		aSig <<= - shiftCount;
2023 	}
2024 	else {
2025 		shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2026 	}
2027 	return roundAndPackInt64( aSign, aSig, aSigExtra );
2028 
2029 }
2030 
2031 /*----------------------------------------------------------------------------
2032 | Returns the result of converting the double-precision floating-point value
2033 | `a' to the 64-bit two's complement integer format.  The conversion is
2034 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2035 | Arithmetic, except that the conversion is always rounded toward zero.
2036 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2037 | the conversion overflows, the largest integer with the same sign as `a' is
2038 | returned.
2039 *----------------------------------------------------------------------------*/
2040 
float64_to_int64_round_to_zero(float64 a)2041 int64 float64_to_int64_round_to_zero( float64 a )
2042 {
2043 	flag aSign;
2044 	int16 aExp, shiftCount;
2045 	bits64 aSig;
2046 	int64 z;
2047 
2048 	aSig = extractFloat64Frac( a );
2049 	aExp = extractFloat64Exp( a );
2050 	aSign = extractFloat64Sign( a );
2051 	if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2052 	shiftCount = aExp - 0x433;
2053 	if ( 0 <= shiftCount ) {
2054 		if ( 0x43E <= aExp ) {
2055 			if ( a != LIT64( 0xC3E0000000000000 ) ) {
2056 				float_raise( float_flag_invalid );
2057 				if (    ! aSign
2058 						|| (    ( aExp == 0x7FF )
2059 							&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2060 					) {
2061 					return LIT64( 0x7FFFFFFFFFFFFFFF );
2062 				}
2063 			}
2064 			return (sbits64) LIT64( 0x8000000000000000 );
2065 		}
2066 		z = aSig<<shiftCount;
2067 	}
2068 	else {
2069 		if ( aExp < 0x3FE ) {
2070 			if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2071 			return 0;
2072 		}
2073 		z = aSig>>( - shiftCount );
2074 		if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2075 			float_exception_flags |= float_flag_inexact;
2076 		}
2077 	}
2078 	if ( aSign ) z = - z;
2079 	return z;
2080 
2081 }
2082 
2083 /*----------------------------------------------------------------------------
2084 | Returns the result of converting the double-precision floating-point value
2085 | `a' to the single-precision floating-point format.  The conversion is
2086 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2087 | Arithmetic.
2088 *----------------------------------------------------------------------------*/
2089 
float64_to_float32(float64 a)2090 float32 float64_to_float32( float64 a )
2091 {
2092 	flag aSign;
2093 	int16 aExp;
2094 	bits64 aSig;
2095 	bits32 zSig;
2096 
2097 	aSig = extractFloat64Frac( a );
2098 	aExp = extractFloat64Exp( a );
2099 	aSign = extractFloat64Sign( a );
2100 	if ( aExp == 0x7FF ) {
2101 		if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2102 		return packFloat32( aSign, 0xFF, 0 );
2103 	}
2104 	shift64RightJamming( aSig, 22, &aSig );
2105 	zSig = aSig;
2106 	if ( aExp || zSig ) {
2107 		zSig |= 0x40000000;
2108 		aExp -= 0x381;
2109 	}
2110 	return roundAndPackFloat32( aSign, aExp, zSig );
2111 
2112 }
2113 
2114 #ifdef FLOATX80
2115 
2116 /*----------------------------------------------------------------------------
2117 | Returns the result of converting the double-precision floating-point value
2118 | `a' to the extended double-precision floating-point format.  The conversion
2119 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2120 | Arithmetic.
2121 *----------------------------------------------------------------------------*/
2122 
float64_to_floatx80(float64 a)2123 floatx80 float64_to_floatx80( float64 a )
2124 {
2125 	flag aSign;
2126 	int16 aExp;
2127 	bits64 aSig;
2128 
2129 	aSig = extractFloat64Frac( a );
2130 	aExp = extractFloat64Exp( a );
2131 	aSign = extractFloat64Sign( a );
2132 	if ( aExp == 0x7FF ) {
2133 		if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2134 		return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2135 	}
2136 	if ( aExp == 0 ) {
2137 		if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2138 		normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2139 	}
2140 	return
2141 		packFloatx80(
2142 			aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2143 
2144 }
2145 
2146 #endif
2147 
2148 #ifdef FLOAT128
2149 
2150 /*----------------------------------------------------------------------------
2151 | Returns the result of converting the double-precision floating-point value
2152 | `a' to the quadruple-precision floating-point format.  The conversion is
2153 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2154 | Arithmetic.
2155 *----------------------------------------------------------------------------*/
2156 
float64_to_float128(float64 a)2157 float128 float64_to_float128( float64 a )
2158 {
2159 	flag aSign;
2160 	int16 aExp;
2161 	bits64 aSig, zSig0, zSig1;
2162 
2163 	aSig = extractFloat64Frac( a );
2164 	aExp = extractFloat64Exp( a );
2165 	aSign = extractFloat64Sign( a );
2166 	if ( aExp == 0x7FF ) {
2167 		if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2168 		return packFloat128( aSign, 0x7FFF, 0, 0 );
2169 	}
2170 	if ( aExp == 0 ) {
2171 		if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2172 		normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2173 		--aExp;
2174 	}
2175 	shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2176 	return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2177 
2178 }
2179 
2180 #endif
2181 
2182 /*----------------------------------------------------------------------------
2183 | Rounds the double-precision floating-point value `a' to an integer, and
2184 | returns the result as a double-precision floating-point value.  The
2185 | operation is performed according to the IEC/IEEE Standard for Binary
2186 | Floating-Point Arithmetic.
2187 *----------------------------------------------------------------------------*/
2188 
float64_round_to_int(float64 a)2189 float64 float64_round_to_int( float64 a )
2190 {
2191 	flag aSign;
2192 	int16 aExp;
2193 	bits64 lastBitMask, roundBitsMask;
2194 	int8 roundingMode;
2195 	float64 z;
2196 
2197 	aExp = extractFloat64Exp( a );
2198 	if ( 0x433 <= aExp ) {
2199 		if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2200 			return propagateFloat64NaN( a, a );
2201 		}
2202 		return a;
2203 	}
2204 	if ( aExp < 0x3FF ) {
2205 		if ( (bits64) ( a<<1 ) == 0 ) return a;
2206 		float_exception_flags |= float_flag_inexact;
2207 		aSign = extractFloat64Sign( a );
2208 		switch ( float_rounding_mode ) {
2209 			case float_round_nearest_even:
2210 			if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2211 				return packFloat64( aSign, 0x3FF, 0 );
2212 			}
2213 			break;
2214 			case float_round_down:
2215 			return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2216 			case float_round_up:
2217 			return
2218 			aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2219 		}
2220 		return packFloat64( aSign, 0, 0 );
2221 	}
2222 	lastBitMask = 1;
2223 	lastBitMask <<= 0x433 - aExp;
2224 	roundBitsMask = lastBitMask - 1;
2225 	z = a;
2226 	roundingMode = float_rounding_mode;
2227 	if ( roundingMode == float_round_nearest_even ) {
2228 		z += lastBitMask>>1;
2229 		if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2230 	}
2231 	else if ( roundingMode != float_round_to_zero ) {
2232 		if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2233 			z += roundBitsMask;
2234 		}
2235 	}
2236 	z &= ~ roundBitsMask;
2237 	if ( z != a ) float_exception_flags |= float_flag_inexact;
2238 	return z;
2239 
2240 }
2241 
2242 /*----------------------------------------------------------------------------
2243 | Returns the result of adding the absolute values of the double-precision
2244 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2245 | before being returned.  `zSign' is ignored if the result is a NaN.
2246 | The addition is performed according to the IEC/IEEE Standard for Binary
2247 | Floating-Point Arithmetic.
2248 *----------------------------------------------------------------------------*/
2249 
addFloat64Sigs(float64 a,float64 b,flag zSign)2250 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2251 {
2252 	int16 aExp, bExp, zExp;
2253 	bits64 aSig, bSig, zSig;
2254 	int16 expDiff;
2255 
2256 	aSig = extractFloat64Frac( a );
2257 	aExp = extractFloat64Exp( a );
2258 	bSig = extractFloat64Frac( b );
2259 	bExp = extractFloat64Exp( b );
2260 	expDiff = aExp - bExp;
2261 	aSig <<= 9;
2262 	bSig <<= 9;
2263 	if ( 0 < expDiff ) {
2264 		if ( aExp == 0x7FF ) {
2265 			if ( aSig ) return propagateFloat64NaN( a, b );
2266 			return a;
2267 		}
2268 		if ( bExp == 0 ) {
2269 			--expDiff;
2270 		}
2271 		else {
2272 			bSig |= LIT64( 0x2000000000000000 );
2273 		}
2274 		shift64RightJamming( bSig, expDiff, &bSig );
2275 		zExp = aExp;
2276 	}
2277 	else if ( expDiff < 0 ) {
2278 		if ( bExp == 0x7FF ) {
2279 			if ( bSig ) return propagateFloat64NaN( a, b );
2280 			return packFloat64( zSign, 0x7FF, 0 );
2281 		}
2282 		if ( aExp == 0 ) {
2283 			++expDiff;
2284 		}
2285 		else {
2286 			aSig |= LIT64( 0x2000000000000000 );
2287 		}
2288 		shift64RightJamming( aSig, - expDiff, &aSig );
2289 		zExp = bExp;
2290 	}
2291 	else {
2292 		if ( aExp == 0x7FF ) {
2293 			if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2294 			return a;
2295 		}
2296 		if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2297 		zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2298 		zExp = aExp;
2299 		goto roundAndPack;
2300 	}
2301 	aSig |= LIT64( 0x2000000000000000 );
2302 	zSig = ( aSig + bSig )<<1;
2303 	--zExp;
2304 	if ( (sbits64) zSig < 0 ) {
2305 		zSig = aSig + bSig;
2306 		++zExp;
2307 	}
2308 	roundAndPack:
2309 	return roundAndPackFloat64( zSign, zExp, zSig );
2310 
2311 }
2312 
2313 /*----------------------------------------------------------------------------
2314 | Returns the result of subtracting the absolute values of the double-
2315 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2316 | difference is negated before being returned.  `zSign' is ignored if the
2317 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2318 | Standard for Binary Floating-Point Arithmetic.
2319 *----------------------------------------------------------------------------*/
2320 
subFloat64Sigs(float64 a,float64 b,flag zSign)2321 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2322 {
2323 	int16 aExp, bExp, zExp;
2324 	bits64 aSig, bSig, zSig;
2325 	int16 expDiff;
2326 
2327 	aSig = extractFloat64Frac( a );
2328 	aExp = extractFloat64Exp( a );
2329 	bSig = extractFloat64Frac( b );
2330 	bExp = extractFloat64Exp( b );
2331 	expDiff = aExp - bExp;
2332 	aSig <<= 10;
2333 	bSig <<= 10;
2334 	if ( 0 < expDiff ) goto aExpBigger;
2335 	if ( expDiff < 0 ) goto bExpBigger;
2336 	if ( aExp == 0x7FF ) {
2337 		if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2338 		float_raise( float_flag_invalid );
2339 		return float64_default_nan;
2340 	}
2341 	if ( aExp == 0 ) {
2342 		aExp = 1;
2343 		bExp = 1;
2344 	}
2345 	if ( bSig < aSig ) goto aBigger;
2346 	if ( aSig < bSig ) goto bBigger;
2347 	return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2348 	bExpBigger:
2349 	if ( bExp == 0x7FF ) {
2350 		if ( bSig ) return propagateFloat64NaN( a, b );
2351 		return packFloat64( zSign ^ 1, 0x7FF, 0 );
2352 	}
2353 	if ( aExp == 0 ) {
2354 		++expDiff;
2355 	}
2356 	else {
2357 		aSig |= LIT64( 0x4000000000000000 );
2358 	}
2359 	shift64RightJamming( aSig, - expDiff, &aSig );
2360 	bSig |= LIT64( 0x4000000000000000 );
2361 	bBigger:
2362 	zSig = bSig - aSig;
2363 	zExp = bExp;
2364 	zSign ^= 1;
2365 	goto normalizeRoundAndPack;
2366 	aExpBigger:
2367 	if ( aExp == 0x7FF ) {
2368 		if ( aSig ) return propagateFloat64NaN( a, b );
2369 		return a;
2370 	}
2371 	if ( bExp == 0 ) {
2372 		--expDiff;
2373 	}
2374 	else {
2375 		bSig |= LIT64( 0x4000000000000000 );
2376 	}
2377 	shift64RightJamming( bSig, expDiff, &bSig );
2378 	aSig |= LIT64( 0x4000000000000000 );
2379 	aBigger:
2380 	zSig = aSig - bSig;
2381 	zExp = aExp;
2382 	normalizeRoundAndPack:
2383 	--zExp;
2384 	return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2385 
2386 }
2387 
2388 /*----------------------------------------------------------------------------
2389 | Returns the result of adding the double-precision floating-point values `a'
2390 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2391 | Binary Floating-Point Arithmetic.
2392 *----------------------------------------------------------------------------*/
2393 
float64_add(float64 a,float64 b)2394 float64 float64_add( float64 a, float64 b )
2395 {
2396 	flag aSign, bSign;
2397 
2398 	aSign = extractFloat64Sign( a );
2399 	bSign = extractFloat64Sign( b );
2400 	if ( aSign == bSign ) {
2401 		return addFloat64Sigs( a, b, aSign );
2402 	}
2403 	else {
2404 		return subFloat64Sigs( a, b, aSign );
2405 	}
2406 
2407 }
2408 
2409 /*----------------------------------------------------------------------------
2410 | Returns the result of subtracting the double-precision floating-point values
2411 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2412 | for Binary Floating-Point Arithmetic.
2413 *----------------------------------------------------------------------------*/
2414 
float64_sub(float64 a,float64 b)2415 float64 float64_sub( float64 a, float64 b )
2416 {
2417 	flag aSign, bSign;
2418 
2419 	aSign = extractFloat64Sign( a );
2420 	bSign = extractFloat64Sign( b );
2421 	if ( aSign == bSign ) {
2422 		return subFloat64Sigs( a, b, aSign );
2423 	}
2424 	else {
2425 		return addFloat64Sigs( a, b, aSign );
2426 	}
2427 
2428 }
2429 
2430 /*----------------------------------------------------------------------------
2431 | Returns the result of multiplying the double-precision floating-point values
2432 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2433 | for Binary Floating-Point Arithmetic.
2434 *----------------------------------------------------------------------------*/
2435 
float64_mul(float64 a,float64 b)2436 float64 float64_mul( float64 a, float64 b )
2437 {
2438 	flag aSign, bSign, zSign;
2439 	int16 aExp, bExp, zExp;
2440 	bits64 aSig, bSig, zSig0, zSig1;
2441 
2442 	aSig = extractFloat64Frac( a );
2443 	aExp = extractFloat64Exp( a );
2444 	aSign = extractFloat64Sign( a );
2445 	bSig = extractFloat64Frac( b );
2446 	bExp = extractFloat64Exp( b );
2447 	bSign = extractFloat64Sign( b );
2448 	zSign = aSign ^ bSign;
2449 	if ( aExp == 0x7FF ) {
2450 		if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2451 			return propagateFloat64NaN( a, b );
2452 		}
2453 		if ( ( bExp | bSig ) == 0 ) {
2454 			float_raise( float_flag_invalid );
2455 			return float64_default_nan;
2456 		}
2457 		return packFloat64( zSign, 0x7FF, 0 );
2458 	}
2459 	if ( bExp == 0x7FF ) {
2460 		if ( bSig ) return propagateFloat64NaN( a, b );
2461 		if ( ( aExp | aSig ) == 0 ) {
2462 			float_raise( float_flag_invalid );
2463 			return float64_default_nan;
2464 		}
2465 		return packFloat64( zSign, 0x7FF, 0 );
2466 	}
2467 	if ( aExp == 0 ) {
2468 		if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2469 		normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2470 	}
2471 	if ( bExp == 0 ) {
2472 		if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2473 		normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2474 	}
2475 	zExp = aExp + bExp - 0x3FF;
2476 	aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2477 	bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2478 	mul64To128( aSig, bSig, &zSig0, &zSig1 );
2479 	zSig0 |= ( zSig1 != 0 );
2480 	if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2481 		zSig0 <<= 1;
2482 		--zExp;
2483 	}
2484 	return roundAndPackFloat64( zSign, zExp, zSig0 );
2485 
2486 }
2487 
2488 /*----------------------------------------------------------------------------
2489 | Returns the result of dividing the double-precision floating-point value `a'
2490 | by the corresponding value `b'.  The operation is performed according to
2491 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2492 *----------------------------------------------------------------------------*/
2493 
float64_div(float64 a,float64 b)2494 float64 float64_div( float64 a, float64 b )
2495 {
2496 	flag aSign, bSign, zSign;
2497 	int16 aExp, bExp, zExp;
2498 	bits64 aSig, bSig, zSig;
2499 	bits64 rem0, rem1;
2500 	bits64 term0, term1;
2501 
2502 	aSig = extractFloat64Frac( a );
2503 	aExp = extractFloat64Exp( a );
2504 	aSign = extractFloat64Sign( a );
2505 	bSig = extractFloat64Frac( b );
2506 	bExp = extractFloat64Exp( b );
2507 	bSign = extractFloat64Sign( b );
2508 	zSign = aSign ^ bSign;
2509 	if ( aExp == 0x7FF ) {
2510 		if ( aSig ) return propagateFloat64NaN( a, b );
2511 		if ( bExp == 0x7FF ) {
2512 			if ( bSig ) return propagateFloat64NaN( a, b );
2513 			float_raise( float_flag_invalid );
2514 			return float64_default_nan;
2515 		}
2516 		return packFloat64( zSign, 0x7FF, 0 );
2517 	}
2518 	if ( bExp == 0x7FF ) {
2519 		if ( bSig ) return propagateFloat64NaN( a, b );
2520 		return packFloat64( zSign, 0, 0 );
2521 	}
2522 	if ( bExp == 0 ) {
2523 		if ( bSig == 0 ) {
2524 			if ( ( aExp | aSig ) == 0 ) {
2525 				float_raise( float_flag_invalid );
2526 				return float64_default_nan;
2527 			}
2528 			float_raise( float_flag_divbyzero );
2529 			return packFloat64( zSign, 0x7FF, 0 );
2530 		}
2531 		normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2532 	}
2533 	if ( aExp == 0 ) {
2534 		if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2535 		normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2536 	}
2537 	zExp = aExp - bExp + 0x3FD;
2538 	aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2539 	bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2540 	if ( bSig <= ( aSig + aSig ) ) {
2541 		aSig >>= 1;
2542 		++zExp;
2543 	}
2544 	zSig = estimateDiv128To64( aSig, 0, bSig );
2545 	if ( ( zSig & 0x1FF ) <= 2 ) {
2546 		mul64To128( bSig, zSig, &term0, &term1 );
2547 		sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2548 		while ( (sbits64) rem0 < 0 ) {
2549 			--zSig;
2550 			add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2551 		}
2552 		zSig |= ( rem1 != 0 );
2553 	}
2554 	return roundAndPackFloat64( zSign, zExp, zSig );
2555 
2556 }
2557 
2558 /*----------------------------------------------------------------------------
2559 | Returns the remainder of the double-precision floating-point value `a'
2560 | with respect to the corresponding value `b'.  The operation is performed
2561 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2562 *----------------------------------------------------------------------------*/
2563 
float64_rem(float64 a,float64 b)2564 float64 float64_rem( float64 a, float64 b )
2565 {
2566 	flag aSign, zSign;
2567 	int16 aExp, bExp, expDiff;
2568 	bits64 aSig, bSig;
2569 	bits64 q, alternateASig;
2570 	sbits64 sigMean;
2571 
2572 	aSig = extractFloat64Frac( a );
2573 	aExp = extractFloat64Exp( a );
2574 	aSign = extractFloat64Sign( a );
2575 	bSig = extractFloat64Frac( b );
2576 	bExp = extractFloat64Exp( b );
2577 //    bSign = extractFloat64Sign( b );
2578 	if ( aExp == 0x7FF ) {
2579 		if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2580 			return propagateFloat64NaN( a, b );
2581 		}
2582 		float_raise( float_flag_invalid );
2583 		return float64_default_nan;
2584 	}
2585 	if ( bExp == 0x7FF ) {
2586 		if ( bSig ) return propagateFloat64NaN( a, b );
2587 		return a;
2588 	}
2589 	if ( bExp == 0 ) {
2590 		if ( bSig == 0 ) {
2591 			float_raise( float_flag_invalid );
2592 			return float64_default_nan;
2593 		}
2594 		normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2595 	}
2596 	if ( aExp == 0 ) {
2597 		if ( aSig == 0 ) return a;
2598 		normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2599 	}
2600 	expDiff = aExp - bExp;
2601 	aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2602 	bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2603 	if ( expDiff < 0 ) {
2604 		if ( expDiff < -1 ) return a;
2605 		aSig >>= 1;
2606 	}
2607 	q = ( bSig <= aSig );
2608 	if ( q ) aSig -= bSig;
2609 	expDiff -= 64;
2610 	while ( 0 < expDiff ) {
2611 		q = estimateDiv128To64( aSig, 0, bSig );
2612 		q = ( 2 < q ) ? q - 2 : 0;
2613 		aSig = - ( ( bSig>>2 ) * q );
2614 		expDiff -= 62;
2615 	}
2616 	expDiff += 64;
2617 	if ( 0 < expDiff ) {
2618 		q = estimateDiv128To64( aSig, 0, bSig );
2619 		q = ( 2 < q ) ? q - 2 : 0;
2620 		q >>= 64 - expDiff;
2621 		bSig >>= 2;
2622 		aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2623 	}
2624 	else {
2625 		aSig >>= 2;
2626 		bSig >>= 2;
2627 	}
2628 	do {
2629 		alternateASig = aSig;
2630 		++q;
2631 		aSig -= bSig;
2632 	} while ( 0 <= (sbits64) aSig );
2633 	sigMean = aSig + alternateASig;
2634 	if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2635 		aSig = alternateASig;
2636 	}
2637 	zSign = ( (sbits64) aSig < 0 );
2638 	if ( zSign ) aSig = - aSig;
2639 	return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2640 
2641 }
2642 
2643 /*----------------------------------------------------------------------------
2644 | Returns the square root of the double-precision floating-point value `a'.
2645 | The operation is performed according to the IEC/IEEE Standard for Binary
2646 | Floating-Point Arithmetic.
2647 *----------------------------------------------------------------------------*/
2648 
float64_sqrt(float64 a)2649 float64 float64_sqrt( float64 a )
2650 {
2651 	flag aSign;
2652 	int16 aExp, zExp;
2653 	bits64 aSig, zSig, doubleZSig;
2654 	bits64 rem0, rem1, term0, term1;
2655 //    float64 z;
2656 
2657 	aSig = extractFloat64Frac( a );
2658 	aExp = extractFloat64Exp( a );
2659 	aSign = extractFloat64Sign( a );
2660 	if ( aExp == 0x7FF ) {
2661 		if ( aSig ) return propagateFloat64NaN( a, a );
2662 		if ( ! aSign ) return a;
2663 		float_raise( float_flag_invalid );
2664 		return float64_default_nan;
2665 	}
2666 	if ( aSign ) {
2667 		if ( ( aExp | aSig ) == 0 ) return a;
2668 		float_raise( float_flag_invalid );
2669 		return float64_default_nan;
2670 	}
2671 	if ( aExp == 0 ) {
2672 		if ( aSig == 0 ) return 0;
2673 		normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2674 	}
2675 	zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2676 	aSig |= LIT64( 0x0010000000000000 );
2677 	zSig = estimateSqrt32( aExp, aSig>>21 );
2678 	aSig <<= 9 - ( aExp & 1 );
2679 	zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2680 	if ( ( zSig & 0x1FF ) <= 5 ) {
2681 		doubleZSig = zSig<<1;
2682 		mul64To128( zSig, zSig, &term0, &term1 );
2683 		sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2684 		while ( (sbits64) rem0 < 0 ) {
2685 			--zSig;
2686 			doubleZSig -= 2;
2687 			add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2688 		}
2689 		zSig |= ( ( rem0 | rem1 ) != 0 );
2690 	}
2691 	return roundAndPackFloat64( 0, zExp, zSig );
2692 
2693 }
2694 
2695 /*----------------------------------------------------------------------------
2696 | Returns 1 if the double-precision floating-point value `a' is equal to the
2697 | corresponding value `b', and 0 otherwise.  The comparison is performed
2698 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2699 *----------------------------------------------------------------------------*/
2700 
float64_eq(float64 a,float64 b)2701 flag float64_eq( float64 a, float64 b )
2702 {
2703 	if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2704 			|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2705 		) {
2706 		if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2707 			float_raise( float_flag_invalid );
2708 		}
2709 		return 0;
2710 	}
2711 	return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2712 
2713 }
2714 
2715 /*----------------------------------------------------------------------------
2716 | Returns 1 if the double-precision floating-point value `a' is less than or
2717 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
2718 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2719 | Arithmetic.
2720 *----------------------------------------------------------------------------*/
2721 
float64_le(float64 a,float64 b)2722 flag float64_le( float64 a, float64 b )
2723 {
2724 	flag aSign, bSign;
2725 
2726 	if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2727 			|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2728 		) {
2729 		float_raise( float_flag_invalid );
2730 		return 0;
2731 	}
2732 	aSign = extractFloat64Sign( a );
2733 	bSign = extractFloat64Sign( b );
2734 	if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2735 	return ( a == b ) || ( aSign ^ ( a < b ) );
2736 
2737 }
2738 
2739 /*----------------------------------------------------------------------------
2740 | Returns 1 if the double-precision floating-point value `a' is less than
2741 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2742 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2743 *----------------------------------------------------------------------------*/
2744 
float64_lt(float64 a,float64 b)2745 flag float64_lt( float64 a, float64 b )
2746 {
2747 	flag aSign, bSign;
2748 
2749 	if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2750 			|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2751 		) {
2752 		float_raise( float_flag_invalid );
2753 		return 0;
2754 	}
2755 	aSign = extractFloat64Sign( a );
2756 	bSign = extractFloat64Sign( b );
2757 	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2758 	return ( a != b ) && ( aSign ^ ( a < b ) );
2759 
2760 }
2761 
2762 /*----------------------------------------------------------------------------
2763 | Returns 1 if the double-precision floating-point value `a' is equal to the
2764 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
2765 | if either operand is a NaN.  Otherwise, the comparison is performed
2766 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2767 *----------------------------------------------------------------------------*/
2768 
float64_eq_signaling(float64 a,float64 b)2769 flag float64_eq_signaling( float64 a, float64 b )
2770 {
2771 	if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2772 			|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2773 		) {
2774 		float_raise( float_flag_invalid );
2775 		return 0;
2776 	}
2777 	return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2778 
2779 }
2780 
2781 /*----------------------------------------------------------------------------
2782 | Returns 1 if the double-precision floating-point value `a' is less than or
2783 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2784 | cause an exception.  Otherwise, the comparison is performed according to the
2785 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2786 *----------------------------------------------------------------------------*/
2787 
float64_le_quiet(float64 a,float64 b)2788 flag float64_le_quiet( float64 a, float64 b )
2789 {
2790 	flag aSign, bSign;
2791 //    int16 aExp, bExp;
2792 
2793 	if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2794 			|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2795 		) {
2796 		if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2797 			float_raise( float_flag_invalid );
2798 		}
2799 		return 0;
2800 	}
2801 	aSign = extractFloat64Sign( a );
2802 	bSign = extractFloat64Sign( b );
2803 	if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2804 	return ( a == b ) || ( aSign ^ ( a < b ) );
2805 
2806 }
2807 
2808 /*----------------------------------------------------------------------------
2809 | Returns 1 if the double-precision floating-point value `a' is less than
2810 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2811 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2812 | Standard for Binary Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814 
float64_lt_quiet(float64 a,float64 b)2815 flag float64_lt_quiet( float64 a, float64 b )
2816 {
2817 	flag aSign, bSign;
2818 
2819 	if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2820 			|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2821 		) {
2822 		if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2823 			float_raise( float_flag_invalid );
2824 		}
2825 		return 0;
2826 	}
2827 	aSign = extractFloat64Sign( a );
2828 	bSign = extractFloat64Sign( b );
2829 	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2830 	return ( a != b ) && ( aSign ^ ( a < b ) );
2831 
2832 }
2833 
2834 #ifdef FLOATX80
2835 
2836 /*----------------------------------------------------------------------------
2837 | Returns the result of converting the extended double-precision floating-
2838 | point value `a' to the 32-bit two's complement integer format.  The
2839 | conversion is performed according to the IEC/IEEE Standard for Binary
2840 | Floating-Point Arithmetic---which means in particular that the conversion
2841 | is rounded according to the current rounding mode.  If `a' is a NaN, the
2842 | largest positive integer is returned.  Otherwise, if the conversion
2843 | overflows, the largest integer with the same sign as `a' is returned.
2844 *----------------------------------------------------------------------------*/
2845 
floatx80_to_int32(floatx80 a)2846 int32 floatx80_to_int32( floatx80 a )
2847 {
2848 	flag aSign;
2849 	int32 aExp, shiftCount;
2850 	bits64 aSig;
2851 
2852 	aSig = extractFloatx80Frac( a );
2853 	aExp = extractFloatx80Exp( a );
2854 	aSign = extractFloatx80Sign( a );
2855 	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2856 	shiftCount = 0x4037 - aExp;
2857 	if ( shiftCount <= 0 ) shiftCount = 1;
2858 	shift64RightJamming( aSig, shiftCount, &aSig );
2859 	return roundAndPackInt32( aSign, aSig );
2860 
2861 }
2862 
2863 /*----------------------------------------------------------------------------
2864 | Returns the result of converting the extended double-precision floating-
2865 | point value `a' to the 32-bit two's complement integer format.  The
2866 | conversion is performed according to the IEC/IEEE Standard for Binary
2867 | Floating-Point Arithmetic, except that the conversion is always rounded
2868 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
2869 | Otherwise, if the conversion overflows, the largest integer with the same
2870 | sign as `a' is returned.
2871 *----------------------------------------------------------------------------*/
2872 
floatx80_to_int32_round_to_zero(floatx80 a)2873 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2874 {
2875 	flag aSign;
2876 	int32 aExp, shiftCount;
2877 	bits64 aSig, savedASig;
2878 	int32 z;
2879 
2880 	aSig = extractFloatx80Frac( a );
2881 	aExp = extractFloatx80Exp( a );
2882 	aSign = extractFloatx80Sign( a );
2883 	if ( 0x401E < aExp ) {
2884 		if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2885 		goto invalid;
2886 	}
2887 	else if ( aExp < 0x3FFF ) {
2888 		if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2889 		return 0;
2890 	}
2891 	shiftCount = 0x403E - aExp;
2892 	savedASig = aSig;
2893 	aSig >>= shiftCount;
2894 	z = aSig;
2895 	if ( aSign ) z = - z;
2896 	if ( ( z < 0 ) ^ aSign ) {
2897 	invalid:
2898 		float_raise( float_flag_invalid );
2899 		return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2900 	}
2901 	if ( ( aSig<<shiftCount ) != savedASig ) {
2902 		float_exception_flags |= float_flag_inexact;
2903 	}
2904 	return z;
2905 
2906 }
2907 
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of converting the extended double-precision floating-
2910 | point value `a' to the 64-bit two's complement integer format.  The
2911 | conversion is performed according to the IEC/IEEE Standard for Binary
2912 | Floating-Point Arithmetic---which means in particular that the conversion
2913 | is rounded according to the current rounding mode.  If `a' is a NaN,
2914 | the largest positive integer is returned.  Otherwise, if the conversion
2915 | overflows, the largest integer with the same sign as `a' is returned.
2916 *----------------------------------------------------------------------------*/
2917 
floatx80_to_int64(floatx80 a)2918 int64 floatx80_to_int64( floatx80 a )
2919 {
2920 	flag aSign;
2921 	int32 aExp, shiftCount;
2922 	bits64 aSig, aSigExtra;
2923 
2924 	aSig = extractFloatx80Frac( a );
2925 	aExp = extractFloatx80Exp( a );
2926 	aSign = extractFloatx80Sign( a );
2927 	shiftCount = 0x403E - aExp;
2928 	if ( shiftCount <= 0 ) {
2929 		if ( shiftCount ) {
2930 			float_raise( float_flag_invalid );
2931 			if (    ! aSign
2932 					|| (    ( aExp == 0x7FFF )
2933 						&& ( aSig != LIT64( 0x8000000000000000 ) ) )
2934 				) {
2935 				return LIT64( 0x7FFFFFFFFFFFFFFF );
2936 			}
2937 			return (sbits64) LIT64( 0x8000000000000000 );
2938 		}
2939 		aSigExtra = 0;
2940 	}
2941 	else {
2942 		shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2943 	}
2944 	return roundAndPackInt64( aSign, aSig, aSigExtra );
2945 
2946 }
2947 
2948 /*----------------------------------------------------------------------------
2949 | Returns the result of converting the extended double-precision floating-
2950 | point value `a' to the 64-bit two's complement integer format.  The
2951 | conversion is performed according to the IEC/IEEE Standard for Binary
2952 | Floating-Point Arithmetic, except that the conversion is always rounded
2953 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
2954 | Otherwise, if the conversion overflows, the largest integer with the same
2955 | sign as `a' is returned.
2956 *----------------------------------------------------------------------------*/
2957 
floatx80_to_int64_round_to_zero(floatx80 a)2958 int64 floatx80_to_int64_round_to_zero( floatx80 a )
2959 {
2960 	flag aSign;
2961 	int32 aExp, shiftCount;
2962 	bits64 aSig;
2963 	int64 z;
2964 
2965 	aSig = extractFloatx80Frac( a );
2966 	aExp = extractFloatx80Exp( a );
2967 	aSign = extractFloatx80Sign( a );
2968 	shiftCount = aExp - 0x403E;
2969 	if ( 0 <= shiftCount ) {
2970 		aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
2971 		if ( ( a.high != 0xC03E ) || aSig ) {
2972 			float_raise( float_flag_invalid );
2973 			if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
2974 				return LIT64( 0x7FFFFFFFFFFFFFFF );
2975 			}
2976 		}
2977 		return (sbits64) LIT64( 0x8000000000000000 );
2978 	}
2979 	else if ( aExp < 0x3FFF ) {
2980 		if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2981 		return 0;
2982 	}
2983 	z = aSig>>( - shiftCount );
2984 	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2985 		float_exception_flags |= float_flag_inexact;
2986 	}
2987 	if ( aSign ) z = - z;
2988 	return z;
2989 
2990 }
2991 
2992 /*----------------------------------------------------------------------------
2993 | Returns the result of converting the extended double-precision floating-
2994 | point value `a' to the single-precision floating-point format.  The
2995 | conversion is performed according to the IEC/IEEE Standard for Binary
2996 | Floating-Point Arithmetic.
2997 *----------------------------------------------------------------------------*/
2998 
floatx80_to_float32(floatx80 a)2999 float32 floatx80_to_float32( floatx80 a )
3000 {
3001 	flag aSign;
3002 	int32 aExp;
3003 	bits64 aSig;
3004 
3005 	aSig = extractFloatx80Frac( a );
3006 	aExp = extractFloatx80Exp( a );
3007 	aSign = extractFloatx80Sign( a );
3008 	if ( aExp == 0x7FFF ) {
3009 		if ( (bits64) ( aSig<<1 ) ) {
3010 			return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3011 		}
3012 		return packFloat32( aSign, 0xFF, 0 );
3013 	}
3014 	shift64RightJamming( aSig, 33, &aSig );
3015 	if ( aExp || aSig ) aExp -= 0x3F81;
3016 	return roundAndPackFloat32( aSign, aExp, aSig );
3017 
3018 }
3019 
3020 /*----------------------------------------------------------------------------
3021 | Returns the result of converting the extended double-precision floating-
3022 | point value `a' to the double-precision floating-point format.  The
3023 | conversion is performed according to the IEC/IEEE Standard for Binary
3024 | Floating-Point Arithmetic.
3025 *----------------------------------------------------------------------------*/
3026 
floatx80_to_float64(floatx80 a)3027 float64 floatx80_to_float64( floatx80 a )
3028 {
3029 	flag aSign;
3030 	int32 aExp;
3031 	bits64 aSig, zSig;
3032 
3033 	aSig = extractFloatx80Frac( a );
3034 	aExp = extractFloatx80Exp( a );
3035 	aSign = extractFloatx80Sign( a );
3036 	if ( aExp == 0x7FFF ) {
3037 		if ( (bits64) ( aSig<<1 ) ) {
3038 			return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3039 		}
3040 		return packFloat64( aSign, 0x7FF, 0 );
3041 	}
3042 	shift64RightJamming( aSig, 1, &zSig );
3043 	if ( aExp || aSig ) aExp -= 0x3C01;
3044 	return roundAndPackFloat64( aSign, aExp, zSig );
3045 
3046 }
3047 
3048 #ifdef FLOAT128
3049 
3050 /*----------------------------------------------------------------------------
3051 | Returns the result of converting the extended double-precision floating-
3052 | point value `a' to the quadruple-precision floating-point format.  The
3053 | conversion is performed according to the IEC/IEEE Standard for Binary
3054 | Floating-Point Arithmetic.
3055 *----------------------------------------------------------------------------*/
3056 
floatx80_to_float128(floatx80 a)3057 float128 floatx80_to_float128( floatx80 a )
3058 {
3059 	flag aSign;
3060 	int16 aExp;
3061 	bits64 aSig, zSig0, zSig1;
3062 
3063 	aSig = extractFloatx80Frac( a );
3064 	aExp = extractFloatx80Exp( a );
3065 	aSign = extractFloatx80Sign( a );
3066 	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3067 		return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3068 	}
3069 	shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3070 	return packFloat128( aSign, aExp, zSig0, zSig1 );
3071 
3072 }
3073 
3074 #endif
3075 
3076 /*----------------------------------------------------------------------------
3077 | Rounds the extended double-precision floating-point value `a' to an integer,
3078 | and returns the result as an extended quadruple-precision floating-point
3079 | value.  The operation is performed according to the IEC/IEEE Standard for
3080 | Binary Floating-Point Arithmetic.
3081 *----------------------------------------------------------------------------*/
3082 
floatx80_round_to_int(floatx80 a)3083 floatx80 floatx80_round_to_int( floatx80 a )
3084 {
3085 	flag aSign;
3086 	int32 aExp;
3087 	bits64 lastBitMask, roundBitsMask;
3088 	int8 roundingMode;
3089 	floatx80 z;
3090 
3091 	aExp = extractFloatx80Exp( a );
3092 	if ( 0x403E <= aExp ) {
3093 		if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3094 			return propagateFloatx80NaN( a, a );
3095 		}
3096 		return a;
3097 	}
3098 	if ( aExp < 0x3FFF ) {
3099 		if (    ( aExp == 0 )
3100 				&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3101 			return a;
3102 		}
3103 		float_exception_flags |= float_flag_inexact;
3104 		aSign = extractFloatx80Sign( a );
3105 		switch ( float_rounding_mode ) {
3106 			case float_round_nearest_even:
3107 			if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3108 				) {
3109 				return
3110 					packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3111 			}
3112 			break;
3113 			case float_round_down:
3114 			return
3115 					aSign ?
3116 						packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3117 				: packFloatx80( 0, 0, 0 );
3118 			case float_round_up:
3119 			return
3120 					aSign ? packFloatx80( 1, 0, 0 )
3121 				: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3122 		}
3123 		return packFloatx80( aSign, 0, 0 );
3124 	}
3125 	lastBitMask = 1;
3126 	lastBitMask <<= 0x403E - aExp;
3127 	roundBitsMask = lastBitMask - 1;
3128 	z = a;
3129 	roundingMode = float_rounding_mode;
3130 	if ( roundingMode == float_round_nearest_even ) {
3131 		z.low += lastBitMask>>1;
3132 		if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3133 	}
3134 	else if ( roundingMode != float_round_to_zero ) {
3135 		if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3136 			z.low += roundBitsMask;
3137 		}
3138 	}
3139 	z.low &= ~ roundBitsMask;
3140 	if ( z.low == 0 ) {
3141 		++z.high;
3142 		z.low = LIT64( 0x8000000000000000 );
3143 	}
3144 	if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3145 	return z;
3146 
3147 }
3148 
3149 /*----------------------------------------------------------------------------
3150 | Returns the result of adding the absolute values of the extended double-
3151 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3152 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3153 | The addition is performed according to the IEC/IEEE Standard for Binary
3154 | Floating-Point Arithmetic.
3155 *----------------------------------------------------------------------------*/
3156 
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3157 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3158 {
3159 	int32 aExp, bExp, zExp;
3160 	bits64 aSig, bSig, zSig0, zSig1;
3161 	int32 expDiff;
3162 
3163 	aSig = extractFloatx80Frac( a );
3164 	aExp = extractFloatx80Exp( a );
3165 	bSig = extractFloatx80Frac( b );
3166 	bExp = extractFloatx80Exp( b );
3167 	expDiff = aExp - bExp;
3168 	if ( 0 < expDiff ) {
3169 		if ( aExp == 0x7FFF ) {
3170 			if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3171 			return a;
3172 		}
3173 		if ( bExp == 0 ) --expDiff;
3174 		shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3175 		zExp = aExp;
3176 	}
3177 	else if ( expDiff < 0 ) {
3178 		if ( bExp == 0x7FFF ) {
3179 			if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3180 			return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3181 		}
3182 		if ( aExp == 0 ) ++expDiff;
3183 		shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3184 		zExp = bExp;
3185 	}
3186 	else {
3187 		if ( aExp == 0x7FFF ) {
3188 			if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3189 				return propagateFloatx80NaN( a, b );
3190 			}
3191 			return a;
3192 		}
3193 		zSig1 = 0;
3194 		zSig0 = aSig + bSig;
3195 		if ( aExp == 0 ) {
3196 			normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3197 			goto roundAndPack;
3198 		}
3199 		zExp = aExp;
3200 		goto shiftRight1;
3201 	}
3202 	zSig0 = aSig + bSig;
3203 	if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3204 	shiftRight1:
3205 	shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3206 	zSig0 |= LIT64( 0x8000000000000000 );
3207 	++zExp;
3208 	roundAndPack:
3209 	return
3210 		roundAndPackFloatx80(
3211 			floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3212 
3213 }
3214 
3215 /*----------------------------------------------------------------------------
3216 | Returns the result of subtracting the absolute values of the extended
3217 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3218 | difference is negated before being returned.  `zSign' is ignored if the
3219 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3220 | Standard for Binary Floating-Point Arithmetic.
3221 *----------------------------------------------------------------------------*/
3222 
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3223 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3224 {
3225 	int32 aExp, bExp, zExp;
3226 	bits64 aSig, bSig, zSig0, zSig1;
3227 	int32 expDiff;
3228 	floatx80 z;
3229 
3230 	aSig = extractFloatx80Frac( a );
3231 	aExp = extractFloatx80Exp( a );
3232 	bSig = extractFloatx80Frac( b );
3233 	bExp = extractFloatx80Exp( b );
3234 	expDiff = aExp - bExp;
3235 	if ( 0 < expDiff ) goto aExpBigger;
3236 	if ( expDiff < 0 ) goto bExpBigger;
3237 	if ( aExp == 0x7FFF ) {
3238 		if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3239 			return propagateFloatx80NaN( a, b );
3240 		}
3241 		float_raise( float_flag_invalid );
3242 		z.low = floatx80_default_nan_low;
3243 		z.high = floatx80_default_nan_high;
3244 		return z;
3245 	}
3246 	if ( aExp == 0 ) {
3247 		aExp = 1;
3248 		bExp = 1;
3249 	}
3250 	zSig1 = 0;
3251 	if ( bSig < aSig ) goto aBigger;
3252 	if ( aSig < bSig ) goto bBigger;
3253 	return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3254 	bExpBigger:
3255 	if ( bExp == 0x7FFF ) {
3256 		if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3257 		return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3258 	}
3259 	if ( aExp == 0 ) ++expDiff;
3260 	shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3261 	bBigger:
3262 	sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3263 	zExp = bExp;
3264 	zSign ^= 1;
3265 	goto normalizeRoundAndPack;
3266 	aExpBigger:
3267 	if ( aExp == 0x7FFF ) {
3268 		if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3269 		return a;
3270 	}
3271 	if ( bExp == 0 ) --expDiff;
3272 	shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3273 	aBigger:
3274 	sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3275 	zExp = aExp;
3276 	normalizeRoundAndPack:
3277 	return
3278 		normalizeRoundAndPackFloatx80(
3279 			floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3280 
3281 }
3282 
3283 /*----------------------------------------------------------------------------
3284 | Returns the result of adding the extended double-precision floating-point
3285 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3286 | Standard for Binary Floating-Point Arithmetic.
3287 *----------------------------------------------------------------------------*/
3288 
floatx80_add(floatx80 a,floatx80 b)3289 floatx80 floatx80_add( floatx80 a, floatx80 b )
3290 {
3291 	flag aSign, bSign;
3292 
3293 	aSign = extractFloatx80Sign( a );
3294 	bSign = extractFloatx80Sign( b );
3295 	if ( aSign == bSign ) {
3296 		return addFloatx80Sigs( a, b, aSign );
3297 	}
3298 	else {
3299 		return subFloatx80Sigs( a, b, aSign );
3300 	}
3301 
3302 }
3303 
3304 /*----------------------------------------------------------------------------
3305 | Returns the result of subtracting the extended double-precision floating-
3306 | point values `a' and `b'.  The operation is performed according to the
3307 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3309 
floatx80_sub(floatx80 a,floatx80 b)3310 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3311 {
3312 	flag aSign, bSign;
3313 
3314 	aSign = extractFloatx80Sign( a );
3315 	bSign = extractFloatx80Sign( b );
3316 	if ( aSign == bSign ) {
3317 		return subFloatx80Sigs( a, b, aSign );
3318 	}
3319 	else {
3320 		return addFloatx80Sigs( a, b, aSign );
3321 	}
3322 
3323 }
3324 
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of multiplying the extended double-precision floating-
3327 | point values `a' and `b'.  The operation is performed according to the
3328 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3329 *----------------------------------------------------------------------------*/
3330 
floatx80_mul(floatx80 a,floatx80 b)3331 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3332 {
3333 	flag aSign, bSign, zSign;
3334 	int32 aExp, bExp, zExp;
3335 	bits64 aSig, bSig, zSig0, zSig1;
3336 	floatx80 z;
3337 
3338 	aSig = extractFloatx80Frac( a );
3339 	aExp = extractFloatx80Exp( a );
3340 	aSign = extractFloatx80Sign( a );
3341 	bSig = extractFloatx80Frac( b );
3342 	bExp = extractFloatx80Exp( b );
3343 	bSign = extractFloatx80Sign( b );
3344 	zSign = aSign ^ bSign;
3345 	if ( aExp == 0x7FFF ) {
3346 		if (    (bits64) ( aSig<<1 )
3347 				|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3348 			return propagateFloatx80NaN( a, b );
3349 		}
3350 		if ( ( bExp | bSig ) == 0 ) goto invalid;
3351 		return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3352 	}
3353 	if ( bExp == 0x7FFF ) {
3354 		if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3355 		if ( ( aExp | aSig ) == 0 ) {
3356 	invalid:
3357 			float_raise( float_flag_invalid );
3358 			z.low = floatx80_default_nan_low;
3359 			z.high = floatx80_default_nan_high;
3360 			return z;
3361 		}
3362 		return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3363 	}
3364 	if ( aExp == 0 ) {
3365 		if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3366 		normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3367 	}
3368 	if ( bExp == 0 ) {
3369 		if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3370 		normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3371 	}
3372 	zExp = aExp + bExp - 0x3FFE;
3373 	mul64To128( aSig, bSig, &zSig0, &zSig1 );
3374 	if ( 0 < (sbits64) zSig0 ) {
3375 		shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3376 		--zExp;
3377 	}
3378 	return
3379 		roundAndPackFloatx80(
3380 			floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3381 
3382 }
3383 
3384 /*----------------------------------------------------------------------------
3385 | Returns the result of dividing the extended double-precision floating-point
3386 | value `a' by the corresponding value `b'.  The operation is performed
3387 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3388 *----------------------------------------------------------------------------*/
3389 
floatx80_div(floatx80 a,floatx80 b)3390 floatx80 floatx80_div( floatx80 a, floatx80 b )
3391 {
3392 	flag aSign, bSign, zSign;
3393 	int32 aExp, bExp, zExp;
3394 	bits64 aSig, bSig, zSig0, zSig1;
3395 	bits64 rem0, rem1, rem2, term0, term1, term2;
3396 	floatx80 z;
3397 
3398 	aSig = extractFloatx80Frac( a );
3399 	aExp = extractFloatx80Exp( a );
3400 	aSign = extractFloatx80Sign( a );
3401 	bSig = extractFloatx80Frac( b );
3402 	bExp = extractFloatx80Exp( b );
3403 	bSign = extractFloatx80Sign( b );
3404 	zSign = aSign ^ bSign;
3405 	if ( aExp == 0x7FFF ) {
3406 		if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3407 		if ( bExp == 0x7FFF ) {
3408 			if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3409 			goto invalid;
3410 		}
3411 		return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3412 	}
3413 	if ( bExp == 0x7FFF ) {
3414 		if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3415 		return packFloatx80( zSign, 0, 0 );
3416 	}
3417 	if ( bExp == 0 ) {
3418 		if ( bSig == 0 ) {
3419 			if ( ( aExp | aSig ) == 0 ) {
3420 	invalid:
3421 				float_raise( float_flag_invalid );
3422 				z.low = floatx80_default_nan_low;
3423 				z.high = floatx80_default_nan_high;
3424 				return z;
3425 			}
3426 			float_raise( float_flag_divbyzero );
3427 			return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3428 		}
3429 		normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3430 	}
3431 	if ( aExp == 0 ) {
3432 		if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3433 		normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3434 	}
3435 	zExp = aExp - bExp + 0x3FFE;
3436 	rem1 = 0;
3437 	if ( bSig <= aSig ) {
3438 		shift128Right( aSig, 0, 1, &aSig, &rem1 );
3439 		++zExp;
3440 	}
3441 	zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3442 	mul64To128( bSig, zSig0, &term0, &term1 );
3443 	sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3444 	while ( (sbits64) rem0 < 0 ) {
3445 		--zSig0;
3446 		add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3447 	}
3448 	zSig1 = estimateDiv128To64( rem1, 0, bSig );
3449 	if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3450 		mul64To128( bSig, zSig1, &term1, &term2 );
3451 		sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3452 		while ( (sbits64) rem1 < 0 ) {
3453 			--zSig1;
3454 			add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3455 		}
3456 		zSig1 |= ( ( rem1 | rem2 ) != 0 );
3457 	}
3458 	return
3459 		roundAndPackFloatx80(
3460 			floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3461 
3462 }
3463 
3464 /*----------------------------------------------------------------------------
3465 | Returns the remainder of the extended double-precision floating-point value
3466 | `a' with respect to the corresponding value `b'.  The operation is performed
3467 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3468 *----------------------------------------------------------------------------*/
3469 
floatx80_rem(floatx80 a,floatx80 b)3470 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3471 {
3472 	flag aSign, zSign;
3473 	int32 aExp, bExp, expDiff;
3474 	bits64 aSig0, aSig1, bSig;
3475 	bits64 q, term0, term1, alternateASig0, alternateASig1;
3476 	floatx80 z;
3477 
3478 	aSig0 = extractFloatx80Frac( a );
3479 	aExp = extractFloatx80Exp( a );
3480 	aSign = extractFloatx80Sign( a );
3481 	bSig = extractFloatx80Frac( b );
3482 	bExp = extractFloatx80Exp( b );
3483 //    bSign = extractFloatx80Sign( b );
3484 	if ( aExp == 0x7FFF ) {
3485 		if (    (bits64) ( aSig0<<1 )
3486 				|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3487 			return propagateFloatx80NaN( a, b );
3488 		}
3489 		goto invalid;
3490 	}
3491 	if ( bExp == 0x7FFF ) {
3492 		if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3493 		return a;
3494 	}
3495 	if ( bExp == 0 ) {
3496 		if ( bSig == 0 ) {
3497 	invalid:
3498 			float_raise( float_flag_invalid );
3499 			z.low = floatx80_default_nan_low;
3500 			z.high = floatx80_default_nan_high;
3501 			return z;
3502 		}
3503 		normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3504 	}
3505 	if ( aExp == 0 ) {
3506 		if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3507 		normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3508 	}
3509 	bSig |= LIT64( 0x8000000000000000 );
3510 	zSign = aSign;
3511 	expDiff = aExp - bExp;
3512 	aSig1 = 0;
3513 	if ( expDiff < 0 ) {
3514 		if ( expDiff < -1 ) return a;
3515 		shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3516 		expDiff = 0;
3517 	}
3518 	q = ( bSig <= aSig0 );
3519 	if ( q ) aSig0 -= bSig;
3520 	expDiff -= 64;
3521 	while ( 0 < expDiff ) {
3522 		q = estimateDiv128To64( aSig0, aSig1, bSig );
3523 		q = ( 2 < q ) ? q - 2 : 0;
3524 		mul64To128( bSig, q, &term0, &term1 );
3525 		sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3526 		shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3527 		expDiff -= 62;
3528 	}
3529 	expDiff += 64;
3530 	if ( 0 < expDiff ) {
3531 		q = estimateDiv128To64( aSig0, aSig1, bSig );
3532 		q = ( 2 < q ) ? q - 2 : 0;
3533 		q >>= 64 - expDiff;
3534 		mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3535 		sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3536 		shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3537 		while ( le128( term0, term1, aSig0, aSig1 ) ) {
3538 			++q;
3539 			sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3540 		}
3541 	}
3542 	else {
3543 		term1 = 0;
3544 		term0 = bSig;
3545 	}
3546 	sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3547 	if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3548 			|| (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3549 				&& ( q & 1 ) )
3550 		) {
3551 		aSig0 = alternateASig0;
3552 		aSig1 = alternateASig1;
3553 		zSign = ! zSign;
3554 	}
3555 	return
3556 		normalizeRoundAndPackFloatx80(
3557 			80, zSign, bExp + expDiff, aSig0, aSig1 );
3558 
3559 }
3560 
3561 /*----------------------------------------------------------------------------
3562 | Returns the square root of the extended double-precision floating-point
3563 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3564 | for Binary Floating-Point Arithmetic.
3565 *----------------------------------------------------------------------------*/
3566 
floatx80_sqrt(floatx80 a)3567 floatx80 floatx80_sqrt( floatx80 a )
3568 {
3569 	flag aSign;
3570 	int32 aExp, zExp;
3571 	bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3572 	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3573 	floatx80 z;
3574 
3575 	aSig0 = extractFloatx80Frac( a );
3576 	aExp = extractFloatx80Exp( a );
3577 	aSign = extractFloatx80Sign( a );
3578 	if ( aExp == 0x7FFF ) {
3579 		if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3580 		if ( ! aSign ) return a;
3581 		goto invalid;
3582 	}
3583 	if ( aSign ) {
3584 		if ( ( aExp | aSig0 ) == 0 ) return a;
3585 	invalid:
3586 		float_raise( float_flag_invalid );
3587 		z.low = floatx80_default_nan_low;
3588 		z.high = floatx80_default_nan_high;
3589 		return z;
3590 	}
3591 	if ( aExp == 0 ) {
3592 		if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3593 		normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3594 	}
3595 	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3596 	zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3597 	shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3598 	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3599 	doubleZSig0 = zSig0<<1;
3600 	mul64To128( zSig0, zSig0, &term0, &term1 );
3601 	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3602 	while ( (sbits64) rem0 < 0 ) {
3603 		--zSig0;
3604 		doubleZSig0 -= 2;
3605 		add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3606 	}
3607 	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3608 	if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3609 		if ( zSig1 == 0 ) zSig1 = 1;
3610 		mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3611 		sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3612 		mul64To128( zSig1, zSig1, &term2, &term3 );
3613 		sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3614 		while ( (sbits64) rem1 < 0 ) {
3615 			--zSig1;
3616 			shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3617 			term3 |= 1;
3618 			term2 |= doubleZSig0;
3619 			add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3620 		}
3621 		zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3622 	}
3623 	shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3624 	zSig0 |= doubleZSig0;
3625 	return
3626 		roundAndPackFloatx80(
3627 			floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3628 
3629 }
3630 
3631 /*----------------------------------------------------------------------------
3632 | Returns 1 if the extended double-precision floating-point value `a' is
3633 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3634 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3635 | Arithmetic.
3636 *----------------------------------------------------------------------------*/
3637 
floatx80_eq(floatx80 a,floatx80 b)3638 flag floatx80_eq( floatx80 a, floatx80 b )
3639 {
3640 	if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3641 				&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3642 			|| (    ( extractFloatx80Exp( b ) == 0x7FFF )
3643 				&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3644 		) {
3645 		if (    floatx80_is_signaling_nan( a )
3646 				|| floatx80_is_signaling_nan( b ) ) {
3647 			float_raise( float_flag_invalid );
3648 		}
3649 		return 0;
3650 	}
3651 	return
3652 			( a.low == b.low )
3653 		&& (    ( a.high == b.high )
3654 				|| (    ( a.low == 0 )
3655 					&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3656 			);
3657 
3658 }
3659 
3660 /*----------------------------------------------------------------------------
3661 | Returns 1 if the extended double-precision floating-point value `a' is
3662 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3663 | comparison is performed according to the IEC/IEEE Standard for Binary
3664 | Floating-Point Arithmetic.
3665 *----------------------------------------------------------------------------*/
3666 
floatx80_le(floatx80 a,floatx80 b)3667 flag floatx80_le( floatx80 a, floatx80 b )
3668 {
3669 	flag aSign, bSign;
3670 
3671 	if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3672 				&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3673 			|| (    ( extractFloatx80Exp( b ) == 0x7FFF )
3674 				&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3675 		) {
3676 		float_raise( float_flag_invalid );
3677 		return 0;
3678 	}
3679 	aSign = extractFloatx80Sign( a );
3680 	bSign = extractFloatx80Sign( b );
3681 	if ( aSign != bSign ) {
3682 		return
3683 				aSign
3684 			|| (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3685 					== 0 );
3686 	}
3687 	return
3688 			aSign ? le128( b.high, b.low, a.high, a.low )
3689 		: le128( a.high, a.low, b.high, b.low );
3690 
3691 }
3692 
3693 /*----------------------------------------------------------------------------
3694 | Returns 1 if the extended double-precision floating-point value `a' is
3695 | less than the corresponding value `b', and 0 otherwise.  The comparison
3696 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3697 | Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699 
floatx80_lt(floatx80 a,floatx80 b)3700 flag floatx80_lt( floatx80 a, floatx80 b )
3701 {
3702 	flag aSign, bSign;
3703 
3704 	if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3705 				&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3706 			|| (    ( extractFloatx80Exp( b ) == 0x7FFF )
3707 				&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3708 		) {
3709 		float_raise( float_flag_invalid );
3710 		return 0;
3711 	}
3712 	aSign = extractFloatx80Sign( a );
3713 	bSign = extractFloatx80Sign( b );
3714 	if ( aSign != bSign ) {
3715 		return
3716 				aSign
3717 			&& (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3718 					!= 0 );
3719 	}
3720 	return
3721 			aSign ? lt128( b.high, b.low, a.high, a.low )
3722 		: lt128( a.high, a.low, b.high, b.low );
3723 
3724 }
3725 
3726 /*----------------------------------------------------------------------------
3727 | Returns 1 if the extended double-precision floating-point value `a' is equal
3728 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
3729 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3730 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3731 *----------------------------------------------------------------------------*/
3732 
floatx80_eq_signaling(floatx80 a,floatx80 b)3733 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3734 {
3735 	if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3736 				&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3737 			|| (    ( extractFloatx80Exp( b ) == 0x7FFF )
3738 				&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3739 		) {
3740 		float_raise( float_flag_invalid );
3741 		return 0;
3742 	}
3743 	return
3744 			( a.low == b.low )
3745 		&& (    ( a.high == b.high )
3746 				|| (    ( a.low == 0 )
3747 					&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3748 			);
3749 
3750 }
3751 
3752 /*----------------------------------------------------------------------------
3753 | Returns 1 if the extended double-precision floating-point value `a' is less
3754 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3755 | do not cause an exception.  Otherwise, the comparison is performed according
3756 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3757 *----------------------------------------------------------------------------*/
3758 
floatx80_le_quiet(floatx80 a,floatx80 b)3759 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3760 {
3761 	flag aSign, bSign;
3762 
3763 	if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3764 				&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3765 			|| (    ( extractFloatx80Exp( b ) == 0x7FFF )
3766 				&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3767 		) {
3768 		if (    floatx80_is_signaling_nan( a )
3769 				|| floatx80_is_signaling_nan( b ) ) {
3770 			float_raise( float_flag_invalid );
3771 		}
3772 		return 0;
3773 	}
3774 	aSign = extractFloatx80Sign( a );
3775 	bSign = extractFloatx80Sign( b );
3776 	if ( aSign != bSign ) {
3777 		return
3778 				aSign
3779 			|| (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3780 					== 0 );
3781 	}
3782 	return
3783 			aSign ? le128( b.high, b.low, a.high, a.low )
3784 		: le128( a.high, a.low, b.high, b.low );
3785 
3786 }
3787 
3788 /*----------------------------------------------------------------------------
3789 | Returns 1 if the extended double-precision floating-point value `a' is less
3790 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3791 | an exception.  Otherwise, the comparison is performed according to the
3792 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3793 *----------------------------------------------------------------------------*/
3794 
floatx80_lt_quiet(floatx80 a,floatx80 b)3795 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3796 {
3797 	flag aSign, bSign;
3798 
3799 	if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3800 				&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3801 			|| (    ( extractFloatx80Exp( b ) == 0x7FFF )
3802 				&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3803 		) {
3804 		if (    floatx80_is_signaling_nan( a )
3805 				|| floatx80_is_signaling_nan( b ) ) {
3806 			float_raise( float_flag_invalid );
3807 		}
3808 		return 0;
3809 	}
3810 	aSign = extractFloatx80Sign( a );
3811 	bSign = extractFloatx80Sign( b );
3812 	if ( aSign != bSign ) {
3813 		return
3814 				aSign
3815 			&& (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3816 					!= 0 );
3817 	}
3818 	return
3819 			aSign ? lt128( b.high, b.low, a.high, a.low )
3820 		: lt128( a.high, a.low, b.high, b.low );
3821 
3822 }
3823 
3824 #endif
3825 
3826 #ifdef FLOAT128
3827 
3828 /*----------------------------------------------------------------------------
3829 | Returns the result of converting the quadruple-precision floating-point
3830 | value `a' to the 32-bit two's complement integer format.  The conversion
3831 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3832 | Arithmetic---which means in particular that the conversion is rounded
3833 | according to the current rounding mode.  If `a' is a NaN, the largest
3834 | positive integer is returned.  Otherwise, if the conversion overflows, the
3835 | largest integer with the same sign as `a' is returned.
3836 *----------------------------------------------------------------------------*/
3837 
float128_to_int32(float128 a)3838 int32 float128_to_int32( float128 a )
3839 {
3840 	flag aSign;
3841 	int32 aExp, shiftCount;
3842 	bits64 aSig0, aSig1;
3843 
3844 	aSig1 = extractFloat128Frac1( a );
3845 	aSig0 = extractFloat128Frac0( a );
3846 	aExp = extractFloat128Exp( a );
3847 	aSign = extractFloat128Sign( a );
3848 	if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3849 	if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3850 	aSig0 |= ( aSig1 != 0 );
3851 	shiftCount = 0x4028 - aExp;
3852 	if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3853 	return roundAndPackInt32( aSign, aSig0 );
3854 
3855 }
3856 
3857 /*----------------------------------------------------------------------------
3858 | Returns the result of converting the quadruple-precision floating-point
3859 | value `a' to the 32-bit two's complement integer format.  The conversion
3860 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3861 | Arithmetic, except that the conversion is always rounded toward zero.  If
3862 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
3863 | conversion overflows, the largest integer with the same sign as `a' is
3864 | returned.
3865 *----------------------------------------------------------------------------*/
3866 
float128_to_int32_round_to_zero(float128 a)3867 int32 float128_to_int32_round_to_zero( float128 a )
3868 {
3869 	flag aSign;
3870 	int32 aExp, shiftCount;
3871 	bits64 aSig0, aSig1, savedASig;
3872 	int32 z;
3873 
3874 	aSig1 = extractFloat128Frac1( a );
3875 	aSig0 = extractFloat128Frac0( a );
3876 	aExp = extractFloat128Exp( a );
3877 	aSign = extractFloat128Sign( a );
3878 	aSig0 |= ( aSig1 != 0 );
3879 	if ( 0x401E < aExp ) {
3880 		if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3881 		goto invalid;
3882 	}
3883 	else if ( aExp < 0x3FFF ) {
3884 		if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3885 		return 0;
3886 	}
3887 	aSig0 |= LIT64( 0x0001000000000000 );
3888 	shiftCount = 0x402F - aExp;
3889 	savedASig = aSig0;
3890 	aSig0 >>= shiftCount;
3891 	z = aSig0;
3892 	if ( aSign ) z = - z;
3893 	if ( ( z < 0 ) ^ aSign ) {
3894 	invalid:
3895 		float_raise( float_flag_invalid );
3896 		return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3897 	}
3898 	if ( ( aSig0<<shiftCount ) != savedASig ) {
3899 		float_exception_flags |= float_flag_inexact;
3900 	}
3901 	return z;
3902 
3903 }
3904 
3905 /*----------------------------------------------------------------------------
3906 | Returns the result of converting the quadruple-precision floating-point
3907 | value `a' to the 64-bit two's complement integer format.  The conversion
3908 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3909 | Arithmetic---which means in particular that the conversion is rounded
3910 | according to the current rounding mode.  If `a' is a NaN, the largest
3911 | positive integer is returned.  Otherwise, if the conversion overflows, the
3912 | largest integer with the same sign as `a' is returned.
3913 *----------------------------------------------------------------------------*/
3914 
float128_to_int64(float128 a)3915 int64 float128_to_int64( float128 a )
3916 {
3917 	flag aSign;
3918 	int32 aExp, shiftCount;
3919 	bits64 aSig0, aSig1;
3920 
3921 	aSig1 = extractFloat128Frac1( a );
3922 	aSig0 = extractFloat128Frac0( a );
3923 	aExp = extractFloat128Exp( a );
3924 	aSign = extractFloat128Sign( a );
3925 	if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3926 	shiftCount = 0x402F - aExp;
3927 	if ( shiftCount <= 0 ) {
3928 		if ( 0x403E < aExp ) {
3929 			float_raise( float_flag_invalid );
3930 			if (    ! aSign
3931 					|| (    ( aExp == 0x7FFF )
3932 						&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
3933 					)
3934 				) {
3935 				return LIT64( 0x7FFFFFFFFFFFFFFF );
3936 			}
3937 			return (sbits64) LIT64( 0x8000000000000000 );
3938 		}
3939 		shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
3940 	}
3941 	else {
3942 		shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
3943 	}
3944 	return roundAndPackInt64( aSign, aSig0, aSig1 );
3945 
3946 }
3947 
3948 /*----------------------------------------------------------------------------
3949 | Returns the result of converting the quadruple-precision floating-point
3950 | value `a' to the 64-bit two's complement integer format.  The conversion
3951 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3952 | Arithmetic, except that the conversion is always rounded toward zero.
3953 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3954 | the conversion overflows, the largest integer with the same sign as `a' is
3955 | returned.
3956 *----------------------------------------------------------------------------*/
3957 
float128_to_int64_round_to_zero(float128 a)3958 int64 float128_to_int64_round_to_zero( float128 a )
3959 {
3960 	flag aSign;
3961 	int32 aExp, shiftCount;
3962 	bits64 aSig0, aSig1;
3963 	int64 z;
3964 
3965 	aSig1 = extractFloat128Frac1( a );
3966 	aSig0 = extractFloat128Frac0( a );
3967 	aExp = extractFloat128Exp( a );
3968 	aSign = extractFloat128Sign( a );
3969 	if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3970 	shiftCount = aExp - 0x402F;
3971 	if ( 0 < shiftCount ) {
3972 		if ( 0x403E <= aExp ) {
3973 			aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
3974 			if (    ( a.high == LIT64( 0xC03E000000000000 ) )
3975 					&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
3976 				if ( aSig1 ) float_exception_flags |= float_flag_inexact;
3977 			}
3978 			else {
3979 				float_raise( float_flag_invalid );
3980 				if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
3981 					return LIT64( 0x7FFFFFFFFFFFFFFF );
3982 				}
3983 			}
3984 			return (sbits64) LIT64( 0x8000000000000000 );
3985 		}
3986 		z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
3987 		if ( (bits64) ( aSig1<<shiftCount ) ) {
3988 			float_exception_flags |= float_flag_inexact;
3989 		}
3990 	}
3991 	else {
3992 		if ( aExp < 0x3FFF ) {
3993 			if ( aExp | aSig0 | aSig1 ) {
3994 				float_exception_flags |= float_flag_inexact;
3995 			}
3996 			return 0;
3997 		}
3998 		z = aSig0>>( - shiftCount );
3999 		if (    aSig1
4000 				|| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4001 			float_exception_flags |= float_flag_inexact;
4002 		}
4003 	}
4004 	if ( aSign ) z = - z;
4005 	return z;
4006 
4007 }
4008 
4009 /*----------------------------------------------------------------------------
4010 | Returns the result of converting the quadruple-precision floating-point
4011 | value `a' to the single-precision floating-point format.  The conversion
4012 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4013 | Arithmetic.
4014 *----------------------------------------------------------------------------*/
4015 
float128_to_float32(float128 a)4016 float32 float128_to_float32( float128 a )
4017 {
4018 	flag aSign;
4019 	int32 aExp;
4020 	bits64 aSig0, aSig1;
4021 	bits32 zSig;
4022 
4023 	aSig1 = extractFloat128Frac1( a );
4024 	aSig0 = extractFloat128Frac0( a );
4025 	aExp = extractFloat128Exp( a );
4026 	aSign = extractFloat128Sign( a );
4027 	if ( aExp == 0x7FFF ) {
4028 		if ( aSig0 | aSig1 ) {
4029 			return commonNaNToFloat32( float128ToCommonNaN( a ) );
4030 		}
4031 		return packFloat32( aSign, 0xFF, 0 );
4032 	}
4033 	aSig0 |= ( aSig1 != 0 );
4034 	shift64RightJamming( aSig0, 18, &aSig0 );
4035 	zSig = aSig0;
4036 	if ( aExp || zSig ) {
4037 		zSig |= 0x40000000;
4038 		aExp -= 0x3F81;
4039 	}
4040 	return roundAndPackFloat32( aSign, aExp, zSig );
4041 
4042 }
4043 
4044 /*----------------------------------------------------------------------------
4045 | Returns the result of converting the quadruple-precision floating-point
4046 | value `a' to the double-precision floating-point format.  The conversion
4047 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4048 | Arithmetic.
4049 *----------------------------------------------------------------------------*/
4050 
float128_to_float64(float128 a)4051 float64 float128_to_float64( float128 a )
4052 {
4053 	flag aSign;
4054 	int32 aExp;
4055 	bits64 aSig0, aSig1;
4056 
4057 	aSig1 = extractFloat128Frac1( a );
4058 	aSig0 = extractFloat128Frac0( a );
4059 	aExp = extractFloat128Exp( a );
4060 	aSign = extractFloat128Sign( a );
4061 	if ( aExp == 0x7FFF ) {
4062 		if ( aSig0 | aSig1 ) {
4063 			return commonNaNToFloat64( float128ToCommonNaN( a ) );
4064 		}
4065 		return packFloat64( aSign, 0x7FF, 0 );
4066 	}
4067 	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4068 	aSig0 |= ( aSig1 != 0 );
4069 	if ( aExp || aSig0 ) {
4070 		aSig0 |= LIT64( 0x4000000000000000 );
4071 		aExp -= 0x3C01;
4072 	}
4073 	return roundAndPackFloat64( aSign, aExp, aSig0 );
4074 
4075 }
4076 
4077 #ifdef FLOATX80
4078 
4079 /*----------------------------------------------------------------------------
4080 | Returns the result of converting the quadruple-precision floating-point
4081 | value `a' to the extended double-precision floating-point format.  The
4082 | conversion is performed according to the IEC/IEEE Standard for Binary
4083 | Floating-Point Arithmetic.
4084 *----------------------------------------------------------------------------*/
4085 
float128_to_floatx80(float128 a)4086 floatx80 float128_to_floatx80( float128 a )
4087 {
4088 	flag aSign;
4089 	int32 aExp;
4090 	bits64 aSig0, aSig1;
4091 
4092 	aSig1 = extractFloat128Frac1( a );
4093 	aSig0 = extractFloat128Frac0( a );
4094 	aExp = extractFloat128Exp( a );
4095 	aSign = extractFloat128Sign( a );
4096 	if ( aExp == 0x7FFF ) {
4097 		if ( aSig0 | aSig1 ) {
4098 			return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4099 		}
4100 		return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4101 	}
4102 	if ( aExp == 0 ) {
4103 		if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4104 		normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4105 	}
4106 	else {
4107 		aSig0 |= LIT64( 0x0001000000000000 );
4108 	}
4109 	shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4110 	return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4111 
4112 }
4113 
4114 #endif
4115 
4116 /*----------------------------------------------------------------------------
4117 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4118 | returns the result as a quadruple-precision floating-point value.  The
4119 | operation is performed according to the IEC/IEEE Standard for Binary
4120 | Floating-Point Arithmetic.
4121 *----------------------------------------------------------------------------*/
4122 
float128_round_to_int(float128 a)4123 float128 float128_round_to_int( float128 a )
4124 {
4125 	flag aSign;
4126 	int32 aExp;
4127 	bits64 lastBitMask, roundBitsMask;
4128 	int8 roundingMode;
4129 	float128 z;
4130 
4131 	aExp = extractFloat128Exp( a );
4132 	if ( 0x402F <= aExp ) {
4133 		if ( 0x406F <= aExp ) {
4134 			if (    ( aExp == 0x7FFF )
4135 					&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4136 				) {
4137 				return propagateFloat128NaN( a, a );
4138 			}
4139 			return a;
4140 		}
4141 		lastBitMask = 1;
4142 		lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4143 		roundBitsMask = lastBitMask - 1;
4144 		z = a;
4145 		roundingMode = float_rounding_mode;
4146 		if ( roundingMode == float_round_nearest_even ) {
4147 			if ( lastBitMask ) {
4148 				add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4149 				if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4150 			}
4151 			else {
4152 				if ( (sbits64) z.low < 0 ) {
4153 					++z.high;
4154 					if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4155 				}
4156 			}
4157 		}
4158 		else if ( roundingMode != float_round_to_zero ) {
4159 			if (   extractFloat128Sign( z )
4160 					^ ( roundingMode == float_round_up ) ) {
4161 				add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4162 			}
4163 		}
4164 		z.low &= ~ roundBitsMask;
4165 	}
4166 	else {
4167 		if ( aExp < 0x3FFF ) {
4168 			if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4169 			float_exception_flags |= float_flag_inexact;
4170 			aSign = extractFloat128Sign( a );
4171 			switch ( float_rounding_mode ) {
4172 				case float_round_nearest_even:
4173 				if (    ( aExp == 0x3FFE )
4174 						&& (   extractFloat128Frac0( a )
4175 							| extractFloat128Frac1( a ) )
4176 					) {
4177 					return packFloat128( aSign, 0x3FFF, 0, 0 );
4178 				}
4179 				break;
4180 				case float_round_down:
4181 				return
4182 						aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4183 					: packFloat128( 0, 0, 0, 0 );
4184 				case float_round_up:
4185 				return
4186 						aSign ? packFloat128( 1, 0, 0, 0 )
4187 					: packFloat128( 0, 0x3FFF, 0, 0 );
4188 			}
4189 			return packFloat128( aSign, 0, 0, 0 );
4190 		}
4191 		lastBitMask = 1;
4192 		lastBitMask <<= 0x402F - aExp;
4193 		roundBitsMask = lastBitMask - 1;
4194 		z.low = 0;
4195 		z.high = a.high;
4196 		roundingMode = float_rounding_mode;
4197 		if ( roundingMode == float_round_nearest_even ) {
4198 			z.high += lastBitMask>>1;
4199 			if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4200 				z.high &= ~ lastBitMask;
4201 			}
4202 		}
4203 		else if ( roundingMode != float_round_to_zero ) {
4204 			if (   extractFloat128Sign( z )
4205 					^ ( roundingMode == float_round_up ) ) {
4206 				z.high |= ( a.low != 0 );
4207 				z.high += roundBitsMask;
4208 			}
4209 		}
4210 		z.high &= ~ roundBitsMask;
4211 	}
4212 	if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4213 		float_exception_flags |= float_flag_inexact;
4214 	}
4215 	return z;
4216 
4217 }
4218 
4219 /*----------------------------------------------------------------------------
4220 | Returns the result of adding the absolute values of the quadruple-precision
4221 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4222 | before being returned.  `zSign' is ignored if the result is a NaN.
4223 | The addition is performed according to the IEC/IEEE Standard for Binary
4224 | Floating-Point Arithmetic.
4225 *----------------------------------------------------------------------------*/
4226 
addFloat128Sigs(float128 a,float128 b,flag zSign)4227 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4228 {
4229 	int32 aExp, bExp, zExp;
4230 	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4231 	int32 expDiff;
4232 
4233 	aSig1 = extractFloat128Frac1( a );
4234 	aSig0 = extractFloat128Frac0( a );
4235 	aExp = extractFloat128Exp( a );
4236 	bSig1 = extractFloat128Frac1( b );
4237 	bSig0 = extractFloat128Frac0( b );
4238 	bExp = extractFloat128Exp( b );
4239 	expDiff = aExp - bExp;
4240 	if ( 0 < expDiff ) {
4241 		if ( aExp == 0x7FFF ) {
4242 			if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4243 			return a;
4244 		}
4245 		if ( bExp == 0 ) {
4246 			--expDiff;
4247 		}
4248 		else {
4249 			bSig0 |= LIT64( 0x0001000000000000 );
4250 		}
4251 		shift128ExtraRightJamming(
4252 			bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4253 		zExp = aExp;
4254 	}
4255 	else if ( expDiff < 0 ) {
4256 		if ( bExp == 0x7FFF ) {
4257 			if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4258 			return packFloat128( zSign, 0x7FFF, 0, 0 );
4259 		}
4260 		if ( aExp == 0 ) {
4261 			++expDiff;
4262 		}
4263 		else {
4264 			aSig0 |= LIT64( 0x0001000000000000 );
4265 		}
4266 		shift128ExtraRightJamming(
4267 			aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4268 		zExp = bExp;
4269 	}
4270 	else {
4271 		if ( aExp == 0x7FFF ) {
4272 			if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4273 				return propagateFloat128NaN( a, b );
4274 			}
4275 			return a;
4276 		}
4277 		add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4278 		if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4279 		zSig2 = 0;
4280 		zSig0 |= LIT64( 0x0002000000000000 );
4281 		zExp = aExp;
4282 		goto shiftRight1;
4283 	}
4284 	aSig0 |= LIT64( 0x0001000000000000 );
4285 	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4286 	--zExp;
4287 	if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4288 	++zExp;
4289 	shiftRight1:
4290 	shift128ExtraRightJamming(
4291 		zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4292 	roundAndPack:
4293 	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4294 
4295 }
4296 
4297 /*----------------------------------------------------------------------------
4298 | Returns the result of subtracting the absolute values of the quadruple-
4299 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4300 | difference is negated before being returned.  `zSign' is ignored if the
4301 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4302 | Standard for Binary Floating-Point Arithmetic.
4303 *----------------------------------------------------------------------------*/
4304 
subFloat128Sigs(float128 a,float128 b,flag zSign)4305 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4306 {
4307 	int32 aExp, bExp, zExp;
4308 	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4309 	int32 expDiff;
4310 	float128 z;
4311 
4312 	aSig1 = extractFloat128Frac1( a );
4313 	aSig0 = extractFloat128Frac0( a );
4314 	aExp = extractFloat128Exp( a );
4315 	bSig1 = extractFloat128Frac1( b );
4316 	bSig0 = extractFloat128Frac0( b );
4317 	bExp = extractFloat128Exp( b );
4318 	expDiff = aExp - bExp;
4319 	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4320 	shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4321 	if ( 0 < expDiff ) goto aExpBigger;
4322 	if ( expDiff < 0 ) goto bExpBigger;
4323 	if ( aExp == 0x7FFF ) {
4324 		if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4325 			return propagateFloat128NaN( a, b );
4326 		}
4327 		float_raise( float_flag_invalid );
4328 		z.low = float128_default_nan_low;
4329 		z.high = float128_default_nan_high;
4330 		return z;
4331 	}
4332 	if ( aExp == 0 ) {
4333 		aExp = 1;
4334 		bExp = 1;
4335 	}
4336 	if ( bSig0 < aSig0 ) goto aBigger;
4337 	if ( aSig0 < bSig0 ) goto bBigger;
4338 	if ( bSig1 < aSig1 ) goto aBigger;
4339 	if ( aSig1 < bSig1 ) goto bBigger;
4340 	return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4341 	bExpBigger:
4342 	if ( bExp == 0x7FFF ) {
4343 		if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4344 		return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4345 	}
4346 	if ( aExp == 0 ) {
4347 		++expDiff;
4348 	}
4349 	else {
4350 		aSig0 |= LIT64( 0x4000000000000000 );
4351 	}
4352 	shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4353 	bSig0 |= LIT64( 0x4000000000000000 );
4354 	bBigger:
4355 	sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4356 	zExp = bExp;
4357 	zSign ^= 1;
4358 	goto normalizeRoundAndPack;
4359 	aExpBigger:
4360 	if ( aExp == 0x7FFF ) {
4361 		if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4362 		return a;
4363 	}
4364 	if ( bExp == 0 ) {
4365 		--expDiff;
4366 	}
4367 	else {
4368 		bSig0 |= LIT64( 0x4000000000000000 );
4369 	}
4370 	shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4371 	aSig0 |= LIT64( 0x4000000000000000 );
4372 	aBigger:
4373 	sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4374 	zExp = aExp;
4375 	normalizeRoundAndPack:
4376 	--zExp;
4377 	return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4378 
4379 }
4380 
4381 /*----------------------------------------------------------------------------
4382 | Returns the result of adding the quadruple-precision floating-point values
4383 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4384 | for Binary Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4386 
float128_add(float128 a,float128 b)4387 float128 float128_add( float128 a, float128 b )
4388 {
4389 	flag aSign, bSign;
4390 
4391 	aSign = extractFloat128Sign( a );
4392 	bSign = extractFloat128Sign( b );
4393 	if ( aSign == bSign ) {
4394 		return addFloat128Sigs( a, b, aSign );
4395 	}
4396 	else {
4397 		return subFloat128Sigs( a, b, aSign );
4398 	}
4399 
4400 }
4401 
4402 /*----------------------------------------------------------------------------
4403 | Returns the result of subtracting the quadruple-precision floating-point
4404 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4405 | Standard for Binary Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4407 
float128_sub(float128 a,float128 b)4408 float128 float128_sub( float128 a, float128 b )
4409 {
4410 	flag aSign, bSign;
4411 
4412 	aSign = extractFloat128Sign( a );
4413 	bSign = extractFloat128Sign( b );
4414 	if ( aSign == bSign ) {
4415 		return subFloat128Sigs( a, b, aSign );
4416 	}
4417 	else {
4418 		return addFloat128Sigs( a, b, aSign );
4419 	}
4420 
4421 }
4422 
4423 /*----------------------------------------------------------------------------
4424 | Returns the result of multiplying the quadruple-precision floating-point
4425 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4426 | Standard for Binary Floating-Point Arithmetic.
4427 *----------------------------------------------------------------------------*/
4428 
float128_mul(float128 a,float128 b)4429 float128 float128_mul( float128 a, float128 b )
4430 {
4431 	flag aSign, bSign, zSign;
4432 	int32 aExp, bExp, zExp;
4433 	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4434 	float128 z;
4435 
4436 	aSig1 = extractFloat128Frac1( a );
4437 	aSig0 = extractFloat128Frac0( a );
4438 	aExp = extractFloat128Exp( a );
4439 	aSign = extractFloat128Sign( a );
4440 	bSig1 = extractFloat128Frac1( b );
4441 	bSig0 = extractFloat128Frac0( b );
4442 	bExp = extractFloat128Exp( b );
4443 	bSign = extractFloat128Sign( b );
4444 	zSign = aSign ^ bSign;
4445 	if ( aExp == 0x7FFF ) {
4446 		if (    ( aSig0 | aSig1 )
4447 				|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4448 			return propagateFloat128NaN( a, b );
4449 		}
4450 		if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4451 		return packFloat128( zSign, 0x7FFF, 0, 0 );
4452 	}
4453 	if ( bExp == 0x7FFF ) {
4454 		if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4455 		if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4456 	invalid:
4457 			float_raise( float_flag_invalid );
4458 			z.low = float128_default_nan_low;
4459 			z.high = float128_default_nan_high;
4460 			return z;
4461 		}
4462 		return packFloat128( zSign, 0x7FFF, 0, 0 );
4463 	}
4464 	if ( aExp == 0 ) {
4465 		if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4466 		normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4467 	}
4468 	if ( bExp == 0 ) {
4469 		if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4470 		normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4471 	}
4472 	zExp = aExp + bExp - 0x4000;
4473 	aSig0 |= LIT64( 0x0001000000000000 );
4474 	shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4475 	mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4476 	add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4477 	zSig2 |= ( zSig3 != 0 );
4478 	if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4479 		shift128ExtraRightJamming(
4480 			zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4481 		++zExp;
4482 	}
4483 	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4484 
4485 }
4486 
4487 /*----------------------------------------------------------------------------
4488 | Returns the result of dividing the quadruple-precision floating-point value
4489 | `a' by the corresponding value `b'.  The operation is performed according to
4490 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4491 *----------------------------------------------------------------------------*/
4492 
float128_div(float128 a,float128 b)4493 float128 float128_div( float128 a, float128 b )
4494 {
4495 	flag aSign, bSign, zSign;
4496 	int32 aExp, bExp, zExp;
4497 	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4498 	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4499 	float128 z;
4500 
4501 	aSig1 = extractFloat128Frac1( a );
4502 	aSig0 = extractFloat128Frac0( a );
4503 	aExp = extractFloat128Exp( a );
4504 	aSign = extractFloat128Sign( a );
4505 	bSig1 = extractFloat128Frac1( b );
4506 	bSig0 = extractFloat128Frac0( b );
4507 	bExp = extractFloat128Exp( b );
4508 	bSign = extractFloat128Sign( b );
4509 	zSign = aSign ^ bSign;
4510 	if ( aExp == 0x7FFF ) {
4511 		if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4512 		if ( bExp == 0x7FFF ) {
4513 			if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4514 			goto invalid;
4515 		}
4516 		return packFloat128( zSign, 0x7FFF, 0, 0 );
4517 	}
4518 	if ( bExp == 0x7FFF ) {
4519 		if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4520 		return packFloat128( zSign, 0, 0, 0 );
4521 	}
4522 	if ( bExp == 0 ) {
4523 		if ( ( bSig0 | bSig1 ) == 0 ) {
4524 			if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4525 	invalid:
4526 				float_raise( float_flag_invalid );
4527 				z.low = float128_default_nan_low;
4528 				z.high = float128_default_nan_high;
4529 				return z;
4530 			}
4531 			float_raise( float_flag_divbyzero );
4532 			return packFloat128( zSign, 0x7FFF, 0, 0 );
4533 		}
4534 		normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4535 	}
4536 	if ( aExp == 0 ) {
4537 		if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4538 		normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4539 	}
4540 	zExp = aExp - bExp + 0x3FFD;
4541 	shortShift128Left(
4542 		aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4543 	shortShift128Left(
4544 		bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4545 	if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4546 		shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4547 		++zExp;
4548 	}
4549 	zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4550 	mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4551 	sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4552 	while ( (sbits64) rem0 < 0 ) {
4553 		--zSig0;
4554 		add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4555 	}
4556 	zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4557 	if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4558 		mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4559 		sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4560 		while ( (sbits64) rem1 < 0 ) {
4561 			--zSig1;
4562 			add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4563 		}
4564 		zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4565 	}
4566 	shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4567 	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4568 
4569 }
4570 
4571 /*----------------------------------------------------------------------------
4572 | Returns the remainder of the quadruple-precision floating-point value `a'
4573 | with respect to the corresponding value `b'.  The operation is performed
4574 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4575 *----------------------------------------------------------------------------*/
4576 
float128_rem(float128 a,float128 b)4577 float128 float128_rem( float128 a, float128 b )
4578 {
4579 	flag aSign, zSign;
4580 	int32 aExp, bExp, expDiff;
4581 	bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4582 	bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4583 	sbits64 sigMean0;
4584 	float128 z;
4585 
4586 	aSig1 = extractFloat128Frac1( a );
4587 	aSig0 = extractFloat128Frac0( a );
4588 	aExp = extractFloat128Exp( a );
4589 	aSign = extractFloat128Sign( a );
4590 	bSig1 = extractFloat128Frac1( b );
4591 	bSig0 = extractFloat128Frac0( b );
4592 	bExp = extractFloat128Exp( b );
4593 //    bSign = extractFloat128Sign( b );
4594 	if ( aExp == 0x7FFF ) {
4595 		if (    ( aSig0 | aSig1 )
4596 				|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4597 			return propagateFloat128NaN( a, b );
4598 		}
4599 		goto invalid;
4600 	}
4601 	if ( bExp == 0x7FFF ) {
4602 		if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4603 		return a;
4604 	}
4605 	if ( bExp == 0 ) {
4606 		if ( ( bSig0 | bSig1 ) == 0 ) {
4607 	invalid:
4608 			float_raise( float_flag_invalid );
4609 			z.low = float128_default_nan_low;
4610 			z.high = float128_default_nan_high;
4611 			return z;
4612 		}
4613 		normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4614 	}
4615 	if ( aExp == 0 ) {
4616 		if ( ( aSig0 | aSig1 ) == 0 ) return a;
4617 		normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4618 	}
4619 	expDiff = aExp - bExp;
4620 	if ( expDiff < -1 ) return a;
4621 	shortShift128Left(
4622 		aSig0 | LIT64( 0x0001000000000000 ),
4623 		aSig1,
4624 		15 - ( expDiff < 0 ),
4625 		&aSig0,
4626 		&aSig1
4627 	);
4628 	shortShift128Left(
4629 		bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4630 	q = le128( bSig0, bSig1, aSig0, aSig1 );
4631 	if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4632 	expDiff -= 64;
4633 	while ( 0 < expDiff ) {
4634 		q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4635 		q = ( 4 < q ) ? q - 4 : 0;
4636 		mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4637 		shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4638 		shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4639 		sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4640 		expDiff -= 61;
4641 	}
4642 	if ( -64 < expDiff ) {
4643 		q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4644 		q = ( 4 < q ) ? q - 4 : 0;
4645 		q >>= - expDiff;
4646 		shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4647 		expDiff += 52;
4648 		if ( expDiff < 0 ) {
4649 			shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4650 		}
4651 		else {
4652 			shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4653 		}
4654 		mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4655 		sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4656 	}
4657 	else {
4658 		shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4659 		shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4660 	}
4661 	do {
4662 		alternateASig0 = aSig0;
4663 		alternateASig1 = aSig1;
4664 		++q;
4665 		sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4666 	} while ( 0 <= (sbits64) aSig0 );
4667 	add128(
4668 		aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
4669 	if (    ( sigMean0 < 0 )
4670 			|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4671 		aSig0 = alternateASig0;
4672 		aSig1 = alternateASig1;
4673 	}
4674 	zSign = ( (sbits64) aSig0 < 0 );
4675 	if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4676 	return
4677 		normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4678 
4679 }
4680 
4681 /*----------------------------------------------------------------------------
4682 | Returns the square root of the quadruple-precision floating-point value `a'.
4683 | The operation is performed according to the IEC/IEEE Standard for Binary
4684 | Floating-Point Arithmetic.
4685 *----------------------------------------------------------------------------*/
4686 
float128_sqrt(float128 a)4687 float128 float128_sqrt( float128 a )
4688 {
4689 	flag aSign;
4690 	int32 aExp, zExp;
4691 	bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4692 	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4693 	float128 z;
4694 
4695 	aSig1 = extractFloat128Frac1( a );
4696 	aSig0 = extractFloat128Frac0( a );
4697 	aExp = extractFloat128Exp( a );
4698 	aSign = extractFloat128Sign( a );
4699 	if ( aExp == 0x7FFF ) {
4700 		if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4701 		if ( ! aSign ) return a;
4702 		goto invalid;
4703 	}
4704 	if ( aSign ) {
4705 		if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4706 	invalid:
4707 		float_raise( float_flag_invalid );
4708 		z.low = float128_default_nan_low;
4709 		z.high = float128_default_nan_high;
4710 		return z;
4711 	}
4712 	if ( aExp == 0 ) {
4713 		if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4714 		normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4715 	}
4716 	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4717 	aSig0 |= LIT64( 0x0001000000000000 );
4718 	zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4719 	shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4720 	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4721 	doubleZSig0 = zSig0<<1;
4722 	mul64To128( zSig0, zSig0, &term0, &term1 );
4723 	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4724 	while ( (sbits64) rem0 < 0 ) {
4725 		--zSig0;
4726 		doubleZSig0 -= 2;
4727 		add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4728 	}
4729 	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4730 	if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4731 		if ( zSig1 == 0 ) zSig1 = 1;
4732 		mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4733 		sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4734 		mul64To128( zSig1, zSig1, &term2, &term3 );
4735 		sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4736 		while ( (sbits64) rem1 < 0 ) {
4737 			--zSig1;
4738 			shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4739 			term3 |= 1;
4740 			term2 |= doubleZSig0;
4741 			add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4742 		}
4743 		zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4744 	}
4745 	shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4746 	return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4747 
4748 }
4749 
4750 /*----------------------------------------------------------------------------
4751 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4752 | the corresponding value `b', and 0 otherwise.  The comparison is performed
4753 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4755 
float128_eq(float128 a,float128 b)4756 flag float128_eq( float128 a, float128 b )
4757 {
4758 	if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4759 				&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4760 			|| (    ( extractFloat128Exp( b ) == 0x7FFF )
4761 				&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4762 		) {
4763 		if (    float128_is_signaling_nan( a )
4764 				|| float128_is_signaling_nan( b ) ) {
4765 			float_raise( float_flag_invalid );
4766 		}
4767 		return 0;
4768 	}
4769 	return
4770 			( a.low == b.low )
4771 		&& (    ( a.high == b.high )
4772 				|| (    ( a.low == 0 )
4773 					&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4774 			);
4775 
4776 }
4777 
4778 /*----------------------------------------------------------------------------
4779 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4780 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
4781 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4782 | Arithmetic.
4783 *----------------------------------------------------------------------------*/
4784 
float128_le(float128 a,float128 b)4785 flag float128_le( float128 a, float128 b )
4786 {
4787 	flag aSign, bSign;
4788 
4789 	if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4790 				&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4791 			|| (    ( extractFloat128Exp( b ) == 0x7FFF )
4792 				&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4793 		) {
4794 		float_raise( float_flag_invalid );
4795 		return 0;
4796 	}
4797 	aSign = extractFloat128Sign( a );
4798 	bSign = extractFloat128Sign( b );
4799 	if ( aSign != bSign ) {
4800 		return
4801 				aSign
4802 			|| (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4803 					== 0 );
4804 	}
4805 	return
4806 			aSign ? le128( b.high, b.low, a.high, a.low )
4807 		: le128( a.high, a.low, b.high, b.low );
4808 
4809 }
4810 
4811 /*----------------------------------------------------------------------------
4812 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4813 | the corresponding value `b', and 0 otherwise.  The comparison is performed
4814 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4815 *----------------------------------------------------------------------------*/
4816 
float128_lt(float128 a,float128 b)4817 flag float128_lt( float128 a, float128 b )
4818 {
4819 	flag aSign, bSign;
4820 
4821 	if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4822 				&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4823 			|| (    ( extractFloat128Exp( b ) == 0x7FFF )
4824 				&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4825 		) {
4826 		float_raise( float_flag_invalid );
4827 		return 0;
4828 	}
4829 	aSign = extractFloat128Sign( a );
4830 	bSign = extractFloat128Sign( b );
4831 	if ( aSign != bSign ) {
4832 		return
4833 				aSign
4834 			&& (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4835 					!= 0 );
4836 	}
4837 	return
4838 			aSign ? lt128( b.high, b.low, a.high, a.low )
4839 		: lt128( a.high, a.low, b.high, b.low );
4840 
4841 }
4842 
4843 /*----------------------------------------------------------------------------
4844 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4845 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4846 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4847 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4849 
float128_eq_signaling(float128 a,float128 b)4850 flag float128_eq_signaling( float128 a, float128 b )
4851 {
4852 	if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4853 				&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4854 			|| (    ( extractFloat128Exp( b ) == 0x7FFF )
4855 				&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4856 		) {
4857 		float_raise( float_flag_invalid );
4858 		return 0;
4859 	}
4860 	return
4861 			( a.low == b.low )
4862 		&& (    ( a.high == b.high )
4863 				|| (    ( a.low == 0 )
4864 					&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4865 			);
4866 
4867 }
4868 
4869 /*----------------------------------------------------------------------------
4870 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4871 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4872 | cause an exception.  Otherwise, the comparison is performed according to the
4873 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4874 *----------------------------------------------------------------------------*/
4875 
float128_le_quiet(float128 a,float128 b)4876 flag float128_le_quiet( float128 a, float128 b )
4877 {
4878 	flag aSign, bSign;
4879 
4880 	if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4881 				&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4882 			|| (    ( extractFloat128Exp( b ) == 0x7FFF )
4883 				&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4884 		) {
4885 		if (    float128_is_signaling_nan( a )
4886 				|| float128_is_signaling_nan( b ) ) {
4887 			float_raise( float_flag_invalid );
4888 		}
4889 		return 0;
4890 	}
4891 	aSign = extractFloat128Sign( a );
4892 	bSign = extractFloat128Sign( b );
4893 	if ( aSign != bSign ) {
4894 		return
4895 				aSign
4896 			|| (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4897 					== 0 );
4898 	}
4899 	return
4900 			aSign ? le128( b.high, b.low, a.high, a.low )
4901 		: le128( a.high, a.low, b.high, b.low );
4902 
4903 }
4904 
4905 /*----------------------------------------------------------------------------
4906 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4907 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4908 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4909 | Standard for Binary Floating-Point Arithmetic.
4910 *----------------------------------------------------------------------------*/
4911 
float128_lt_quiet(float128 a,float128 b)4912 flag float128_lt_quiet( float128 a, float128 b )
4913 {
4914 	flag aSign, bSign;
4915 
4916 	if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4917 				&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4918 			|| (    ( extractFloat128Exp( b ) == 0x7FFF )
4919 				&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4920 		) {
4921 		if (    float128_is_signaling_nan( a )
4922 				|| float128_is_signaling_nan( b ) ) {
4923 			float_raise( float_flag_invalid );
4924 		}
4925 		return 0;
4926 	}
4927 	aSign = extractFloat128Sign( a );
4928 	bSign = extractFloat128Sign( b );
4929 	if ( aSign != bSign ) {
4930 		return
4931 				aSign
4932 			&& (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4933 					!= 0 );
4934 	}
4935 	return
4936 			aSign ? lt128( b.high, b.low, a.high, a.low )
4937 		: lt128( a.high, a.low, b.high, b.low );
4938 
4939 }
4940 
4941 #endif
4942