1 /* $OpenBSD: softfloat.c,v 1.6 2014/07/01 20:21:17 miod Exp $ */
2 /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */
3
4 /*
5 * This version hacked for use with gcc -msoft-float by bjh21.
6 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7 * itself).
8 */
9
10 /*
11 * Things you may want to define:
12 *
13 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
15 * properly renamed.
16 */
17
18 /*
19 ===============================================================================
20
21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
22 Arithmetic Package, Release 2a.
23
24 Written by John R. Hauser. This work was made possible in part by the
25 International Computer Science Institute, located at Suite 600, 1947 Center
26 Street, Berkeley, California 94704. Funding was partially provided by the
27 National Science Foundation under grant MIP-9311980. The original version
28 of this code was written as part of a project to build a fixed-point vector
29 processor in collaboration with the University of California at Berkeley,
30 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
32 arithmetic/SoftFloat.html'.
33
34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable
35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS
37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
43
44 Derivative works are acceptable, even for commercial purposes, so long as
45 (1) they include prominent notice that the work is derivative, and (2) they
46 include prominent notice akin to these four paragraphs for those parts of
47 this code that are retained.
48
49 ===============================================================================
50 */
51
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55
56 #include "milieu.h"
57 #include "softfloat.h"
58
59 float32 normalizeRoundAndPackFloat32(flag, int16, bits32);
60 float64 normalizeRoundAndPackFloat64(flag, int16, bits64);
61
62 /*
63 * Conversions between floats as stored in memory and floats as
64 * SoftFloat uses them
65 */
66 #ifndef FLOAT64_DEMANGLE
67 #define FLOAT64_DEMANGLE(a) (a)
68 #endif
69 #ifndef FLOAT64_MANGLE
70 #define FLOAT64_MANGLE(a) (a)
71 #endif
72
73 /*
74 -------------------------------------------------------------------------------
75 Floating-point rounding mode, extended double-precision rounding precision,
76 and exception flags.
77 -------------------------------------------------------------------------------
78 */
79
80 /*
81 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
82 * Right now, it does not. I've removed all other dynamic global
83 * variables. [ross]
84 */
85 #ifdef FLOATX80
86 int8 floatx80_rounding_precision = 80;
87 #endif
88
89 /*
90 -------------------------------------------------------------------------------
91 Primitive arithmetic functions, including multi-word arithmetic, and
92 division and square root approximations. (Can be specialized to target if
93 desired.)
94 -------------------------------------------------------------------------------
95 */
96 #include "softfloat-macros.h"
97
98 /*
99 -------------------------------------------------------------------------------
100 Functions and definitions to determine: (1) whether tininess for underflow
101 is detected before or after rounding by default, (2) what (if anything)
102 happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 are propagated from function inputs to output. These details are target-
105 specific.
106 -------------------------------------------------------------------------------
107 */
108 #include "softfloat-specialize.h"
109
110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
111 /*
112 -------------------------------------------------------------------------------
113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
114 and 7, and returns the properly rounded 32-bit integer corresponding to the
115 input. If `zSign' is 1, the input is negated before being converted to an
116 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
117 is simply rounded to an integer, with the inexact exception raised if the
118 input cannot be represented exactly as an integer. However, if the fixed-
119 point input is too large, the invalid exception is raised and the largest
120 positive or negative integer is returned.
121 -------------------------------------------------------------------------------
122 */
roundAndPackInt32(flag zSign,bits64 absZ)123 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
124 {
125 int8 roundingMode;
126 flag roundNearestEven;
127 int8 roundIncrement, roundBits;
128 int32 z;
129
130 roundingMode = float_rounding_mode();
131 roundNearestEven = ( roundingMode == float_round_nearest_even );
132 roundIncrement = 0x40;
133 if ( ! roundNearestEven ) {
134 if ( roundingMode == float_round_to_zero ) {
135 roundIncrement = 0;
136 }
137 else {
138 roundIncrement = 0x7F;
139 if ( zSign ) {
140 if ( roundingMode == float_round_up ) roundIncrement = 0;
141 }
142 else {
143 if ( roundingMode == float_round_down ) roundIncrement = 0;
144 }
145 }
146 }
147 roundBits = absZ & 0x7F;
148 absZ = ( absZ + roundIncrement )>>7;
149 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
150 z = absZ;
151 if ( zSign ) z = - z;
152 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
153 float_raise( float_flag_invalid );
154 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
155 }
156 if ( roundBits ) float_set_inexact();
157 return z;
158
159 }
160
161 /*
162 -------------------------------------------------------------------------------
163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
164 `absZ1', with binary point between bits 63 and 64 (between the input words),
165 and returns the properly rounded 64-bit integer corresponding to the input.
166 If `zSign' is 1, the input is negated before being converted to an integer.
167 Ordinarily, the fixed-point input is simply rounded to an integer, with
168 the inexact exception raised if the input cannot be represented exactly as
169 an integer. However, if the fixed-point input is too large, the invalid
170 exception is raised and the largest positive or negative integer is
171 returned.
172 -------------------------------------------------------------------------------
173 */
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
175 {
176 int8 roundingMode;
177 flag roundNearestEven, increment;
178 int64 z;
179
180 roundingMode = float_rounding_mode();
181 roundNearestEven = ( roundingMode == float_round_nearest_even );
182 increment = ( (sbits64) absZ1 < 0 );
183 if ( ! roundNearestEven ) {
184 if ( roundingMode == float_round_to_zero ) {
185 increment = 0;
186 }
187 else {
188 if ( zSign ) {
189 increment = ( roundingMode == float_round_down ) && absZ1;
190 }
191 else {
192 increment = ( roundingMode == float_round_up ) && absZ1;
193 }
194 }
195 }
196 if ( increment ) {
197 ++absZ0;
198 if ( absZ0 == 0 ) goto overflow;
199 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
200 }
201 z = absZ0;
202 if ( zSign ) z = - z;
203 if ( z && ( ( z < 0 ) ^ zSign ) ) {
204 overflow:
205 float_raise( float_flag_invalid );
206 return
207 zSign ? (sbits64) LIT64( 0x8000000000000000 )
208 : LIT64( 0x7FFFFFFFFFFFFFFF );
209 }
210 if ( absZ1 ) float_set_inexact();
211 return z;
212
213 }
214
215 #ifdef __alpha__
216 /*
217 -------------------------------------------------------------------------------
218 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
219 `absZ1', with binary point between bits 63 and 64 (between the input words),
220 and returns the properly rounded 64-bit integer corresponding to the input.
221 If `zSign' is 1, the input is negated before being converted to an integer.
222 Ordinarily, the fixed-point input is simply rounded to an integer, with
223 the inexact exception raised if the input cannot be represented exactly as
224 an integer.
225 -------------------------------------------------------------------------------
226 */
roundAndPackInt64NoOverflow(flag zSign,bits64 absZ0,bits64 absZ1)227 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0,
228 bits64 absZ1 )
229 {
230 int8 roundingMode;
231 flag roundNearestEven, increment;
232 int64 z;
233
234 roundingMode = float_rounding_mode();
235 roundNearestEven = ( roundingMode == float_round_nearest_even );
236 increment = ( (sbits64) absZ1 < 0 );
237 if ( ! roundNearestEven ) {
238 if ( roundingMode == float_round_to_zero ) {
239 increment = 0;
240 }
241 else {
242 if ( zSign ) {
243 increment = ( roundingMode == float_round_down ) && absZ1;
244 }
245 else {
246 increment = ( roundingMode == float_round_up ) && absZ1;
247 }
248 }
249 }
250 if ( increment ) {
251 ++absZ0;
252 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
253 }
254 z = absZ0;
255 if ( zSign ) z = - z;
256 if ( absZ1 ) float_set_inexact();
257 return z;
258
259 }
260 #endif /* __alpha__ */
261 #endif
262
263 /*
264 -------------------------------------------------------------------------------
265 Returns the fraction bits of the single-precision floating-point value `a'.
266 -------------------------------------------------------------------------------
267 */
extractFloat32Frac(float32 a)268 INLINE bits32 extractFloat32Frac( float32 a )
269 {
270
271 return a & 0x007FFFFF;
272
273 }
274
275 /*
276 -------------------------------------------------------------------------------
277 Returns the exponent bits of the single-precision floating-point value `a'.
278 -------------------------------------------------------------------------------
279 */
extractFloat32Exp(float32 a)280 INLINE int16 extractFloat32Exp( float32 a )
281 {
282
283 return ( a>>23 ) & 0xFF;
284
285 }
286
287 /*
288 -------------------------------------------------------------------------------
289 Returns the sign bit of the single-precision floating-point value `a'.
290 -------------------------------------------------------------------------------
291 */
extractFloat32Sign(float32 a)292 INLINE flag extractFloat32Sign( float32 a )
293 {
294
295 return a>>31;
296
297 }
298
299 /*
300 -------------------------------------------------------------------------------
301 Normalizes the subnormal single-precision floating-point value represented
302 by the denormalized significand `aSig'. The normalized exponent and
303 significand are stored at the locations pointed to by `zExpPtr' and
304 `zSigPtr', respectively.
305 -------------------------------------------------------------------------------
306 */
307 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)308 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
309 {
310 int8 shiftCount;
311
312 shiftCount = countLeadingZeros32( aSig ) - 8;
313 *zSigPtr = aSig<<shiftCount;
314 *zExpPtr = 1 - shiftCount;
315
316 }
317
318 /*
319 -------------------------------------------------------------------------------
320 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
321 single-precision floating-point value, returning the result. After being
322 shifted into the proper positions, the three fields are simply added
323 together to form the result. This means that any integer portion of `zSig'
324 will be added into the exponent. Since a properly normalized significand
325 will have an integer portion equal to 1, the `zExp' input should be 1 less
326 than the desired result exponent whenever `zSig' is a complete, normalized
327 significand.
328 -------------------------------------------------------------------------------
329 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)330 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
331 {
332
333 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
334
335 }
336
337 /*
338 -------------------------------------------------------------------------------
339 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
340 and significand `zSig', and returns the proper single-precision floating-
341 point value corresponding to the abstract input. Ordinarily, the abstract
342 value is simply rounded and packed into the single-precision format, with
343 the inexact exception raised if the abstract input cannot be represented
344 exactly. However, if the abstract value is too large, the overflow and
345 inexact exceptions are raised and an infinity or maximal finite value is
346 returned. If the abstract value is too small, the input value is rounded to
347 a subnormal number, and the underflow and inexact exceptions are raised if
348 the abstract input cannot be represented exactly as a subnormal single-
349 precision floating-point number.
350 The input significand `zSig' has its binary point between bits 30
351 and 29, which is 7 bits to the left of the usual location. This shifted
352 significand must be normalized or smaller. If `zSig' is not normalized,
353 `zExp' must be 0; in that case, the result returned is a subnormal number,
354 and it must not require rounding. In the usual case that `zSig' is
355 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
356 The handling of underflow and overflow follows the IEC/IEEE Standard for
357 Binary Floating-Point Arithmetic.
358 -------------------------------------------------------------------------------
359 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)360 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
361 {
362 int8 roundingMode;
363 flag roundNearestEven;
364 int8 roundIncrement, roundBits;
365 flag isTiny;
366
367 roundingMode = float_rounding_mode();
368 roundNearestEven = ( roundingMode == float_round_nearest_even );
369 roundIncrement = 0x40;
370 if ( ! roundNearestEven ) {
371 if ( roundingMode == float_round_to_zero ) {
372 roundIncrement = 0;
373 }
374 else {
375 roundIncrement = 0x7F;
376 if ( zSign ) {
377 if ( roundingMode == float_round_up ) roundIncrement = 0;
378 }
379 else {
380 if ( roundingMode == float_round_down ) roundIncrement = 0;
381 }
382 }
383 }
384 roundBits = zSig & 0x7F;
385 if ( 0xFD <= (bits16) zExp ) {
386 if ( ( 0xFD < zExp )
387 || ( ( zExp == 0xFD )
388 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
389 ) {
390 float_raise( float_flag_overflow | float_flag_inexact );
391 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
392 }
393 if ( zExp < 0 ) {
394 isTiny =
395 ( float_detect_tininess == float_tininess_before_rounding )
396 || ( zExp < -1 )
397 || ( zSig + roundIncrement < 0x80000000 );
398 shift32RightJamming( zSig, - zExp, &zSig );
399 zExp = 0;
400 roundBits = zSig & 0x7F;
401 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
402 }
403 }
404 if ( roundBits ) float_set_inexact();
405 zSig = ( zSig + roundIncrement )>>7;
406 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
407 if ( zSig == 0 ) zExp = 0;
408 return packFloat32( zSign, zExp, zSig );
409
410 }
411
412 /*
413 -------------------------------------------------------------------------------
414 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
415 and significand `zSig', and returns the proper single-precision floating-
416 point value corresponding to the abstract input. This routine is just like
417 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
418 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
419 floating-point exponent.
420 -------------------------------------------------------------------------------
421 */
422 float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)423 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
424 {
425 int8 shiftCount;
426
427 shiftCount = countLeadingZeros32( zSig ) - 1;
428 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
429
430 }
431
432 /*
433 -------------------------------------------------------------------------------
434 Returns the fraction bits of the double-precision floating-point value `a'.
435 -------------------------------------------------------------------------------
436 */
extractFloat64Frac(float64 a)437 INLINE bits64 extractFloat64Frac( float64 a )
438 {
439
440 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
441
442 }
443
444 /*
445 -------------------------------------------------------------------------------
446 Returns the exponent bits of the double-precision floating-point value `a'.
447 -------------------------------------------------------------------------------
448 */
extractFloat64Exp(float64 a)449 INLINE int16 extractFloat64Exp( float64 a )
450 {
451
452 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
453
454 }
455
456 /*
457 -------------------------------------------------------------------------------
458 Returns the sign bit of the double-precision floating-point value `a'.
459 -------------------------------------------------------------------------------
460 */
extractFloat64Sign(float64 a)461 INLINE flag extractFloat64Sign( float64 a )
462 {
463
464 return FLOAT64_DEMANGLE(a)>>63;
465
466 }
467
468 /*
469 -------------------------------------------------------------------------------
470 Normalizes the subnormal double-precision floating-point value represented
471 by the denormalized significand `aSig'. The normalized exponent and
472 significand are stored at the locations pointed to by `zExpPtr' and
473 `zSigPtr', respectively.
474 -------------------------------------------------------------------------------
475 */
476 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)477 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
478 {
479 int8 shiftCount;
480
481 shiftCount = countLeadingZeros64( aSig ) - 11;
482 *zSigPtr = aSig<<shiftCount;
483 *zExpPtr = 1 - shiftCount;
484
485 }
486
487 /*
488 -------------------------------------------------------------------------------
489 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
490 double-precision floating-point value, returning the result. After being
491 shifted into the proper positions, the three fields are simply added
492 together to form the result. This means that any integer portion of `zSig'
493 will be added into the exponent. Since a properly normalized significand
494 will have an integer portion equal to 1, the `zExp' input should be 1 less
495 than the desired result exponent whenever `zSig' is a complete, normalized
496 significand.
497 -------------------------------------------------------------------------------
498 */
packFloat64(flag zSign,int16 zExp,bits64 zSig)499 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
500 {
501
502 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
503 ( ( (bits64) zExp )<<52 ) + zSig );
504
505 }
506
507 /*
508 -------------------------------------------------------------------------------
509 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
510 and significand `zSig', and returns the proper double-precision floating-
511 point value corresponding to the abstract input. Ordinarily, the abstract
512 value is simply rounded and packed into the double-precision format, with
513 the inexact exception raised if the abstract input cannot be represented
514 exactly. However, if the abstract value is too large, the overflow and
515 inexact exceptions are raised and an infinity or maximal finite value is
516 returned. If the abstract value is too small, the input value is rounded to
517 a subnormal number, and the underflow and inexact exceptions are raised if
518 the abstract input cannot be represented exactly as a subnormal double-
519 precision floating-point number.
520 The input significand `zSig' has its binary point between bits 62
521 and 61, which is 10 bits to the left of the usual location. This shifted
522 significand must be normalized or smaller. If `zSig' is not normalized,
523 `zExp' must be 0; in that case, the result returned is a subnormal number,
524 and it must not require rounding. In the usual case that `zSig' is
525 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
526 The handling of underflow and overflow follows the IEC/IEEE Standard for
527 Binary Floating-Point Arithmetic.
528 -------------------------------------------------------------------------------
529 */
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)530 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
531 {
532 int8 roundingMode;
533 flag roundNearestEven;
534 int16 roundIncrement, roundBits;
535 flag isTiny;
536
537 roundingMode = float_rounding_mode();
538 roundNearestEven = ( roundingMode == float_round_nearest_even );
539 roundIncrement = 0x200;
540 if ( ! roundNearestEven ) {
541 if ( roundingMode == float_round_to_zero ) {
542 roundIncrement = 0;
543 }
544 else {
545 roundIncrement = 0x3FF;
546 if ( zSign ) {
547 if ( roundingMode == float_round_up ) roundIncrement = 0;
548 }
549 else {
550 if ( roundingMode == float_round_down ) roundIncrement = 0;
551 }
552 }
553 }
554 roundBits = zSig & 0x3FF;
555 if ( 0x7FD <= (bits16) zExp ) {
556 if ( ( 0x7FD < zExp )
557 || ( ( zExp == 0x7FD )
558 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
559 ) {
560 float_raise( float_flag_overflow | float_flag_inexact );
561 return FLOAT64_MANGLE(
562 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
563 ( roundIncrement == 0 ));
564 }
565 if ( zExp < 0 ) {
566 isTiny =
567 ( float_detect_tininess == float_tininess_before_rounding )
568 || ( zExp < -1 )
569 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
570 shift64RightJamming( zSig, - zExp, &zSig );
571 zExp = 0;
572 roundBits = zSig & 0x3FF;
573 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
574 }
575 }
576 if ( roundBits ) float_set_inexact();
577 zSig = ( zSig + roundIncrement )>>10;
578 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
579 if ( zSig == 0 ) zExp = 0;
580 return packFloat64( zSign, zExp, zSig );
581
582 }
583
584 /*
585 -------------------------------------------------------------------------------
586 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
587 and significand `zSig', and returns the proper double-precision floating-
588 point value corresponding to the abstract input. This routine is just like
589 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
590 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
591 floating-point exponent.
592 -------------------------------------------------------------------------------
593 */
594 float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)595 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
596 {
597 int8 shiftCount;
598
599 shiftCount = countLeadingZeros64( zSig ) - 1;
600 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
601
602 }
603
604 #ifdef FLOATX80
605
606 /*
607 -------------------------------------------------------------------------------
608 Returns the fraction bits of the extended double-precision floating-point
609 value `a'.
610 -------------------------------------------------------------------------------
611 */
extractFloatx80Frac(floatx80 a)612 INLINE bits64 extractFloatx80Frac( floatx80 a )
613 {
614
615 return a.low;
616
617 }
618
619 /*
620 -------------------------------------------------------------------------------
621 Returns the exponent bits of the extended double-precision floating-point
622 value `a'.
623 -------------------------------------------------------------------------------
624 */
extractFloatx80Exp(floatx80 a)625 INLINE int32 extractFloatx80Exp( floatx80 a )
626 {
627
628 return a.high & 0x7FFF;
629
630 }
631
632 /*
633 -------------------------------------------------------------------------------
634 Returns the sign bit of the extended double-precision floating-point value
635 `a'.
636 -------------------------------------------------------------------------------
637 */
extractFloatx80Sign(floatx80 a)638 INLINE flag extractFloatx80Sign( floatx80 a )
639 {
640
641 return a.high>>15;
642
643 }
644
645 /*
646 -------------------------------------------------------------------------------
647 Normalizes the subnormal extended double-precision floating-point value
648 represented by the denormalized significand `aSig'. The normalized exponent
649 and significand are stored at the locations pointed to by `zExpPtr' and
650 `zSigPtr', respectively.
651 -------------------------------------------------------------------------------
652 */
653 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)654 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
655 {
656 int8 shiftCount;
657
658 shiftCount = countLeadingZeros64( aSig );
659 *zSigPtr = aSig<<shiftCount;
660 *zExpPtr = 1 - shiftCount;
661
662 }
663
664 /*
665 -------------------------------------------------------------------------------
666 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
667 extended double-precision floating-point value, returning the result.
668 -------------------------------------------------------------------------------
669 */
packFloatx80(flag zSign,int32 zExp,bits64 zSig)670 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
671 {
672 floatx80 z;
673
674 z.low = zSig;
675 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
676 return z;
677
678 }
679
680 /*
681 -------------------------------------------------------------------------------
682 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
683 and extended significand formed by the concatenation of `zSig0' and `zSig1',
684 and returns the proper extended double-precision floating-point value
685 corresponding to the abstract input. Ordinarily, the abstract value is
686 rounded and packed into the extended double-precision format, with the
687 inexact exception raised if the abstract input cannot be represented
688 exactly. However, if the abstract value is too large, the overflow and
689 inexact exceptions are raised and an infinity or maximal finite value is
690 returned. If the abstract value is too small, the input value is rounded to
691 a subnormal number, and the underflow and inexact exceptions are raised if
692 the abstract input cannot be represented exactly as a subnormal extended
693 double-precision floating-point number.
694 If `roundingPrecision' is 32 or 64, the result is rounded to the same
695 number of bits as single or double precision, respectively. Otherwise, the
696 result is rounded to the full precision of the extended double-precision
697 format.
698 The input significand must be normalized or smaller. If the input
699 significand is not normalized, `zExp' must be 0; in that case, the result
700 returned is a subnormal number, and it must not require rounding. The
701 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
702 Floating-Point Arithmetic.
703 -------------------------------------------------------------------------------
704 */
705 static floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)706 roundAndPackFloatx80(
707 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
708 )
709 {
710 int8 roundingMode;
711 flag roundNearestEven, increment, isTiny;
712 int64 roundIncrement, roundMask, roundBits;
713
714 roundingMode = float_rounding_mode();
715 roundNearestEven = ( roundingMode == float_round_nearest_even );
716 if ( roundingPrecision == 80 ) goto precision80;
717 if ( roundingPrecision == 64 ) {
718 roundIncrement = LIT64( 0x0000000000000400 );
719 roundMask = LIT64( 0x00000000000007FF );
720 }
721 else if ( roundingPrecision == 32 ) {
722 roundIncrement = LIT64( 0x0000008000000000 );
723 roundMask = LIT64( 0x000000FFFFFFFFFF );
724 }
725 else {
726 goto precision80;
727 }
728 zSig0 |= ( zSig1 != 0 );
729 if ( ! roundNearestEven ) {
730 if ( roundingMode == float_round_to_zero ) {
731 roundIncrement = 0;
732 }
733 else {
734 roundIncrement = roundMask;
735 if ( zSign ) {
736 if ( roundingMode == float_round_up ) roundIncrement = 0;
737 }
738 else {
739 if ( roundingMode == float_round_down ) roundIncrement = 0;
740 }
741 }
742 }
743 roundBits = zSig0 & roundMask;
744 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
745 if ( ( 0x7FFE < zExp )
746 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
747 ) {
748 goto overflow;
749 }
750 if ( zExp <= 0 ) {
751 isTiny =
752 ( float_detect_tininess == float_tininess_before_rounding )
753 || ( zExp < 0 )
754 || ( zSig0 <= zSig0 + roundIncrement );
755 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
756 zExp = 0;
757 roundBits = zSig0 & roundMask;
758 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
759 if ( roundBits ) float_set_inexact();
760 zSig0 += roundIncrement;
761 if ( (sbits64) zSig0 < 0 ) zExp = 1;
762 roundIncrement = roundMask + 1;
763 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
764 roundMask |= roundIncrement;
765 }
766 zSig0 &= ~ roundMask;
767 return packFloatx80( zSign, zExp, zSig0 );
768 }
769 }
770 if ( roundBits ) float_set_inexact();
771 zSig0 += roundIncrement;
772 if ( zSig0 < roundIncrement ) {
773 ++zExp;
774 zSig0 = LIT64( 0x8000000000000000 );
775 }
776 roundIncrement = roundMask + 1;
777 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
778 roundMask |= roundIncrement;
779 }
780 zSig0 &= ~ roundMask;
781 if ( zSig0 == 0 ) zExp = 0;
782 return packFloatx80( zSign, zExp, zSig0 );
783 precision80:
784 increment = ( (sbits64) zSig1 < 0 );
785 if ( ! roundNearestEven ) {
786 if ( roundingMode == float_round_to_zero ) {
787 increment = 0;
788 }
789 else {
790 if ( zSign ) {
791 increment = ( roundingMode == float_round_down ) && zSig1;
792 }
793 else {
794 increment = ( roundingMode == float_round_up ) && zSig1;
795 }
796 }
797 }
798 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
799 if ( ( 0x7FFE < zExp )
800 || ( ( zExp == 0x7FFE )
801 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
802 && increment
803 )
804 ) {
805 roundMask = 0;
806 overflow:
807 float_raise( float_flag_overflow | float_flag_inexact );
808 if ( ( roundingMode == float_round_to_zero )
809 || ( zSign && ( roundingMode == float_round_up ) )
810 || ( ! zSign && ( roundingMode == float_round_down ) )
811 ) {
812 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
813 }
814 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
815 }
816 if ( zExp <= 0 ) {
817 isTiny =
818 ( float_detect_tininess == float_tininess_before_rounding )
819 || ( zExp < 0 )
820 || ! increment
821 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
822 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
823 zExp = 0;
824 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
825 if ( zSig1 ) float_set_inexact();
826 if ( roundNearestEven ) {
827 increment = ( (sbits64) zSig1 < 0 );
828 }
829 else {
830 if ( zSign ) {
831 increment = ( roundingMode == float_round_down ) && zSig1;
832 }
833 else {
834 increment = ( roundingMode == float_round_up ) && zSig1;
835 }
836 }
837 if ( increment ) {
838 ++zSig0;
839 zSig0 &=
840 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
841 if ( (sbits64) zSig0 < 0 ) zExp = 1;
842 }
843 return packFloatx80( zSign, zExp, zSig0 );
844 }
845 }
846 if ( zSig1 ) float_set_inexact();
847 if ( increment ) {
848 ++zSig0;
849 if ( zSig0 == 0 ) {
850 ++zExp;
851 zSig0 = LIT64( 0x8000000000000000 );
852 }
853 else {
854 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
855 }
856 }
857 else {
858 if ( zSig0 == 0 ) zExp = 0;
859 }
860 return packFloatx80( zSign, zExp, zSig0 );
861
862 }
863
864 /*
865 -------------------------------------------------------------------------------
866 Takes an abstract floating-point value having sign `zSign', exponent
867 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
868 and returns the proper extended double-precision floating-point value
869 corresponding to the abstract input. This routine is just like
870 `roundAndPackFloatx80' except that the input significand does not have to be
871 normalized.
872 -------------------------------------------------------------------------------
873 */
874 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)875 normalizeRoundAndPackFloatx80(
876 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
877 )
878 {
879 int8 shiftCount;
880
881 if ( zSig0 == 0 ) {
882 zSig0 = zSig1;
883 zSig1 = 0;
884 zExp -= 64;
885 }
886 shiftCount = countLeadingZeros64( zSig0 );
887 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
888 zExp -= shiftCount;
889 return
890 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
891
892 }
893
894 #endif
895
896 #ifdef FLOAT128
897
898 /*
899 -------------------------------------------------------------------------------
900 Returns the least-significant 64 fraction bits of the quadruple-precision
901 floating-point value `a'.
902 -------------------------------------------------------------------------------
903 */
extractFloat128Frac1(float128 a)904 INLINE bits64 extractFloat128Frac1( float128 a )
905 {
906
907 return a.low;
908
909 }
910
911 /*
912 -------------------------------------------------------------------------------
913 Returns the most-significant 48 fraction bits of the quadruple-precision
914 floating-point value `a'.
915 -------------------------------------------------------------------------------
916 */
extractFloat128Frac0(float128 a)917 INLINE bits64 extractFloat128Frac0( float128 a )
918 {
919
920 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
921
922 }
923
924 /*
925 -------------------------------------------------------------------------------
926 Returns the exponent bits of the quadruple-precision floating-point value
927 `a'.
928 -------------------------------------------------------------------------------
929 */
extractFloat128Exp(float128 a)930 INLINE int32 extractFloat128Exp( float128 a )
931 {
932
933 return ( a.high>>48 ) & 0x7FFF;
934
935 }
936
937 /*
938 -------------------------------------------------------------------------------
939 Returns the sign bit of the quadruple-precision floating-point value `a'.
940 -------------------------------------------------------------------------------
941 */
extractFloat128Sign(float128 a)942 INLINE flag extractFloat128Sign( float128 a )
943 {
944
945 return a.high>>63;
946
947 }
948
949 /*
950 -------------------------------------------------------------------------------
951 Normalizes the subnormal quadruple-precision floating-point value
952 represented by the denormalized significand formed by the concatenation of
953 `aSig0' and `aSig1'. The normalized exponent is stored at the location
954 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
955 significand are stored at the location pointed to by `zSig0Ptr', and the
956 least significant 64 bits of the normalized significand are stored at the
957 location pointed to by `zSig1Ptr'.
958 -------------------------------------------------------------------------------
959 */
960 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)961 normalizeFloat128Subnormal(
962 bits64 aSig0,
963 bits64 aSig1,
964 int32 *zExpPtr,
965 bits64 *zSig0Ptr,
966 bits64 *zSig1Ptr
967 )
968 {
969 int8 shiftCount;
970
971 if ( aSig0 == 0 ) {
972 shiftCount = countLeadingZeros64( aSig1 ) - 15;
973 if ( shiftCount < 0 ) {
974 *zSig0Ptr = aSig1>>( - shiftCount );
975 *zSig1Ptr = aSig1<<( shiftCount & 63 );
976 }
977 else {
978 *zSig0Ptr = aSig1<<shiftCount;
979 *zSig1Ptr = 0;
980 }
981 *zExpPtr = - shiftCount - 63;
982 }
983 else {
984 shiftCount = countLeadingZeros64( aSig0 ) - 15;
985 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
986 *zExpPtr = 1 - shiftCount;
987 }
988
989 }
990
991 /*
992 -------------------------------------------------------------------------------
993 Packs the sign `zSign', the exponent `zExp', and the significand formed
994 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
995 floating-point value, returning the result. After being shifted into the
996 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
997 added together to form the most significant 32 bits of the result. This
998 means that any integer portion of `zSig0' will be added into the exponent.
999 Since a properly normalized significand will have an integer portion equal
1000 to 1, the `zExp' input should be 1 less than the desired result exponent
1001 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1002 significand.
1003 -------------------------------------------------------------------------------
1004 */
1005 INLINE float128
packFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1006 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1007 {
1008 float128 z;
1009
1010 z.low = zSig1;
1011 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
1012 return z;
1013
1014 }
1015
1016 /*
1017 -------------------------------------------------------------------------------
1018 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1019 and extended significand formed by the concatenation of `zSig0', `zSig1',
1020 and `zSig2', and returns the proper quadruple-precision floating-point value
1021 corresponding to the abstract input. Ordinarily, the abstract value is
1022 simply rounded and packed into the quadruple-precision format, with the
1023 inexact exception raised if the abstract input cannot be represented
1024 exactly. However, if the abstract value is too large, the overflow and
1025 inexact exceptions are raised and an infinity or maximal finite value is
1026 returned. If the abstract value is too small, the input value is rounded to
1027 a subnormal number, and the underflow and inexact exceptions are raised if
1028 the abstract input cannot be represented exactly as a subnormal quadruple-
1029 precision floating-point number.
1030 The input significand must be normalized or smaller. If the input
1031 significand is not normalized, `zExp' must be 0; in that case, the result
1032 returned is a subnormal number, and it must not require rounding. In the
1033 usual case that the input significand is normalized, `zExp' must be 1 less
1034 than the ``true'' floating-point exponent. The handling of underflow and
1035 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1036 -------------------------------------------------------------------------------
1037 */
1038 static float128
roundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1,bits64 zSig2)1039 roundAndPackFloat128(
1040 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
1041 {
1042 int8 roundingMode;
1043 flag roundNearestEven, increment, isTiny;
1044
1045 roundingMode = float_rounding_mode();
1046 roundNearestEven = ( roundingMode == float_round_nearest_even );
1047 increment = ( (sbits64) zSig2 < 0 );
1048 if ( ! roundNearestEven ) {
1049 if ( roundingMode == float_round_to_zero ) {
1050 increment = 0;
1051 }
1052 else {
1053 if ( zSign ) {
1054 increment = ( roundingMode == float_round_down ) && zSig2;
1055 }
1056 else {
1057 increment = ( roundingMode == float_round_up ) && zSig2;
1058 }
1059 }
1060 }
1061 if ( 0x7FFD <= (bits32) zExp ) {
1062 if ( ( 0x7FFD < zExp )
1063 || ( ( zExp == 0x7FFD )
1064 && eq128(
1065 LIT64( 0x0001FFFFFFFFFFFF ),
1066 LIT64( 0xFFFFFFFFFFFFFFFF ),
1067 zSig0,
1068 zSig1
1069 )
1070 && increment
1071 )
1072 ) {
1073 float_raise( float_flag_overflow | float_flag_inexact );
1074 if ( ( roundingMode == float_round_to_zero )
1075 || ( zSign && ( roundingMode == float_round_up ) )
1076 || ( ! zSign && ( roundingMode == float_round_down ) )
1077 ) {
1078 return
1079 packFloat128(
1080 zSign,
1081 0x7FFE,
1082 LIT64( 0x0000FFFFFFFFFFFF ),
1083 LIT64( 0xFFFFFFFFFFFFFFFF )
1084 );
1085 }
1086 return packFloat128( zSign, 0x7FFF, 0, 0 );
1087 }
1088 if ( zExp < 0 ) {
1089 isTiny =
1090 ( float_detect_tininess == float_tininess_before_rounding )
1091 || ( zExp < -1 )
1092 || ! increment
1093 || lt128(
1094 zSig0,
1095 zSig1,
1096 LIT64( 0x0001FFFFFFFFFFFF ),
1097 LIT64( 0xFFFFFFFFFFFFFFFF )
1098 );
1099 shift128ExtraRightJamming(
1100 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1101 zExp = 0;
1102 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1103 if ( roundNearestEven ) {
1104 increment = ( (sbits64) zSig2 < 0 );
1105 }
1106 else {
1107 if ( zSign ) {
1108 increment = ( roundingMode == float_round_down ) && zSig2;
1109 }
1110 else {
1111 increment = ( roundingMode == float_round_up ) && zSig2;
1112 }
1113 }
1114 }
1115 }
1116 if ( zSig2 ) float_set_inexact();
1117 if ( increment ) {
1118 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1119 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1120 }
1121 else {
1122 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1123 }
1124 return packFloat128( zSign, zExp, zSig0, zSig1 );
1125
1126 }
1127
1128 /*
1129 -------------------------------------------------------------------------------
1130 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1131 and significand formed by the concatenation of `zSig0' and `zSig1', and
1132 returns the proper quadruple-precision floating-point value corresponding
1133 to the abstract input. This routine is just like `roundAndPackFloat128'
1134 except that the input significand has fewer bits and does not have to be
1135 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1136 point exponent.
1137 -------------------------------------------------------------------------------
1138 */
1139 static float128
normalizeRoundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1140 normalizeRoundAndPackFloat128(
1141 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1142 {
1143 int8 shiftCount;
1144 bits64 zSig2;
1145
1146 if ( zSig0 == 0 ) {
1147 zSig0 = zSig1;
1148 zSig1 = 0;
1149 zExp -= 64;
1150 }
1151 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1152 if ( 0 <= shiftCount ) {
1153 zSig2 = 0;
1154 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1155 }
1156 else {
1157 shift128ExtraRightJamming(
1158 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1159 }
1160 zExp -= shiftCount;
1161 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1162
1163 }
1164
1165 #endif
1166
1167 /*
1168 -------------------------------------------------------------------------------
1169 Returns the result of converting the 32-bit two's complement integer `a'
1170 to the single-precision floating-point format. The conversion is performed
1171 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172 -------------------------------------------------------------------------------
1173 */
int32_to_float32(int32 a)1174 float32 int32_to_float32( int32 a )
1175 {
1176 flag zSign;
1177
1178 if ( a == 0 ) return 0;
1179 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1180 zSign = ( a < 0 );
1181 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1182
1183 }
1184
1185 /*
1186 -------------------------------------------------------------------------------
1187 Returns the result of converting the 32-bit two's complement integer `a'
1188 to the double-precision floating-point format. The conversion is performed
1189 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1190 -------------------------------------------------------------------------------
1191 */
int32_to_float64(int32 a)1192 float64 int32_to_float64( int32 a )
1193 {
1194 flag zSign;
1195 uint32 absA;
1196 int8 shiftCount;
1197 bits64 zSig;
1198
1199 if ( a == 0 ) return 0;
1200 zSign = ( a < 0 );
1201 absA = zSign ? - a : a;
1202 shiftCount = countLeadingZeros32( absA ) + 21;
1203 zSig = absA;
1204 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1205
1206 }
1207
1208 #ifdef FLOATX80
1209
1210 /*
1211 -------------------------------------------------------------------------------
1212 Returns the result of converting the 32-bit two's complement integer `a'
1213 to the extended double-precision floating-point format. The conversion
1214 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1215 Arithmetic.
1216 -------------------------------------------------------------------------------
1217 */
int32_to_floatx80(int32 a)1218 floatx80 int32_to_floatx80( int32 a )
1219 {
1220 flag zSign;
1221 uint32 absA;
1222 int8 shiftCount;
1223 bits64 zSig;
1224
1225 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1226 zSign = ( a < 0 );
1227 absA = zSign ? - a : a;
1228 shiftCount = countLeadingZeros32( absA ) + 32;
1229 zSig = absA;
1230 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1231
1232 }
1233
1234 #endif
1235
1236 #ifdef FLOAT128
1237
1238 /*
1239 -------------------------------------------------------------------------------
1240 Returns the result of converting the 32-bit two's complement integer `a' to
1241 the quadruple-precision floating-point format. The conversion is performed
1242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1243 -------------------------------------------------------------------------------
1244 */
int32_to_float128(int32 a)1245 float128 int32_to_float128( int32 a )
1246 {
1247 flag zSign;
1248 uint32 absA;
1249 int8 shiftCount;
1250 bits64 zSig0;
1251
1252 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1253 zSign = ( a < 0 );
1254 absA = zSign ? - a : a;
1255 shiftCount = countLeadingZeros32( absA ) + 17;
1256 zSig0 = absA;
1257 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1258
1259 }
1260
1261 #endif
1262
1263 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1264 /*
1265 -------------------------------------------------------------------------------
1266 Returns the result of converting the 64-bit two's complement integer `a'
1267 to the single-precision floating-point format. The conversion is performed
1268 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1269 -------------------------------------------------------------------------------
1270 */
int64_to_float32(int64 a)1271 float32 int64_to_float32( int64 a )
1272 {
1273 flag zSign;
1274 uint64 absA;
1275 int8 shiftCount;
1276
1277 if ( a == 0 ) return 0;
1278 zSign = ( a < 0 );
1279 absA = zSign ? - a : a;
1280 shiftCount = countLeadingZeros64( absA ) - 40;
1281 if ( 0 <= shiftCount ) {
1282 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1283 }
1284 else {
1285 shiftCount += 7;
1286 if ( shiftCount < 0 ) {
1287 shift64RightJamming( absA, - shiftCount, &absA );
1288 }
1289 else {
1290 absA <<= shiftCount;
1291 }
1292 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1293 }
1294
1295 }
1296
1297 /*
1298 -------------------------------------------------------------------------------
1299 Returns the result of converting the 64-bit two's complement integer `a'
1300 to the double-precision floating-point format. The conversion is performed
1301 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1302 -------------------------------------------------------------------------------
1303 */
int64_to_float64(int64 a)1304 float64 int64_to_float64( int64 a )
1305 {
1306 flag zSign;
1307
1308 if ( a == 0 ) return 0;
1309 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1310 return packFloat64( 1, 0x43E, 0 );
1311 }
1312 zSign = ( a < 0 );
1313 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1314
1315 }
1316
1317 #ifdef FLOATX80
1318
1319 /*
1320 -------------------------------------------------------------------------------
1321 Returns the result of converting the 64-bit two's complement integer `a'
1322 to the extended double-precision floating-point format. The conversion
1323 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1324 Arithmetic.
1325 -------------------------------------------------------------------------------
1326 */
int64_to_floatx80(int64 a)1327 floatx80 int64_to_floatx80( int64 a )
1328 {
1329 flag zSign;
1330 uint64 absA;
1331 int8 shiftCount;
1332
1333 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1334 zSign = ( a < 0 );
1335 absA = zSign ? - a : a;
1336 shiftCount = countLeadingZeros64( absA );
1337 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1338
1339 }
1340
1341 #endif
1342
1343 #ifdef FLOAT128
1344
1345 /*
1346 -------------------------------------------------------------------------------
1347 Returns the result of converting the 64-bit two's complement integer `a' to
1348 the quadruple-precision floating-point format. The conversion is performed
1349 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1350 -------------------------------------------------------------------------------
1351 */
int64_to_float128(int64 a)1352 float128 int64_to_float128( int64 a )
1353 {
1354 flag zSign;
1355 uint64 absA;
1356 int8 shiftCount;
1357 int32 zExp;
1358 bits64 zSig0, zSig1;
1359
1360 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1361 zSign = ( a < 0 );
1362 absA = zSign ? - a : a;
1363 shiftCount = countLeadingZeros64( absA ) + 49;
1364 zExp = 0x406E - shiftCount;
1365 if ( 64 <= shiftCount ) {
1366 zSig1 = 0;
1367 zSig0 = absA;
1368 shiftCount -= 64;
1369 }
1370 else {
1371 zSig1 = absA;
1372 zSig0 = 0;
1373 }
1374 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1375 return packFloat128( zSign, zExp, zSig0, zSig1 );
1376
1377 }
1378
1379 #endif
1380 #endif /* !SOFTFLOAT_FOR_GCC */
1381
1382 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1383 /*
1384 -------------------------------------------------------------------------------
1385 Returns the result of converting the single-precision floating-point value
1386 `a' to the 32-bit two's complement integer format. The conversion is
1387 performed according to the IEC/IEEE Standard for Binary Floating-Point
1388 Arithmetic---which means in particular that the conversion is rounded
1389 according to the current rounding mode. If `a' is a NaN, the largest
1390 positive integer is returned. Otherwise, if the conversion overflows, the
1391 largest integer with the same sign as `a' is returned.
1392 -------------------------------------------------------------------------------
1393 */
float32_to_int32(float32 a)1394 int32 float32_to_int32( float32 a )
1395 {
1396 flag aSign;
1397 int16 aExp, shiftCount;
1398 bits32 aSig;
1399 bits64 aSig64;
1400
1401 aSig = extractFloat32Frac( a );
1402 aExp = extractFloat32Exp( a );
1403 aSign = extractFloat32Sign( a );
1404 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1405 if ( aExp ) aSig |= 0x00800000;
1406 shiftCount = 0xAF - aExp;
1407 aSig64 = aSig;
1408 aSig64 <<= 32;
1409 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1410 return roundAndPackInt32( aSign, aSig64 );
1411
1412 }
1413 #endif /* !SOFTFLOAT_FOR_GCC */
1414
1415 /*
1416 -------------------------------------------------------------------------------
1417 Returns the result of converting the single-precision floating-point value
1418 `a' to the 32-bit two's complement integer format. The conversion is
1419 performed according to the IEC/IEEE Standard for Binary Floating-Point
1420 Arithmetic, except that the conversion is always rounded toward zero.
1421 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1422 the conversion overflows, the largest integer with the same sign as `a' is
1423 returned.
1424 -------------------------------------------------------------------------------
1425 */
float32_to_int32_round_to_zero(float32 a)1426 int32 float32_to_int32_round_to_zero( float32 a )
1427 {
1428 flag aSign;
1429 int16 aExp, shiftCount;
1430 bits32 aSig;
1431 int32 z;
1432
1433 aSig = extractFloat32Frac( a );
1434 aExp = extractFloat32Exp( a );
1435 aSign = extractFloat32Sign( a );
1436 shiftCount = aExp - 0x9E;
1437 if ( 0 <= shiftCount ) {
1438 if ( a != 0xCF000000 ) {
1439 float_raise( float_flag_invalid );
1440 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1441 }
1442 return (sbits32) 0x80000000;
1443 }
1444 else if ( aExp <= 0x7E ) {
1445 if ( aExp | aSig ) float_set_inexact();
1446 return 0;
1447 }
1448 aSig = ( aSig | 0x00800000 )<<8;
1449 z = aSig>>( - shiftCount );
1450 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1451 float_set_inexact();
1452 }
1453 if ( aSign ) z = - z;
1454 return z;
1455
1456 }
1457
1458 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1459 /*
1460 -------------------------------------------------------------------------------
1461 Returns the result of converting the single-precision floating-point value
1462 `a' to the 64-bit two's complement integer format. The conversion is
1463 performed according to the IEC/IEEE Standard for Binary Floating-Point
1464 Arithmetic---which means in particular that the conversion is rounded
1465 according to the current rounding mode. If `a' is a NaN, the largest
1466 positive integer is returned. Otherwise, if the conversion overflows, the
1467 largest integer with the same sign as `a' is returned.
1468 -------------------------------------------------------------------------------
1469 */
float32_to_int64(float32 a)1470 int64 float32_to_int64( float32 a )
1471 {
1472 flag aSign;
1473 int16 aExp, shiftCount;
1474 bits32 aSig;
1475 bits64 aSig64, aSigExtra;
1476
1477 aSig = extractFloat32Frac( a );
1478 aExp = extractFloat32Exp( a );
1479 aSign = extractFloat32Sign( a );
1480 shiftCount = 0xBE - aExp;
1481 if ( shiftCount < 0 ) {
1482 float_raise( float_flag_invalid );
1483 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1484 return LIT64( 0x7FFFFFFFFFFFFFFF );
1485 }
1486 return (sbits64) LIT64( 0x8000000000000000 );
1487 }
1488 if ( aExp ) aSig |= 0x00800000;
1489 aSig64 = aSig;
1490 aSig64 <<= 40;
1491 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1492 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1493
1494 }
1495
1496 /*
1497 -------------------------------------------------------------------------------
1498 Returns the result of converting the single-precision floating-point value
1499 `a' to the 64-bit two's complement integer format. The conversion is
1500 performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 Arithmetic, except that the conversion is always rounded toward zero. If
1502 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1503 conversion overflows, the largest integer with the same sign as `a' is
1504 returned.
1505 -------------------------------------------------------------------------------
1506 */
float32_to_int64_round_to_zero(float32 a)1507 int64 float32_to_int64_round_to_zero( float32 a )
1508 {
1509 flag aSign;
1510 int16 aExp, shiftCount;
1511 bits32 aSig;
1512 bits64 aSig64;
1513 int64 z;
1514
1515 aSig = extractFloat32Frac( a );
1516 aExp = extractFloat32Exp( a );
1517 aSign = extractFloat32Sign( a );
1518 shiftCount = aExp - 0xBE;
1519 if ( 0 <= shiftCount ) {
1520 if ( a != 0xDF000000 ) {
1521 float_raise( float_flag_invalid );
1522 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1523 return LIT64( 0x7FFFFFFFFFFFFFFF );
1524 }
1525 }
1526 return (sbits64) LIT64( 0x8000000000000000 );
1527 }
1528 else if ( aExp <= 0x7E ) {
1529 if ( aExp | aSig ) float_set_inexact();
1530 return 0;
1531 }
1532 aSig64 = aSig | 0x00800000;
1533 aSig64 <<= 40;
1534 z = aSig64>>( - shiftCount );
1535 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1536 float_set_inexact();
1537 }
1538 if ( aSign ) z = - z;
1539 return z;
1540
1541 }
1542 #endif /* !SOFTFLOAT_FOR_GCC */
1543
1544 /*
1545 -------------------------------------------------------------------------------
1546 Returns the result of converting the single-precision floating-point value
1547 `a' to the double-precision floating-point format. The conversion is
1548 performed according to the IEC/IEEE Standard for Binary Floating-Point
1549 Arithmetic.
1550 -------------------------------------------------------------------------------
1551 */
float32_to_float64(float32 a)1552 float64 float32_to_float64( float32 a )
1553 {
1554 flag aSign;
1555 int16 aExp;
1556 bits32 aSig;
1557
1558 aSig = extractFloat32Frac( a );
1559 aExp = extractFloat32Exp( a );
1560 aSign = extractFloat32Sign( a );
1561 if ( aExp == 0xFF ) {
1562 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1563 return packFloat64( aSign, 0x7FF, 0 );
1564 }
1565 if ( aExp == 0 ) {
1566 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1567 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1568 --aExp;
1569 }
1570 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1571
1572 }
1573
1574 #ifdef FLOATX80
1575
1576 /*
1577 -------------------------------------------------------------------------------
1578 Returns the result of converting the single-precision floating-point value
1579 `a' to the extended double-precision floating-point format. The conversion
1580 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1581 Arithmetic.
1582 -------------------------------------------------------------------------------
1583 */
float32_to_floatx80(float32 a)1584 floatx80 float32_to_floatx80( float32 a )
1585 {
1586 flag aSign;
1587 int16 aExp;
1588 bits32 aSig;
1589
1590 aSig = extractFloat32Frac( a );
1591 aExp = extractFloat32Exp( a );
1592 aSign = extractFloat32Sign( a );
1593 if ( aExp == 0xFF ) {
1594 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1595 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1596 }
1597 if ( aExp == 0 ) {
1598 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1599 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1600 }
1601 aSig |= 0x00800000;
1602 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1603
1604 }
1605
1606 #endif
1607
1608 #ifdef FLOAT128
1609
1610 /*
1611 -------------------------------------------------------------------------------
1612 Returns the result of converting the single-precision floating-point value
1613 `a' to the double-precision floating-point format. The conversion is
1614 performed according to the IEC/IEEE Standard for Binary Floating-Point
1615 Arithmetic.
1616 -------------------------------------------------------------------------------
1617 */
float32_to_float128(float32 a)1618 float128 float32_to_float128( float32 a )
1619 {
1620 flag aSign;
1621 int16 aExp;
1622 bits32 aSig;
1623
1624 aSig = extractFloat32Frac( a );
1625 aExp = extractFloat32Exp( a );
1626 aSign = extractFloat32Sign( a );
1627 if ( aExp == 0xFF ) {
1628 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1629 return packFloat128( aSign, 0x7FFF, 0, 0 );
1630 }
1631 if ( aExp == 0 ) {
1632 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1633 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1634 --aExp;
1635 }
1636 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1637
1638 }
1639
1640 #endif
1641
1642 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1643 /*
1644 -------------------------------------------------------------------------------
1645 Rounds the single-precision floating-point value `a' to an integer, and
1646 returns the result as a single-precision floating-point value. The
1647 operation is performed according to the IEC/IEEE Standard for Binary
1648 Floating-Point Arithmetic.
1649 -------------------------------------------------------------------------------
1650 */
float32_round_to_int(float32 a)1651 float32 float32_round_to_int( float32 a )
1652 {
1653 flag aSign;
1654 int16 aExp;
1655 bits32 lastBitMask, roundBitsMask;
1656 int8 roundingMode;
1657 float32 z;
1658
1659 aExp = extractFloat32Exp( a );
1660 if ( 0x96 <= aExp ) {
1661 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1662 return propagateFloat32NaN( a, a );
1663 }
1664 return a;
1665 }
1666 if ( aExp <= 0x7E ) {
1667 if ( (bits32) ( a<<1 ) == 0 ) return a;
1668 float_set_inexact();
1669 aSign = extractFloat32Sign( a );
1670 switch ( float_rounding_mode() ) {
1671 case float_round_nearest_even:
1672 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1673 return packFloat32( aSign, 0x7F, 0 );
1674 }
1675 break;
1676 case float_round_down:
1677 return aSign ? 0xBF800000 : 0;
1678 case float_round_up:
1679 return aSign ? 0x80000000 : 0x3F800000;
1680 }
1681 return packFloat32( aSign, 0, 0 );
1682 }
1683 lastBitMask = 1;
1684 lastBitMask <<= 0x96 - aExp;
1685 roundBitsMask = lastBitMask - 1;
1686 z = a;
1687 roundingMode = float_rounding_mode();
1688 if ( roundingMode == float_round_nearest_even ) {
1689 z += lastBitMask>>1;
1690 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1691 }
1692 else if ( roundingMode != float_round_to_zero ) {
1693 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1694 z += roundBitsMask;
1695 }
1696 }
1697 z &= ~ roundBitsMask;
1698 if ( z != a ) float_set_inexact();
1699 return z;
1700
1701 }
1702 #endif /* !SOFTFLOAT_FOR_GCC */
1703
1704 /*
1705 -------------------------------------------------------------------------------
1706 Returns the result of adding the absolute values of the single-precision
1707 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1708 before being returned. `zSign' is ignored if the result is a NaN.
1709 The addition is performed according to the IEC/IEEE Standard for Binary
1710 Floating-Point Arithmetic.
1711 -------------------------------------------------------------------------------
1712 */
addFloat32Sigs(float32 a,float32 b,flag zSign)1713 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1714 {
1715 int16 aExp, bExp, zExp;
1716 bits32 aSig, bSig, zSig;
1717 int16 expDiff;
1718
1719 aSig = extractFloat32Frac( a );
1720 aExp = extractFloat32Exp( a );
1721 bSig = extractFloat32Frac( b );
1722 bExp = extractFloat32Exp( b );
1723 expDiff = aExp - bExp;
1724 aSig <<= 6;
1725 bSig <<= 6;
1726 if ( 0 < expDiff ) {
1727 if ( aExp == 0xFF ) {
1728 if ( aSig ) return propagateFloat32NaN( a, b );
1729 return a;
1730 }
1731 if ( bExp == 0 ) {
1732 --expDiff;
1733 }
1734 else {
1735 bSig |= 0x20000000;
1736 }
1737 shift32RightJamming( bSig, expDiff, &bSig );
1738 zExp = aExp;
1739 }
1740 else if ( expDiff < 0 ) {
1741 if ( bExp == 0xFF ) {
1742 if ( bSig ) return propagateFloat32NaN( a, b );
1743 return packFloat32( zSign, 0xFF, 0 );
1744 }
1745 if ( aExp == 0 ) {
1746 ++expDiff;
1747 }
1748 else {
1749 aSig |= 0x20000000;
1750 }
1751 shift32RightJamming( aSig, - expDiff, &aSig );
1752 zExp = bExp;
1753 }
1754 else {
1755 if ( aExp == 0xFF ) {
1756 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1757 return a;
1758 }
1759 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1760 zSig = 0x40000000 + aSig + bSig;
1761 zExp = aExp;
1762 goto roundAndPack;
1763 }
1764 aSig |= 0x20000000;
1765 zSig = ( aSig + bSig )<<1;
1766 --zExp;
1767 if ( (sbits32) zSig < 0 ) {
1768 zSig = aSig + bSig;
1769 ++zExp;
1770 }
1771 roundAndPack:
1772 return roundAndPackFloat32( zSign, zExp, zSig );
1773
1774 }
1775
1776 /*
1777 -------------------------------------------------------------------------------
1778 Returns the result of subtracting the absolute values of the single-
1779 precision floating-point values `a' and `b'. If `zSign' is 1, the
1780 difference is negated before being returned. `zSign' is ignored if the
1781 result is a NaN. The subtraction is performed according to the IEC/IEEE
1782 Standard for Binary Floating-Point Arithmetic.
1783 -------------------------------------------------------------------------------
1784 */
subFloat32Sigs(float32 a,float32 b,flag zSign)1785 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1786 {
1787 int16 aExp, bExp, zExp;
1788 bits32 aSig, bSig, zSig;
1789 int16 expDiff;
1790
1791 aSig = extractFloat32Frac( a );
1792 aExp = extractFloat32Exp( a );
1793 bSig = extractFloat32Frac( b );
1794 bExp = extractFloat32Exp( b );
1795 expDiff = aExp - bExp;
1796 aSig <<= 7;
1797 bSig <<= 7;
1798 if ( 0 < expDiff ) goto aExpBigger;
1799 if ( expDiff < 0 ) goto bExpBigger;
1800 if ( aExp == 0xFF ) {
1801 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1802 float_raise( float_flag_invalid );
1803 return float32_default_nan;
1804 }
1805 if ( aExp == 0 ) {
1806 aExp = 1;
1807 bExp = 1;
1808 }
1809 if ( bSig < aSig ) goto aBigger;
1810 if ( aSig < bSig ) goto bBigger;
1811 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1812 bExpBigger:
1813 if ( bExp == 0xFF ) {
1814 if ( bSig ) return propagateFloat32NaN( a, b );
1815 return packFloat32( zSign ^ 1, 0xFF, 0 );
1816 }
1817 if ( aExp == 0 ) {
1818 ++expDiff;
1819 }
1820 else {
1821 aSig |= 0x40000000;
1822 }
1823 shift32RightJamming( aSig, - expDiff, &aSig );
1824 bSig |= 0x40000000;
1825 bBigger:
1826 zSig = bSig - aSig;
1827 zExp = bExp;
1828 zSign ^= 1;
1829 goto normalizeRoundAndPack;
1830 aExpBigger:
1831 if ( aExp == 0xFF ) {
1832 if ( aSig ) return propagateFloat32NaN( a, b );
1833 return a;
1834 }
1835 if ( bExp == 0 ) {
1836 --expDiff;
1837 }
1838 else {
1839 bSig |= 0x40000000;
1840 }
1841 shift32RightJamming( bSig, expDiff, &bSig );
1842 aSig |= 0x40000000;
1843 aBigger:
1844 zSig = aSig - bSig;
1845 zExp = aExp;
1846 normalizeRoundAndPack:
1847 --zExp;
1848 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1849
1850 }
1851
1852 /*
1853 -------------------------------------------------------------------------------
1854 Returns the result of adding the single-precision floating-point values `a'
1855 and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 Binary Floating-Point Arithmetic.
1857 -------------------------------------------------------------------------------
1858 */
float32_add(float32 a,float32 b)1859 float32 float32_add( float32 a, float32 b )
1860 {
1861 flag aSign, bSign;
1862
1863 aSign = extractFloat32Sign( a );
1864 bSign = extractFloat32Sign( b );
1865 if ( aSign == bSign ) {
1866 return addFloat32Sigs( a, b, aSign );
1867 }
1868 else {
1869 return subFloat32Sigs( a, b, aSign );
1870 }
1871
1872 }
1873
1874 /*
1875 -------------------------------------------------------------------------------
1876 Returns the result of subtracting the single-precision floating-point values
1877 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1878 for Binary Floating-Point Arithmetic.
1879 -------------------------------------------------------------------------------
1880 */
float32_sub(float32 a,float32 b)1881 float32 float32_sub( float32 a, float32 b )
1882 {
1883 flag aSign, bSign;
1884
1885 aSign = extractFloat32Sign( a );
1886 bSign = extractFloat32Sign( b );
1887 if ( aSign == bSign ) {
1888 return subFloat32Sigs( a, b, aSign );
1889 }
1890 else {
1891 return addFloat32Sigs( a, b, aSign );
1892 }
1893
1894 }
1895
1896 /*
1897 -------------------------------------------------------------------------------
1898 Returns the result of multiplying the single-precision floating-point values
1899 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1900 for Binary Floating-Point Arithmetic.
1901 -------------------------------------------------------------------------------
1902 */
float32_mul(float32 a,float32 b)1903 float32 float32_mul( float32 a, float32 b )
1904 {
1905 flag aSign, bSign, zSign;
1906 int16 aExp, bExp, zExp;
1907 bits32 aSig, bSig;
1908 bits64 zSig64;
1909 bits32 zSig;
1910
1911 aSig = extractFloat32Frac( a );
1912 aExp = extractFloat32Exp( a );
1913 aSign = extractFloat32Sign( a );
1914 bSig = extractFloat32Frac( b );
1915 bExp = extractFloat32Exp( b );
1916 bSign = extractFloat32Sign( b );
1917 zSign = aSign ^ bSign;
1918 if ( aExp == 0xFF ) {
1919 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1920 return propagateFloat32NaN( a, b );
1921 }
1922 if ( ( bExp | bSig ) == 0 ) {
1923 float_raise( float_flag_invalid );
1924 return float32_default_nan;
1925 }
1926 return packFloat32( zSign, 0xFF, 0 );
1927 }
1928 if ( bExp == 0xFF ) {
1929 if ( bSig ) return propagateFloat32NaN( a, b );
1930 if ( ( aExp | aSig ) == 0 ) {
1931 float_raise( float_flag_invalid );
1932 return float32_default_nan;
1933 }
1934 return packFloat32( zSign, 0xFF, 0 );
1935 }
1936 if ( aExp == 0 ) {
1937 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1938 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1939 }
1940 if ( bExp == 0 ) {
1941 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1942 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943 }
1944 zExp = aExp + bExp - 0x7F;
1945 aSig = ( aSig | 0x00800000 )<<7;
1946 bSig = ( bSig | 0x00800000 )<<8;
1947 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1948 zSig = zSig64;
1949 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1950 zSig <<= 1;
1951 --zExp;
1952 }
1953 return roundAndPackFloat32( zSign, zExp, zSig );
1954
1955 }
1956
1957 /*
1958 -------------------------------------------------------------------------------
1959 Returns the result of dividing the single-precision floating-point value `a'
1960 by the corresponding value `b'. The operation is performed according to the
1961 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1962 -------------------------------------------------------------------------------
1963 */
float32_div(float32 a,float32 b)1964 float32 float32_div( float32 a, float32 b )
1965 {
1966 flag aSign, bSign, zSign;
1967 int16 aExp, bExp, zExp;
1968 bits32 aSig, bSig, zSig;
1969
1970 aSig = extractFloat32Frac( a );
1971 aExp = extractFloat32Exp( a );
1972 aSign = extractFloat32Sign( a );
1973 bSig = extractFloat32Frac( b );
1974 bExp = extractFloat32Exp( b );
1975 bSign = extractFloat32Sign( b );
1976 zSign = aSign ^ bSign;
1977 if ( aExp == 0xFF ) {
1978 if ( aSig ) return propagateFloat32NaN( a, b );
1979 if ( bExp == 0xFF ) {
1980 if ( bSig ) return propagateFloat32NaN( a, b );
1981 float_raise( float_flag_invalid );
1982 return float32_default_nan;
1983 }
1984 return packFloat32( zSign, 0xFF, 0 );
1985 }
1986 if ( bExp == 0xFF ) {
1987 if ( bSig ) return propagateFloat32NaN( a, b );
1988 return packFloat32( zSign, 0, 0 );
1989 }
1990 if ( bExp == 0 ) {
1991 if ( bSig == 0 ) {
1992 if ( ( aExp | aSig ) == 0 ) {
1993 float_raise( float_flag_invalid );
1994 return float32_default_nan;
1995 }
1996 float_raise( float_flag_divbyzero );
1997 return packFloat32( zSign, 0xFF, 0 );
1998 }
1999 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2000 }
2001 if ( aExp == 0 ) {
2002 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2003 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2004 }
2005 zExp = aExp - bExp + 0x7D;
2006 aSig = ( aSig | 0x00800000 )<<7;
2007 bSig = ( bSig | 0x00800000 )<<8;
2008 if ( bSig <= ( aSig + aSig ) ) {
2009 aSig >>= 1;
2010 ++zExp;
2011 }
2012 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2013 if ( ( zSig & 0x3F ) == 0 ) {
2014 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2015 }
2016 return roundAndPackFloat32( zSign, zExp, zSig );
2017
2018 }
2019
2020 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2021 /*
2022 -------------------------------------------------------------------------------
2023 Returns the remainder of the single-precision floating-point value `a'
2024 with respect to the corresponding value `b'. The operation is performed
2025 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2026 -------------------------------------------------------------------------------
2027 */
float32_rem(float32 a,float32 b)2028 float32 float32_rem( float32 a, float32 b )
2029 {
2030 flag aSign, bSign, zSign;
2031 int16 aExp, bExp, expDiff;
2032 bits32 aSig, bSig;
2033 bits32 q;
2034 bits64 aSig64, bSig64, q64;
2035 bits32 alternateASig;
2036 sbits32 sigMean;
2037
2038 aSig = extractFloat32Frac( a );
2039 aExp = extractFloat32Exp( a );
2040 aSign = extractFloat32Sign( a );
2041 bSig = extractFloat32Frac( b );
2042 bExp = extractFloat32Exp( b );
2043 bSign = extractFloat32Sign( b );
2044 if ( aExp == 0xFF ) {
2045 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2046 return propagateFloat32NaN( a, b );
2047 }
2048 float_raise( float_flag_invalid );
2049 return float32_default_nan;
2050 }
2051 if ( bExp == 0xFF ) {
2052 if ( bSig ) return propagateFloat32NaN( a, b );
2053 return a;
2054 }
2055 if ( bExp == 0 ) {
2056 if ( bSig == 0 ) {
2057 float_raise( float_flag_invalid );
2058 return float32_default_nan;
2059 }
2060 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2061 }
2062 if ( aExp == 0 ) {
2063 if ( aSig == 0 ) return a;
2064 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2065 }
2066 expDiff = aExp - bExp;
2067 aSig |= 0x00800000;
2068 bSig |= 0x00800000;
2069 if ( expDiff < 32 ) {
2070 aSig <<= 8;
2071 bSig <<= 8;
2072 if ( expDiff < 0 ) {
2073 if ( expDiff < -1 ) return a;
2074 aSig >>= 1;
2075 }
2076 q = ( bSig <= aSig );
2077 if ( q ) aSig -= bSig;
2078 if ( 0 < expDiff ) {
2079 q = ( ( (bits64) aSig )<<32 ) / bSig;
2080 q >>= 32 - expDiff;
2081 bSig >>= 2;
2082 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2083 }
2084 else {
2085 aSig >>= 2;
2086 bSig >>= 2;
2087 }
2088 }
2089 else {
2090 if ( bSig <= aSig ) aSig -= bSig;
2091 aSig64 = ( (bits64) aSig )<<40;
2092 bSig64 = ( (bits64) bSig )<<40;
2093 expDiff -= 64;
2094 while ( 0 < expDiff ) {
2095 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2096 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2097 aSig64 = - ( ( bSig * q64 )<<38 );
2098 expDiff -= 62;
2099 }
2100 expDiff += 64;
2101 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2102 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2103 q = q64>>( 64 - expDiff );
2104 bSig <<= 6;
2105 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2106 }
2107 do {
2108 alternateASig = aSig;
2109 ++q;
2110 aSig -= bSig;
2111 } while ( 0 <= (sbits32) aSig );
2112 sigMean = aSig + alternateASig;
2113 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2114 aSig = alternateASig;
2115 }
2116 zSign = ( (sbits32) aSig < 0 );
2117 if ( zSign ) aSig = - aSig;
2118 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2119
2120 }
2121 #endif /* !SOFTFLOAT_FOR_GCC */
2122
2123 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2124 /*
2125 -------------------------------------------------------------------------------
2126 Returns the square root of the single-precision floating-point value `a'.
2127 The operation is performed according to the IEC/IEEE Standard for Binary
2128 Floating-Point Arithmetic.
2129 -------------------------------------------------------------------------------
2130 */
float32_sqrt(float32 a)2131 float32 float32_sqrt( float32 a )
2132 {
2133 flag aSign;
2134 int16 aExp, zExp;
2135 bits32 aSig, zSig;
2136 bits64 rem, term;
2137
2138 aSig = extractFloat32Frac( a );
2139 aExp = extractFloat32Exp( a );
2140 aSign = extractFloat32Sign( a );
2141 if ( aExp == 0xFF ) {
2142 if ( aSig ) return propagateFloat32NaN( a, 0 );
2143 if ( ! aSign ) return a;
2144 float_raise( float_flag_invalid );
2145 return float32_default_nan;
2146 }
2147 if ( aSign ) {
2148 if ( ( aExp | aSig ) == 0 ) return a;
2149 float_raise( float_flag_invalid );
2150 return float32_default_nan;
2151 }
2152 if ( aExp == 0 ) {
2153 if ( aSig == 0 ) return 0;
2154 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2155 }
2156 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2157 aSig = ( aSig | 0x00800000 )<<8;
2158 zSig = estimateSqrt32( aExp, aSig ) + 2;
2159 if ( ( zSig & 0x7F ) <= 5 ) {
2160 if ( zSig < 2 ) {
2161 zSig = 0x7FFFFFFF;
2162 goto roundAndPack;
2163 }
2164 aSig >>= aExp & 1;
2165 term = ( (bits64) zSig ) * zSig;
2166 rem = ( ( (bits64) aSig )<<32 ) - term;
2167 while ( (sbits64) rem < 0 ) {
2168 --zSig;
2169 rem += ( ( (bits64) zSig )<<1 ) | 1;
2170 }
2171 zSig |= ( rem != 0 );
2172 }
2173 shift32RightJamming( zSig, 1, &zSig );
2174 roundAndPack:
2175 return roundAndPackFloat32( 0, zExp, zSig );
2176
2177 }
2178 #endif /* !SOFTFLOAT_FOR_GCC */
2179
2180 /*
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the single-precision floating-point value `a' is equal to
2183 the corresponding value `b', and 0 otherwise. The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2186 */
float32_eq(float32 a,float32 b)2187 flag float32_eq( float32 a, float32 b )
2188 {
2189
2190 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2191 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2192 ) {
2193 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2194 float_raise( float_flag_invalid );
2195 }
2196 return 0;
2197 }
2198 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2199
2200 }
2201
2202 /*
2203 -------------------------------------------------------------------------------
2204 Returns 1 if the single-precision floating-point value `a' is less than
2205 or equal to the corresponding value `b', and 0 otherwise. The comparison
2206 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2207 Arithmetic.
2208 -------------------------------------------------------------------------------
2209 */
float32_le(float32 a,float32 b)2210 flag float32_le( float32 a, float32 b )
2211 {
2212 flag aSign, bSign;
2213
2214 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2215 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2216 ) {
2217 float_raise( float_flag_invalid );
2218 return 0;
2219 }
2220 aSign = extractFloat32Sign( a );
2221 bSign = extractFloat32Sign( b );
2222 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2223 return ( a == b ) || ( aSign ^ ( a < b ) );
2224
2225 }
2226
2227 /*
2228 -------------------------------------------------------------------------------
2229 Returns 1 if the single-precision floating-point value `a' is less than
2230 the corresponding value `b', and 0 otherwise. The comparison is performed
2231 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2232 -------------------------------------------------------------------------------
2233 */
float32_lt(float32 a,float32 b)2234 flag float32_lt( float32 a, float32 b )
2235 {
2236 flag aSign, bSign;
2237
2238 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2239 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2240 ) {
2241 float_raise( float_flag_invalid );
2242 return 0;
2243 }
2244 aSign = extractFloat32Sign( a );
2245 bSign = extractFloat32Sign( b );
2246 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2247 return ( a != b ) && ( aSign ^ ( a < b ) );
2248
2249 }
2250
2251 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2252 /*
2253 -------------------------------------------------------------------------------
2254 Returns 1 if the single-precision floating-point value `a' is equal to
2255 the corresponding value `b', and 0 otherwise. The invalid exception is
2256 raised if either operand is a NaN. Otherwise, the comparison is performed
2257 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2258 -------------------------------------------------------------------------------
2259 */
float32_eq_signaling(float32 a,float32 b)2260 flag float32_eq_signaling( float32 a, float32 b )
2261 {
2262
2263 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2264 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2265 ) {
2266 float_raise( float_flag_invalid );
2267 return 0;
2268 }
2269 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2270
2271 }
2272
2273 /*
2274 -------------------------------------------------------------------------------
2275 Returns 1 if the single-precision floating-point value `a' is less than or
2276 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2277 cause an exception. Otherwise, the comparison is performed according to the
2278 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2279 -------------------------------------------------------------------------------
2280 */
float32_le_quiet(float32 a,float32 b)2281 flag float32_le_quiet( float32 a, float32 b )
2282 {
2283 flag aSign, bSign;
2284
2285 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2286 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2287 ) {
2288 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2289 float_raise( float_flag_invalid );
2290 }
2291 return 0;
2292 }
2293 aSign = extractFloat32Sign( a );
2294 bSign = extractFloat32Sign( b );
2295 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2296 return ( a == b ) || ( aSign ^ ( a < b ) );
2297
2298 }
2299
2300 /*
2301 -------------------------------------------------------------------------------
2302 Returns 1 if the single-precision floating-point value `a' is less than
2303 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2304 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2305 Standard for Binary Floating-Point Arithmetic.
2306 -------------------------------------------------------------------------------
2307 */
float32_lt_quiet(float32 a,float32 b)2308 flag float32_lt_quiet( float32 a, float32 b )
2309 {
2310 flag aSign, bSign;
2311
2312 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2313 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2314 ) {
2315 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2316 float_raise( float_flag_invalid );
2317 }
2318 return 0;
2319 }
2320 aSign = extractFloat32Sign( a );
2321 bSign = extractFloat32Sign( b );
2322 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2323 return ( a != b ) && ( aSign ^ ( a < b ) );
2324
2325 }
2326 #endif /* !SOFTFLOAT_FOR_GCC */
2327
2328 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2329 /*
2330 -------------------------------------------------------------------------------
2331 Returns the result of converting the double-precision floating-point value
2332 `a' to the 32-bit two's complement integer format. The conversion is
2333 performed according to the IEC/IEEE Standard for Binary Floating-Point
2334 Arithmetic---which means in particular that the conversion is rounded
2335 according to the current rounding mode. If `a' is a NaN, the largest
2336 positive integer is returned. Otherwise, if the conversion overflows, the
2337 largest integer with the same sign as `a' is returned.
2338 -------------------------------------------------------------------------------
2339 */
float64_to_int32(float64 a)2340 int32 float64_to_int32( float64 a )
2341 {
2342 flag aSign;
2343 int16 aExp, shiftCount;
2344 bits64 aSig;
2345
2346 aSig = extractFloat64Frac( a );
2347 aExp = extractFloat64Exp( a );
2348 aSign = extractFloat64Sign( a );
2349 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2350 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2351 shiftCount = 0x42C - aExp;
2352 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2353 return roundAndPackInt32( aSign, aSig );
2354
2355 }
2356 #endif /* !SOFTFLOAT_FOR_GCC */
2357
2358 /*
2359 -------------------------------------------------------------------------------
2360 Returns the result of converting the double-precision floating-point value
2361 `a' to the 32-bit two's complement integer format. The conversion is
2362 performed according to the IEC/IEEE Standard for Binary Floating-Point
2363 Arithmetic, except that the conversion is always rounded toward zero.
2364 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2365 the conversion overflows, the largest integer with the same sign as `a' is
2366 returned.
2367 -------------------------------------------------------------------------------
2368 */
float64_to_int32_round_to_zero(float64 a)2369 int32 float64_to_int32_round_to_zero( float64 a )
2370 {
2371 flag aSign;
2372 int16 aExp, shiftCount;
2373 bits64 aSig, savedASig;
2374 int32 z;
2375
2376 aSig = extractFloat64Frac( a );
2377 aExp = extractFloat64Exp( a );
2378 aSign = extractFloat64Sign( a );
2379 if ( 0x41E < aExp ) {
2380 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2381 goto invalid;
2382 }
2383 else if ( aExp < 0x3FF ) {
2384 if ( aExp || aSig ) float_set_inexact();
2385 return 0;
2386 }
2387 aSig |= LIT64( 0x0010000000000000 );
2388 shiftCount = 0x433 - aExp;
2389 savedASig = aSig;
2390 aSig >>= shiftCount;
2391 z = aSig;
2392 if ( aSign ) z = - z;
2393 if ( ( z < 0 ) ^ aSign ) {
2394 invalid:
2395 float_raise( float_flag_invalid );
2396 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2397 }
2398 if ( ( aSig<<shiftCount ) != savedASig ) {
2399 float_set_inexact();
2400 }
2401 return z;
2402
2403 }
2404
2405 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2406 /*
2407 -------------------------------------------------------------------------------
2408 Returns the result of converting the double-precision floating-point value
2409 `a' to the 64-bit two's complement integer format. The conversion is
2410 performed according to the IEC/IEEE Standard for Binary Floating-Point
2411 Arithmetic---which means in particular that the conversion is rounded
2412 according to the current rounding mode. If `a' is a NaN, the largest
2413 positive integer is returned. Otherwise, if the conversion overflows, the
2414 largest integer with the same sign as `a' is returned.
2415 -------------------------------------------------------------------------------
2416 */
float64_to_int64(float64 a)2417 int64 float64_to_int64( float64 a )
2418 {
2419 flag aSign;
2420 int16 aExp, shiftCount;
2421 bits64 aSig, aSigExtra;
2422
2423 aSig = extractFloat64Frac( a );
2424 aExp = extractFloat64Exp( a );
2425 aSign = extractFloat64Sign( a );
2426 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2427 shiftCount = 0x433 - aExp;
2428 if ( shiftCount <= 0 ) {
2429 if ( 0x43E < aExp ) {
2430 float_raise( float_flag_invalid );
2431 if ( ! aSign
2432 || ( ( aExp == 0x7FF )
2433 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2434 ) {
2435 return LIT64( 0x7FFFFFFFFFFFFFFF );
2436 }
2437 return (sbits64) LIT64( 0x8000000000000000 );
2438 }
2439 aSigExtra = 0;
2440 aSig <<= - shiftCount;
2441 }
2442 else {
2443 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2444 }
2445 return roundAndPackInt64( aSign, aSig, aSigExtra );
2446
2447 }
2448
2449 #ifdef __alpha__
2450 /*
2451 -------------------------------------------------------------------------------
2452 Returns the result of converting the double-precision floating-point value
2453 `a' to the 64-bit two's complement integer format. The conversion is
2454 performed according to the IEC/IEEE Standard for Binary Floating-Point
2455 Arithmetic---which means in particular that the conversion is rounded
2456 according to the current rounding mode. If `a' is a NaN, the invalid
2457 exception is raised and zero is returned.
2458 -------------------------------------------------------------------------------
2459 */
float64_to_int64_no_overflow(float64 a)2460 int64 float64_to_int64_no_overflow( float64 a )
2461 {
2462 flag aSign;
2463 int16 aExp, shiftCount;
2464 bits64 aSig, aSigExtra;
2465
2466 aSig = extractFloat64Frac( a );
2467 aExp = extractFloat64Exp( a );
2468 aSign = extractFloat64Sign( a );
2469 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2470 shiftCount = 0x433 - aExp;
2471 if ( shiftCount <= 0 ) {
2472 if ( 0x43E < aExp ) {
2473 float_raise( float_flag_invalid );
2474 return 0;
2475 }
2476 aSigExtra = 0;
2477 aSig <<= - shiftCount;
2478 }
2479 else {
2480 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2481 }
2482 return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra );
2483
2484 }
2485 #endif /* __alpha__ */
2486
2487 /*
2488 -------------------------------------------------------------------------------
2489 Returns the result of converting the double-precision floating-point value
2490 `a' to the 64-bit two's complement integer format. The conversion is
2491 performed according to the IEC/IEEE Standard for Binary Floating-Point
2492 Arithmetic, except that the conversion is always rounded toward zero.
2493 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2494 the conversion overflows, the largest integer with the same sign as `a' is
2495 returned.
2496 -------------------------------------------------------------------------------
2497 */
float64_to_int64_round_to_zero(float64 a)2498 int64 float64_to_int64_round_to_zero( float64 a )
2499 {
2500 flag aSign;
2501 int16 aExp, shiftCount;
2502 bits64 aSig;
2503 int64 z;
2504
2505 aSig = extractFloat64Frac( a );
2506 aExp = extractFloat64Exp( a );
2507 aSign = extractFloat64Sign( a );
2508 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2509 shiftCount = aExp - 0x433;
2510 if ( 0 <= shiftCount ) {
2511 if ( 0x43E <= aExp ) {
2512 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2513 float_raise( float_flag_invalid );
2514 if ( ! aSign
2515 || ( ( aExp == 0x7FF )
2516 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2517 ) {
2518 return LIT64( 0x7FFFFFFFFFFFFFFF );
2519 }
2520 }
2521 return (sbits64) LIT64( 0x8000000000000000 );
2522 }
2523 z = aSig<<shiftCount;
2524 }
2525 else {
2526 if ( aExp < 0x3FE ) {
2527 if ( aExp | aSig ) float_set_inexact();
2528 return 0;
2529 }
2530 z = aSig>>( - shiftCount );
2531 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2532 float_set_inexact();
2533 }
2534 }
2535 if ( aSign ) z = - z;
2536 return z;
2537
2538 }
2539 #endif /* !SOFTFLOAT_FOR_GCC */
2540
2541 /*
2542 -------------------------------------------------------------------------------
2543 Returns the result of converting the double-precision floating-point value
2544 `a' to the single-precision floating-point format. The conversion is
2545 performed according to the IEC/IEEE Standard for Binary Floating-Point
2546 Arithmetic.
2547 -------------------------------------------------------------------------------
2548 */
float64_to_float32(float64 a)2549 float32 float64_to_float32( float64 a )
2550 {
2551 flag aSign;
2552 int16 aExp;
2553 bits64 aSig;
2554 bits32 zSig;
2555
2556 aSig = extractFloat64Frac( a );
2557 aExp = extractFloat64Exp( a );
2558 aSign = extractFloat64Sign( a );
2559 if ( aExp == 0x7FF ) {
2560 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2561 return packFloat32( aSign, 0xFF, 0 );
2562 }
2563 shift64RightJamming( aSig, 22, &aSig );
2564 zSig = aSig;
2565 if ( aExp || zSig ) {
2566 zSig |= 0x40000000;
2567 aExp -= 0x381;
2568 }
2569 return roundAndPackFloat32( aSign, aExp, zSig );
2570
2571 }
2572
2573 #ifdef FLOATX80
2574
2575 /*
2576 -------------------------------------------------------------------------------
2577 Returns the result of converting the double-precision floating-point value
2578 `a' to the extended double-precision floating-point format. The conversion
2579 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2580 Arithmetic.
2581 -------------------------------------------------------------------------------
2582 */
float64_to_floatx80(float64 a)2583 floatx80 float64_to_floatx80( float64 a )
2584 {
2585 flag aSign;
2586 int16 aExp;
2587 bits64 aSig;
2588
2589 aSig = extractFloat64Frac( a );
2590 aExp = extractFloat64Exp( a );
2591 aSign = extractFloat64Sign( a );
2592 if ( aExp == 0x7FF ) {
2593 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2594 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2595 }
2596 if ( aExp == 0 ) {
2597 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2598 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2599 }
2600 return
2601 packFloatx80(
2602 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2603
2604 }
2605
2606 #endif
2607
2608 #ifdef FLOAT128
2609
2610 /*
2611 -------------------------------------------------------------------------------
2612 Returns the result of converting the double-precision floating-point value
2613 `a' to the quadruple-precision floating-point format. The conversion is
2614 performed according to the IEC/IEEE Standard for Binary Floating-Point
2615 Arithmetic.
2616 -------------------------------------------------------------------------------
2617 */
float64_to_float128(float64 a)2618 float128 float64_to_float128( float64 a )
2619 {
2620 flag aSign;
2621 int16 aExp;
2622 bits64 aSig, zSig0, zSig1;
2623
2624 aSig = extractFloat64Frac( a );
2625 aExp = extractFloat64Exp( a );
2626 aSign = extractFloat64Sign( a );
2627 if ( aExp == 0x7FF ) {
2628 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2629 return packFloat128( aSign, 0x7FFF, 0, 0 );
2630 }
2631 if ( aExp == 0 ) {
2632 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2633 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2634 --aExp;
2635 }
2636 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2637 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2638
2639 }
2640
2641 #endif
2642
2643 #ifndef SOFTFLOAT_FOR_GCC
2644 /*
2645 -------------------------------------------------------------------------------
2646 Rounds the double-precision floating-point value `a' to an integer, and
2647 returns the result as a double-precision floating-point value. The
2648 operation is performed according to the IEC/IEEE Standard for Binary
2649 Floating-Point Arithmetic.
2650 -------------------------------------------------------------------------------
2651 */
float64_round_to_int(float64 a)2652 float64 float64_round_to_int( float64 a )
2653 {
2654 flag aSign;
2655 int16 aExp;
2656 bits64 lastBitMask, roundBitsMask;
2657 int8 roundingMode;
2658 float64 z;
2659
2660 aExp = extractFloat64Exp( a );
2661 if ( 0x433 <= aExp ) {
2662 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2663 return propagateFloat64NaN( a, a );
2664 }
2665 return a;
2666 }
2667 if ( aExp < 0x3FF ) {
2668 if ( (bits64) ( a<<1 ) == 0 ) return a;
2669 float_set_inexact();
2670 aSign = extractFloat64Sign( a );
2671 switch ( float_rounding_mode() ) {
2672 case float_round_nearest_even:
2673 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2674 return packFloat64( aSign, 0x3FF, 0 );
2675 }
2676 break;
2677 case float_round_down:
2678 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2679 case float_round_up:
2680 return
2681 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2682 }
2683 return packFloat64( aSign, 0, 0 );
2684 }
2685 lastBitMask = 1;
2686 lastBitMask <<= 0x433 - aExp;
2687 roundBitsMask = lastBitMask - 1;
2688 z = a;
2689 roundingMode = float_rounding_mode();
2690 if ( roundingMode == float_round_nearest_even ) {
2691 z += lastBitMask>>1;
2692 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2693 }
2694 else if ( roundingMode != float_round_to_zero ) {
2695 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2696 z += roundBitsMask;
2697 }
2698 }
2699 z &= ~ roundBitsMask;
2700 if ( z != a ) float_set_inexact();
2701 return z;
2702
2703 }
2704 #endif
2705
2706 /*
2707 -------------------------------------------------------------------------------
2708 Returns the result of adding the absolute values of the double-precision
2709 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2710 before being returned. `zSign' is ignored if the result is a NaN.
2711 The addition is performed according to the IEC/IEEE Standard for Binary
2712 Floating-Point Arithmetic.
2713 -------------------------------------------------------------------------------
2714 */
addFloat64Sigs(float64 a,float64 b,flag zSign)2715 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2716 {
2717 int16 aExp, bExp, zExp;
2718 bits64 aSig, bSig, zSig;
2719 int16 expDiff;
2720
2721 aSig = extractFloat64Frac( a );
2722 aExp = extractFloat64Exp( a );
2723 bSig = extractFloat64Frac( b );
2724 bExp = extractFloat64Exp( b );
2725 expDiff = aExp - bExp;
2726 aSig <<= 9;
2727 bSig <<= 9;
2728 if ( 0 < expDiff ) {
2729 if ( aExp == 0x7FF ) {
2730 if ( aSig ) return propagateFloat64NaN( a, b );
2731 return a;
2732 }
2733 if ( bExp == 0 ) {
2734 --expDiff;
2735 }
2736 else {
2737 bSig |= LIT64( 0x2000000000000000 );
2738 }
2739 shift64RightJamming( bSig, expDiff, &bSig );
2740 zExp = aExp;
2741 }
2742 else if ( expDiff < 0 ) {
2743 if ( bExp == 0x7FF ) {
2744 if ( bSig ) return propagateFloat64NaN( a, b );
2745 return packFloat64( zSign, 0x7FF, 0 );
2746 }
2747 if ( aExp == 0 ) {
2748 ++expDiff;
2749 }
2750 else {
2751 aSig |= LIT64( 0x2000000000000000 );
2752 }
2753 shift64RightJamming( aSig, - expDiff, &aSig );
2754 zExp = bExp;
2755 }
2756 else {
2757 if ( aExp == 0x7FF ) {
2758 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2759 return a;
2760 }
2761 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2762 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2763 zExp = aExp;
2764 goto roundAndPack;
2765 }
2766 aSig |= LIT64( 0x2000000000000000 );
2767 zSig = ( aSig + bSig )<<1;
2768 --zExp;
2769 if ( (sbits64) zSig < 0 ) {
2770 zSig = aSig + bSig;
2771 ++zExp;
2772 }
2773 roundAndPack:
2774 return roundAndPackFloat64( zSign, zExp, zSig );
2775
2776 }
2777
2778 /*
2779 -------------------------------------------------------------------------------
2780 Returns the result of subtracting the absolute values of the double-
2781 precision floating-point values `a' and `b'. If `zSign' is 1, the
2782 difference is negated before being returned. `zSign' is ignored if the
2783 result is a NaN. The subtraction is performed according to the IEC/IEEE
2784 Standard for Binary Floating-Point Arithmetic.
2785 -------------------------------------------------------------------------------
2786 */
subFloat64Sigs(float64 a,float64 b,flag zSign)2787 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2788 {
2789 int16 aExp, bExp, zExp;
2790 bits64 aSig, bSig, zSig;
2791 int16 expDiff;
2792
2793 aSig = extractFloat64Frac( a );
2794 aExp = extractFloat64Exp( a );
2795 bSig = extractFloat64Frac( b );
2796 bExp = extractFloat64Exp( b );
2797 expDiff = aExp - bExp;
2798 aSig <<= 10;
2799 bSig <<= 10;
2800 if ( 0 < expDiff ) goto aExpBigger;
2801 if ( expDiff < 0 ) goto bExpBigger;
2802 if ( aExp == 0x7FF ) {
2803 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2804 float_raise( float_flag_invalid );
2805 return float64_default_nan;
2806 }
2807 if ( aExp == 0 ) {
2808 aExp = 1;
2809 bExp = 1;
2810 }
2811 if ( bSig < aSig ) goto aBigger;
2812 if ( aSig < bSig ) goto bBigger;
2813 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2814 bExpBigger:
2815 if ( bExp == 0x7FF ) {
2816 if ( bSig ) return propagateFloat64NaN( a, b );
2817 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2818 }
2819 if ( aExp == 0 ) {
2820 ++expDiff;
2821 }
2822 else {
2823 aSig |= LIT64( 0x4000000000000000 );
2824 }
2825 shift64RightJamming( aSig, - expDiff, &aSig );
2826 bSig |= LIT64( 0x4000000000000000 );
2827 bBigger:
2828 zSig = bSig - aSig;
2829 zExp = bExp;
2830 zSign ^= 1;
2831 goto normalizeRoundAndPack;
2832 aExpBigger:
2833 if ( aExp == 0x7FF ) {
2834 if ( aSig ) return propagateFloat64NaN( a, b );
2835 return a;
2836 }
2837 if ( bExp == 0 ) {
2838 --expDiff;
2839 }
2840 else {
2841 bSig |= LIT64( 0x4000000000000000 );
2842 }
2843 shift64RightJamming( bSig, expDiff, &bSig );
2844 aSig |= LIT64( 0x4000000000000000 );
2845 aBigger:
2846 zSig = aSig - bSig;
2847 zExp = aExp;
2848 normalizeRoundAndPack:
2849 --zExp;
2850 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2851
2852 }
2853
2854 /*
2855 -------------------------------------------------------------------------------
2856 Returns the result of adding the double-precision floating-point values `a'
2857 and `b'. The operation is performed according to the IEC/IEEE Standard for
2858 Binary Floating-Point Arithmetic.
2859 -------------------------------------------------------------------------------
2860 */
float64_add(float64 a,float64 b)2861 float64 float64_add( float64 a, float64 b )
2862 {
2863 flag aSign, bSign;
2864
2865 aSign = extractFloat64Sign( a );
2866 bSign = extractFloat64Sign( b );
2867 if ( aSign == bSign ) {
2868 return addFloat64Sigs( a, b, aSign );
2869 }
2870 else {
2871 return subFloat64Sigs( a, b, aSign );
2872 }
2873
2874 }
2875
2876 /*
2877 -------------------------------------------------------------------------------
2878 Returns the result of subtracting the double-precision floating-point values
2879 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2880 for Binary Floating-Point Arithmetic.
2881 -------------------------------------------------------------------------------
2882 */
float64_sub(float64 a,float64 b)2883 float64 float64_sub( float64 a, float64 b )
2884 {
2885 flag aSign, bSign;
2886
2887 aSign = extractFloat64Sign( a );
2888 bSign = extractFloat64Sign( b );
2889 if ( aSign == bSign ) {
2890 return subFloat64Sigs( a, b, aSign );
2891 }
2892 else {
2893 return addFloat64Sigs( a, b, aSign );
2894 }
2895
2896 }
2897
2898 /*
2899 -------------------------------------------------------------------------------
2900 Returns the result of multiplying the double-precision floating-point values
2901 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2902 for Binary Floating-Point Arithmetic.
2903 -------------------------------------------------------------------------------
2904 */
float64_mul(float64 a,float64 b)2905 float64 float64_mul( float64 a, float64 b )
2906 {
2907 flag aSign, bSign, zSign;
2908 int16 aExp, bExp, zExp;
2909 bits64 aSig, bSig, zSig0, zSig1;
2910
2911 aSig = extractFloat64Frac( a );
2912 aExp = extractFloat64Exp( a );
2913 aSign = extractFloat64Sign( a );
2914 bSig = extractFloat64Frac( b );
2915 bExp = extractFloat64Exp( b );
2916 bSign = extractFloat64Sign( b );
2917 zSign = aSign ^ bSign;
2918 if ( aExp == 0x7FF ) {
2919 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2920 return propagateFloat64NaN( a, b );
2921 }
2922 if ( ( bExp | bSig ) == 0 ) {
2923 float_raise( float_flag_invalid );
2924 return float64_default_nan;
2925 }
2926 return packFloat64( zSign, 0x7FF, 0 );
2927 }
2928 if ( bExp == 0x7FF ) {
2929 if ( bSig ) return propagateFloat64NaN( a, b );
2930 if ( ( aExp | aSig ) == 0 ) {
2931 float_raise( float_flag_invalid );
2932 return float64_default_nan;
2933 }
2934 return packFloat64( zSign, 0x7FF, 0 );
2935 }
2936 if ( aExp == 0 ) {
2937 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2938 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2939 }
2940 if ( bExp == 0 ) {
2941 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2942 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2943 }
2944 zExp = aExp + bExp - 0x3FF;
2945 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2946 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2947 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2948 zSig0 |= ( zSig1 != 0 );
2949 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2950 zSig0 <<= 1;
2951 --zExp;
2952 }
2953 return roundAndPackFloat64( zSign, zExp, zSig0 );
2954
2955 }
2956
2957 /*
2958 -------------------------------------------------------------------------------
2959 Returns the result of dividing the double-precision floating-point value `a'
2960 by the corresponding value `b'. The operation is performed according to
2961 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2962 -------------------------------------------------------------------------------
2963 */
float64_div(float64 a,float64 b)2964 float64 float64_div( float64 a, float64 b )
2965 {
2966 flag aSign, bSign, zSign;
2967 int16 aExp, bExp, zExp;
2968 bits64 aSig, bSig, zSig;
2969 bits64 rem0, rem1;
2970 bits64 term0, term1;
2971
2972 aSig = extractFloat64Frac( a );
2973 aExp = extractFloat64Exp( a );
2974 aSign = extractFloat64Sign( a );
2975 bSig = extractFloat64Frac( b );
2976 bExp = extractFloat64Exp( b );
2977 bSign = extractFloat64Sign( b );
2978 zSign = aSign ^ bSign;
2979 if ( aExp == 0x7FF ) {
2980 if ( aSig ) return propagateFloat64NaN( a, b );
2981 if ( bExp == 0x7FF ) {
2982 if ( bSig ) return propagateFloat64NaN( a, b );
2983 float_raise( float_flag_invalid );
2984 return float64_default_nan;
2985 }
2986 return packFloat64( zSign, 0x7FF, 0 );
2987 }
2988 if ( bExp == 0x7FF ) {
2989 if ( bSig ) return propagateFloat64NaN( a, b );
2990 return packFloat64( zSign, 0, 0 );
2991 }
2992 if ( bExp == 0 ) {
2993 if ( bSig == 0 ) {
2994 if ( ( aExp | aSig ) == 0 ) {
2995 float_raise( float_flag_invalid );
2996 return float64_default_nan;
2997 }
2998 float_raise( float_flag_divbyzero );
2999 return packFloat64( zSign, 0x7FF, 0 );
3000 }
3001 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3002 }
3003 if ( aExp == 0 ) {
3004 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3005 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3006 }
3007 zExp = aExp - bExp + 0x3FD;
3008 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3009 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3010 if ( bSig <= ( aSig + aSig ) ) {
3011 aSig >>= 1;
3012 ++zExp;
3013 }
3014 zSig = estimateDiv128To64( aSig, 0, bSig );
3015 if ( ( zSig & 0x1FF ) <= 2 ) {
3016 mul64To128( bSig, zSig, &term0, &term1 );
3017 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3018 while ( (sbits64) rem0 < 0 ) {
3019 --zSig;
3020 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3021 }
3022 zSig |= ( rem1 != 0 );
3023 }
3024 return roundAndPackFloat64( zSign, zExp, zSig );
3025
3026 }
3027
3028 #ifndef SOFTFLOAT_FOR_GCC
3029 /*
3030 -------------------------------------------------------------------------------
3031 Returns the remainder of the double-precision floating-point value `a'
3032 with respect to the corresponding value `b'. The operation is performed
3033 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3034 -------------------------------------------------------------------------------
3035 */
float64_rem(float64 a,float64 b)3036 float64 float64_rem( float64 a, float64 b )
3037 {
3038 flag aSign, bSign, zSign;
3039 int16 aExp, bExp, expDiff;
3040 bits64 aSig, bSig;
3041 bits64 q, alternateASig;
3042 sbits64 sigMean;
3043
3044 aSig = extractFloat64Frac( a );
3045 aExp = extractFloat64Exp( a );
3046 aSign = extractFloat64Sign( a );
3047 bSig = extractFloat64Frac( b );
3048 bExp = extractFloat64Exp( b );
3049 bSign = extractFloat64Sign( b );
3050 if ( aExp == 0x7FF ) {
3051 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3052 return propagateFloat64NaN( a, b );
3053 }
3054 float_raise( float_flag_invalid );
3055 return float64_default_nan;
3056 }
3057 if ( bExp == 0x7FF ) {
3058 if ( bSig ) return propagateFloat64NaN( a, b );
3059 return a;
3060 }
3061 if ( bExp == 0 ) {
3062 if ( bSig == 0 ) {
3063 float_raise( float_flag_invalid );
3064 return float64_default_nan;
3065 }
3066 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3067 }
3068 if ( aExp == 0 ) {
3069 if ( aSig == 0 ) return a;
3070 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3071 }
3072 expDiff = aExp - bExp;
3073 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3074 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3075 if ( expDiff < 0 ) {
3076 if ( expDiff < -1 ) return a;
3077 aSig >>= 1;
3078 }
3079 q = ( bSig <= aSig );
3080 if ( q ) aSig -= bSig;
3081 expDiff -= 64;
3082 while ( 0 < expDiff ) {
3083 q = estimateDiv128To64( aSig, 0, bSig );
3084 q = ( 2 < q ) ? q - 2 : 0;
3085 aSig = - ( ( bSig>>2 ) * q );
3086 expDiff -= 62;
3087 }
3088 expDiff += 64;
3089 if ( 0 < expDiff ) {
3090 q = estimateDiv128To64( aSig, 0, bSig );
3091 q = ( 2 < q ) ? q - 2 : 0;
3092 q >>= 64 - expDiff;
3093 bSig >>= 2;
3094 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3095 }
3096 else {
3097 aSig >>= 2;
3098 bSig >>= 2;
3099 }
3100 do {
3101 alternateASig = aSig;
3102 ++q;
3103 aSig -= bSig;
3104 } while ( 0 <= (sbits64) aSig );
3105 sigMean = aSig + alternateASig;
3106 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3107 aSig = alternateASig;
3108 }
3109 zSign = ( (sbits64) aSig < 0 );
3110 if ( zSign ) aSig = - aSig;
3111 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3112
3113 }
3114
3115 /*
3116 -------------------------------------------------------------------------------
3117 Returns the square root of the double-precision floating-point value `a'.
3118 The operation is performed according to the IEC/IEEE Standard for Binary
3119 Floating-Point Arithmetic.
3120 -------------------------------------------------------------------------------
3121 */
float64_sqrt(float64 a)3122 float64 float64_sqrt( float64 a )
3123 {
3124 flag aSign;
3125 int16 aExp, zExp;
3126 bits64 aSig, zSig, doubleZSig;
3127 bits64 rem0, rem1, term0, term1;
3128
3129 aSig = extractFloat64Frac( a );
3130 aExp = extractFloat64Exp( a );
3131 aSign = extractFloat64Sign( a );
3132 if ( aExp == 0x7FF ) {
3133 if ( aSig ) return propagateFloat64NaN( a, a );
3134 if ( ! aSign ) return a;
3135 float_raise( float_flag_invalid );
3136 return float64_default_nan;
3137 }
3138 if ( aSign ) {
3139 if ( ( aExp | aSig ) == 0 ) return a;
3140 float_raise( float_flag_invalid );
3141 return float64_default_nan;
3142 }
3143 if ( aExp == 0 ) {
3144 if ( aSig == 0 ) return 0;
3145 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3146 }
3147 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3148 aSig |= LIT64( 0x0010000000000000 );
3149 zSig = estimateSqrt32( aExp, aSig>>21 );
3150 aSig <<= 9 - ( aExp & 1 );
3151 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3152 if ( ( zSig & 0x1FF ) <= 5 ) {
3153 doubleZSig = zSig<<1;
3154 mul64To128( zSig, zSig, &term0, &term1 );
3155 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3156 while ( (sbits64) rem0 < 0 ) {
3157 --zSig;
3158 doubleZSig -= 2;
3159 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3160 }
3161 zSig |= ( ( rem0 | rem1 ) != 0 );
3162 }
3163 return roundAndPackFloat64( 0, zExp, zSig );
3164
3165 }
3166 #endif
3167
3168 /*
3169 -------------------------------------------------------------------------------
3170 Returns 1 if the double-precision floating-point value `a' is equal to the
3171 corresponding value `b', and 0 otherwise. The comparison is performed
3172 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3173 -------------------------------------------------------------------------------
3174 */
float64_eq(float64 a,float64 b)3175 flag float64_eq( float64 a, float64 b )
3176 {
3177
3178 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3179 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3180 ) {
3181 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3182 float_raise( float_flag_invalid );
3183 }
3184 return 0;
3185 }
3186 return ( a == b ) ||
3187 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3188
3189 }
3190
3191 /*
3192 -------------------------------------------------------------------------------
3193 Returns 1 if the double-precision floating-point value `a' is less than or
3194 equal to the corresponding value `b', and 0 otherwise. The comparison is
3195 performed according to the IEC/IEEE Standard for Binary Floating-Point
3196 Arithmetic.
3197 -------------------------------------------------------------------------------
3198 */
float64_le(float64 a,float64 b)3199 flag float64_le( float64 a, float64 b )
3200 {
3201 flag aSign, bSign;
3202
3203 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3204 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3205 ) {
3206 float_raise( float_flag_invalid );
3207 return 0;
3208 }
3209 aSign = extractFloat64Sign( a );
3210 bSign = extractFloat64Sign( b );
3211 if ( aSign != bSign )
3212 return aSign ||
3213 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3214 0 );
3215 return ( a == b ) ||
3216 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3217
3218 }
3219
3220 /*
3221 -------------------------------------------------------------------------------
3222 Returns 1 if the double-precision floating-point value `a' is less than
3223 the corresponding value `b', and 0 otherwise. The comparison is performed
3224 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3225 -------------------------------------------------------------------------------
3226 */
float64_lt(float64 a,float64 b)3227 flag float64_lt( float64 a, float64 b )
3228 {
3229 flag aSign, bSign;
3230
3231 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3232 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3233 ) {
3234 float_raise( float_flag_invalid );
3235 return 0;
3236 }
3237 aSign = extractFloat64Sign( a );
3238 bSign = extractFloat64Sign( b );
3239 if ( aSign != bSign )
3240 return aSign &&
3241 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3242 0 );
3243 return ( a != b ) &&
3244 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3245
3246 }
3247
3248 #ifndef SOFTFLOAT_FOR_GCC
3249 /*
3250 -------------------------------------------------------------------------------
3251 Returns 1 if the double-precision floating-point value `a' is equal to the
3252 corresponding value `b', and 0 otherwise. The invalid exception is raised
3253 if either operand is a NaN. Otherwise, the comparison is performed
3254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3255 -------------------------------------------------------------------------------
3256 */
float64_eq_signaling(float64 a,float64 b)3257 flag float64_eq_signaling( float64 a, float64 b )
3258 {
3259
3260 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3261 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3262 ) {
3263 float_raise( float_flag_invalid );
3264 return 0;
3265 }
3266 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3267
3268 }
3269
3270 /*
3271 -------------------------------------------------------------------------------
3272 Returns 1 if the double-precision floating-point value `a' is less than or
3273 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3274 cause an exception. Otherwise, the comparison is performed according to the
3275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3276 -------------------------------------------------------------------------------
3277 */
float64_le_quiet(float64 a,float64 b)3278 flag float64_le_quiet( float64 a, float64 b )
3279 {
3280 flag aSign, bSign;
3281
3282 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3283 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3284 ) {
3285 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3286 float_raise( float_flag_invalid );
3287 }
3288 return 0;
3289 }
3290 aSign = extractFloat64Sign( a );
3291 bSign = extractFloat64Sign( b );
3292 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3293 return ( a == b ) || ( aSign ^ ( a < b ) );
3294
3295 }
3296
3297 /*
3298 -------------------------------------------------------------------------------
3299 Returns 1 if the double-precision floating-point value `a' is less than
3300 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3301 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3302 Standard for Binary Floating-Point Arithmetic.
3303 -------------------------------------------------------------------------------
3304 */
float64_lt_quiet(float64 a,float64 b)3305 flag float64_lt_quiet( float64 a, float64 b )
3306 {
3307 flag aSign, bSign;
3308
3309 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3310 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3311 ) {
3312 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3313 float_raise( float_flag_invalid );
3314 }
3315 return 0;
3316 }
3317 aSign = extractFloat64Sign( a );
3318 bSign = extractFloat64Sign( b );
3319 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3320 return ( a != b ) && ( aSign ^ ( a < b ) );
3321
3322 }
3323 #endif
3324
3325 #ifdef FLOATX80
3326
3327 /*
3328 -------------------------------------------------------------------------------
3329 Returns the result of converting the extended double-precision floating-
3330 point value `a' to the 32-bit two's complement integer format. The
3331 conversion is performed according to the IEC/IEEE Standard for Binary
3332 Floating-Point Arithmetic---which means in particular that the conversion
3333 is rounded according to the current rounding mode. If `a' is a NaN, the
3334 largest positive integer is returned. Otherwise, if the conversion
3335 overflows, the largest integer with the same sign as `a' is returned.
3336 -------------------------------------------------------------------------------
3337 */
floatx80_to_int32(floatx80 a)3338 int32 floatx80_to_int32( floatx80 a )
3339 {
3340 flag aSign;
3341 int32 aExp, shiftCount;
3342 bits64 aSig;
3343
3344 aSig = extractFloatx80Frac( a );
3345 aExp = extractFloatx80Exp( a );
3346 aSign = extractFloatx80Sign( a );
3347 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3348 shiftCount = 0x4037 - aExp;
3349 if ( shiftCount <= 0 ) shiftCount = 1;
3350 shift64RightJamming( aSig, shiftCount, &aSig );
3351 return roundAndPackInt32( aSign, aSig );
3352
3353 }
3354
3355 /*
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 32-bit two's complement integer format. The
3359 conversion is performed according to the IEC/IEEE Standard for Binary
3360 Floating-Point Arithmetic, except that the conversion is always rounded
3361 toward zero. If `a' is a NaN, the largest positive integer is returned.
3362 Otherwise, if the conversion overflows, the largest integer with the same
3363 sign as `a' is returned.
3364 -------------------------------------------------------------------------------
3365 */
floatx80_to_int32_round_to_zero(floatx80 a)3366 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3367 {
3368 flag aSign;
3369 int32 aExp, shiftCount;
3370 bits64 aSig, savedASig;
3371 int32 z;
3372
3373 aSig = extractFloatx80Frac( a );
3374 aExp = extractFloatx80Exp( a );
3375 aSign = extractFloatx80Sign( a );
3376 if ( 0x401E < aExp ) {
3377 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3378 goto invalid;
3379 }
3380 else if ( aExp < 0x3FFF ) {
3381 if ( aExp || aSig ) float_set_inexact();
3382 return 0;
3383 }
3384 shiftCount = 0x403E - aExp;
3385 savedASig = aSig;
3386 aSig >>= shiftCount;
3387 z = aSig;
3388 if ( aSign ) z = - z;
3389 if ( ( z < 0 ) ^ aSign ) {
3390 invalid:
3391 float_raise( float_flag_invalid );
3392 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3393 }
3394 if ( ( aSig<<shiftCount ) != savedASig ) {
3395 float_set_inexact();
3396 }
3397 return z;
3398
3399 }
3400
3401 /*
3402 -------------------------------------------------------------------------------
3403 Returns the result of converting the extended double-precision floating-
3404 point value `a' to the 64-bit two's complement integer format. The
3405 conversion is performed according to the IEC/IEEE Standard for Binary
3406 Floating-Point Arithmetic---which means in particular that the conversion
3407 is rounded according to the current rounding mode. If `a' is a NaN,
3408 the largest positive integer is returned. Otherwise, if the conversion
3409 overflows, the largest integer with the same sign as `a' is returned.
3410 -------------------------------------------------------------------------------
3411 */
floatx80_to_int64(floatx80 a)3412 int64 floatx80_to_int64( floatx80 a )
3413 {
3414 flag aSign;
3415 int32 aExp, shiftCount;
3416 bits64 aSig, aSigExtra;
3417
3418 aSig = extractFloatx80Frac( a );
3419 aExp = extractFloatx80Exp( a );
3420 aSign = extractFloatx80Sign( a );
3421 shiftCount = 0x403E - aExp;
3422 if ( shiftCount <= 0 ) {
3423 if ( shiftCount ) {
3424 float_raise( float_flag_invalid );
3425 if ( ! aSign
3426 || ( ( aExp == 0x7FFF )
3427 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3428 ) {
3429 return LIT64( 0x7FFFFFFFFFFFFFFF );
3430 }
3431 return (sbits64) LIT64( 0x8000000000000000 );
3432 }
3433 aSigExtra = 0;
3434 }
3435 else {
3436 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3437 }
3438 return roundAndPackInt64( aSign, aSig, aSigExtra );
3439
3440 }
3441
3442 /*
3443 -------------------------------------------------------------------------------
3444 Returns the result of converting the extended double-precision floating-
3445 point value `a' to the 64-bit two's complement integer format. The
3446 conversion is performed according to the IEC/IEEE Standard for Binary
3447 Floating-Point Arithmetic, except that the conversion is always rounded
3448 toward zero. If `a' is a NaN, the largest positive integer is returned.
3449 Otherwise, if the conversion overflows, the largest integer with the same
3450 sign as `a' is returned.
3451 -------------------------------------------------------------------------------
3452 */
floatx80_to_int64_round_to_zero(floatx80 a)3453 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3454 {
3455 flag aSign;
3456 int32 aExp, shiftCount;
3457 bits64 aSig;
3458 int64 z;
3459
3460 aSig = extractFloatx80Frac( a );
3461 aExp = extractFloatx80Exp( a );
3462 aSign = extractFloatx80Sign( a );
3463 shiftCount = aExp - 0x403E;
3464 if ( 0 <= shiftCount ) {
3465 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3466 if ( ( a.high != 0xC03E ) || aSig ) {
3467 float_raise( float_flag_invalid );
3468 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3469 return LIT64( 0x7FFFFFFFFFFFFFFF );
3470 }
3471 }
3472 return (sbits64) LIT64( 0x8000000000000000 );
3473 }
3474 else if ( aExp < 0x3FFF ) {
3475 if ( aExp | aSig ) float_set_inexact();
3476 return 0;
3477 }
3478 z = aSig>>( - shiftCount );
3479 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3480 float_set_inexact();
3481 }
3482 if ( aSign ) z = - z;
3483 return z;
3484
3485 }
3486
3487 /*
3488 -------------------------------------------------------------------------------
3489 Returns the result of converting the extended double-precision floating-
3490 point value `a' to the single-precision floating-point format. The
3491 conversion is performed according to the IEC/IEEE Standard for Binary
3492 Floating-Point Arithmetic.
3493 -------------------------------------------------------------------------------
3494 */
floatx80_to_float32(floatx80 a)3495 float32 floatx80_to_float32( floatx80 a )
3496 {
3497 flag aSign;
3498 int32 aExp;
3499 bits64 aSig;
3500
3501 aSig = extractFloatx80Frac( a );
3502 aExp = extractFloatx80Exp( a );
3503 aSign = extractFloatx80Sign( a );
3504 if ( aExp == 0x7FFF ) {
3505 if ( (bits64) ( aSig<<1 ) ) {
3506 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3507 }
3508 return packFloat32( aSign, 0xFF, 0 );
3509 }
3510 shift64RightJamming( aSig, 33, &aSig );
3511 if ( aExp || aSig ) aExp -= 0x3F81;
3512 return roundAndPackFloat32( aSign, aExp, aSig );
3513
3514 }
3515
3516 /*
3517 -------------------------------------------------------------------------------
3518 Returns the result of converting the extended double-precision floating-
3519 point value `a' to the double-precision floating-point format. The
3520 conversion is performed according to the IEC/IEEE Standard for Binary
3521 Floating-Point Arithmetic.
3522 -------------------------------------------------------------------------------
3523 */
floatx80_to_float64(floatx80 a)3524 float64 floatx80_to_float64( floatx80 a )
3525 {
3526 flag aSign;
3527 int32 aExp;
3528 bits64 aSig, zSig;
3529
3530 aSig = extractFloatx80Frac( a );
3531 aExp = extractFloatx80Exp( a );
3532 aSign = extractFloatx80Sign( a );
3533 if ( aExp == 0x7FFF ) {
3534 if ( (bits64) ( aSig<<1 ) ) {
3535 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3536 }
3537 return packFloat64( aSign, 0x7FF, 0 );
3538 }
3539 shift64RightJamming( aSig, 1, &zSig );
3540 if ( aExp || aSig ) aExp -= 0x3C01;
3541 return roundAndPackFloat64( aSign, aExp, zSig );
3542
3543 }
3544
3545 #ifdef FLOAT128
3546
3547 /*
3548 -------------------------------------------------------------------------------
3549 Returns the result of converting the extended double-precision floating-
3550 point value `a' to the quadruple-precision floating-point format. The
3551 conversion is performed according to the IEC/IEEE Standard for Binary
3552 Floating-Point Arithmetic.
3553 -------------------------------------------------------------------------------
3554 */
floatx80_to_float128(floatx80 a)3555 float128 floatx80_to_float128( floatx80 a )
3556 {
3557 flag aSign;
3558 int16 aExp;
3559 bits64 aSig, zSig0, zSig1;
3560
3561 aSig = extractFloatx80Frac( a );
3562 aExp = extractFloatx80Exp( a );
3563 aSign = extractFloatx80Sign( a );
3564 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3565 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3566 }
3567 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3568 return packFloat128( aSign, aExp, zSig0, zSig1 );
3569
3570 }
3571
3572 #endif
3573
3574 /*
3575 -------------------------------------------------------------------------------
3576 Rounds the extended double-precision floating-point value `a' to an integer,
3577 and returns the result as an extended quadruple-precision floating-point
3578 value. The operation is performed according to the IEC/IEEE Standard for
3579 Binary Floating-Point Arithmetic.
3580 -------------------------------------------------------------------------------
3581 */
floatx80_round_to_int(floatx80 a)3582 floatx80 floatx80_round_to_int( floatx80 a )
3583 {
3584 flag aSign;
3585 int32 aExp;
3586 bits64 lastBitMask, roundBitsMask;
3587 int8 roundingMode;
3588 floatx80 z;
3589
3590 aExp = extractFloatx80Exp( a );
3591 if ( 0x403E <= aExp ) {
3592 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3593 return propagateFloatx80NaN( a, a );
3594 }
3595 return a;
3596 }
3597 if ( aExp < 0x3FFF ) {
3598 if ( ( aExp == 0 )
3599 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3600 return a;
3601 }
3602 float_set_inexact();
3603 aSign = extractFloatx80Sign( a );
3604 switch ( float_rounding_mode() ) {
3605 case float_round_nearest_even:
3606 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3607 ) {
3608 return
3609 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3610 }
3611 break;
3612 case float_round_down:
3613 return
3614 aSign ?
3615 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3616 : packFloatx80( 0, 0, 0 );
3617 case float_round_up:
3618 return
3619 aSign ? packFloatx80( 1, 0, 0 )
3620 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3621 }
3622 return packFloatx80( aSign, 0, 0 );
3623 }
3624 lastBitMask = 1;
3625 lastBitMask <<= 0x403E - aExp;
3626 roundBitsMask = lastBitMask - 1;
3627 z = a;
3628 roundingMode = float_rounding_mode();
3629 if ( roundingMode == float_round_nearest_even ) {
3630 z.low += lastBitMask>>1;
3631 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3632 }
3633 else if ( roundingMode != float_round_to_zero ) {
3634 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3635 z.low += roundBitsMask;
3636 }
3637 }
3638 z.low &= ~ roundBitsMask;
3639 if ( z.low == 0 ) {
3640 ++z.high;
3641 z.low = LIT64( 0x8000000000000000 );
3642 }
3643 if ( z.low != a.low ) float_set_inexact();
3644 return z;
3645
3646 }
3647
3648 /*
3649 -------------------------------------------------------------------------------
3650 Returns the result of adding the absolute values of the extended double-
3651 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3652 negated before being returned. `zSign' is ignored if the result is a NaN.
3653 The addition is performed according to the IEC/IEEE Standard for Binary
3654 Floating-Point Arithmetic.
3655 -------------------------------------------------------------------------------
3656 */
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3657 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3658 {
3659 int32 aExp, bExp, zExp;
3660 bits64 aSig, bSig, zSig0, zSig1;
3661 int32 expDiff;
3662
3663 aSig = extractFloatx80Frac( a );
3664 aExp = extractFloatx80Exp( a );
3665 bSig = extractFloatx80Frac( b );
3666 bExp = extractFloatx80Exp( b );
3667 expDiff = aExp - bExp;
3668 if ( 0 < expDiff ) {
3669 if ( aExp == 0x7FFF ) {
3670 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3671 return a;
3672 }
3673 if ( bExp == 0 ) --expDiff;
3674 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3675 zExp = aExp;
3676 }
3677 else if ( expDiff < 0 ) {
3678 if ( bExp == 0x7FFF ) {
3679 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3680 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3681 }
3682 if ( aExp == 0 ) ++expDiff;
3683 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3684 zExp = bExp;
3685 }
3686 else {
3687 if ( aExp == 0x7FFF ) {
3688 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3689 return propagateFloatx80NaN( a, b );
3690 }
3691 return a;
3692 }
3693 zSig1 = 0;
3694 zSig0 = aSig + bSig;
3695 if ( aExp == 0 ) {
3696 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3697 goto roundAndPack;
3698 }
3699 zExp = aExp;
3700 goto shiftRight1;
3701 }
3702 zSig0 = aSig + bSig;
3703 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3704 shiftRight1:
3705 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3706 zSig0 |= LIT64( 0x8000000000000000 );
3707 ++zExp;
3708 roundAndPack:
3709 return
3710 roundAndPackFloatx80(
3711 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3712
3713 }
3714
3715 /*
3716 -------------------------------------------------------------------------------
3717 Returns the result of subtracting the absolute values of the extended
3718 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3719 difference is negated before being returned. `zSign' is ignored if the
3720 result is a NaN. The subtraction is performed according to the IEC/IEEE
3721 Standard for Binary Floating-Point Arithmetic.
3722 -------------------------------------------------------------------------------
3723 */
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3724 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3725 {
3726 int32 aExp, bExp, zExp;
3727 bits64 aSig, bSig, zSig0, zSig1;
3728 int32 expDiff;
3729 floatx80 z;
3730
3731 aSig = extractFloatx80Frac( a );
3732 aExp = extractFloatx80Exp( a );
3733 bSig = extractFloatx80Frac( b );
3734 bExp = extractFloatx80Exp( b );
3735 expDiff = aExp - bExp;
3736 if ( 0 < expDiff ) goto aExpBigger;
3737 if ( expDiff < 0 ) goto bExpBigger;
3738 if ( aExp == 0x7FFF ) {
3739 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3740 return propagateFloatx80NaN( a, b );
3741 }
3742 float_raise( float_flag_invalid );
3743 z.low = floatx80_default_nan_low;
3744 z.high = floatx80_default_nan_high;
3745 return z;
3746 }
3747 if ( aExp == 0 ) {
3748 aExp = 1;
3749 bExp = 1;
3750 }
3751 zSig1 = 0;
3752 if ( bSig < aSig ) goto aBigger;
3753 if ( aSig < bSig ) goto bBigger;
3754 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3755 bExpBigger:
3756 if ( bExp == 0x7FFF ) {
3757 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3758 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3759 }
3760 if ( aExp == 0 ) ++expDiff;
3761 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3762 bBigger:
3763 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3764 zExp = bExp;
3765 zSign ^= 1;
3766 goto normalizeRoundAndPack;
3767 aExpBigger:
3768 if ( aExp == 0x7FFF ) {
3769 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3770 return a;
3771 }
3772 if ( bExp == 0 ) --expDiff;
3773 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3774 aBigger:
3775 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3776 zExp = aExp;
3777 normalizeRoundAndPack:
3778 return
3779 normalizeRoundAndPackFloatx80(
3780 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3781
3782 }
3783
3784 /*
3785 -------------------------------------------------------------------------------
3786 Returns the result of adding the extended double-precision floating-point
3787 values `a' and `b'. The operation is performed according to the IEC/IEEE
3788 Standard for Binary Floating-Point Arithmetic.
3789 -------------------------------------------------------------------------------
3790 */
floatx80_add(floatx80 a,floatx80 b)3791 floatx80 floatx80_add( floatx80 a, floatx80 b )
3792 {
3793 flag aSign, bSign;
3794
3795 aSign = extractFloatx80Sign( a );
3796 bSign = extractFloatx80Sign( b );
3797 if ( aSign == bSign ) {
3798 return addFloatx80Sigs( a, b, aSign );
3799 }
3800 else {
3801 return subFloatx80Sigs( a, b, aSign );
3802 }
3803
3804 }
3805
3806 /*
3807 -------------------------------------------------------------------------------
3808 Returns the result of subtracting the extended double-precision floating-
3809 point values `a' and `b'. The operation is performed according to the
3810 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3811 -------------------------------------------------------------------------------
3812 */
floatx80_sub(floatx80 a,floatx80 b)3813 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3814 {
3815 flag aSign, bSign;
3816
3817 aSign = extractFloatx80Sign( a );
3818 bSign = extractFloatx80Sign( b );
3819 if ( aSign == bSign ) {
3820 return subFloatx80Sigs( a, b, aSign );
3821 }
3822 else {
3823 return addFloatx80Sigs( a, b, aSign );
3824 }
3825
3826 }
3827
3828 /*
3829 -------------------------------------------------------------------------------
3830 Returns the result of multiplying the extended double-precision floating-
3831 point values `a' and `b'. The operation is performed according to the
3832 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3833 -------------------------------------------------------------------------------
3834 */
floatx80_mul(floatx80 a,floatx80 b)3835 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3836 {
3837 flag aSign, bSign, zSign;
3838 int32 aExp, bExp, zExp;
3839 bits64 aSig, bSig, zSig0, zSig1;
3840 floatx80 z;
3841
3842 aSig = extractFloatx80Frac( a );
3843 aExp = extractFloatx80Exp( a );
3844 aSign = extractFloatx80Sign( a );
3845 bSig = extractFloatx80Frac( b );
3846 bExp = extractFloatx80Exp( b );
3847 bSign = extractFloatx80Sign( b );
3848 zSign = aSign ^ bSign;
3849 if ( aExp == 0x7FFF ) {
3850 if ( (bits64) ( aSig<<1 )
3851 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3852 return propagateFloatx80NaN( a, b );
3853 }
3854 if ( ( bExp | bSig ) == 0 ) goto invalid;
3855 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3856 }
3857 if ( bExp == 0x7FFF ) {
3858 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3859 if ( ( aExp | aSig ) == 0 ) {
3860 invalid:
3861 float_raise( float_flag_invalid );
3862 z.low = floatx80_default_nan_low;
3863 z.high = floatx80_default_nan_high;
3864 return z;
3865 }
3866 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3867 }
3868 if ( aExp == 0 ) {
3869 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3870 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3871 }
3872 if ( bExp == 0 ) {
3873 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3874 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3875 }
3876 zExp = aExp + bExp - 0x3FFE;
3877 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3878 if ( 0 < (sbits64) zSig0 ) {
3879 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3880 --zExp;
3881 }
3882 return
3883 roundAndPackFloatx80(
3884 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3885
3886 }
3887
3888 /*
3889 -------------------------------------------------------------------------------
3890 Returns the result of dividing the extended double-precision floating-point
3891 value `a' by the corresponding value `b'. The operation is performed
3892 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3893 -------------------------------------------------------------------------------
3894 */
floatx80_div(floatx80 a,floatx80 b)3895 floatx80 floatx80_div( floatx80 a, floatx80 b )
3896 {
3897 flag aSign, bSign, zSign;
3898 int32 aExp, bExp, zExp;
3899 bits64 aSig, bSig, zSig0, zSig1;
3900 bits64 rem0, rem1, rem2, term0, term1, term2;
3901 floatx80 z;
3902
3903 aSig = extractFloatx80Frac( a );
3904 aExp = extractFloatx80Exp( a );
3905 aSign = extractFloatx80Sign( a );
3906 bSig = extractFloatx80Frac( b );
3907 bExp = extractFloatx80Exp( b );
3908 bSign = extractFloatx80Sign( b );
3909 zSign = aSign ^ bSign;
3910 if ( aExp == 0x7FFF ) {
3911 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3912 if ( bExp == 0x7FFF ) {
3913 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914 goto invalid;
3915 }
3916 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3917 }
3918 if ( bExp == 0x7FFF ) {
3919 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3920 return packFloatx80( zSign, 0, 0 );
3921 }
3922 if ( bExp == 0 ) {
3923 if ( bSig == 0 ) {
3924 if ( ( aExp | aSig ) == 0 ) {
3925 invalid:
3926 float_raise( float_flag_invalid );
3927 z.low = floatx80_default_nan_low;
3928 z.high = floatx80_default_nan_high;
3929 return z;
3930 }
3931 float_raise( float_flag_divbyzero );
3932 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3933 }
3934 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3935 }
3936 if ( aExp == 0 ) {
3937 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3938 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3939 }
3940 zExp = aExp - bExp + 0x3FFE;
3941 rem1 = 0;
3942 if ( bSig <= aSig ) {
3943 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3944 ++zExp;
3945 }
3946 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3947 mul64To128( bSig, zSig0, &term0, &term1 );
3948 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3949 while ( (sbits64) rem0 < 0 ) {
3950 --zSig0;
3951 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3952 }
3953 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3954 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3955 mul64To128( bSig, zSig1, &term1, &term2 );
3956 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3957 while ( (sbits64) rem1 < 0 ) {
3958 --zSig1;
3959 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3960 }
3961 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3962 }
3963 return
3964 roundAndPackFloatx80(
3965 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3966
3967 }
3968
3969 /*
3970 -------------------------------------------------------------------------------
3971 Returns the remainder of the extended double-precision floating-point value
3972 `a' with respect to the corresponding value `b'. The operation is performed
3973 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3974 -------------------------------------------------------------------------------
3975 */
floatx80_rem(floatx80 a,floatx80 b)3976 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3977 {
3978 flag aSign, bSign, zSign;
3979 int32 aExp, bExp, expDiff;
3980 bits64 aSig0, aSig1, bSig;
3981 bits64 q, term0, term1, alternateASig0, alternateASig1;
3982 floatx80 z;
3983
3984 aSig0 = extractFloatx80Frac( a );
3985 aExp = extractFloatx80Exp( a );
3986 aSign = extractFloatx80Sign( a );
3987 bSig = extractFloatx80Frac( b );
3988 bExp = extractFloatx80Exp( b );
3989 bSign = extractFloatx80Sign( b );
3990 if ( aExp == 0x7FFF ) {
3991 if ( (bits64) ( aSig0<<1 )
3992 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3993 return propagateFloatx80NaN( a, b );
3994 }
3995 goto invalid;
3996 }
3997 if ( bExp == 0x7FFF ) {
3998 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3999 return a;
4000 }
4001 if ( bExp == 0 ) {
4002 if ( bSig == 0 ) {
4003 invalid:
4004 float_raise( float_flag_invalid );
4005 z.low = floatx80_default_nan_low;
4006 z.high = floatx80_default_nan_high;
4007 return z;
4008 }
4009 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4010 }
4011 if ( aExp == 0 ) {
4012 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4013 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4014 }
4015 bSig |= LIT64( 0x8000000000000000 );
4016 zSign = aSign;
4017 expDiff = aExp - bExp;
4018 aSig1 = 0;
4019 if ( expDiff < 0 ) {
4020 if ( expDiff < -1 ) return a;
4021 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4022 expDiff = 0;
4023 }
4024 q = ( bSig <= aSig0 );
4025 if ( q ) aSig0 -= bSig;
4026 expDiff -= 64;
4027 while ( 0 < expDiff ) {
4028 q = estimateDiv128To64( aSig0, aSig1, bSig );
4029 q = ( 2 < q ) ? q - 2 : 0;
4030 mul64To128( bSig, q, &term0, &term1 );
4031 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4032 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4033 expDiff -= 62;
4034 }
4035 expDiff += 64;
4036 if ( 0 < expDiff ) {
4037 q = estimateDiv128To64( aSig0, aSig1, bSig );
4038 q = ( 2 < q ) ? q - 2 : 0;
4039 q >>= 64 - expDiff;
4040 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4041 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4042 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4043 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4044 ++q;
4045 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4046 }
4047 }
4048 else {
4049 term1 = 0;
4050 term0 = bSig;
4051 }
4052 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4053 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4054 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4055 && ( q & 1 ) )
4056 ) {
4057 aSig0 = alternateASig0;
4058 aSig1 = alternateASig1;
4059 zSign = ! zSign;
4060 }
4061 return
4062 normalizeRoundAndPackFloatx80(
4063 80, zSign, bExp + expDiff, aSig0, aSig1 );
4064
4065 }
4066
4067 /*
4068 -------------------------------------------------------------------------------
4069 Returns the square root of the extended double-precision floating-point
4070 value `a'. The operation is performed according to the IEC/IEEE Standard
4071 for Binary Floating-Point Arithmetic.
4072 -------------------------------------------------------------------------------
4073 */
floatx80_sqrt(floatx80 a)4074 floatx80 floatx80_sqrt( floatx80 a )
4075 {
4076 flag aSign;
4077 int32 aExp, zExp;
4078 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4079 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4080 floatx80 z;
4081
4082 aSig0 = extractFloatx80Frac( a );
4083 aExp = extractFloatx80Exp( a );
4084 aSign = extractFloatx80Sign( a );
4085 if ( aExp == 0x7FFF ) {
4086 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4087 if ( ! aSign ) return a;
4088 goto invalid;
4089 }
4090 if ( aSign ) {
4091 if ( ( aExp | aSig0 ) == 0 ) return a;
4092 invalid:
4093 float_raise( float_flag_invalid );
4094 z.low = floatx80_default_nan_low;
4095 z.high = floatx80_default_nan_high;
4096 return z;
4097 }
4098 if ( aExp == 0 ) {
4099 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4100 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4101 }
4102 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4103 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4104 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4105 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4106 doubleZSig0 = zSig0<<1;
4107 mul64To128( zSig0, zSig0, &term0, &term1 );
4108 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4109 while ( (sbits64) rem0 < 0 ) {
4110 --zSig0;
4111 doubleZSig0 -= 2;
4112 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4113 }
4114 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4115 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4116 if ( zSig1 == 0 ) zSig1 = 1;
4117 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4118 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4119 mul64To128( zSig1, zSig1, &term2, &term3 );
4120 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4121 while ( (sbits64) rem1 < 0 ) {
4122 --zSig1;
4123 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4124 term3 |= 1;
4125 term2 |= doubleZSig0;
4126 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4127 }
4128 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4129 }
4130 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4131 zSig0 |= doubleZSig0;
4132 return
4133 roundAndPackFloatx80(
4134 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4135
4136 }
4137
4138 /*
4139 -------------------------------------------------------------------------------
4140 Returns 1 if the extended double-precision floating-point value `a' is
4141 equal to the corresponding value `b', and 0 otherwise. The comparison is
4142 performed according to the IEC/IEEE Standard for Binary Floating-Point
4143 Arithmetic.
4144 -------------------------------------------------------------------------------
4145 */
floatx80_eq(floatx80 a,floatx80 b)4146 flag floatx80_eq( floatx80 a, floatx80 b )
4147 {
4148
4149 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4150 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4151 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4152 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4153 ) {
4154 if ( floatx80_is_signaling_nan( a )
4155 || floatx80_is_signaling_nan( b ) ) {
4156 float_raise( float_flag_invalid );
4157 }
4158 return 0;
4159 }
4160 return
4161 ( a.low == b.low )
4162 && ( ( a.high == b.high )
4163 || ( ( a.low == 0 )
4164 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4165 );
4166
4167 }
4168
4169 /*
4170 -------------------------------------------------------------------------------
4171 Returns 1 if the extended double-precision floating-point value `a' is
4172 less than or equal to the corresponding value `b', and 0 otherwise. The
4173 comparison is performed according to the IEC/IEEE Standard for Binary
4174 Floating-Point Arithmetic.
4175 -------------------------------------------------------------------------------
4176 */
floatx80_le(floatx80 a,floatx80 b)4177 flag floatx80_le( floatx80 a, floatx80 b )
4178 {
4179 flag aSign, bSign;
4180
4181 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4182 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4183 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4184 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4185 ) {
4186 float_raise( float_flag_invalid );
4187 return 0;
4188 }
4189 aSign = extractFloatx80Sign( a );
4190 bSign = extractFloatx80Sign( b );
4191 if ( aSign != bSign ) {
4192 return
4193 aSign
4194 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4195 == 0 );
4196 }
4197 return
4198 aSign ? le128( b.high, b.low, a.high, a.low )
4199 : le128( a.high, a.low, b.high, b.low );
4200
4201 }
4202
4203 /*
4204 -------------------------------------------------------------------------------
4205 Returns 1 if the extended double-precision floating-point value `a' is
4206 less than the corresponding value `b', and 0 otherwise. The comparison
4207 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4208 Arithmetic.
4209 -------------------------------------------------------------------------------
4210 */
floatx80_lt(floatx80 a,floatx80 b)4211 flag floatx80_lt( floatx80 a, floatx80 b )
4212 {
4213 flag aSign, bSign;
4214
4215 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4216 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4217 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4218 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4219 ) {
4220 float_raise( float_flag_invalid );
4221 return 0;
4222 }
4223 aSign = extractFloatx80Sign( a );
4224 bSign = extractFloatx80Sign( b );
4225 if ( aSign != bSign ) {
4226 return
4227 aSign
4228 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4229 != 0 );
4230 }
4231 return
4232 aSign ? lt128( b.high, b.low, a.high, a.low )
4233 : lt128( a.high, a.low, b.high, b.low );
4234
4235 }
4236
4237 /*
4238 -------------------------------------------------------------------------------
4239 Returns 1 if the extended double-precision floating-point value `a' is equal
4240 to the corresponding value `b', and 0 otherwise. The invalid exception is
4241 raised if either operand is a NaN. Otherwise, the comparison is performed
4242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4243 -------------------------------------------------------------------------------
4244 */
floatx80_eq_signaling(floatx80 a,floatx80 b)4245 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4246 {
4247
4248 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4249 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4250 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4251 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4252 ) {
4253 float_raise( float_flag_invalid );
4254 return 0;
4255 }
4256 return
4257 ( a.low == b.low )
4258 && ( ( a.high == b.high )
4259 || ( ( a.low == 0 )
4260 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4261 );
4262
4263 }
4264
4265 /*
4266 -------------------------------------------------------------------------------
4267 Returns 1 if the extended double-precision floating-point value `a' is less
4268 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4269 do not cause an exception. Otherwise, the comparison is performed according
4270 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 -------------------------------------------------------------------------------
4272 */
floatx80_le_quiet(floatx80 a,floatx80 b)4273 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4274 {
4275 flag aSign, bSign;
4276
4277 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4278 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4279 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4280 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4281 ) {
4282 if ( floatx80_is_signaling_nan( a )
4283 || floatx80_is_signaling_nan( b ) ) {
4284 float_raise( float_flag_invalid );
4285 }
4286 return 0;
4287 }
4288 aSign = extractFloatx80Sign( a );
4289 bSign = extractFloatx80Sign( b );
4290 if ( aSign != bSign ) {
4291 return
4292 aSign
4293 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4294 == 0 );
4295 }
4296 return
4297 aSign ? le128( b.high, b.low, a.high, a.low )
4298 : le128( a.high, a.low, b.high, b.low );
4299
4300 }
4301
4302 /*
4303 -------------------------------------------------------------------------------
4304 Returns 1 if the extended double-precision floating-point value `a' is less
4305 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4306 an exception. Otherwise, the comparison is performed according to the
4307 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4308 -------------------------------------------------------------------------------
4309 */
floatx80_lt_quiet(floatx80 a,floatx80 b)4310 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4311 {
4312 flag aSign, bSign;
4313
4314 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4315 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4316 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4317 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4318 ) {
4319 if ( floatx80_is_signaling_nan( a )
4320 || floatx80_is_signaling_nan( b ) ) {
4321 float_raise( float_flag_invalid );
4322 }
4323 return 0;
4324 }
4325 aSign = extractFloatx80Sign( a );
4326 bSign = extractFloatx80Sign( b );
4327 if ( aSign != bSign ) {
4328 return
4329 aSign
4330 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4331 != 0 );
4332 }
4333 return
4334 aSign ? lt128( b.high, b.low, a.high, a.low )
4335 : lt128( a.high, a.low, b.high, b.low );
4336
4337 }
4338
4339 #endif
4340
4341 #ifdef FLOAT128
4342
4343 /*
4344 -------------------------------------------------------------------------------
4345 Returns the result of converting the quadruple-precision floating-point
4346 value `a' to the 32-bit two's complement integer format. The conversion
4347 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4348 Arithmetic---which means in particular that the conversion is rounded
4349 according to the current rounding mode. If `a' is a NaN, the largest
4350 positive integer is returned. Otherwise, if the conversion overflows, the
4351 largest integer with the same sign as `a' is returned.
4352 -------------------------------------------------------------------------------
4353 */
float128_to_int32(float128 a)4354 int32 float128_to_int32( float128 a )
4355 {
4356 flag aSign;
4357 int32 aExp, shiftCount;
4358 bits64 aSig0, aSig1;
4359
4360 aSig1 = extractFloat128Frac1( a );
4361 aSig0 = extractFloat128Frac0( a );
4362 aExp = extractFloat128Exp( a );
4363 aSign = extractFloat128Sign( a );
4364 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4365 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4366 aSig0 |= ( aSig1 != 0 );
4367 shiftCount = 0x4028 - aExp;
4368 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4369 return roundAndPackInt32( aSign, aSig0 );
4370
4371 }
4372
4373 /*
4374 -------------------------------------------------------------------------------
4375 Returns the result of converting the quadruple-precision floating-point
4376 value `a' to the 32-bit two's complement integer format. The conversion
4377 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4378 Arithmetic, except that the conversion is always rounded toward zero. If
4379 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4380 conversion overflows, the largest integer with the same sign as `a' is
4381 returned.
4382 -------------------------------------------------------------------------------
4383 */
float128_to_int32_round_to_zero(float128 a)4384 int32 float128_to_int32_round_to_zero( float128 a )
4385 {
4386 flag aSign;
4387 int32 aExp, shiftCount;
4388 bits64 aSig0, aSig1, savedASig;
4389 int32 z;
4390
4391 aSig1 = extractFloat128Frac1( a );
4392 aSig0 = extractFloat128Frac0( a );
4393 aExp = extractFloat128Exp( a );
4394 aSign = extractFloat128Sign( a );
4395 aSig0 |= ( aSig1 != 0 );
4396 if ( 0x401E < aExp ) {
4397 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4398 goto invalid;
4399 }
4400 else if ( aExp < 0x3FFF ) {
4401 if ( aExp || aSig0 ) float_set_inexact();
4402 return 0;
4403 }
4404 aSig0 |= LIT64( 0x0001000000000000 );
4405 shiftCount = 0x402F - aExp;
4406 savedASig = aSig0;
4407 aSig0 >>= shiftCount;
4408 z = aSig0;
4409 if ( aSign ) z = - z;
4410 if ( ( z < 0 ) ^ aSign ) {
4411 invalid:
4412 float_raise( float_flag_invalid );
4413 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4414 }
4415 if ( ( aSig0<<shiftCount ) != savedASig ) {
4416 float_set_inexact();
4417 }
4418 return z;
4419
4420 }
4421
4422 /*
4423 -------------------------------------------------------------------------------
4424 Returns the result of converting the quadruple-precision floating-point
4425 value `a' to the 64-bit two's complement integer format. The conversion
4426 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4427 Arithmetic---which means in particular that the conversion is rounded
4428 according to the current rounding mode. If `a' is a NaN, the largest
4429 positive integer is returned. Otherwise, if the conversion overflows, the
4430 largest integer with the same sign as `a' is returned.
4431 -------------------------------------------------------------------------------
4432 */
float128_to_int64(float128 a)4433 int64 float128_to_int64( float128 a )
4434 {
4435 flag aSign;
4436 int32 aExp, shiftCount;
4437 bits64 aSig0, aSig1;
4438
4439 aSig1 = extractFloat128Frac1( a );
4440 aSig0 = extractFloat128Frac0( a );
4441 aExp = extractFloat128Exp( a );
4442 aSign = extractFloat128Sign( a );
4443 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4444 shiftCount = 0x402F - aExp;
4445 if ( shiftCount <= 0 ) {
4446 if ( 0x403E < aExp ) {
4447 float_raise( float_flag_invalid );
4448 if ( ! aSign
4449 || ( ( aExp == 0x7FFF )
4450 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4451 )
4452 ) {
4453 return LIT64( 0x7FFFFFFFFFFFFFFF );
4454 }
4455 return (sbits64) LIT64( 0x8000000000000000 );
4456 }
4457 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4458 }
4459 else {
4460 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4461 }
4462 return roundAndPackInt64( aSign, aSig0, aSig1 );
4463
4464 }
4465
4466 /*
4467 -------------------------------------------------------------------------------
4468 Returns the result of converting the quadruple-precision floating-point
4469 value `a' to the 64-bit two's complement integer format. The conversion
4470 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4471 Arithmetic, except that the conversion is always rounded toward zero.
4472 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4473 the conversion overflows, the largest integer with the same sign as `a' is
4474 returned.
4475 -------------------------------------------------------------------------------
4476 */
float128_to_int64_round_to_zero(float128 a)4477 int64 float128_to_int64_round_to_zero( float128 a )
4478 {
4479 flag aSign;
4480 int32 aExp, shiftCount;
4481 bits64 aSig0, aSig1;
4482 int64 z;
4483
4484 aSig1 = extractFloat128Frac1( a );
4485 aSig0 = extractFloat128Frac0( a );
4486 aExp = extractFloat128Exp( a );
4487 aSign = extractFloat128Sign( a );
4488 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4489 shiftCount = aExp - 0x402F;
4490 if ( 0 < shiftCount ) {
4491 if ( 0x403E <= aExp ) {
4492 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4493 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4494 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4495 if ( aSig1 ) float_set_inexact();
4496 }
4497 else {
4498 float_raise( float_flag_invalid );
4499 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4500 return LIT64( 0x7FFFFFFFFFFFFFFF );
4501 }
4502 }
4503 return (sbits64) LIT64( 0x8000000000000000 );
4504 }
4505 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4506 if ( (bits64) ( aSig1<<shiftCount ) ) {
4507 float_set_inexact();
4508 }
4509 }
4510 else {
4511 if ( aExp < 0x3FFF ) {
4512 if ( aExp | aSig0 | aSig1 ) {
4513 float_set_inexact();
4514 }
4515 return 0;
4516 }
4517 z = aSig0>>( - shiftCount );
4518 if ( aSig1
4519 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4520 float_set_inexact();
4521 }
4522 }
4523 if ( aSign ) z = - z;
4524 return z;
4525
4526 }
4527
4528 /*
4529 -------------------------------------------------------------------------------
4530 Returns the result of converting the quadruple-precision floating-point
4531 value `a' to the single-precision floating-point format. The conversion
4532 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4533 Arithmetic.
4534 -------------------------------------------------------------------------------
4535 */
float128_to_float32(float128 a)4536 float32 float128_to_float32( float128 a )
4537 {
4538 flag aSign;
4539 int32 aExp;
4540 bits64 aSig0, aSig1;
4541 bits32 zSig;
4542
4543 aSig1 = extractFloat128Frac1( a );
4544 aSig0 = extractFloat128Frac0( a );
4545 aExp = extractFloat128Exp( a );
4546 aSign = extractFloat128Sign( a );
4547 if ( aExp == 0x7FFF ) {
4548 if ( aSig0 | aSig1 ) {
4549 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4550 }
4551 return packFloat32( aSign, 0xFF, 0 );
4552 }
4553 aSig0 |= ( aSig1 != 0 );
4554 shift64RightJamming( aSig0, 18, &aSig0 );
4555 zSig = aSig0;
4556 if ( aExp || zSig ) {
4557 zSig |= 0x40000000;
4558 aExp -= 0x3F81;
4559 }
4560 return roundAndPackFloat32( aSign, aExp, zSig );
4561
4562 }
4563
4564 /*
4565 -------------------------------------------------------------------------------
4566 Returns the result of converting the quadruple-precision floating-point
4567 value `a' to the double-precision floating-point format. The conversion
4568 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4569 Arithmetic.
4570 -------------------------------------------------------------------------------
4571 */
float128_to_float64(float128 a)4572 float64 float128_to_float64( float128 a )
4573 {
4574 flag aSign;
4575 int32 aExp;
4576 bits64 aSig0, aSig1;
4577
4578 aSig1 = extractFloat128Frac1( a );
4579 aSig0 = extractFloat128Frac0( a );
4580 aExp = extractFloat128Exp( a );
4581 aSign = extractFloat128Sign( a );
4582 if ( aExp == 0x7FFF ) {
4583 if ( aSig0 | aSig1 ) {
4584 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4585 }
4586 return packFloat64( aSign, 0x7FF, 0 );
4587 }
4588 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4589 aSig0 |= ( aSig1 != 0 );
4590 if ( aExp || aSig0 ) {
4591 aSig0 |= LIT64( 0x4000000000000000 );
4592 aExp -= 0x3C01;
4593 }
4594 return roundAndPackFloat64( aSign, aExp, aSig0 );
4595
4596 }
4597
4598 #ifdef FLOATX80
4599
4600 /*
4601 -------------------------------------------------------------------------------
4602 Returns the result of converting the quadruple-precision floating-point
4603 value `a' to the extended double-precision floating-point format. The
4604 conversion is performed according to the IEC/IEEE Standard for Binary
4605 Floating-Point Arithmetic.
4606 -------------------------------------------------------------------------------
4607 */
float128_to_floatx80(float128 a)4608 floatx80 float128_to_floatx80( float128 a )
4609 {
4610 flag aSign;
4611 int32 aExp;
4612 bits64 aSig0, aSig1;
4613
4614 aSig1 = extractFloat128Frac1( a );
4615 aSig0 = extractFloat128Frac0( a );
4616 aExp = extractFloat128Exp( a );
4617 aSign = extractFloat128Sign( a );
4618 if ( aExp == 0x7FFF ) {
4619 if ( aSig0 | aSig1 ) {
4620 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4621 }
4622 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4623 }
4624 if ( aExp == 0 ) {
4625 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4626 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4627 }
4628 else {
4629 aSig0 |= LIT64( 0x0001000000000000 );
4630 }
4631 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4632 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4633
4634 }
4635
4636 #endif
4637
4638 /*
4639 -------------------------------------------------------------------------------
4640 Rounds the quadruple-precision floating-point value `a' to an integer, and
4641 returns the result as a quadruple-precision floating-point value. The
4642 operation is performed according to the IEC/IEEE Standard for Binary
4643 Floating-Point Arithmetic.
4644 -------------------------------------------------------------------------------
4645 */
float128_round_to_int(float128 a)4646 float128 float128_round_to_int( float128 a )
4647 {
4648 flag aSign;
4649 int32 aExp;
4650 bits64 lastBitMask, roundBitsMask;
4651 int8 roundingMode;
4652 float128 z;
4653
4654 aExp = extractFloat128Exp( a );
4655 if ( 0x402F <= aExp ) {
4656 if ( 0x406F <= aExp ) {
4657 if ( ( aExp == 0x7FFF )
4658 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4659 ) {
4660 return propagateFloat128NaN( a, a );
4661 }
4662 return a;
4663 }
4664 lastBitMask = 1;
4665 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4666 roundBitsMask = lastBitMask - 1;
4667 z = a;
4668 roundingMode = float_rounding_mode();
4669 if ( roundingMode == float_round_nearest_even ) {
4670 if ( lastBitMask ) {
4671 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4672 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4673 }
4674 else {
4675 if ( (sbits64) z.low < 0 ) {
4676 ++z.high;
4677 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4678 }
4679 }
4680 }
4681 else if ( roundingMode != float_round_to_zero ) {
4682 if ( extractFloat128Sign( z )
4683 ^ ( roundingMode == float_round_up ) ) {
4684 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4685 }
4686 }
4687 z.low &= ~ roundBitsMask;
4688 }
4689 else {
4690 if ( aExp < 0x3FFF ) {
4691 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4692 float_set_inexact();
4693 aSign = extractFloat128Sign( a );
4694 switch ( float_rounding_mode() ) {
4695 case float_round_nearest_even:
4696 if ( ( aExp == 0x3FFE )
4697 && ( extractFloat128Frac0( a )
4698 | extractFloat128Frac1( a ) )
4699 ) {
4700 return packFloat128( aSign, 0x3FFF, 0, 0 );
4701 }
4702 break;
4703 case float_round_down:
4704 return
4705 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4706 : packFloat128( 0, 0, 0, 0 );
4707 case float_round_up:
4708 return
4709 aSign ? packFloat128( 1, 0, 0, 0 )
4710 : packFloat128( 0, 0x3FFF, 0, 0 );
4711 }
4712 return packFloat128( aSign, 0, 0, 0 );
4713 }
4714 lastBitMask = 1;
4715 lastBitMask <<= 0x402F - aExp;
4716 roundBitsMask = lastBitMask - 1;
4717 z.low = 0;
4718 z.high = a.high;
4719 roundingMode = float_rounding_mode();
4720 if ( roundingMode == float_round_nearest_even ) {
4721 z.high += lastBitMask>>1;
4722 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4723 z.high &= ~ lastBitMask;
4724 }
4725 }
4726 else if ( roundingMode != float_round_to_zero ) {
4727 if ( extractFloat128Sign( z )
4728 ^ ( roundingMode == float_round_up ) ) {
4729 z.high |= ( a.low != 0 );
4730 z.high += roundBitsMask;
4731 }
4732 }
4733 z.high &= ~ roundBitsMask;
4734 }
4735 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4736 float_set_inexact();
4737 }
4738 return z;
4739
4740 }
4741
4742 /*
4743 -------------------------------------------------------------------------------
4744 Returns the result of adding the absolute values of the quadruple-precision
4745 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4746 before being returned. `zSign' is ignored if the result is a NaN.
4747 The addition is performed according to the IEC/IEEE Standard for Binary
4748 Floating-Point Arithmetic.
4749 -------------------------------------------------------------------------------
4750 */
addFloat128Sigs(float128 a,float128 b,flag zSign)4751 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4752 {
4753 int32 aExp, bExp, zExp;
4754 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4755 int32 expDiff;
4756
4757 aSig1 = extractFloat128Frac1( a );
4758 aSig0 = extractFloat128Frac0( a );
4759 aExp = extractFloat128Exp( a );
4760 bSig1 = extractFloat128Frac1( b );
4761 bSig0 = extractFloat128Frac0( b );
4762 bExp = extractFloat128Exp( b );
4763 expDiff = aExp - bExp;
4764 if ( 0 < expDiff ) {
4765 if ( aExp == 0x7FFF ) {
4766 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4767 return a;
4768 }
4769 if ( bExp == 0 ) {
4770 --expDiff;
4771 }
4772 else {
4773 bSig0 |= LIT64( 0x0001000000000000 );
4774 }
4775 shift128ExtraRightJamming(
4776 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4777 zExp = aExp;
4778 }
4779 else if ( expDiff < 0 ) {
4780 if ( bExp == 0x7FFF ) {
4781 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4782 return packFloat128( zSign, 0x7FFF, 0, 0 );
4783 }
4784 if ( aExp == 0 ) {
4785 ++expDiff;
4786 }
4787 else {
4788 aSig0 |= LIT64( 0x0001000000000000 );
4789 }
4790 shift128ExtraRightJamming(
4791 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4792 zExp = bExp;
4793 }
4794 else {
4795 if ( aExp == 0x7FFF ) {
4796 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4797 return propagateFloat128NaN( a, b );
4798 }
4799 return a;
4800 }
4801 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4802 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4803 zSig2 = 0;
4804 zSig0 |= LIT64( 0x0002000000000000 );
4805 zExp = aExp;
4806 goto shiftRight1;
4807 }
4808 aSig0 |= LIT64( 0x0001000000000000 );
4809 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4810 --zExp;
4811 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4812 ++zExp;
4813 shiftRight1:
4814 shift128ExtraRightJamming(
4815 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4816 roundAndPack:
4817 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4818
4819 }
4820
4821 /*
4822 -------------------------------------------------------------------------------
4823 Returns the result of subtracting the absolute values of the quadruple-
4824 precision floating-point values `a' and `b'. If `zSign' is 1, the
4825 difference is negated before being returned. `zSign' is ignored if the
4826 result is a NaN. The subtraction is performed according to the IEC/IEEE
4827 Standard for Binary Floating-Point Arithmetic.
4828 -------------------------------------------------------------------------------
4829 */
subFloat128Sigs(float128 a,float128 b,flag zSign)4830 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4831 {
4832 int32 aExp, bExp, zExp;
4833 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4834 int32 expDiff;
4835 float128 z;
4836
4837 aSig1 = extractFloat128Frac1( a );
4838 aSig0 = extractFloat128Frac0( a );
4839 aExp = extractFloat128Exp( a );
4840 bSig1 = extractFloat128Frac1( b );
4841 bSig0 = extractFloat128Frac0( b );
4842 bExp = extractFloat128Exp( b );
4843 expDiff = aExp - bExp;
4844 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4845 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4846 if ( 0 < expDiff ) goto aExpBigger;
4847 if ( expDiff < 0 ) goto bExpBigger;
4848 if ( aExp == 0x7FFF ) {
4849 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4850 return propagateFloat128NaN( a, b );
4851 }
4852 float_raise( float_flag_invalid );
4853 z.low = float128_default_nan_low;
4854 z.high = float128_default_nan_high;
4855 return z;
4856 }
4857 if ( aExp == 0 ) {
4858 aExp = 1;
4859 bExp = 1;
4860 }
4861 if ( bSig0 < aSig0 ) goto aBigger;
4862 if ( aSig0 < bSig0 ) goto bBigger;
4863 if ( bSig1 < aSig1 ) goto aBigger;
4864 if ( aSig1 < bSig1 ) goto bBigger;
4865 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4866 bExpBigger:
4867 if ( bExp == 0x7FFF ) {
4868 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4869 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4870 }
4871 if ( aExp == 0 ) {
4872 ++expDiff;
4873 }
4874 else {
4875 aSig0 |= LIT64( 0x4000000000000000 );
4876 }
4877 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4878 bSig0 |= LIT64( 0x4000000000000000 );
4879 bBigger:
4880 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4881 zExp = bExp;
4882 zSign ^= 1;
4883 goto normalizeRoundAndPack;
4884 aExpBigger:
4885 if ( aExp == 0x7FFF ) {
4886 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4887 return a;
4888 }
4889 if ( bExp == 0 ) {
4890 --expDiff;
4891 }
4892 else {
4893 bSig0 |= LIT64( 0x4000000000000000 );
4894 }
4895 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4896 aSig0 |= LIT64( 0x4000000000000000 );
4897 aBigger:
4898 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4899 zExp = aExp;
4900 normalizeRoundAndPack:
4901 --zExp;
4902 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4903
4904 }
4905
4906 /*
4907 -------------------------------------------------------------------------------
4908 Returns the result of adding the quadruple-precision floating-point values
4909 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4910 for Binary Floating-Point Arithmetic.
4911 -------------------------------------------------------------------------------
4912 */
float128_add(float128 a,float128 b)4913 float128 float128_add( float128 a, float128 b )
4914 {
4915 flag aSign, bSign;
4916
4917 aSign = extractFloat128Sign( a );
4918 bSign = extractFloat128Sign( b );
4919 if ( aSign == bSign ) {
4920 return addFloat128Sigs( a, b, aSign );
4921 }
4922 else {
4923 return subFloat128Sigs( a, b, aSign );
4924 }
4925
4926 }
4927
4928 /*
4929 -------------------------------------------------------------------------------
4930 Returns the result of subtracting the quadruple-precision floating-point
4931 values `a' and `b'. The operation is performed according to the IEC/IEEE
4932 Standard for Binary Floating-Point Arithmetic.
4933 -------------------------------------------------------------------------------
4934 */
float128_sub(float128 a,float128 b)4935 float128 float128_sub( float128 a, float128 b )
4936 {
4937 flag aSign, bSign;
4938
4939 aSign = extractFloat128Sign( a );
4940 bSign = extractFloat128Sign( b );
4941 if ( aSign == bSign ) {
4942 return subFloat128Sigs( a, b, aSign );
4943 }
4944 else {
4945 return addFloat128Sigs( a, b, aSign );
4946 }
4947
4948 }
4949
4950 /*
4951 -------------------------------------------------------------------------------
4952 Returns the result of multiplying the quadruple-precision floating-point
4953 values `a' and `b'. The operation is performed according to the IEC/IEEE
4954 Standard for Binary Floating-Point Arithmetic.
4955 -------------------------------------------------------------------------------
4956 */
float128_mul(float128 a,float128 b)4957 float128 float128_mul( float128 a, float128 b )
4958 {
4959 flag aSign, bSign, zSign;
4960 int32 aExp, bExp, zExp;
4961 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4962 float128 z;
4963
4964 aSig1 = extractFloat128Frac1( a );
4965 aSig0 = extractFloat128Frac0( a );
4966 aExp = extractFloat128Exp( a );
4967 aSign = extractFloat128Sign( a );
4968 bSig1 = extractFloat128Frac1( b );
4969 bSig0 = extractFloat128Frac0( b );
4970 bExp = extractFloat128Exp( b );
4971 bSign = extractFloat128Sign( b );
4972 zSign = aSign ^ bSign;
4973 if ( aExp == 0x7FFF ) {
4974 if ( ( aSig0 | aSig1 )
4975 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4976 return propagateFloat128NaN( a, b );
4977 }
4978 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4979 return packFloat128( zSign, 0x7FFF, 0, 0 );
4980 }
4981 if ( bExp == 0x7FFF ) {
4982 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4983 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4984 invalid:
4985 float_raise( float_flag_invalid );
4986 z.low = float128_default_nan_low;
4987 z.high = float128_default_nan_high;
4988 return z;
4989 }
4990 return packFloat128( zSign, 0x7FFF, 0, 0 );
4991 }
4992 if ( aExp == 0 ) {
4993 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4994 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4995 }
4996 if ( bExp == 0 ) {
4997 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4998 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4999 }
5000 zExp = aExp + bExp - 0x4000;
5001 aSig0 |= LIT64( 0x0001000000000000 );
5002 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5003 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5004 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5005 zSig2 |= ( zSig3 != 0 );
5006 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5007 shift128ExtraRightJamming(
5008 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5009 ++zExp;
5010 }
5011 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5012
5013 }
5014
5015 /*
5016 -------------------------------------------------------------------------------
5017 Returns the result of dividing the quadruple-precision floating-point value
5018 `a' by the corresponding value `b'. The operation is performed according to
5019 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 -------------------------------------------------------------------------------
5021 */
float128_div(float128 a,float128 b)5022 float128 float128_div( float128 a, float128 b )
5023 {
5024 flag aSign, bSign, zSign;
5025 int32 aExp, bExp, zExp;
5026 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5027 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5028 float128 z;
5029
5030 aSig1 = extractFloat128Frac1( a );
5031 aSig0 = extractFloat128Frac0( a );
5032 aExp = extractFloat128Exp( a );
5033 aSign = extractFloat128Sign( a );
5034 bSig1 = extractFloat128Frac1( b );
5035 bSig0 = extractFloat128Frac0( b );
5036 bExp = extractFloat128Exp( b );
5037 bSign = extractFloat128Sign( b );
5038 zSign = aSign ^ bSign;
5039 if ( aExp == 0x7FFF ) {
5040 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5041 if ( bExp == 0x7FFF ) {
5042 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5043 goto invalid;
5044 }
5045 return packFloat128( zSign, 0x7FFF, 0, 0 );
5046 }
5047 if ( bExp == 0x7FFF ) {
5048 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5049 return packFloat128( zSign, 0, 0, 0 );
5050 }
5051 if ( bExp == 0 ) {
5052 if ( ( bSig0 | bSig1 ) == 0 ) {
5053 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5054 invalid:
5055 float_raise( float_flag_invalid );
5056 z.low = float128_default_nan_low;
5057 z.high = float128_default_nan_high;
5058 return z;
5059 }
5060 float_raise( float_flag_divbyzero );
5061 return packFloat128( zSign, 0x7FFF, 0, 0 );
5062 }
5063 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5064 }
5065 if ( aExp == 0 ) {
5066 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5067 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5068 }
5069 zExp = aExp - bExp + 0x3FFD;
5070 shortShift128Left(
5071 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5072 shortShift128Left(
5073 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5074 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5075 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5076 ++zExp;
5077 }
5078 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5079 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5080 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5081 while ( (sbits64) rem0 < 0 ) {
5082 --zSig0;
5083 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5084 }
5085 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5086 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5087 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5088 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5089 while ( (sbits64) rem1 < 0 ) {
5090 --zSig1;
5091 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5092 }
5093 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5094 }
5095 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5096 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5097
5098 }
5099
5100 /*
5101 -------------------------------------------------------------------------------
5102 Returns the remainder of the quadruple-precision floating-point value `a'
5103 with respect to the corresponding value `b'. The operation is performed
5104 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5105 -------------------------------------------------------------------------------
5106 */
float128_rem(float128 a,float128 b)5107 float128 float128_rem( float128 a, float128 b )
5108 {
5109 flag aSign, bSign, zSign;
5110 int32 aExp, bExp, expDiff;
5111 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5112 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5113 sbits64 sigMean0;
5114 float128 z;
5115
5116 aSig1 = extractFloat128Frac1( a );
5117 aSig0 = extractFloat128Frac0( a );
5118 aExp = extractFloat128Exp( a );
5119 aSign = extractFloat128Sign( a );
5120 bSig1 = extractFloat128Frac1( b );
5121 bSig0 = extractFloat128Frac0( b );
5122 bExp = extractFloat128Exp( b );
5123 bSign = extractFloat128Sign( b );
5124 if ( aExp == 0x7FFF ) {
5125 if ( ( aSig0 | aSig1 )
5126 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5127 return propagateFloat128NaN( a, b );
5128 }
5129 goto invalid;
5130 }
5131 if ( bExp == 0x7FFF ) {
5132 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5133 return a;
5134 }
5135 if ( bExp == 0 ) {
5136 if ( ( bSig0 | bSig1 ) == 0 ) {
5137 invalid:
5138 float_raise( float_flag_invalid );
5139 z.low = float128_default_nan_low;
5140 z.high = float128_default_nan_high;
5141 return z;
5142 }
5143 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5144 }
5145 if ( aExp == 0 ) {
5146 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5147 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5148 }
5149 expDiff = aExp - bExp;
5150 if ( expDiff < -1 ) return a;
5151 shortShift128Left(
5152 aSig0 | LIT64( 0x0001000000000000 ),
5153 aSig1,
5154 15 - ( expDiff < 0 ),
5155 &aSig0,
5156 &aSig1
5157 );
5158 shortShift128Left(
5159 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5160 q = le128( bSig0, bSig1, aSig0, aSig1 );
5161 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5162 expDiff -= 64;
5163 while ( 0 < expDiff ) {
5164 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5165 q = ( 4 < q ) ? q - 4 : 0;
5166 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5167 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5168 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5169 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5170 expDiff -= 61;
5171 }
5172 if ( -64 < expDiff ) {
5173 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5174 q = ( 4 < q ) ? q - 4 : 0;
5175 q >>= - expDiff;
5176 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5177 expDiff += 52;
5178 if ( expDiff < 0 ) {
5179 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5180 }
5181 else {
5182 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5183 }
5184 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5185 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5186 }
5187 else {
5188 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5189 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5190 }
5191 do {
5192 alternateASig0 = aSig0;
5193 alternateASig1 = aSig1;
5194 ++q;
5195 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5196 } while ( 0 <= (sbits64) aSig0 );
5197 add128(
5198 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5199 if ( ( sigMean0 < 0 )
5200 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5201 aSig0 = alternateASig0;
5202 aSig1 = alternateASig1;
5203 }
5204 zSign = ( (sbits64) aSig0 < 0 );
5205 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5206 return
5207 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5208
5209 }
5210
5211 /*
5212 -------------------------------------------------------------------------------
5213 Returns the square root of the quadruple-precision floating-point value `a'.
5214 The operation is performed according to the IEC/IEEE Standard for Binary
5215 Floating-Point Arithmetic.
5216 -------------------------------------------------------------------------------
5217 */
float128_sqrt(float128 a)5218 float128 float128_sqrt( float128 a )
5219 {
5220 flag aSign;
5221 int32 aExp, zExp;
5222 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5223 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5224 float128 z;
5225
5226 aSig1 = extractFloat128Frac1( a );
5227 aSig0 = extractFloat128Frac0( a );
5228 aExp = extractFloat128Exp( a );
5229 aSign = extractFloat128Sign( a );
5230 if ( aExp == 0x7FFF ) {
5231 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5232 if ( ! aSign ) return a;
5233 goto invalid;
5234 }
5235 if ( aSign ) {
5236 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5237 invalid:
5238 float_raise( float_flag_invalid );
5239 z.low = float128_default_nan_low;
5240 z.high = float128_default_nan_high;
5241 return z;
5242 }
5243 if ( aExp == 0 ) {
5244 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5245 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5246 }
5247 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5248 aSig0 |= LIT64( 0x0001000000000000 );
5249 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5250 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5251 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5252 doubleZSig0 = zSig0<<1;
5253 mul64To128( zSig0, zSig0, &term0, &term1 );
5254 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5255 while ( (sbits64) rem0 < 0 ) {
5256 --zSig0;
5257 doubleZSig0 -= 2;
5258 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5259 }
5260 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5261 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5262 if ( zSig1 == 0 ) zSig1 = 1;
5263 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5264 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5265 mul64To128( zSig1, zSig1, &term2, &term3 );
5266 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5267 while ( (sbits64) rem1 < 0 ) {
5268 --zSig1;
5269 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5270 term3 |= 1;
5271 term2 |= doubleZSig0;
5272 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5273 }
5274 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5275 }
5276 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5277 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5278
5279 }
5280
5281 /*
5282 -------------------------------------------------------------------------------
5283 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5284 the corresponding value `b', and 0 otherwise. The comparison is performed
5285 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5286 -------------------------------------------------------------------------------
5287 */
float128_eq(float128 a,float128 b)5288 flag float128_eq( float128 a, float128 b )
5289 {
5290
5291 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5292 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5293 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5294 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5295 ) {
5296 if ( float128_is_signaling_nan( a )
5297 || float128_is_signaling_nan( b ) ) {
5298 float_raise( float_flag_invalid );
5299 }
5300 return 0;
5301 }
5302 return
5303 ( a.low == b.low )
5304 && ( ( a.high == b.high )
5305 || ( ( a.low == 0 )
5306 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5307 );
5308
5309 }
5310
5311 /*
5312 -------------------------------------------------------------------------------
5313 Returns 1 if the quadruple-precision floating-point value `a' is less than
5314 or equal to the corresponding value `b', and 0 otherwise. The comparison
5315 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5316 Arithmetic.
5317 -------------------------------------------------------------------------------
5318 */
float128_le(float128 a,float128 b)5319 flag float128_le( float128 a, float128 b )
5320 {
5321 flag aSign, bSign;
5322
5323 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5324 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5325 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5326 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5327 ) {
5328 float_raise( float_flag_invalid );
5329 return 0;
5330 }
5331 aSign = extractFloat128Sign( a );
5332 bSign = extractFloat128Sign( b );
5333 if ( aSign != bSign ) {
5334 return
5335 aSign
5336 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5337 == 0 );
5338 }
5339 return
5340 aSign ? le128( b.high, b.low, a.high, a.low )
5341 : le128( a.high, a.low, b.high, b.low );
5342
5343 }
5344
5345 /*
5346 -------------------------------------------------------------------------------
5347 Returns 1 if the quadruple-precision floating-point value `a' is less than
5348 the corresponding value `b', and 0 otherwise. The comparison is performed
5349 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5350 -------------------------------------------------------------------------------
5351 */
float128_lt(float128 a,float128 b)5352 flag float128_lt( float128 a, float128 b )
5353 {
5354 flag aSign, bSign;
5355
5356 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5357 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5358 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5359 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5360 ) {
5361 float_raise( float_flag_invalid );
5362 return 0;
5363 }
5364 aSign = extractFloat128Sign( a );
5365 bSign = extractFloat128Sign( b );
5366 if ( aSign != bSign ) {
5367 return
5368 aSign
5369 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5370 != 0 );
5371 }
5372 return
5373 aSign ? lt128( b.high, b.low, a.high, a.low )
5374 : lt128( a.high, a.low, b.high, b.low );
5375
5376 }
5377
5378 /*
5379 -------------------------------------------------------------------------------
5380 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5381 the corresponding value `b', and 0 otherwise. The invalid exception is
5382 raised if either operand is a NaN. Otherwise, the comparison is performed
5383 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5384 -------------------------------------------------------------------------------
5385 */
float128_eq_signaling(float128 a,float128 b)5386 flag float128_eq_signaling( float128 a, float128 b )
5387 {
5388
5389 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5390 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5391 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5392 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5393 ) {
5394 float_raise( float_flag_invalid );
5395 return 0;
5396 }
5397 return
5398 ( a.low == b.low )
5399 && ( ( a.high == b.high )
5400 || ( ( a.low == 0 )
5401 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5402 );
5403
5404 }
5405
5406 /*
5407 -------------------------------------------------------------------------------
5408 Returns 1 if the quadruple-precision floating-point value `a' is less than
5409 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5410 cause an exception. Otherwise, the comparison is performed according to the
5411 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5412 -------------------------------------------------------------------------------
5413 */
float128_le_quiet(float128 a,float128 b)5414 flag float128_le_quiet( float128 a, float128 b )
5415 {
5416 flag aSign, bSign;
5417
5418 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5419 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5420 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5421 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5422 ) {
5423 if ( float128_is_signaling_nan( a )
5424 || float128_is_signaling_nan( b ) ) {
5425 float_raise( float_flag_invalid );
5426 }
5427 return 0;
5428 }
5429 aSign = extractFloat128Sign( a );
5430 bSign = extractFloat128Sign( b );
5431 if ( aSign != bSign ) {
5432 return
5433 aSign
5434 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5435 == 0 );
5436 }
5437 return
5438 aSign ? le128( b.high, b.low, a.high, a.low )
5439 : le128( a.high, a.low, b.high, b.low );
5440
5441 }
5442
5443 /*
5444 -------------------------------------------------------------------------------
5445 Returns 1 if the quadruple-precision floating-point value `a' is less than
5446 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5447 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5448 Standard for Binary Floating-Point Arithmetic.
5449 -------------------------------------------------------------------------------
5450 */
float128_lt_quiet(float128 a,float128 b)5451 flag float128_lt_quiet( float128 a, float128 b )
5452 {
5453 flag aSign, bSign;
5454
5455 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5456 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5457 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5458 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5459 ) {
5460 if ( float128_is_signaling_nan( a )
5461 || float128_is_signaling_nan( b ) ) {
5462 float_raise( float_flag_invalid );
5463 }
5464 return 0;
5465 }
5466 aSign = extractFloat128Sign( a );
5467 bSign = extractFloat128Sign( b );
5468 if ( aSign != bSign ) {
5469 return
5470 aSign
5471 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5472 != 0 );
5473 }
5474 return
5475 aSign ? lt128( b.high, b.low, a.high, a.low )
5476 : lt128( a.high, a.low, b.high, b.low );
5477
5478 }
5479
5480 #endif
5481
5482
5483 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5484
5485 /*
5486 * These two routines are not part of the original softfloat distribution.
5487 *
5488 * They are based on the corresponding conversions to integer but return
5489 * unsigned numbers instead since these functions are required by GCC.
5490 *
5491 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97
5492 *
5493 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5494 */
5495
5496 /*
5497 -------------------------------------------------------------------------------
5498 Returns the result of converting the double-precision floating-point value
5499 `a' to the 32-bit unsigned integer format. The conversion is
5500 performed according to the IEC/IEEE Standard for Binary Floating-point
5501 Arithmetic, except that the conversion is always rounded toward zero. If
5502 `a' is a NaN, the largest positive integer is returned. If the conversion
5503 overflows, the largest integer positive is returned.
5504 -------------------------------------------------------------------------------
5505 */
float64_to_uint32_round_to_zero(float64 a)5506 uint32 float64_to_uint32_round_to_zero( float64 a )
5507 {
5508 flag aSign;
5509 int16 aExp, shiftCount;
5510 bits64 aSig, savedASig;
5511 uint32 z;
5512
5513 aSig = extractFloat64Frac( a );
5514 aExp = extractFloat64Exp( a );
5515 aSign = extractFloat64Sign( a );
5516
5517 if (aSign) {
5518 float_raise( float_flag_invalid );
5519 return(0);
5520 }
5521
5522 if ( 0x41E < aExp ) {
5523 float_raise( float_flag_invalid );
5524 return 0xffffffff;
5525 }
5526 else if ( aExp < 0x3FF ) {
5527 if ( aExp || aSig ) float_set_inexact();
5528 return 0;
5529 }
5530 aSig |= LIT64( 0x0010000000000000 );
5531 shiftCount = 0x433 - aExp;
5532 savedASig = aSig;
5533 aSig >>= shiftCount;
5534 z = aSig;
5535 if ( ( aSig<<shiftCount ) != savedASig ) {
5536 float_set_inexact();
5537 }
5538 return z;
5539
5540 }
5541
5542 /*
5543 -------------------------------------------------------------------------------
5544 Returns the result of converting the single-precision floating-point value
5545 `a' to the 32-bit unsigned integer format. The conversion is
5546 performed according to the IEC/IEEE Standard for Binary Floating-point
5547 Arithmetic, except that the conversion is always rounded toward zero. If
5548 `a' is a NaN, the largest positive integer is returned. If the conversion
5549 overflows, the largest positive integer is returned.
5550 -------------------------------------------------------------------------------
5551 */
float32_to_uint32_round_to_zero(float32 a)5552 uint32 float32_to_uint32_round_to_zero( float32 a )
5553 {
5554 flag aSign;
5555 int16 aExp, shiftCount;
5556 bits32 aSig;
5557 uint32 z;
5558
5559 aSig = extractFloat32Frac( a );
5560 aExp = extractFloat32Exp( a );
5561 aSign = extractFloat32Sign( a );
5562 shiftCount = aExp - 0x9E;
5563
5564 if (aSign) {
5565 float_raise( float_flag_invalid );
5566 return(0);
5567 }
5568 if ( 0 < shiftCount ) {
5569 float_raise( float_flag_invalid );
5570 return 0xFFFFFFFF;
5571 }
5572 else if ( aExp <= 0x7E ) {
5573 if ( aExp | aSig ) float_set_inexact();
5574 return 0;
5575 }
5576 aSig = ( aSig | 0x800000 )<<8;
5577 z = aSig>>( - shiftCount );
5578 if ( aSig<<( shiftCount & 31 ) ) {
5579 float_set_inexact();
5580 }
5581 return z;
5582
5583 }
5584
5585 #endif
5586