1
2 /*============================================================================
3
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
6
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
16
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
30
31 =============================================================================*/
32
33 #include "milieu.h"
34 #include "softfloat.h"
35
36 /*----------------------------------------------------------------------------
37 | Floating-point rounding mode, extended double-precision rounding precision,
38 | and exception flags.
39 *----------------------------------------------------------------------------*/
40 int8 float_exception_flags = 0;
41 #ifdef FLOATX80
42 int8 floatx80_rounding_precision = 80;
43 #endif
44
45 int8 float_rounding_mode = float_round_nearest_even;
46
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize"
56
57 /*----------------------------------------------------------------------------
58 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
59 | and 7, and returns the properly rounded 32-bit integer corresponding to the
60 | input. If `zSign' is 1, the input is negated before being converted to an
61 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
62 | is simply rounded to an integer, with the inexact exception raised if the
63 | input cannot be represented exactly as an integer. However, if the fixed-
64 | point input is too large, the invalid exception is raised and the largest
65 | positive or negative integer is returned.
66 *----------------------------------------------------------------------------*/
67
roundAndPackInt32(flag zSign,bits64 absZ)68 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
69 {
70 int8 roundingMode;
71 flag roundNearestEven;
72 int8 roundIncrement, roundBits;
73 int32 z;
74
75 roundingMode = float_rounding_mode;
76 roundNearestEven = ( roundingMode == float_round_nearest_even );
77 roundIncrement = 0x40;
78 if ( ! roundNearestEven ) {
79 if ( roundingMode == float_round_to_zero ) {
80 roundIncrement = 0;
81 }
82 else {
83 roundIncrement = 0x7F;
84 if ( zSign ) {
85 if ( roundingMode == float_round_up ) roundIncrement = 0;
86 }
87 else {
88 if ( roundingMode == float_round_down ) roundIncrement = 0;
89 }
90 }
91 }
92 roundBits = absZ & 0x7F;
93 absZ = ( absZ + roundIncrement )>>7;
94 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
95 z = absZ;
96 if ( zSign ) z = - z;
97 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
98 float_raise( float_flag_invalid );
99 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
100 }
101 if ( roundBits ) float_exception_flags |= float_flag_inexact;
102 return z;
103
104 }
105
106 /*----------------------------------------------------------------------------
107 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
108 | `absZ1', with binary point between bits 63 and 64 (between the input words),
109 | and returns the properly rounded 64-bit integer corresponding to the input.
110 | If `zSign' is 1, the input is negated before being converted to an integer.
111 | Ordinarily, the fixed-point input is simply rounded to an integer, with
112 | the inexact exception raised if the input cannot be represented exactly as
113 | an integer. However, if the fixed-point input is too large, the invalid
114 | exception is raised and the largest positive or negative integer is
115 | returned.
116 *----------------------------------------------------------------------------*/
117
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)118 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
119 {
120 int8 roundingMode;
121 flag roundNearestEven, increment;
122 int64 z;
123
124 roundingMode = float_rounding_mode;
125 roundNearestEven = ( roundingMode == float_round_nearest_even );
126 increment = ( (sbits64) absZ1 < 0 );
127 if ( ! roundNearestEven ) {
128 if ( roundingMode == float_round_to_zero ) {
129 increment = 0;
130 }
131 else {
132 if ( zSign ) {
133 increment = ( roundingMode == float_round_down ) && absZ1;
134 }
135 else {
136 increment = ( roundingMode == float_round_up ) && absZ1;
137 }
138 }
139 }
140 if ( increment ) {
141 ++absZ0;
142 if ( absZ0 == 0 ) goto overflow;
143 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
144 }
145 z = absZ0;
146 if ( zSign ) z = - z;
147 if ( z && ( ( z < 0 ) ^ zSign ) ) {
148 overflow:
149 float_raise( float_flag_invalid );
150 return
151 zSign ? (sbits64) LIT64( 0x8000000000000000 )
152 : LIT64( 0x7FFFFFFFFFFFFFFF );
153 }
154 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
155 return z;
156
157 }
158
159 /*----------------------------------------------------------------------------
160 | Returns the fraction bits of the single-precision floating-point value `a'.
161 *----------------------------------------------------------------------------*/
162
extractFloat32Frac(float32 a)163 INLINE bits32 extractFloat32Frac( float32 a )
164 {
165 return a & 0x007FFFFF;
166
167 }
168
169 /*----------------------------------------------------------------------------
170 | Returns the exponent bits of the single-precision floating-point value `a'.
171 *----------------------------------------------------------------------------*/
172
extractFloat32Exp(float32 a)173 INLINE int16 extractFloat32Exp( float32 a )
174 {
175 return ( a>>23 ) & 0xFF;
176
177 }
178
179 /*----------------------------------------------------------------------------
180 | Returns the sign bit of the single-precision floating-point value `a'.
181 *----------------------------------------------------------------------------*/
182
extractFloat32Sign(float32 a)183 INLINE flag extractFloat32Sign( float32 a )
184 {
185 return a>>31;
186
187 }
188
189 /*----------------------------------------------------------------------------
190 | Normalizes the subnormal single-precision floating-point value represented
191 | by the denormalized significand `aSig'. The normalized exponent and
192 | significand are stored at the locations pointed to by `zExpPtr' and
193 | `zSigPtr', respectively.
194 *----------------------------------------------------------------------------*/
195
196 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)197 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
198 {
199 int8 shiftCount;
200
201 shiftCount = countLeadingZeros32( aSig ) - 8;
202 *zSigPtr = aSig<<shiftCount;
203 *zExpPtr = 1 - shiftCount;
204
205 }
206
207 /*----------------------------------------------------------------------------
208 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
209 | single-precision floating-point value, returning the result. After being
210 | shifted into the proper positions, the three fields are simply added
211 | together to form the result. This means that any integer portion of `zSig'
212 | will be added into the exponent. Since a properly normalized significand
213 | will have an integer portion equal to 1, the `zExp' input should be 1 less
214 | than the desired result exponent whenever `zSig' is a complete, normalized
215 | significand.
216 *----------------------------------------------------------------------------*/
217
packFloat32(flag zSign,int16 zExp,bits32 zSig)218 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
219 {
220 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
221
222 }
223
224 /*----------------------------------------------------------------------------
225 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
226 | and significand `zSig', and returns the proper single-precision floating-
227 | point value corresponding to the abstract input. Ordinarily, the abstract
228 | value is simply rounded and packed into the single-precision format, with
229 | the inexact exception raised if the abstract input cannot be represented
230 | exactly. However, if the abstract value is too large, the overflow and
231 | inexact exceptions are raised and an infinity or maximal finite value is
232 | returned. If the abstract value is too small, the input value is rounded to
233 | a subnormal number, and the underflow and inexact exceptions are raised if
234 | the abstract input cannot be represented exactly as a subnormal single-
235 | precision floating-point number.
236 | The input significand `zSig' has its binary point between bits 30
237 | and 29, which is 7 bits to the left of the usual location. This shifted
238 | significand must be normalized or smaller. If `zSig' is not normalized,
239 | `zExp' must be 0; in that case, the result returned is a subnormal number,
240 | and it must not require rounding. In the usual case that `zSig' is
241 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
242 | The handling of underflow and overflow follows the IEC/IEEE Standard for
243 | Binary Floating-Point Arithmetic.
244 *----------------------------------------------------------------------------*/
245
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)246 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
247 {
248 int8 roundingMode;
249 flag roundNearestEven;
250 int8 roundIncrement, roundBits;
251 flag isTiny;
252
253 roundingMode = float_rounding_mode;
254 roundNearestEven = ( roundingMode == float_round_nearest_even );
255 roundIncrement = 0x40;
256 if ( ! roundNearestEven ) {
257 if ( roundingMode == float_round_to_zero ) {
258 roundIncrement = 0;
259 }
260 else {
261 roundIncrement = 0x7F;
262 if ( zSign ) {
263 if ( roundingMode == float_round_up ) roundIncrement = 0;
264 }
265 else {
266 if ( roundingMode == float_round_down ) roundIncrement = 0;
267 }
268 }
269 }
270 roundBits = zSig & 0x7F;
271 if ( 0xFD <= (bits16) zExp ) {
272 if ( ( 0xFD < zExp )
273 || ( ( zExp == 0xFD )
274 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
275 ) {
276 float_raise( float_flag_overflow | float_flag_inexact );
277 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
278 }
279 if ( zExp < 0 ) {
280 isTiny =
281 ( float_detect_tininess == float_tininess_before_rounding )
282 || ( zExp < -1 )
283 || ( zSig + roundIncrement < 0x80000000 );
284 shift32RightJamming( zSig, - zExp, &zSig );
285 zExp = 0;
286 roundBits = zSig & 0x7F;
287 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
288 }
289 }
290 if ( roundBits ) float_exception_flags |= float_flag_inexact;
291 zSig = ( zSig + roundIncrement )>>7;
292 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
293 if ( zSig == 0 ) zExp = 0;
294 return packFloat32( zSign, zExp, zSig );
295
296 }
297
298 /*----------------------------------------------------------------------------
299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
300 | and significand `zSig', and returns the proper single-precision floating-
301 | point value corresponding to the abstract input. This routine is just like
302 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
303 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
304 | floating-point exponent.
305 *----------------------------------------------------------------------------*/
306
307 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)308 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
309 {
310 int8 shiftCount;
311
312 shiftCount = countLeadingZeros32( zSig ) - 1;
313 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
314
315 }
316
317 /*----------------------------------------------------------------------------
318 | Returns the fraction bits of the double-precision floating-point value `a'.
319 *----------------------------------------------------------------------------*/
320
extractFloat64Frac(float64 a)321 INLINE bits64 extractFloat64Frac( float64 a )
322 {
323 return a & LIT64( 0x000FFFFFFFFFFFFF );
324
325 }
326
327 /*----------------------------------------------------------------------------
328 | Returns the exponent bits of the double-precision floating-point value `a'.
329 *----------------------------------------------------------------------------*/
330
extractFloat64Exp(float64 a)331 INLINE int16 extractFloat64Exp( float64 a )
332 {
333 return ( a>>52 ) & 0x7FF;
334
335 }
336
337 /*----------------------------------------------------------------------------
338 | Returns the sign bit of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
340
extractFloat64Sign(float64 a)341 INLINE flag extractFloat64Sign( float64 a )
342 {
343 return a>>63;
344
345 }
346
347 /*----------------------------------------------------------------------------
348 | Normalizes the subnormal double-precision floating-point value represented
349 | by the denormalized significand `aSig'. The normalized exponent and
350 | significand are stored at the locations pointed to by `zExpPtr' and
351 | `zSigPtr', respectively.
352 *----------------------------------------------------------------------------*/
353
354 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)355 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
356 {
357 int8 shiftCount;
358
359 shiftCount = countLeadingZeros64( aSig ) - 11;
360 *zSigPtr = aSig<<shiftCount;
361 *zExpPtr = 1 - shiftCount;
362
363 }
364
365 /*----------------------------------------------------------------------------
366 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
367 | double-precision floating-point value, returning the result. After being
368 | shifted into the proper positions, the three fields are simply added
369 | together to form the result. This means that any integer portion of `zSig'
370 | will be added into the exponent. Since a properly normalized significand
371 | will have an integer portion equal to 1, the `zExp' input should be 1 less
372 | than the desired result exponent whenever `zSig' is a complete, normalized
373 | significand.
374 *----------------------------------------------------------------------------*/
375
packFloat64(flag zSign,int16 zExp,bits64 zSig)376 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
377 {
378 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
379
380 }
381
382 /*----------------------------------------------------------------------------
383 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
384 | and significand `zSig', and returns the proper double-precision floating-
385 | point value corresponding to the abstract input. Ordinarily, the abstract
386 | value is simply rounded and packed into the double-precision format, with
387 | the inexact exception raised if the abstract input cannot be represented
388 | exactly. However, if the abstract value is too large, the overflow and
389 | inexact exceptions are raised and an infinity or maximal finite value is
390 | returned. If the abstract value is too small, the input value is rounded
391 | to a subnormal number, and the underflow and inexact exceptions are raised
392 | if the abstract input cannot be represented exactly as a subnormal double-
393 | precision floating-point number.
394 | The input significand `zSig' has its binary point between bits 62
395 | and 61, which is 10 bits to the left of the usual location. This shifted
396 | significand must be normalized or smaller. If `zSig' is not normalized,
397 | `zExp' must be 0; in that case, the result returned is a subnormal number,
398 | and it must not require rounding. In the usual case that `zSig' is
399 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
400 | The handling of underflow and overflow follows the IEC/IEEE Standard for
401 | Binary Floating-Point Arithmetic.
402 *----------------------------------------------------------------------------*/
403
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)404 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
405 {
406 int8 roundingMode;
407 flag roundNearestEven;
408 int16 roundIncrement, roundBits;
409 flag isTiny;
410
411 roundingMode = float_rounding_mode;
412 roundNearestEven = ( roundingMode == float_round_nearest_even );
413 roundIncrement = 0x200;
414 if ( ! roundNearestEven ) {
415 if ( roundingMode == float_round_to_zero ) {
416 roundIncrement = 0;
417 }
418 else {
419 roundIncrement = 0x3FF;
420 if ( zSign ) {
421 if ( roundingMode == float_round_up ) roundIncrement = 0;
422 }
423 else {
424 if ( roundingMode == float_round_down ) roundIncrement = 0;
425 }
426 }
427 }
428 roundBits = zSig & 0x3FF;
429 if ( 0x7FD <= (bits16) zExp ) {
430 if ( ( 0x7FD < zExp )
431 || ( ( zExp == 0x7FD )
432 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
433 ) {
434 float_raise( float_flag_overflow | float_flag_inexact );
435 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
436 }
437 if ( zExp < 0 ) {
438 isTiny =
439 ( float_detect_tininess == float_tininess_before_rounding )
440 || ( zExp < -1 )
441 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
442 shift64RightJamming( zSig, - zExp, &zSig );
443 zExp = 0;
444 roundBits = zSig & 0x3FF;
445 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
446 }
447 }
448 if ( roundBits ) float_exception_flags |= float_flag_inexact;
449 zSig = ( zSig + roundIncrement )>>10;
450 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
451 if ( zSig == 0 ) zExp = 0;
452 return packFloat64( zSign, zExp, zSig );
453
454 }
455
456 /*----------------------------------------------------------------------------
457 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
458 | and significand `zSig', and returns the proper double-precision floating-
459 | point value corresponding to the abstract input. This routine is just like
460 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
461 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
462 | floating-point exponent.
463 *----------------------------------------------------------------------------*/
464
465 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)466 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
467 {
468 int8 shiftCount;
469
470 shiftCount = countLeadingZeros64( zSig ) - 1;
471 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
472
473 }
474
475 #ifdef FLOATX80
476
477 /*----------------------------------------------------------------------------
478 | Normalizes the subnormal extended double-precision floating-point value
479 | represented by the denormalized significand `aSig'. The normalized exponent
480 | and significand are stored at the locations pointed to by `zExpPtr' and
481 | `zSigPtr', respectively.
482 *----------------------------------------------------------------------------*/
483
484 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)485 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
486 {
487 int8 shiftCount;
488
489 shiftCount = countLeadingZeros64( aSig );
490 *zSigPtr = aSig<<shiftCount;
491 *zExpPtr = 1 - shiftCount;
492
493 }
494
495 /*----------------------------------------------------------------------------
496 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
497 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
498 | and returns the proper extended double-precision floating-point value
499 | corresponding to the abstract input. Ordinarily, the abstract value is
500 | rounded and packed into the extended double-precision format, with the
501 | inexact exception raised if the abstract input cannot be represented
502 | exactly. However, if the abstract value is too large, the overflow and
503 | inexact exceptions are raised and an infinity or maximal finite value is
504 | returned. If the abstract value is too small, the input value is rounded to
505 | a subnormal number, and the underflow and inexact exceptions are raised if
506 | the abstract input cannot be represented exactly as a subnormal extended
507 | double-precision floating-point number.
508 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
509 | number of bits as single or double precision, respectively. Otherwise, the
510 | result is rounded to the full precision of the extended double-precision
511 | format.
512 | The input significand must be normalized or smaller. If the input
513 | significand is not normalized, `zExp' must be 0; in that case, the result
514 | returned is a subnormal number, and it must not require rounding. The
515 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
516 | Floating-Point Arithmetic.
517 *----------------------------------------------------------------------------*/
518
519 // roundAndPackFloatx80 is now also used in fyl2x.c
520
521 /* static */ floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)522 roundAndPackFloatx80(
523 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
524 )
525 {
526 int8 roundingMode;
527 flag roundNearestEven, increment, isTiny;
528 int64 roundIncrement, roundMask, roundBits;
529
530 roundingMode = float_rounding_mode;
531 roundNearestEven = ( roundingMode == float_round_nearest_even );
532 if ( roundingPrecision == 80 ) goto precision80;
533 if ( roundingPrecision == 64 ) {
534 roundIncrement = LIT64( 0x0000000000000400 );
535 roundMask = LIT64( 0x00000000000007FF );
536 }
537 else if ( roundingPrecision == 32 ) {
538 roundIncrement = LIT64( 0x0000008000000000 );
539 roundMask = LIT64( 0x000000FFFFFFFFFF );
540 }
541 else {
542 goto precision80;
543 }
544 zSig0 |= ( zSig1 != 0 );
545 if ( ! roundNearestEven ) {
546 if ( roundingMode == float_round_to_zero ) {
547 roundIncrement = 0;
548 }
549 else {
550 roundIncrement = roundMask;
551 if ( zSign ) {
552 if ( roundingMode == float_round_up ) roundIncrement = 0;
553 }
554 else {
555 if ( roundingMode == float_round_down ) roundIncrement = 0;
556 }
557 }
558 }
559 roundBits = zSig0 & roundMask;
560 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
561 if ( ( 0x7FFE < zExp )
562 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
563 ) {
564 goto overflow;
565 }
566 if ( zExp <= 0 ) {
567 isTiny =
568 ( float_detect_tininess == float_tininess_before_rounding )
569 || ( zExp < 0 )
570 || ( zSig0 <= zSig0 + roundIncrement );
571 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
572 zExp = 0;
573 roundBits = zSig0 & roundMask;
574 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
575 if ( roundBits ) float_exception_flags |= float_flag_inexact;
576 zSig0 += roundIncrement;
577 if ( (sbits64) zSig0 < 0 ) zExp = 1;
578 roundIncrement = roundMask + 1;
579 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
580 roundMask |= roundIncrement;
581 }
582 zSig0 &= ~ roundMask;
583 return packFloatx80( zSign, zExp, zSig0 );
584 }
585 }
586 if ( roundBits ) float_exception_flags |= float_flag_inexact;
587 zSig0 += roundIncrement;
588 if ( zSig0 < roundIncrement ) {
589 ++zExp;
590 zSig0 = LIT64( 0x8000000000000000 );
591 }
592 roundIncrement = roundMask + 1;
593 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
594 roundMask |= roundIncrement;
595 }
596 zSig0 &= ~ roundMask;
597 if ( zSig0 == 0 ) zExp = 0;
598 return packFloatx80( zSign, zExp, zSig0 );
599 precision80:
600 increment = ( (sbits64) zSig1 < 0 );
601 if ( ! roundNearestEven ) {
602 if ( roundingMode == float_round_to_zero ) {
603 increment = 0;
604 }
605 else {
606 if ( zSign ) {
607 increment = ( roundingMode == float_round_down ) && zSig1;
608 }
609 else {
610 increment = ( roundingMode == float_round_up ) && zSig1;
611 }
612 }
613 }
614 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
615 if ( ( 0x7FFE < zExp )
616 || ( ( zExp == 0x7FFE )
617 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
618 && increment
619 )
620 ) {
621 roundMask = 0;
622 overflow:
623 float_raise( float_flag_overflow | float_flag_inexact );
624 if ( ( roundingMode == float_round_to_zero )
625 || ( zSign && ( roundingMode == float_round_up ) )
626 || ( ! zSign && ( roundingMode == float_round_down ) )
627 ) {
628 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
629 }
630 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
631 }
632 if ( zExp <= 0 ) {
633 isTiny =
634 ( float_detect_tininess == float_tininess_before_rounding )
635 || ( zExp < 0 )
636 || ! increment
637 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
638 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
639 zExp = 0;
640 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
641 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
642 if ( roundNearestEven ) {
643 increment = ( (sbits64) zSig1 < 0 );
644 }
645 else {
646 if ( zSign ) {
647 increment = ( roundingMode == float_round_down ) && zSig1;
648 }
649 else {
650 increment = ( roundingMode == float_round_up ) && zSig1;
651 }
652 }
653 if ( increment ) {
654 ++zSig0;
655 zSig0 &=
656 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
657 if ( (sbits64) zSig0 < 0 ) zExp = 1;
658 }
659 return packFloatx80( zSign, zExp, zSig0 );
660 }
661 }
662 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
663 if ( increment ) {
664 ++zSig0;
665 if ( zSig0 == 0 ) {
666 ++zExp;
667 zSig0 = LIT64( 0x8000000000000000 );
668 }
669 else {
670 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
671 }
672 }
673 else {
674 if ( zSig0 == 0 ) zExp = 0;
675 }
676 return packFloatx80( zSign, zExp, zSig0 );
677
678 }
679
680 /*----------------------------------------------------------------------------
681 | Takes an abstract floating-point value having sign `zSign', exponent
682 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
683 | and returns the proper extended double-precision floating-point value
684 | corresponding to the abstract input. This routine is just like
685 | `roundAndPackFloatx80' except that the input significand does not have to be
686 | normalized.
687 *----------------------------------------------------------------------------*/
688
689 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)690 normalizeRoundAndPackFloatx80(
691 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
692 )
693 {
694 int8 shiftCount;
695
696 if ( zSig0 == 0 ) {
697 zSig0 = zSig1;
698 zSig1 = 0;
699 zExp -= 64;
700 }
701 shiftCount = countLeadingZeros64( zSig0 );
702 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
703 zExp -= shiftCount;
704 return
705 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
706
707 }
708
709 #endif
710
711 #ifdef FLOAT128
712
713 /*----------------------------------------------------------------------------
714 | Returns the least-significant 64 fraction bits of the quadruple-precision
715 | floating-point value `a'.
716 *----------------------------------------------------------------------------*/
717
extractFloat128Frac1(float128 a)718 INLINE bits64 extractFloat128Frac1( float128 a )
719 {
720 return a.low;
721
722 }
723
724 /*----------------------------------------------------------------------------
725 | Returns the most-significant 48 fraction bits of the quadruple-precision
726 | floating-point value `a'.
727 *----------------------------------------------------------------------------*/
728
extractFloat128Frac0(float128 a)729 INLINE bits64 extractFloat128Frac0( float128 a )
730 {
731 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
732
733 }
734
735 /*----------------------------------------------------------------------------
736 | Returns the exponent bits of the quadruple-precision floating-point value
737 | `a'.
738 *----------------------------------------------------------------------------*/
739
extractFloat128Exp(float128 a)740 INLINE int32 extractFloat128Exp( float128 a )
741 {
742 return ( a.high>>48 ) & 0x7FFF;
743
744 }
745
746 /*----------------------------------------------------------------------------
747 | Returns the sign bit of the quadruple-precision floating-point value `a'.
748 *----------------------------------------------------------------------------*/
749
extractFloat128Sign(float128 a)750 INLINE flag extractFloat128Sign( float128 a )
751 {
752 return a.high>>63;
753
754 }
755
756 /*----------------------------------------------------------------------------
757 | Normalizes the subnormal quadruple-precision floating-point value
758 | represented by the denormalized significand formed by the concatenation of
759 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
760 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
761 | significand are stored at the location pointed to by `zSig0Ptr', and the
762 | least significant 64 bits of the normalized significand are stored at the
763 | location pointed to by `zSig1Ptr'.
764 *----------------------------------------------------------------------------*/
765
766 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)767 normalizeFloat128Subnormal(
768 bits64 aSig0,
769 bits64 aSig1,
770 int32 *zExpPtr,
771 bits64 *zSig0Ptr,
772 bits64 *zSig1Ptr
773 )
774 {
775 int8 shiftCount;
776
777 if ( aSig0 == 0 ) {
778 shiftCount = countLeadingZeros64( aSig1 ) - 15;
779 if ( shiftCount < 0 ) {
780 *zSig0Ptr = aSig1>>( - shiftCount );
781 *zSig1Ptr = aSig1<<( shiftCount & 63 );
782 }
783 else {
784 *zSig0Ptr = aSig1<<shiftCount;
785 *zSig1Ptr = 0;
786 }
787 *zExpPtr = - shiftCount - 63;
788 }
789 else {
790 shiftCount = countLeadingZeros64( aSig0 ) - 15;
791 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
792 *zExpPtr = 1 - shiftCount;
793 }
794
795 }
796
797 #endif
798
799 /*----------------------------------------------------------------------------
800 | Returns the result of converting the 32-bit two's complement integer `a'
801 | to the single-precision floating-point format. The conversion is performed
802 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
803 *----------------------------------------------------------------------------*/
804
int32_to_float32(int32 a)805 float32 int32_to_float32( int32 a )
806 {
807 flag zSign;
808
809 if ( a == 0 ) return 0;
810 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
811 zSign = ( a < 0 );
812 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
813
814 }
815
816 /*----------------------------------------------------------------------------
817 | Returns the result of converting the 32-bit two's complement integer `a'
818 | to the double-precision floating-point format. The conversion is performed
819 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
820 *----------------------------------------------------------------------------*/
821
int32_to_float64(int32 a)822 float64 int32_to_float64( int32 a )
823 {
824 flag zSign;
825 uint32 absA;
826 int8 shiftCount;
827 bits64 zSig;
828
829 if ( a == 0 ) return 0;
830 zSign = ( a < 0 );
831 absA = zSign ? - a : a;
832 shiftCount = countLeadingZeros32( absA ) + 21;
833 zSig = absA;
834 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
835
836 }
837
838 #ifdef FLOATX80
839
840 /*----------------------------------------------------------------------------
841 | Returns the result of converting the 32-bit two's complement integer `a'
842 | to the extended double-precision floating-point format. The conversion
843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
844 | Arithmetic.
845 *----------------------------------------------------------------------------*/
846
int32_to_floatx80(int32 a)847 floatx80 int32_to_floatx80( int32 a )
848 {
849 flag zSign;
850 uint32 absA;
851 int8 shiftCount;
852 bits64 zSig;
853
854 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
855 zSign = ( a < 0 );
856 absA = zSign ? - a : a;
857 shiftCount = countLeadingZeros32( absA ) + 32;
858 zSig = absA;
859 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
860
861 }
862
863 #endif
864
865 #ifdef FLOAT128
866
867 /*----------------------------------------------------------------------------
868 | Returns the result of converting the 32-bit two's complement integer `a' to
869 | the quadruple-precision floating-point format. The conversion is performed
870 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
871 *----------------------------------------------------------------------------*/
872
int32_to_float128(int32 a)873 float128 int32_to_float128( int32 a )
874 {
875 flag zSign;
876 uint32 absA;
877 int8 shiftCount;
878 bits64 zSig0;
879
880 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
881 zSign = ( a < 0 );
882 absA = zSign ? - a : a;
883 shiftCount = countLeadingZeros32( absA ) + 17;
884 zSig0 = absA;
885 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
886
887 }
888
889 #endif
890
891 /*----------------------------------------------------------------------------
892 | Returns the result of converting the 64-bit two's complement integer `a'
893 | to the single-precision floating-point format. The conversion is performed
894 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
895 *----------------------------------------------------------------------------*/
896
int64_to_float32(int64 a)897 float32 int64_to_float32( int64 a )
898 {
899 flag zSign;
900 uint64 absA;
901 int8 shiftCount;
902 // bits32 zSig;
903
904 if ( a == 0 ) return 0;
905 zSign = ( a < 0 );
906 absA = zSign ? - a : a;
907 shiftCount = countLeadingZeros64( absA ) - 40;
908 if ( 0 <= shiftCount ) {
909 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
910 }
911 else {
912 shiftCount += 7;
913 if ( shiftCount < 0 ) {
914 shift64RightJamming( absA, - shiftCount, &absA );
915 }
916 else {
917 absA <<= shiftCount;
918 }
919 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
920 }
921
922 }
923
924 /*----------------------------------------------------------------------------
925 | Returns the result of converting the 64-bit two's complement integer `a'
926 | to the double-precision floating-point format. The conversion is performed
927 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
928 *----------------------------------------------------------------------------*/
929
int64_to_float64(int64 a)930 float64 int64_to_float64( int64 a )
931 {
932 flag zSign;
933
934 if ( a == 0 ) return 0;
935 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
936 return packFloat64( 1, 0x43E, 0 );
937 }
938 zSign = ( a < 0 );
939 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
940
941 }
942
943 #ifdef FLOATX80
944
945 /*----------------------------------------------------------------------------
946 | Returns the result of converting the 64-bit two's complement integer `a'
947 | to the extended double-precision floating-point format. The conversion
948 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
949 | Arithmetic.
950 *----------------------------------------------------------------------------*/
951
int64_to_floatx80(int64 a)952 floatx80 int64_to_floatx80( int64 a )
953 {
954 flag zSign;
955 uint64 absA;
956 int8 shiftCount;
957
958 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
959 zSign = ( a < 0 );
960 absA = zSign ? - a : a;
961 shiftCount = countLeadingZeros64( absA );
962 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
963
964 }
965
966 #endif
967
968 #ifdef FLOAT128
969
970 /*----------------------------------------------------------------------------
971 | Returns the result of converting the 64-bit two's complement integer `a' to
972 | the quadruple-precision floating-point format. The conversion is performed
973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
974 *----------------------------------------------------------------------------*/
975
int64_to_float128(int64 a)976 float128 int64_to_float128( int64 a )
977 {
978 flag zSign;
979 uint64 absA;
980 int8 shiftCount;
981 int32 zExp;
982 bits64 zSig0, zSig1;
983
984 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
985 zSign = ( a < 0 );
986 absA = zSign ? - a : a;
987 shiftCount = countLeadingZeros64( absA ) + 49;
988 zExp = 0x406E - shiftCount;
989 if ( 64 <= shiftCount ) {
990 zSig1 = 0;
991 zSig0 = absA;
992 shiftCount -= 64;
993 }
994 else {
995 zSig1 = absA;
996 zSig0 = 0;
997 }
998 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
999 return packFloat128( zSign, zExp, zSig0, zSig1 );
1000
1001 }
1002
1003 #endif
1004
1005 /*----------------------------------------------------------------------------
1006 | Returns the result of converting the single-precision floating-point value
1007 | `a' to the 32-bit two's complement integer format. The conversion is
1008 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1009 | Arithmetic---which means in particular that the conversion is rounded
1010 | according to the current rounding mode. If `a' is a NaN, the largest
1011 | positive integer is returned. Otherwise, if the conversion overflows, the
1012 | largest integer with the same sign as `a' is returned.
1013 *----------------------------------------------------------------------------*/
1014
float32_to_int32(float32 a)1015 int32 float32_to_int32( float32 a )
1016 {
1017 flag aSign;
1018 int16 aExp, shiftCount;
1019 bits32 aSig;
1020 bits64 aSig64;
1021
1022 aSig = extractFloat32Frac( a );
1023 aExp = extractFloat32Exp( a );
1024 aSign = extractFloat32Sign( a );
1025 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1026 if ( aExp ) aSig |= 0x00800000;
1027 shiftCount = 0xAF - aExp;
1028 aSig64 = aSig;
1029 aSig64 <<= 32;
1030 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1031 return roundAndPackInt32( aSign, aSig64 );
1032
1033 }
1034
1035 /*----------------------------------------------------------------------------
1036 | Returns the result of converting the single-precision floating-point value
1037 | `a' to the 32-bit two's complement integer format. The conversion is
1038 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1039 | Arithmetic, except that the conversion is always rounded toward zero.
1040 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1041 | the conversion overflows, the largest integer with the same sign as `a' is
1042 | returned.
1043 *----------------------------------------------------------------------------*/
1044
float32_to_int32_round_to_zero(float32 a)1045 int32 float32_to_int32_round_to_zero( float32 a )
1046 {
1047 flag aSign;
1048 int16 aExp, shiftCount;
1049 bits32 aSig;
1050 int32 z;
1051
1052 aSig = extractFloat32Frac( a );
1053 aExp = extractFloat32Exp( a );
1054 aSign = extractFloat32Sign( a );
1055 shiftCount = aExp - 0x9E;
1056 if ( 0 <= shiftCount ) {
1057 if ( a != 0xCF000000 ) {
1058 float_raise( float_flag_invalid );
1059 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1060 }
1061 return (sbits32) 0x80000000;
1062 }
1063 else if ( aExp <= 0x7E ) {
1064 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1065 return 0;
1066 }
1067 aSig = ( aSig | 0x00800000 )<<8;
1068 z = aSig>>( - shiftCount );
1069 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1070 float_exception_flags |= float_flag_inexact;
1071 }
1072 if ( aSign ) z = - z;
1073 return z;
1074
1075 }
1076
1077 /*----------------------------------------------------------------------------
1078 | Returns the result of converting the single-precision floating-point value
1079 | `a' to the 64-bit two's complement integer format. The conversion is
1080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1081 | Arithmetic---which means in particular that the conversion is rounded
1082 | according to the current rounding mode. If `a' is a NaN, the largest
1083 | positive integer is returned. Otherwise, if the conversion overflows, the
1084 | largest integer with the same sign as `a' is returned.
1085 *----------------------------------------------------------------------------*/
1086
float32_to_int64(float32 a)1087 int64 float32_to_int64( float32 a )
1088 {
1089 flag aSign;
1090 int16 aExp, shiftCount;
1091 bits32 aSig;
1092 bits64 aSig64, aSigExtra;
1093
1094 aSig = extractFloat32Frac( a );
1095 aExp = extractFloat32Exp( a );
1096 aSign = extractFloat32Sign( a );
1097 shiftCount = 0xBE - aExp;
1098 if ( shiftCount < 0 ) {
1099 float_raise( float_flag_invalid );
1100 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1101 return LIT64( 0x7FFFFFFFFFFFFFFF );
1102 }
1103 return (sbits64) LIT64( 0x8000000000000000 );
1104 }
1105 if ( aExp ) aSig |= 0x00800000;
1106 aSig64 = aSig;
1107 aSig64 <<= 40;
1108 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1109 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1110
1111 }
1112
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the single-precision floating-point value
1115 | `a' to the 64-bit two's complement integer format. The conversion is
1116 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1117 | Arithmetic, except that the conversion is always rounded toward zero. If
1118 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1119 | conversion overflows, the largest integer with the same sign as `a' is
1120 | returned.
1121 *----------------------------------------------------------------------------*/
1122
float32_to_int64_round_to_zero(float32 a)1123 int64 float32_to_int64_round_to_zero( float32 a )
1124 {
1125 flag aSign;
1126 int16 aExp, shiftCount;
1127 bits32 aSig;
1128 bits64 aSig64;
1129 int64 z;
1130
1131 aSig = extractFloat32Frac( a );
1132 aExp = extractFloat32Exp( a );
1133 aSign = extractFloat32Sign( a );
1134 shiftCount = aExp - 0xBE;
1135 if ( 0 <= shiftCount ) {
1136 if ( a != 0xDF000000 ) {
1137 float_raise( float_flag_invalid );
1138 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1139 return LIT64( 0x7FFFFFFFFFFFFFFF );
1140 }
1141 }
1142 return (sbits64) LIT64( 0x8000000000000000 );
1143 }
1144 else if ( aExp <= 0x7E ) {
1145 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1146 return 0;
1147 }
1148 aSig64 = aSig | 0x00800000;
1149 aSig64 <<= 40;
1150 z = aSig64>>( - shiftCount );
1151 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1152 float_exception_flags |= float_flag_inexact;
1153 }
1154 if ( aSign ) z = - z;
1155 return z;
1156
1157 }
1158
1159 /*----------------------------------------------------------------------------
1160 | Returns the result of converting the single-precision floating-point value
1161 | `a' to the double-precision floating-point format. The conversion is
1162 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1163 | Arithmetic.
1164 *----------------------------------------------------------------------------*/
1165
float32_to_float64(float32 a)1166 float64 float32_to_float64( float32 a )
1167 {
1168 flag aSign;
1169 int16 aExp;
1170 bits32 aSig;
1171
1172 aSig = extractFloat32Frac( a );
1173 aExp = extractFloat32Exp( a );
1174 aSign = extractFloat32Sign( a );
1175 if ( aExp == 0xFF ) {
1176 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1177 return packFloat64( aSign, 0x7FF, 0 );
1178 }
1179 if ( aExp == 0 ) {
1180 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1181 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1182 --aExp;
1183 }
1184 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1185
1186 }
1187
1188 #ifdef FLOATX80
1189
1190 /*----------------------------------------------------------------------------
1191 | Returns the result of converting the single-precision floating-point value
1192 | `a' to the extended double-precision floating-point format. The conversion
1193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1194 | Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196
float32_to_floatx80(float32 a)1197 floatx80 float32_to_floatx80( float32 a )
1198 {
1199 flag aSign;
1200 int16 aExp;
1201 bits32 aSig;
1202
1203 aSig = extractFloat32Frac( a );
1204 aExp = extractFloat32Exp( a );
1205 aSign = extractFloat32Sign( a );
1206 if ( aExp == 0xFF ) {
1207 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1208 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1209 }
1210 if ( aExp == 0 ) {
1211 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1212 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1213 }
1214 aSig |= 0x00800000;
1215 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1216
1217 }
1218
1219 #endif
1220
1221 #ifdef FLOAT128
1222
1223 /*----------------------------------------------------------------------------
1224 | Returns the result of converting the single-precision floating-point value
1225 | `a' to the double-precision floating-point format. The conversion is
1226 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1227 | Arithmetic.
1228 *----------------------------------------------------------------------------*/
1229
float32_to_float128(float32 a)1230 float128 float32_to_float128( float32 a )
1231 {
1232 flag aSign;
1233 int16 aExp;
1234 bits32 aSig;
1235
1236 aSig = extractFloat32Frac( a );
1237 aExp = extractFloat32Exp( a );
1238 aSign = extractFloat32Sign( a );
1239 if ( aExp == 0xFF ) {
1240 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1241 return packFloat128( aSign, 0x7FFF, 0, 0 );
1242 }
1243 if ( aExp == 0 ) {
1244 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1245 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1246 --aExp;
1247 }
1248 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1249
1250 }
1251
1252 #endif
1253
1254 /*----------------------------------------------------------------------------
1255 | Rounds the single-precision floating-point value `a' to an integer, and
1256 | returns the result as a single-precision floating-point value. The
1257 | operation is performed according to the IEC/IEEE Standard for Binary
1258 | Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1260
float32_round_to_int(float32 a)1261 float32 float32_round_to_int( float32 a )
1262 {
1263 flag aSign;
1264 int16 aExp;
1265 bits32 lastBitMask, roundBitsMask;
1266 int8 roundingMode;
1267 float32 z;
1268
1269 aExp = extractFloat32Exp( a );
1270 if ( 0x96 <= aExp ) {
1271 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1272 return propagateFloat32NaN( a, a );
1273 }
1274 return a;
1275 }
1276 if ( aExp <= 0x7E ) {
1277 if ( (bits32) ( a<<1 ) == 0 ) return a;
1278 float_exception_flags |= float_flag_inexact;
1279 aSign = extractFloat32Sign( a );
1280 switch ( float_rounding_mode ) {
1281 case float_round_nearest_even:
1282 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1283 return packFloat32( aSign, 0x7F, 0 );
1284 }
1285 break;
1286 case float_round_down:
1287 return aSign ? 0xBF800000 : 0;
1288 case float_round_up:
1289 return aSign ? 0x80000000 : 0x3F800000;
1290 }
1291 return packFloat32( aSign, 0, 0 );
1292 }
1293 lastBitMask = 1;
1294 lastBitMask <<= 0x96 - aExp;
1295 roundBitsMask = lastBitMask - 1;
1296 z = a;
1297 roundingMode = float_rounding_mode;
1298 if ( roundingMode == float_round_nearest_even ) {
1299 z += lastBitMask>>1;
1300 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1301 }
1302 else if ( roundingMode != float_round_to_zero ) {
1303 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1304 z += roundBitsMask;
1305 }
1306 }
1307 z &= ~ roundBitsMask;
1308 if ( z != a ) float_exception_flags |= float_flag_inexact;
1309 return z;
1310
1311 }
1312
1313 /*----------------------------------------------------------------------------
1314 | Returns the result of adding the absolute values of the single-precision
1315 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1316 | before being returned. `zSign' is ignored if the result is a NaN.
1317 | The addition is performed according to the IEC/IEEE Standard for Binary
1318 | Floating-Point Arithmetic.
1319 *----------------------------------------------------------------------------*/
1320
addFloat32Sigs(float32 a,float32 b,flag zSign)1321 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1322 {
1323 int16 aExp, bExp, zExp;
1324 bits32 aSig, bSig, zSig;
1325 int16 expDiff;
1326
1327 aSig = extractFloat32Frac( a );
1328 aExp = extractFloat32Exp( a );
1329 bSig = extractFloat32Frac( b );
1330 bExp = extractFloat32Exp( b );
1331 expDiff = aExp - bExp;
1332 aSig <<= 6;
1333 bSig <<= 6;
1334 if ( 0 < expDiff ) {
1335 if ( aExp == 0xFF ) {
1336 if ( aSig ) return propagateFloat32NaN( a, b );
1337 return a;
1338 }
1339 if ( bExp == 0 ) {
1340 --expDiff;
1341 }
1342 else {
1343 bSig |= 0x20000000;
1344 }
1345 shift32RightJamming( bSig, expDiff, &bSig );
1346 zExp = aExp;
1347 }
1348 else if ( expDiff < 0 ) {
1349 if ( bExp == 0xFF ) {
1350 if ( bSig ) return propagateFloat32NaN( a, b );
1351 return packFloat32( zSign, 0xFF, 0 );
1352 }
1353 if ( aExp == 0 ) {
1354 ++expDiff;
1355 }
1356 else {
1357 aSig |= 0x20000000;
1358 }
1359 shift32RightJamming( aSig, - expDiff, &aSig );
1360 zExp = bExp;
1361 }
1362 else {
1363 if ( aExp == 0xFF ) {
1364 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1365 return a;
1366 }
1367 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1368 zSig = 0x40000000 + aSig + bSig;
1369 zExp = aExp;
1370 goto roundAndPack;
1371 }
1372 aSig |= 0x20000000;
1373 zSig = ( aSig + bSig )<<1;
1374 --zExp;
1375 if ( (sbits32) zSig < 0 ) {
1376 zSig = aSig + bSig;
1377 ++zExp;
1378 }
1379 roundAndPack:
1380 return roundAndPackFloat32( zSign, zExp, zSig );
1381
1382 }
1383
1384 /*----------------------------------------------------------------------------
1385 | Returns the result of subtracting the absolute values of the single-
1386 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1387 | difference is negated before being returned. `zSign' is ignored if the
1388 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1389 | Standard for Binary Floating-Point Arithmetic.
1390 *----------------------------------------------------------------------------*/
1391
subFloat32Sigs(float32 a,float32 b,flag zSign)1392 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1393 {
1394 int16 aExp, bExp, zExp;
1395 bits32 aSig, bSig, zSig;
1396 int16 expDiff;
1397
1398 aSig = extractFloat32Frac( a );
1399 aExp = extractFloat32Exp( a );
1400 bSig = extractFloat32Frac( b );
1401 bExp = extractFloat32Exp( b );
1402 expDiff = aExp - bExp;
1403 aSig <<= 7;
1404 bSig <<= 7;
1405 if ( 0 < expDiff ) goto aExpBigger;
1406 if ( expDiff < 0 ) goto bExpBigger;
1407 if ( aExp == 0xFF ) {
1408 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1409 float_raise( float_flag_invalid );
1410 return float32_default_nan;
1411 }
1412 if ( aExp == 0 ) {
1413 aExp = 1;
1414 bExp = 1;
1415 }
1416 if ( bSig < aSig ) goto aBigger;
1417 if ( aSig < bSig ) goto bBigger;
1418 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1419 bExpBigger:
1420 if ( bExp == 0xFF ) {
1421 if ( bSig ) return propagateFloat32NaN( a, b );
1422 return packFloat32( zSign ^ 1, 0xFF, 0 );
1423 }
1424 if ( aExp == 0 ) {
1425 ++expDiff;
1426 }
1427 else {
1428 aSig |= 0x40000000;
1429 }
1430 shift32RightJamming( aSig, - expDiff, &aSig );
1431 bSig |= 0x40000000;
1432 bBigger:
1433 zSig = bSig - aSig;
1434 zExp = bExp;
1435 zSign ^= 1;
1436 goto normalizeRoundAndPack;
1437 aExpBigger:
1438 if ( aExp == 0xFF ) {
1439 if ( aSig ) return propagateFloat32NaN( a, b );
1440 return a;
1441 }
1442 if ( bExp == 0 ) {
1443 --expDiff;
1444 }
1445 else {
1446 bSig |= 0x40000000;
1447 }
1448 shift32RightJamming( bSig, expDiff, &bSig );
1449 aSig |= 0x40000000;
1450 aBigger:
1451 zSig = aSig - bSig;
1452 zExp = aExp;
1453 normalizeRoundAndPack:
1454 --zExp;
1455 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1456
1457 }
1458
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of adding the single-precision floating-point values `a'
1461 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1462 | Binary Floating-Point Arithmetic.
1463 *----------------------------------------------------------------------------*/
1464
float32_add(float32 a,float32 b)1465 float32 float32_add( float32 a, float32 b )
1466 {
1467 flag aSign, bSign;
1468
1469 aSign = extractFloat32Sign( a );
1470 bSign = extractFloat32Sign( b );
1471 if ( aSign == bSign ) {
1472 return addFloat32Sigs( a, b, aSign );
1473 }
1474 else {
1475 return subFloat32Sigs( a, b, aSign );
1476 }
1477
1478 }
1479
1480 /*----------------------------------------------------------------------------
1481 | Returns the result of subtracting the single-precision floating-point values
1482 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1483 | for Binary Floating-Point Arithmetic.
1484 *----------------------------------------------------------------------------*/
1485
float32_sub(float32 a,float32 b)1486 float32 float32_sub( float32 a, float32 b )
1487 {
1488 flag aSign, bSign;
1489
1490 aSign = extractFloat32Sign( a );
1491 bSign = extractFloat32Sign( b );
1492 if ( aSign == bSign ) {
1493 return subFloat32Sigs( a, b, aSign );
1494 }
1495 else {
1496 return addFloat32Sigs( a, b, aSign );
1497 }
1498
1499 }
1500
1501 /*----------------------------------------------------------------------------
1502 | Returns the result of multiplying the single-precision floating-point values
1503 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1504 | for Binary Floating-Point Arithmetic.
1505 *----------------------------------------------------------------------------*/
1506
float32_mul(float32 a,float32 b)1507 float32 float32_mul( float32 a, float32 b )
1508 {
1509 flag aSign, bSign, zSign;
1510 int16 aExp, bExp, zExp;
1511 bits32 aSig, bSig;
1512 bits64 zSig64;
1513 bits32 zSig;
1514
1515 aSig = extractFloat32Frac( a );
1516 aExp = extractFloat32Exp( a );
1517 aSign = extractFloat32Sign( a );
1518 bSig = extractFloat32Frac( b );
1519 bExp = extractFloat32Exp( b );
1520 bSign = extractFloat32Sign( b );
1521 zSign = aSign ^ bSign;
1522 if ( aExp == 0xFF ) {
1523 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1524 return propagateFloat32NaN( a, b );
1525 }
1526 if ( ( bExp | bSig ) == 0 ) {
1527 float_raise( float_flag_invalid );
1528 return float32_default_nan;
1529 }
1530 return packFloat32( zSign, 0xFF, 0 );
1531 }
1532 if ( bExp == 0xFF ) {
1533 if ( bSig ) return propagateFloat32NaN( a, b );
1534 if ( ( aExp | aSig ) == 0 ) {
1535 float_raise( float_flag_invalid );
1536 return float32_default_nan;
1537 }
1538 return packFloat32( zSign, 0xFF, 0 );
1539 }
1540 if ( aExp == 0 ) {
1541 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1542 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1543 }
1544 if ( bExp == 0 ) {
1545 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1546 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1547 }
1548 zExp = aExp + bExp - 0x7F;
1549 aSig = ( aSig | 0x00800000 )<<7;
1550 bSig = ( bSig | 0x00800000 )<<8;
1551 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1552 zSig = zSig64;
1553 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1554 zSig <<= 1;
1555 --zExp;
1556 }
1557 return roundAndPackFloat32( zSign, zExp, zSig );
1558
1559 }
1560
1561 /*----------------------------------------------------------------------------
1562 | Returns the result of dividing the single-precision floating-point value `a'
1563 | by the corresponding value `b'. The operation is performed according to the
1564 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1565 *----------------------------------------------------------------------------*/
1566
float32_div(float32 a,float32 b)1567 float32 float32_div( float32 a, float32 b )
1568 {
1569 flag aSign, bSign, zSign;
1570 int16 aExp, bExp, zExp;
1571 bits32 aSig, bSig, zSig;
1572
1573 aSig = extractFloat32Frac( a );
1574 aExp = extractFloat32Exp( a );
1575 aSign = extractFloat32Sign( a );
1576 bSig = extractFloat32Frac( b );
1577 bExp = extractFloat32Exp( b );
1578 bSign = extractFloat32Sign( b );
1579 zSign = aSign ^ bSign;
1580 if ( aExp == 0xFF ) {
1581 if ( aSig ) return propagateFloat32NaN( a, b );
1582 if ( bExp == 0xFF ) {
1583 if ( bSig ) return propagateFloat32NaN( a, b );
1584 float_raise( float_flag_invalid );
1585 return float32_default_nan;
1586 }
1587 return packFloat32( zSign, 0xFF, 0 );
1588 }
1589 if ( bExp == 0xFF ) {
1590 if ( bSig ) return propagateFloat32NaN( a, b );
1591 return packFloat32( zSign, 0, 0 );
1592 }
1593 if ( bExp == 0 ) {
1594 if ( bSig == 0 ) {
1595 if ( ( aExp | aSig ) == 0 ) {
1596 float_raise( float_flag_invalid );
1597 return float32_default_nan;
1598 }
1599 float_raise( float_flag_divbyzero );
1600 return packFloat32( zSign, 0xFF, 0 );
1601 }
1602 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1603 }
1604 if ( aExp == 0 ) {
1605 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1606 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607 }
1608 zExp = aExp - bExp + 0x7D;
1609 aSig = ( aSig | 0x00800000 )<<7;
1610 bSig = ( bSig | 0x00800000 )<<8;
1611 if ( bSig <= ( aSig + aSig ) ) {
1612 aSig >>= 1;
1613 ++zExp;
1614 }
1615 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1616 if ( ( zSig & 0x3F ) == 0 ) {
1617 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1618 }
1619 return roundAndPackFloat32( zSign, zExp, zSig );
1620
1621 }
1622
1623 /*----------------------------------------------------------------------------
1624 | Returns the remainder of the single-precision floating-point value `a'
1625 | with respect to the corresponding value `b'. The operation is performed
1626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1627 *----------------------------------------------------------------------------*/
1628
float32_rem(float32 a,float32 b)1629 float32 float32_rem( float32 a, float32 b )
1630 {
1631 flag aSign, zSign;
1632 int16 aExp, bExp, expDiff;
1633 bits32 aSig, bSig;
1634 bits32 q;
1635 bits64 aSig64, bSig64, q64;
1636 bits32 alternateASig;
1637 sbits32 sigMean;
1638
1639 aSig = extractFloat32Frac( a );
1640 aExp = extractFloat32Exp( a );
1641 aSign = extractFloat32Sign( a );
1642 bSig = extractFloat32Frac( b );
1643 bExp = extractFloat32Exp( b );
1644 // bSign = extractFloat32Sign( b );
1645 if ( aExp == 0xFF ) {
1646 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1647 return propagateFloat32NaN( a, b );
1648 }
1649 float_raise( float_flag_invalid );
1650 return float32_default_nan;
1651 }
1652 if ( bExp == 0xFF ) {
1653 if ( bSig ) return propagateFloat32NaN( a, b );
1654 return a;
1655 }
1656 if ( bExp == 0 ) {
1657 if ( bSig == 0 ) {
1658 float_raise( float_flag_invalid );
1659 return float32_default_nan;
1660 }
1661 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1662 }
1663 if ( aExp == 0 ) {
1664 if ( aSig == 0 ) return a;
1665 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1666 }
1667 expDiff = aExp - bExp;
1668 aSig |= 0x00800000;
1669 bSig |= 0x00800000;
1670 if ( expDiff < 32 ) {
1671 aSig <<= 8;
1672 bSig <<= 8;
1673 if ( expDiff < 0 ) {
1674 if ( expDiff < -1 ) return a;
1675 aSig >>= 1;
1676 }
1677 q = ( bSig <= aSig );
1678 if ( q ) aSig -= bSig;
1679 if ( 0 < expDiff ) {
1680 q = ( ( (bits64) aSig )<<32 ) / bSig;
1681 q >>= 32 - expDiff;
1682 bSig >>= 2;
1683 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1684 }
1685 else {
1686 aSig >>= 2;
1687 bSig >>= 2;
1688 }
1689 }
1690 else {
1691 if ( bSig <= aSig ) aSig -= bSig;
1692 aSig64 = ( (bits64) aSig )<<40;
1693 bSig64 = ( (bits64) bSig )<<40;
1694 expDiff -= 64;
1695 while ( 0 < expDiff ) {
1696 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1697 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1698 aSig64 = - ( ( bSig * q64 )<<38 );
1699 expDiff -= 62;
1700 }
1701 expDiff += 64;
1702 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1703 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1704 q = q64>>( 64 - expDiff );
1705 bSig <<= 6;
1706 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1707 }
1708 do {
1709 alternateASig = aSig;
1710 ++q;
1711 aSig -= bSig;
1712 } while ( 0 <= (sbits32) aSig );
1713 sigMean = aSig + alternateASig;
1714 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1715 aSig = alternateASig;
1716 }
1717 zSign = ( (sbits32) aSig < 0 );
1718 if ( zSign ) aSig = - aSig;
1719 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1720
1721 }
1722
1723 /*----------------------------------------------------------------------------
1724 | Returns the square root of the single-precision floating-point value `a'.
1725 | The operation is performed according to the IEC/IEEE Standard for Binary
1726 | Floating-Point Arithmetic.
1727 *----------------------------------------------------------------------------*/
1728
float32_sqrt(float32 a)1729 float32 float32_sqrt( float32 a )
1730 {
1731 flag aSign;
1732 int16 aExp, zExp;
1733 bits32 aSig, zSig;
1734 bits64 rem, term;
1735
1736 aSig = extractFloat32Frac( a );
1737 aExp = extractFloat32Exp( a );
1738 aSign = extractFloat32Sign( a );
1739 if ( aExp == 0xFF ) {
1740 if ( aSig ) return propagateFloat32NaN( a, 0 );
1741 if ( ! aSign ) return a;
1742 float_raise( float_flag_invalid );
1743 return float32_default_nan;
1744 }
1745 if ( aSign ) {
1746 if ( ( aExp | aSig ) == 0 ) return a;
1747 float_raise( float_flag_invalid );
1748 return float32_default_nan;
1749 }
1750 if ( aExp == 0 ) {
1751 if ( aSig == 0 ) return 0;
1752 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1753 }
1754 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1755 aSig = ( aSig | 0x00800000 )<<8;
1756 zSig = estimateSqrt32( aExp, aSig ) + 2;
1757 if ( ( zSig & 0x7F ) <= 5 ) {
1758 if ( zSig < 2 ) {
1759 zSig = 0x7FFFFFFF;
1760 goto roundAndPack;
1761 }
1762 aSig >>= aExp & 1;
1763 term = ( (bits64) zSig ) * zSig;
1764 rem = ( ( (bits64) aSig )<<32 ) - term;
1765 while ( (sbits64) rem < 0 ) {
1766 --zSig;
1767 rem += ( ( (bits64) zSig )<<1 ) | 1;
1768 }
1769 zSig |= ( rem != 0 );
1770 }
1771 shift32RightJamming( zSig, 1, &zSig );
1772 roundAndPack:
1773 return roundAndPackFloat32( 0, zExp, zSig );
1774
1775 }
1776
1777 /*----------------------------------------------------------------------------
1778 | Returns 1 if the single-precision floating-point value `a' is equal to
1779 | the corresponding value `b', and 0 otherwise. The comparison is performed
1780 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1781 *----------------------------------------------------------------------------*/
1782
float32_eq(float32 a,float32 b)1783 flag float32_eq( float32 a, float32 b )
1784 {
1785 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1786 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1787 ) {
1788 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1789 float_raise( float_flag_invalid );
1790 }
1791 return 0;
1792 }
1793 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1794
1795 }
1796
1797 /*----------------------------------------------------------------------------
1798 | Returns 1 if the single-precision floating-point value `a' is less than
1799 | or equal to the corresponding value `b', and 0 otherwise. The comparison
1800 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1801 | Arithmetic.
1802 *----------------------------------------------------------------------------*/
1803
float32_le(float32 a,float32 b)1804 flag float32_le( float32 a, float32 b )
1805 {
1806 flag aSign, bSign;
1807
1808 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1809 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1810 ) {
1811 float_raise( float_flag_invalid );
1812 return 0;
1813 }
1814 aSign = extractFloat32Sign( a );
1815 bSign = extractFloat32Sign( b );
1816 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1817 return ( a == b ) || ( aSign ^ ( a < b ) );
1818
1819 }
1820
1821 /*----------------------------------------------------------------------------
1822 | Returns 1 if the single-precision floating-point value `a' is less than
1823 | the corresponding value `b', and 0 otherwise. The comparison is performed
1824 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1825 *----------------------------------------------------------------------------*/
1826
float32_lt(float32 a,float32 b)1827 flag float32_lt( float32 a, float32 b )
1828 {
1829 flag aSign, bSign;
1830
1831 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1832 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1833 ) {
1834 float_raise( float_flag_invalid );
1835 return 0;
1836 }
1837 aSign = extractFloat32Sign( a );
1838 bSign = extractFloat32Sign( b );
1839 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1840 return ( a != b ) && ( aSign ^ ( a < b ) );
1841
1842 }
1843
1844 /*----------------------------------------------------------------------------
1845 | Returns 1 if the single-precision floating-point value `a' is equal to
1846 | the corresponding value `b', and 0 otherwise. The invalid exception is
1847 | raised if either operand is a NaN. Otherwise, the comparison is performed
1848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1849 *----------------------------------------------------------------------------*/
1850
float32_eq_signaling(float32 a,float32 b)1851 flag float32_eq_signaling( float32 a, float32 b )
1852 {
1853 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1854 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1855 ) {
1856 float_raise( float_flag_invalid );
1857 return 0;
1858 }
1859 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1860
1861 }
1862
1863 /*----------------------------------------------------------------------------
1864 | Returns 1 if the single-precision floating-point value `a' is less than or
1865 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1866 | cause an exception. Otherwise, the comparison is performed according to the
1867 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1868 *----------------------------------------------------------------------------*/
1869
float32_le_quiet(float32 a,float32 b)1870 flag float32_le_quiet( float32 a, float32 b )
1871 {
1872 flag aSign, bSign;
1873 // int16 aExp, bExp;
1874
1875 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1876 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1877 ) {
1878 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1879 float_raise( float_flag_invalid );
1880 }
1881 return 0;
1882 }
1883 aSign = extractFloat32Sign( a );
1884 bSign = extractFloat32Sign( b );
1885 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1886 return ( a == b ) || ( aSign ^ ( a < b ) );
1887
1888 }
1889
1890 /*----------------------------------------------------------------------------
1891 | Returns 1 if the single-precision floating-point value `a' is less than
1892 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1893 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
1894 | Standard for Binary Floating-Point Arithmetic.
1895 *----------------------------------------------------------------------------*/
1896
float32_lt_quiet(float32 a,float32 b)1897 flag float32_lt_quiet( float32 a, float32 b )
1898 {
1899 flag aSign, bSign;
1900
1901 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1902 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1903 ) {
1904 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1905 float_raise( float_flag_invalid );
1906 }
1907 return 0;
1908 }
1909 aSign = extractFloat32Sign( a );
1910 bSign = extractFloat32Sign( b );
1911 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1912 return ( a != b ) && ( aSign ^ ( a < b ) );
1913
1914 }
1915
1916 /*----------------------------------------------------------------------------
1917 | Returns the result of converting the double-precision floating-point value
1918 | `a' to the 32-bit two's complement integer format. The conversion is
1919 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1920 | Arithmetic---which means in particular that the conversion is rounded
1921 | according to the current rounding mode. If `a' is a NaN, the largest
1922 | positive integer is returned. Otherwise, if the conversion overflows, the
1923 | largest integer with the same sign as `a' is returned.
1924 *----------------------------------------------------------------------------*/
1925
float64_to_int32(float64 a)1926 int32 float64_to_int32( float64 a )
1927 {
1928 flag aSign;
1929 int16 aExp, shiftCount;
1930 bits64 aSig;
1931
1932 aSig = extractFloat64Frac( a );
1933 aExp = extractFloat64Exp( a );
1934 aSign = extractFloat64Sign( a );
1935 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1936 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1937 shiftCount = 0x42C - aExp;
1938 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1939 return roundAndPackInt32( aSign, aSig );
1940
1941 }
1942
1943 /*----------------------------------------------------------------------------
1944 | Returns the result of converting the double-precision floating-point value
1945 | `a' to the 32-bit two's complement integer format. The conversion is
1946 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1947 | Arithmetic, except that the conversion is always rounded toward zero.
1948 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1949 | the conversion overflows, the largest integer with the same sign as `a' is
1950 | returned.
1951 *----------------------------------------------------------------------------*/
1952
float64_to_int32_round_to_zero(float64 a)1953 int32 float64_to_int32_round_to_zero( float64 a )
1954 {
1955 flag aSign;
1956 int16 aExp, shiftCount;
1957 bits64 aSig, savedASig;
1958 int32 z;
1959
1960 aSig = extractFloat64Frac( a );
1961 aExp = extractFloat64Exp( a );
1962 aSign = extractFloat64Sign( a );
1963 if ( 0x41E < aExp ) {
1964 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1965 goto invalid;
1966 }
1967 else if ( aExp < 0x3FF ) {
1968 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1969 return 0;
1970 }
1971 aSig |= LIT64( 0x0010000000000000 );
1972 shiftCount = 0x433 - aExp;
1973 savedASig = aSig;
1974 aSig >>= shiftCount;
1975 z = aSig;
1976 if ( aSign ) z = - z;
1977 if ( ( z < 0 ) ^ aSign ) {
1978 invalid:
1979 float_raise( float_flag_invalid );
1980 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1981 }
1982 if ( ( aSig<<shiftCount ) != savedASig ) {
1983 float_exception_flags |= float_flag_inexact;
1984 }
1985 return z;
1986
1987 }
1988
1989 /*----------------------------------------------------------------------------
1990 | Returns the result of converting the double-precision floating-point value
1991 | `a' to the 64-bit two's complement integer format. The conversion is
1992 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1993 | Arithmetic---which means in particular that the conversion is rounded
1994 | according to the current rounding mode. If `a' is a NaN, the largest
1995 | positive integer is returned. Otherwise, if the conversion overflows, the
1996 | largest integer with the same sign as `a' is returned.
1997 *----------------------------------------------------------------------------*/
1998
float64_to_int64(float64 a)1999 int64 float64_to_int64( float64 a )
2000 {
2001 flag aSign;
2002 int16 aExp, shiftCount;
2003 bits64 aSig, aSigExtra;
2004
2005 aSig = extractFloat64Frac( a );
2006 aExp = extractFloat64Exp( a );
2007 aSign = extractFloat64Sign( a );
2008 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2009 shiftCount = 0x433 - aExp;
2010 if ( shiftCount <= 0 ) {
2011 if ( 0x43E < aExp ) {
2012 float_raise( float_flag_invalid );
2013 if ( ! aSign
2014 || ( ( aExp == 0x7FF )
2015 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2016 ) {
2017 return LIT64( 0x7FFFFFFFFFFFFFFF );
2018 }
2019 return (sbits64) LIT64( 0x8000000000000000 );
2020 }
2021 aSigExtra = 0;
2022 aSig <<= - shiftCount;
2023 }
2024 else {
2025 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2026 }
2027 return roundAndPackInt64( aSign, aSig, aSigExtra );
2028
2029 }
2030
2031 /*----------------------------------------------------------------------------
2032 | Returns the result of converting the double-precision floating-point value
2033 | `a' to the 64-bit two's complement integer format. The conversion is
2034 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2035 | Arithmetic, except that the conversion is always rounded toward zero.
2036 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2037 | the conversion overflows, the largest integer with the same sign as `a' is
2038 | returned.
2039 *----------------------------------------------------------------------------*/
2040
float64_to_int64_round_to_zero(float64 a)2041 int64 float64_to_int64_round_to_zero( float64 a )
2042 {
2043 flag aSign;
2044 int16 aExp, shiftCount;
2045 bits64 aSig;
2046 int64 z;
2047
2048 aSig = extractFloat64Frac( a );
2049 aExp = extractFloat64Exp( a );
2050 aSign = extractFloat64Sign( a );
2051 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2052 shiftCount = aExp - 0x433;
2053 if ( 0 <= shiftCount ) {
2054 if ( 0x43E <= aExp ) {
2055 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2056 float_raise( float_flag_invalid );
2057 if ( ! aSign
2058 || ( ( aExp == 0x7FF )
2059 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2060 ) {
2061 return LIT64( 0x7FFFFFFFFFFFFFFF );
2062 }
2063 }
2064 return (sbits64) LIT64( 0x8000000000000000 );
2065 }
2066 z = aSig<<shiftCount;
2067 }
2068 else {
2069 if ( aExp < 0x3FE ) {
2070 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2071 return 0;
2072 }
2073 z = aSig>>( - shiftCount );
2074 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2075 float_exception_flags |= float_flag_inexact;
2076 }
2077 }
2078 if ( aSign ) z = - z;
2079 return z;
2080
2081 }
2082
2083 /*----------------------------------------------------------------------------
2084 | Returns the result of converting the double-precision floating-point value
2085 | `a' to the single-precision floating-point format. The conversion is
2086 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2087 | Arithmetic.
2088 *----------------------------------------------------------------------------*/
2089
float64_to_float32(float64 a)2090 float32 float64_to_float32( float64 a )
2091 {
2092 flag aSign;
2093 int16 aExp;
2094 bits64 aSig;
2095 bits32 zSig;
2096
2097 aSig = extractFloat64Frac( a );
2098 aExp = extractFloat64Exp( a );
2099 aSign = extractFloat64Sign( a );
2100 if ( aExp == 0x7FF ) {
2101 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2102 return packFloat32( aSign, 0xFF, 0 );
2103 }
2104 shift64RightJamming( aSig, 22, &aSig );
2105 zSig = aSig;
2106 if ( aExp || zSig ) {
2107 zSig |= 0x40000000;
2108 aExp -= 0x381;
2109 }
2110 return roundAndPackFloat32( aSign, aExp, zSig );
2111
2112 }
2113
2114 #ifdef FLOATX80
2115
2116 /*----------------------------------------------------------------------------
2117 | Returns the result of converting the double-precision floating-point value
2118 | `a' to the extended double-precision floating-point format. The conversion
2119 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2120 | Arithmetic.
2121 *----------------------------------------------------------------------------*/
2122
float64_to_floatx80(float64 a)2123 floatx80 float64_to_floatx80( float64 a )
2124 {
2125 flag aSign;
2126 int16 aExp;
2127 bits64 aSig;
2128
2129 aSig = extractFloat64Frac( a );
2130 aExp = extractFloat64Exp( a );
2131 aSign = extractFloat64Sign( a );
2132 if ( aExp == 0x7FF ) {
2133 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2134 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2135 }
2136 if ( aExp == 0 ) {
2137 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2138 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2139 }
2140 return
2141 packFloatx80(
2142 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2143
2144 }
2145
2146 #endif
2147
2148 #ifdef FLOAT128
2149
2150 /*----------------------------------------------------------------------------
2151 | Returns the result of converting the double-precision floating-point value
2152 | `a' to the quadruple-precision floating-point format. The conversion is
2153 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2154 | Arithmetic.
2155 *----------------------------------------------------------------------------*/
2156
float64_to_float128(float64 a)2157 float128 float64_to_float128( float64 a )
2158 {
2159 flag aSign;
2160 int16 aExp;
2161 bits64 aSig, zSig0, zSig1;
2162
2163 aSig = extractFloat64Frac( a );
2164 aExp = extractFloat64Exp( a );
2165 aSign = extractFloat64Sign( a );
2166 if ( aExp == 0x7FF ) {
2167 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2168 return packFloat128( aSign, 0x7FFF, 0, 0 );
2169 }
2170 if ( aExp == 0 ) {
2171 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2172 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2173 --aExp;
2174 }
2175 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2176 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2177
2178 }
2179
2180 #endif
2181
2182 /*----------------------------------------------------------------------------
2183 | Rounds the double-precision floating-point value `a' to an integer, and
2184 | returns the result as a double-precision floating-point value. The
2185 | operation is performed according to the IEC/IEEE Standard for Binary
2186 | Floating-Point Arithmetic.
2187 *----------------------------------------------------------------------------*/
2188
float64_round_to_int(float64 a)2189 float64 float64_round_to_int( float64 a )
2190 {
2191 flag aSign;
2192 int16 aExp;
2193 bits64 lastBitMask, roundBitsMask;
2194 int8 roundingMode;
2195 float64 z;
2196
2197 aExp = extractFloat64Exp( a );
2198 if ( 0x433 <= aExp ) {
2199 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2200 return propagateFloat64NaN( a, a );
2201 }
2202 return a;
2203 }
2204 if ( aExp < 0x3FF ) {
2205 if ( (bits64) ( a<<1 ) == 0 ) return a;
2206 float_exception_flags |= float_flag_inexact;
2207 aSign = extractFloat64Sign( a );
2208 switch ( float_rounding_mode ) {
2209 case float_round_nearest_even:
2210 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2211 return packFloat64( aSign, 0x3FF, 0 );
2212 }
2213 break;
2214 case float_round_down:
2215 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2216 case float_round_up:
2217 return
2218 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2219 }
2220 return packFloat64( aSign, 0, 0 );
2221 }
2222 lastBitMask = 1;
2223 lastBitMask <<= 0x433 - aExp;
2224 roundBitsMask = lastBitMask - 1;
2225 z = a;
2226 roundingMode = float_rounding_mode;
2227 if ( roundingMode == float_round_nearest_even ) {
2228 z += lastBitMask>>1;
2229 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2230 }
2231 else if ( roundingMode != float_round_to_zero ) {
2232 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2233 z += roundBitsMask;
2234 }
2235 }
2236 z &= ~ roundBitsMask;
2237 if ( z != a ) float_exception_flags |= float_flag_inexact;
2238 return z;
2239
2240 }
2241
2242 /*----------------------------------------------------------------------------
2243 | Returns the result of adding the absolute values of the double-precision
2244 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2245 | before being returned. `zSign' is ignored if the result is a NaN.
2246 | The addition is performed according to the IEC/IEEE Standard for Binary
2247 | Floating-Point Arithmetic.
2248 *----------------------------------------------------------------------------*/
2249
addFloat64Sigs(float64 a,float64 b,flag zSign)2250 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2251 {
2252 int16 aExp, bExp, zExp;
2253 bits64 aSig, bSig, zSig;
2254 int16 expDiff;
2255
2256 aSig = extractFloat64Frac( a );
2257 aExp = extractFloat64Exp( a );
2258 bSig = extractFloat64Frac( b );
2259 bExp = extractFloat64Exp( b );
2260 expDiff = aExp - bExp;
2261 aSig <<= 9;
2262 bSig <<= 9;
2263 if ( 0 < expDiff ) {
2264 if ( aExp == 0x7FF ) {
2265 if ( aSig ) return propagateFloat64NaN( a, b );
2266 return a;
2267 }
2268 if ( bExp == 0 ) {
2269 --expDiff;
2270 }
2271 else {
2272 bSig |= LIT64( 0x2000000000000000 );
2273 }
2274 shift64RightJamming( bSig, expDiff, &bSig );
2275 zExp = aExp;
2276 }
2277 else if ( expDiff < 0 ) {
2278 if ( bExp == 0x7FF ) {
2279 if ( bSig ) return propagateFloat64NaN( a, b );
2280 return packFloat64( zSign, 0x7FF, 0 );
2281 }
2282 if ( aExp == 0 ) {
2283 ++expDiff;
2284 }
2285 else {
2286 aSig |= LIT64( 0x2000000000000000 );
2287 }
2288 shift64RightJamming( aSig, - expDiff, &aSig );
2289 zExp = bExp;
2290 }
2291 else {
2292 if ( aExp == 0x7FF ) {
2293 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2294 return a;
2295 }
2296 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2297 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2298 zExp = aExp;
2299 goto roundAndPack;
2300 }
2301 aSig |= LIT64( 0x2000000000000000 );
2302 zSig = ( aSig + bSig )<<1;
2303 --zExp;
2304 if ( (sbits64) zSig < 0 ) {
2305 zSig = aSig + bSig;
2306 ++zExp;
2307 }
2308 roundAndPack:
2309 return roundAndPackFloat64( zSign, zExp, zSig );
2310
2311 }
2312
2313 /*----------------------------------------------------------------------------
2314 | Returns the result of subtracting the absolute values of the double-
2315 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2316 | difference is negated before being returned. `zSign' is ignored if the
2317 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2318 | Standard for Binary Floating-Point Arithmetic.
2319 *----------------------------------------------------------------------------*/
2320
subFloat64Sigs(float64 a,float64 b,flag zSign)2321 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2322 {
2323 int16 aExp, bExp, zExp;
2324 bits64 aSig, bSig, zSig;
2325 int16 expDiff;
2326
2327 aSig = extractFloat64Frac( a );
2328 aExp = extractFloat64Exp( a );
2329 bSig = extractFloat64Frac( b );
2330 bExp = extractFloat64Exp( b );
2331 expDiff = aExp - bExp;
2332 aSig <<= 10;
2333 bSig <<= 10;
2334 if ( 0 < expDiff ) goto aExpBigger;
2335 if ( expDiff < 0 ) goto bExpBigger;
2336 if ( aExp == 0x7FF ) {
2337 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2338 float_raise( float_flag_invalid );
2339 return float64_default_nan;
2340 }
2341 if ( aExp == 0 ) {
2342 aExp = 1;
2343 bExp = 1;
2344 }
2345 if ( bSig < aSig ) goto aBigger;
2346 if ( aSig < bSig ) goto bBigger;
2347 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2348 bExpBigger:
2349 if ( bExp == 0x7FF ) {
2350 if ( bSig ) return propagateFloat64NaN( a, b );
2351 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2352 }
2353 if ( aExp == 0 ) {
2354 ++expDiff;
2355 }
2356 else {
2357 aSig |= LIT64( 0x4000000000000000 );
2358 }
2359 shift64RightJamming( aSig, - expDiff, &aSig );
2360 bSig |= LIT64( 0x4000000000000000 );
2361 bBigger:
2362 zSig = bSig - aSig;
2363 zExp = bExp;
2364 zSign ^= 1;
2365 goto normalizeRoundAndPack;
2366 aExpBigger:
2367 if ( aExp == 0x7FF ) {
2368 if ( aSig ) return propagateFloat64NaN( a, b );
2369 return a;
2370 }
2371 if ( bExp == 0 ) {
2372 --expDiff;
2373 }
2374 else {
2375 bSig |= LIT64( 0x4000000000000000 );
2376 }
2377 shift64RightJamming( bSig, expDiff, &bSig );
2378 aSig |= LIT64( 0x4000000000000000 );
2379 aBigger:
2380 zSig = aSig - bSig;
2381 zExp = aExp;
2382 normalizeRoundAndPack:
2383 --zExp;
2384 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2385
2386 }
2387
2388 /*----------------------------------------------------------------------------
2389 | Returns the result of adding the double-precision floating-point values `a'
2390 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2391 | Binary Floating-Point Arithmetic.
2392 *----------------------------------------------------------------------------*/
2393
float64_add(float64 a,float64 b)2394 float64 float64_add( float64 a, float64 b )
2395 {
2396 flag aSign, bSign;
2397
2398 aSign = extractFloat64Sign( a );
2399 bSign = extractFloat64Sign( b );
2400 if ( aSign == bSign ) {
2401 return addFloat64Sigs( a, b, aSign );
2402 }
2403 else {
2404 return subFloat64Sigs( a, b, aSign );
2405 }
2406
2407 }
2408
2409 /*----------------------------------------------------------------------------
2410 | Returns the result of subtracting the double-precision floating-point values
2411 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2412 | for Binary Floating-Point Arithmetic.
2413 *----------------------------------------------------------------------------*/
2414
float64_sub(float64 a,float64 b)2415 float64 float64_sub( float64 a, float64 b )
2416 {
2417 flag aSign, bSign;
2418
2419 aSign = extractFloat64Sign( a );
2420 bSign = extractFloat64Sign( b );
2421 if ( aSign == bSign ) {
2422 return subFloat64Sigs( a, b, aSign );
2423 }
2424 else {
2425 return addFloat64Sigs( a, b, aSign );
2426 }
2427
2428 }
2429
2430 /*----------------------------------------------------------------------------
2431 | Returns the result of multiplying the double-precision floating-point values
2432 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2433 | for Binary Floating-Point Arithmetic.
2434 *----------------------------------------------------------------------------*/
2435
float64_mul(float64 a,float64 b)2436 float64 float64_mul( float64 a, float64 b )
2437 {
2438 flag aSign, bSign, zSign;
2439 int16 aExp, bExp, zExp;
2440 bits64 aSig, bSig, zSig0, zSig1;
2441
2442 aSig = extractFloat64Frac( a );
2443 aExp = extractFloat64Exp( a );
2444 aSign = extractFloat64Sign( a );
2445 bSig = extractFloat64Frac( b );
2446 bExp = extractFloat64Exp( b );
2447 bSign = extractFloat64Sign( b );
2448 zSign = aSign ^ bSign;
2449 if ( aExp == 0x7FF ) {
2450 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2451 return propagateFloat64NaN( a, b );
2452 }
2453 if ( ( bExp | bSig ) == 0 ) {
2454 float_raise( float_flag_invalid );
2455 return float64_default_nan;
2456 }
2457 return packFloat64( zSign, 0x7FF, 0 );
2458 }
2459 if ( bExp == 0x7FF ) {
2460 if ( bSig ) return propagateFloat64NaN( a, b );
2461 if ( ( aExp | aSig ) == 0 ) {
2462 float_raise( float_flag_invalid );
2463 return float64_default_nan;
2464 }
2465 return packFloat64( zSign, 0x7FF, 0 );
2466 }
2467 if ( aExp == 0 ) {
2468 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2469 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2470 }
2471 if ( bExp == 0 ) {
2472 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2473 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2474 }
2475 zExp = aExp + bExp - 0x3FF;
2476 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2477 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2478 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2479 zSig0 |= ( zSig1 != 0 );
2480 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2481 zSig0 <<= 1;
2482 --zExp;
2483 }
2484 return roundAndPackFloat64( zSign, zExp, zSig0 );
2485
2486 }
2487
2488 /*----------------------------------------------------------------------------
2489 | Returns the result of dividing the double-precision floating-point value `a'
2490 | by the corresponding value `b'. The operation is performed according to
2491 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2492 *----------------------------------------------------------------------------*/
2493
float64_div(float64 a,float64 b)2494 float64 float64_div( float64 a, float64 b )
2495 {
2496 flag aSign, bSign, zSign;
2497 int16 aExp, bExp, zExp;
2498 bits64 aSig, bSig, zSig;
2499 bits64 rem0, rem1;
2500 bits64 term0, term1;
2501
2502 aSig = extractFloat64Frac( a );
2503 aExp = extractFloat64Exp( a );
2504 aSign = extractFloat64Sign( a );
2505 bSig = extractFloat64Frac( b );
2506 bExp = extractFloat64Exp( b );
2507 bSign = extractFloat64Sign( b );
2508 zSign = aSign ^ bSign;
2509 if ( aExp == 0x7FF ) {
2510 if ( aSig ) return propagateFloat64NaN( a, b );
2511 if ( bExp == 0x7FF ) {
2512 if ( bSig ) return propagateFloat64NaN( a, b );
2513 float_raise( float_flag_invalid );
2514 return float64_default_nan;
2515 }
2516 return packFloat64( zSign, 0x7FF, 0 );
2517 }
2518 if ( bExp == 0x7FF ) {
2519 if ( bSig ) return propagateFloat64NaN( a, b );
2520 return packFloat64( zSign, 0, 0 );
2521 }
2522 if ( bExp == 0 ) {
2523 if ( bSig == 0 ) {
2524 if ( ( aExp | aSig ) == 0 ) {
2525 float_raise( float_flag_invalid );
2526 return float64_default_nan;
2527 }
2528 float_raise( float_flag_divbyzero );
2529 return packFloat64( zSign, 0x7FF, 0 );
2530 }
2531 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2532 }
2533 if ( aExp == 0 ) {
2534 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2535 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2536 }
2537 zExp = aExp - bExp + 0x3FD;
2538 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2539 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2540 if ( bSig <= ( aSig + aSig ) ) {
2541 aSig >>= 1;
2542 ++zExp;
2543 }
2544 zSig = estimateDiv128To64( aSig, 0, bSig );
2545 if ( ( zSig & 0x1FF ) <= 2 ) {
2546 mul64To128( bSig, zSig, &term0, &term1 );
2547 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2548 while ( (sbits64) rem0 < 0 ) {
2549 --zSig;
2550 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2551 }
2552 zSig |= ( rem1 != 0 );
2553 }
2554 return roundAndPackFloat64( zSign, zExp, zSig );
2555
2556 }
2557
2558 /*----------------------------------------------------------------------------
2559 | Returns the remainder of the double-precision floating-point value `a'
2560 | with respect to the corresponding value `b'. The operation is performed
2561 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2562 *----------------------------------------------------------------------------*/
2563
float64_rem(float64 a,float64 b)2564 float64 float64_rem( float64 a, float64 b )
2565 {
2566 flag aSign, zSign;
2567 int16 aExp, bExp, expDiff;
2568 bits64 aSig, bSig;
2569 bits64 q, alternateASig;
2570 sbits64 sigMean;
2571
2572 aSig = extractFloat64Frac( a );
2573 aExp = extractFloat64Exp( a );
2574 aSign = extractFloat64Sign( a );
2575 bSig = extractFloat64Frac( b );
2576 bExp = extractFloat64Exp( b );
2577 // bSign = extractFloat64Sign( b );
2578 if ( aExp == 0x7FF ) {
2579 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2580 return propagateFloat64NaN( a, b );
2581 }
2582 float_raise( float_flag_invalid );
2583 return float64_default_nan;
2584 }
2585 if ( bExp == 0x7FF ) {
2586 if ( bSig ) return propagateFloat64NaN( a, b );
2587 return a;
2588 }
2589 if ( bExp == 0 ) {
2590 if ( bSig == 0 ) {
2591 float_raise( float_flag_invalid );
2592 return float64_default_nan;
2593 }
2594 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2595 }
2596 if ( aExp == 0 ) {
2597 if ( aSig == 0 ) return a;
2598 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2599 }
2600 expDiff = aExp - bExp;
2601 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2602 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2603 if ( expDiff < 0 ) {
2604 if ( expDiff < -1 ) return a;
2605 aSig >>= 1;
2606 }
2607 q = ( bSig <= aSig );
2608 if ( q ) aSig -= bSig;
2609 expDiff -= 64;
2610 while ( 0 < expDiff ) {
2611 q = estimateDiv128To64( aSig, 0, bSig );
2612 q = ( 2 < q ) ? q - 2 : 0;
2613 aSig = - ( ( bSig>>2 ) * q );
2614 expDiff -= 62;
2615 }
2616 expDiff += 64;
2617 if ( 0 < expDiff ) {
2618 q = estimateDiv128To64( aSig, 0, bSig );
2619 q = ( 2 < q ) ? q - 2 : 0;
2620 q >>= 64 - expDiff;
2621 bSig >>= 2;
2622 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2623 }
2624 else {
2625 aSig >>= 2;
2626 bSig >>= 2;
2627 }
2628 do {
2629 alternateASig = aSig;
2630 ++q;
2631 aSig -= bSig;
2632 } while ( 0 <= (sbits64) aSig );
2633 sigMean = aSig + alternateASig;
2634 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2635 aSig = alternateASig;
2636 }
2637 zSign = ( (sbits64) aSig < 0 );
2638 if ( zSign ) aSig = - aSig;
2639 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2640
2641 }
2642
2643 /*----------------------------------------------------------------------------
2644 | Returns the square root of the double-precision floating-point value `a'.
2645 | The operation is performed according to the IEC/IEEE Standard for Binary
2646 | Floating-Point Arithmetic.
2647 *----------------------------------------------------------------------------*/
2648
float64_sqrt(float64 a)2649 float64 float64_sqrt( float64 a )
2650 {
2651 flag aSign;
2652 int16 aExp, zExp;
2653 bits64 aSig, zSig, doubleZSig;
2654 bits64 rem0, rem1, term0, term1;
2655 // float64 z;
2656
2657 aSig = extractFloat64Frac( a );
2658 aExp = extractFloat64Exp( a );
2659 aSign = extractFloat64Sign( a );
2660 if ( aExp == 0x7FF ) {
2661 if ( aSig ) return propagateFloat64NaN( a, a );
2662 if ( ! aSign ) return a;
2663 float_raise( float_flag_invalid );
2664 return float64_default_nan;
2665 }
2666 if ( aSign ) {
2667 if ( ( aExp | aSig ) == 0 ) return a;
2668 float_raise( float_flag_invalid );
2669 return float64_default_nan;
2670 }
2671 if ( aExp == 0 ) {
2672 if ( aSig == 0 ) return 0;
2673 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2674 }
2675 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2676 aSig |= LIT64( 0x0010000000000000 );
2677 zSig = estimateSqrt32( aExp, aSig>>21 );
2678 aSig <<= 9 - ( aExp & 1 );
2679 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2680 if ( ( zSig & 0x1FF ) <= 5 ) {
2681 doubleZSig = zSig<<1;
2682 mul64To128( zSig, zSig, &term0, &term1 );
2683 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2684 while ( (sbits64) rem0 < 0 ) {
2685 --zSig;
2686 doubleZSig -= 2;
2687 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2688 }
2689 zSig |= ( ( rem0 | rem1 ) != 0 );
2690 }
2691 return roundAndPackFloat64( 0, zExp, zSig );
2692
2693 }
2694
2695 /*----------------------------------------------------------------------------
2696 | Returns 1 if the double-precision floating-point value `a' is equal to the
2697 | corresponding value `b', and 0 otherwise. The comparison is performed
2698 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2699 *----------------------------------------------------------------------------*/
2700
float64_eq(float64 a,float64 b)2701 flag float64_eq( float64 a, float64 b )
2702 {
2703 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2704 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2705 ) {
2706 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2707 float_raise( float_flag_invalid );
2708 }
2709 return 0;
2710 }
2711 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2712
2713 }
2714
2715 /*----------------------------------------------------------------------------
2716 | Returns 1 if the double-precision floating-point value `a' is less than or
2717 | equal to the corresponding value `b', and 0 otherwise. The comparison is
2718 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2719 | Arithmetic.
2720 *----------------------------------------------------------------------------*/
2721
float64_le(float64 a,float64 b)2722 flag float64_le( float64 a, float64 b )
2723 {
2724 flag aSign, bSign;
2725
2726 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2727 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2728 ) {
2729 float_raise( float_flag_invalid );
2730 return 0;
2731 }
2732 aSign = extractFloat64Sign( a );
2733 bSign = extractFloat64Sign( b );
2734 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2735 return ( a == b ) || ( aSign ^ ( a < b ) );
2736
2737 }
2738
2739 /*----------------------------------------------------------------------------
2740 | Returns 1 if the double-precision floating-point value `a' is less than
2741 | the corresponding value `b', and 0 otherwise. The comparison is performed
2742 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2743 *----------------------------------------------------------------------------*/
2744
float64_lt(float64 a,float64 b)2745 flag float64_lt( float64 a, float64 b )
2746 {
2747 flag aSign, bSign;
2748
2749 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2750 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2751 ) {
2752 float_raise( float_flag_invalid );
2753 return 0;
2754 }
2755 aSign = extractFloat64Sign( a );
2756 bSign = extractFloat64Sign( b );
2757 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2758 return ( a != b ) && ( aSign ^ ( a < b ) );
2759
2760 }
2761
2762 /*----------------------------------------------------------------------------
2763 | Returns 1 if the double-precision floating-point value `a' is equal to the
2764 | corresponding value `b', and 0 otherwise. The invalid exception is raised
2765 | if either operand is a NaN. Otherwise, the comparison is performed
2766 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2767 *----------------------------------------------------------------------------*/
2768
float64_eq_signaling(float64 a,float64 b)2769 flag float64_eq_signaling( float64 a, float64 b )
2770 {
2771 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2772 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2773 ) {
2774 float_raise( float_flag_invalid );
2775 return 0;
2776 }
2777 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2778
2779 }
2780
2781 /*----------------------------------------------------------------------------
2782 | Returns 1 if the double-precision floating-point value `a' is less than or
2783 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2784 | cause an exception. Otherwise, the comparison is performed according to the
2785 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2786 *----------------------------------------------------------------------------*/
2787
float64_le_quiet(float64 a,float64 b)2788 flag float64_le_quiet( float64 a, float64 b )
2789 {
2790 flag aSign, bSign;
2791 // int16 aExp, bExp;
2792
2793 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2794 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2795 ) {
2796 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2797 float_raise( float_flag_invalid );
2798 }
2799 return 0;
2800 }
2801 aSign = extractFloat64Sign( a );
2802 bSign = extractFloat64Sign( b );
2803 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2804 return ( a == b ) || ( aSign ^ ( a < b ) );
2805
2806 }
2807
2808 /*----------------------------------------------------------------------------
2809 | Returns 1 if the double-precision floating-point value `a' is less than
2810 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2811 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2812 | Standard for Binary Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814
float64_lt_quiet(float64 a,float64 b)2815 flag float64_lt_quiet( float64 a, float64 b )
2816 {
2817 flag aSign, bSign;
2818
2819 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2820 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2821 ) {
2822 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2823 float_raise( float_flag_invalid );
2824 }
2825 return 0;
2826 }
2827 aSign = extractFloat64Sign( a );
2828 bSign = extractFloat64Sign( b );
2829 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2830 return ( a != b ) && ( aSign ^ ( a < b ) );
2831
2832 }
2833
2834 #ifdef FLOATX80
2835
2836 /*----------------------------------------------------------------------------
2837 | Returns the result of converting the extended double-precision floating-
2838 | point value `a' to the 32-bit two's complement integer format. The
2839 | conversion is performed according to the IEC/IEEE Standard for Binary
2840 | Floating-Point Arithmetic---which means in particular that the conversion
2841 | is rounded according to the current rounding mode. If `a' is a NaN, the
2842 | largest positive integer is returned. Otherwise, if the conversion
2843 | overflows, the largest integer with the same sign as `a' is returned.
2844 *----------------------------------------------------------------------------*/
2845
floatx80_to_int32(floatx80 a)2846 int32 floatx80_to_int32( floatx80 a )
2847 {
2848 flag aSign;
2849 int32 aExp, shiftCount;
2850 bits64 aSig;
2851
2852 aSig = extractFloatx80Frac( a );
2853 aExp = extractFloatx80Exp( a );
2854 aSign = extractFloatx80Sign( a );
2855 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2856 shiftCount = 0x4037 - aExp;
2857 if ( shiftCount <= 0 ) shiftCount = 1;
2858 shift64RightJamming( aSig, shiftCount, &aSig );
2859 return roundAndPackInt32( aSign, aSig );
2860
2861 }
2862
2863 /*----------------------------------------------------------------------------
2864 | Returns the result of converting the extended double-precision floating-
2865 | point value `a' to the 32-bit two's complement integer format. The
2866 | conversion is performed according to the IEC/IEEE Standard for Binary
2867 | Floating-Point Arithmetic, except that the conversion is always rounded
2868 | toward zero. If `a' is a NaN, the largest positive integer is returned.
2869 | Otherwise, if the conversion overflows, the largest integer with the same
2870 | sign as `a' is returned.
2871 *----------------------------------------------------------------------------*/
2872
floatx80_to_int32_round_to_zero(floatx80 a)2873 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2874 {
2875 flag aSign;
2876 int32 aExp, shiftCount;
2877 bits64 aSig, savedASig;
2878 int32 z;
2879
2880 aSig = extractFloatx80Frac( a );
2881 aExp = extractFloatx80Exp( a );
2882 aSign = extractFloatx80Sign( a );
2883 if ( 0x401E < aExp ) {
2884 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2885 goto invalid;
2886 }
2887 else if ( aExp < 0x3FFF ) {
2888 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2889 return 0;
2890 }
2891 shiftCount = 0x403E - aExp;
2892 savedASig = aSig;
2893 aSig >>= shiftCount;
2894 z = aSig;
2895 if ( aSign ) z = - z;
2896 if ( ( z < 0 ) ^ aSign ) {
2897 invalid:
2898 float_raise( float_flag_invalid );
2899 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2900 }
2901 if ( ( aSig<<shiftCount ) != savedASig ) {
2902 float_exception_flags |= float_flag_inexact;
2903 }
2904 return z;
2905
2906 }
2907
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of converting the extended double-precision floating-
2910 | point value `a' to the 64-bit two's complement integer format. The
2911 | conversion is performed according to the IEC/IEEE Standard for Binary
2912 | Floating-Point Arithmetic---which means in particular that the conversion
2913 | is rounded according to the current rounding mode. If `a' is a NaN,
2914 | the largest positive integer is returned. Otherwise, if the conversion
2915 | overflows, the largest integer with the same sign as `a' is returned.
2916 *----------------------------------------------------------------------------*/
2917
floatx80_to_int64(floatx80 a)2918 int64 floatx80_to_int64( floatx80 a )
2919 {
2920 flag aSign;
2921 int32 aExp, shiftCount;
2922 bits64 aSig, aSigExtra;
2923
2924 aSig = extractFloatx80Frac( a );
2925 aExp = extractFloatx80Exp( a );
2926 aSign = extractFloatx80Sign( a );
2927 shiftCount = 0x403E - aExp;
2928 if ( shiftCount <= 0 ) {
2929 if ( shiftCount ) {
2930 float_raise( float_flag_invalid );
2931 if ( ! aSign
2932 || ( ( aExp == 0x7FFF )
2933 && ( aSig != LIT64( 0x8000000000000000 ) ) )
2934 ) {
2935 return LIT64( 0x7FFFFFFFFFFFFFFF );
2936 }
2937 return (sbits64) LIT64( 0x8000000000000000 );
2938 }
2939 aSigExtra = 0;
2940 }
2941 else {
2942 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2943 }
2944 return roundAndPackInt64( aSign, aSig, aSigExtra );
2945
2946 }
2947
2948 /*----------------------------------------------------------------------------
2949 | Returns the result of converting the extended double-precision floating-
2950 | point value `a' to the 64-bit two's complement integer format. The
2951 | conversion is performed according to the IEC/IEEE Standard for Binary
2952 | Floating-Point Arithmetic, except that the conversion is always rounded
2953 | toward zero. If `a' is a NaN, the largest positive integer is returned.
2954 | Otherwise, if the conversion overflows, the largest integer with the same
2955 | sign as `a' is returned.
2956 *----------------------------------------------------------------------------*/
2957
floatx80_to_int64_round_to_zero(floatx80 a)2958 int64 floatx80_to_int64_round_to_zero( floatx80 a )
2959 {
2960 flag aSign;
2961 int32 aExp, shiftCount;
2962 bits64 aSig;
2963 int64 z;
2964
2965 aSig = extractFloatx80Frac( a );
2966 aExp = extractFloatx80Exp( a );
2967 aSign = extractFloatx80Sign( a );
2968 shiftCount = aExp - 0x403E;
2969 if ( 0 <= shiftCount ) {
2970 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
2971 if ( ( a.high != 0xC03E ) || aSig ) {
2972 float_raise( float_flag_invalid );
2973 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
2974 return LIT64( 0x7FFFFFFFFFFFFFFF );
2975 }
2976 }
2977 return (sbits64) LIT64( 0x8000000000000000 );
2978 }
2979 else if ( aExp < 0x3FFF ) {
2980 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2981 return 0;
2982 }
2983 z = aSig>>( - shiftCount );
2984 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2985 float_exception_flags |= float_flag_inexact;
2986 }
2987 if ( aSign ) z = - z;
2988 return z;
2989
2990 }
2991
2992 /*----------------------------------------------------------------------------
2993 | Returns the result of converting the extended double-precision floating-
2994 | point value `a' to the single-precision floating-point format. The
2995 | conversion is performed according to the IEC/IEEE Standard for Binary
2996 | Floating-Point Arithmetic.
2997 *----------------------------------------------------------------------------*/
2998
floatx80_to_float32(floatx80 a)2999 float32 floatx80_to_float32( floatx80 a )
3000 {
3001 flag aSign;
3002 int32 aExp;
3003 bits64 aSig;
3004
3005 aSig = extractFloatx80Frac( a );
3006 aExp = extractFloatx80Exp( a );
3007 aSign = extractFloatx80Sign( a );
3008 if ( aExp == 0x7FFF ) {
3009 if ( (bits64) ( aSig<<1 ) ) {
3010 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3011 }
3012 return packFloat32( aSign, 0xFF, 0 );
3013 }
3014 shift64RightJamming( aSig, 33, &aSig );
3015 if ( aExp || aSig ) aExp -= 0x3F81;
3016 return roundAndPackFloat32( aSign, aExp, aSig );
3017
3018 }
3019
3020 /*----------------------------------------------------------------------------
3021 | Returns the result of converting the extended double-precision floating-
3022 | point value `a' to the double-precision floating-point format. The
3023 | conversion is performed according to the IEC/IEEE Standard for Binary
3024 | Floating-Point Arithmetic.
3025 *----------------------------------------------------------------------------*/
3026
floatx80_to_float64(floatx80 a)3027 float64 floatx80_to_float64( floatx80 a )
3028 {
3029 flag aSign;
3030 int32 aExp;
3031 bits64 aSig, zSig;
3032
3033 aSig = extractFloatx80Frac( a );
3034 aExp = extractFloatx80Exp( a );
3035 aSign = extractFloatx80Sign( a );
3036 if ( aExp == 0x7FFF ) {
3037 if ( (bits64) ( aSig<<1 ) ) {
3038 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3039 }
3040 return packFloat64( aSign, 0x7FF, 0 );
3041 }
3042 shift64RightJamming( aSig, 1, &zSig );
3043 if ( aExp || aSig ) aExp -= 0x3C01;
3044 return roundAndPackFloat64( aSign, aExp, zSig );
3045
3046 }
3047
3048 #ifdef FLOAT128
3049
3050 /*----------------------------------------------------------------------------
3051 | Returns the result of converting the extended double-precision floating-
3052 | point value `a' to the quadruple-precision floating-point format. The
3053 | conversion is performed according to the IEC/IEEE Standard for Binary
3054 | Floating-Point Arithmetic.
3055 *----------------------------------------------------------------------------*/
3056
floatx80_to_float128(floatx80 a)3057 float128 floatx80_to_float128( floatx80 a )
3058 {
3059 flag aSign;
3060 int16 aExp;
3061 bits64 aSig, zSig0, zSig1;
3062
3063 aSig = extractFloatx80Frac( a );
3064 aExp = extractFloatx80Exp( a );
3065 aSign = extractFloatx80Sign( a );
3066 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3067 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3068 }
3069 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3070 return packFloat128( aSign, aExp, zSig0, zSig1 );
3071
3072 }
3073
3074 #endif
3075
3076 /*----------------------------------------------------------------------------
3077 | Rounds the extended double-precision floating-point value `a' to an integer,
3078 | and returns the result as an extended quadruple-precision floating-point
3079 | value. The operation is performed according to the IEC/IEEE Standard for
3080 | Binary Floating-Point Arithmetic.
3081 *----------------------------------------------------------------------------*/
3082
floatx80_round_to_int(floatx80 a)3083 floatx80 floatx80_round_to_int( floatx80 a )
3084 {
3085 flag aSign;
3086 int32 aExp;
3087 bits64 lastBitMask, roundBitsMask;
3088 int8 roundingMode;
3089 floatx80 z;
3090
3091 aExp = extractFloatx80Exp( a );
3092 if ( 0x403E <= aExp ) {
3093 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3094 return propagateFloatx80NaN( a, a );
3095 }
3096 return a;
3097 }
3098 if ( aExp < 0x3FFF ) {
3099 if ( ( aExp == 0 )
3100 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3101 return a;
3102 }
3103 float_exception_flags |= float_flag_inexact;
3104 aSign = extractFloatx80Sign( a );
3105 switch ( float_rounding_mode ) {
3106 case float_round_nearest_even:
3107 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3108 ) {
3109 return
3110 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3111 }
3112 break;
3113 case float_round_down:
3114 return
3115 aSign ?
3116 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3117 : packFloatx80( 0, 0, 0 );
3118 case float_round_up:
3119 return
3120 aSign ? packFloatx80( 1, 0, 0 )
3121 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3122 }
3123 return packFloatx80( aSign, 0, 0 );
3124 }
3125 lastBitMask = 1;
3126 lastBitMask <<= 0x403E - aExp;
3127 roundBitsMask = lastBitMask - 1;
3128 z = a;
3129 roundingMode = float_rounding_mode;
3130 if ( roundingMode == float_round_nearest_even ) {
3131 z.low += lastBitMask>>1;
3132 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3133 }
3134 else if ( roundingMode != float_round_to_zero ) {
3135 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3136 z.low += roundBitsMask;
3137 }
3138 }
3139 z.low &= ~ roundBitsMask;
3140 if ( z.low == 0 ) {
3141 ++z.high;
3142 z.low = LIT64( 0x8000000000000000 );
3143 }
3144 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3145 return z;
3146
3147 }
3148
3149 /*----------------------------------------------------------------------------
3150 | Returns the result of adding the absolute values of the extended double-
3151 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3152 | negated before being returned. `zSign' is ignored if the result is a NaN.
3153 | The addition is performed according to the IEC/IEEE Standard for Binary
3154 | Floating-Point Arithmetic.
3155 *----------------------------------------------------------------------------*/
3156
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3157 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3158 {
3159 int32 aExp, bExp, zExp;
3160 bits64 aSig, bSig, zSig0, zSig1;
3161 int32 expDiff;
3162
3163 aSig = extractFloatx80Frac( a );
3164 aExp = extractFloatx80Exp( a );
3165 bSig = extractFloatx80Frac( b );
3166 bExp = extractFloatx80Exp( b );
3167 expDiff = aExp - bExp;
3168 if ( 0 < expDiff ) {
3169 if ( aExp == 0x7FFF ) {
3170 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3171 return a;
3172 }
3173 if ( bExp == 0 ) --expDiff;
3174 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3175 zExp = aExp;
3176 }
3177 else if ( expDiff < 0 ) {
3178 if ( bExp == 0x7FFF ) {
3179 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3180 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3181 }
3182 if ( aExp == 0 ) ++expDiff;
3183 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3184 zExp = bExp;
3185 }
3186 else {
3187 if ( aExp == 0x7FFF ) {
3188 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3189 return propagateFloatx80NaN( a, b );
3190 }
3191 return a;
3192 }
3193 zSig1 = 0;
3194 zSig0 = aSig + bSig;
3195 if ( aExp == 0 ) {
3196 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3197 goto roundAndPack;
3198 }
3199 zExp = aExp;
3200 goto shiftRight1;
3201 }
3202 zSig0 = aSig + bSig;
3203 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3204 shiftRight1:
3205 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3206 zSig0 |= LIT64( 0x8000000000000000 );
3207 ++zExp;
3208 roundAndPack:
3209 return
3210 roundAndPackFloatx80(
3211 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3212
3213 }
3214
3215 /*----------------------------------------------------------------------------
3216 | Returns the result of subtracting the absolute values of the extended
3217 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3218 | difference is negated before being returned. `zSign' is ignored if the
3219 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3220 | Standard for Binary Floating-Point Arithmetic.
3221 *----------------------------------------------------------------------------*/
3222
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3223 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3224 {
3225 int32 aExp, bExp, zExp;
3226 bits64 aSig, bSig, zSig0, zSig1;
3227 int32 expDiff;
3228 floatx80 z;
3229
3230 aSig = extractFloatx80Frac( a );
3231 aExp = extractFloatx80Exp( a );
3232 bSig = extractFloatx80Frac( b );
3233 bExp = extractFloatx80Exp( b );
3234 expDiff = aExp - bExp;
3235 if ( 0 < expDiff ) goto aExpBigger;
3236 if ( expDiff < 0 ) goto bExpBigger;
3237 if ( aExp == 0x7FFF ) {
3238 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3239 return propagateFloatx80NaN( a, b );
3240 }
3241 float_raise( float_flag_invalid );
3242 z.low = floatx80_default_nan_low;
3243 z.high = floatx80_default_nan_high;
3244 return z;
3245 }
3246 if ( aExp == 0 ) {
3247 aExp = 1;
3248 bExp = 1;
3249 }
3250 zSig1 = 0;
3251 if ( bSig < aSig ) goto aBigger;
3252 if ( aSig < bSig ) goto bBigger;
3253 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3254 bExpBigger:
3255 if ( bExp == 0x7FFF ) {
3256 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3257 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3258 }
3259 if ( aExp == 0 ) ++expDiff;
3260 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3261 bBigger:
3262 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3263 zExp = bExp;
3264 zSign ^= 1;
3265 goto normalizeRoundAndPack;
3266 aExpBigger:
3267 if ( aExp == 0x7FFF ) {
3268 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3269 return a;
3270 }
3271 if ( bExp == 0 ) --expDiff;
3272 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3273 aBigger:
3274 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3275 zExp = aExp;
3276 normalizeRoundAndPack:
3277 return
3278 normalizeRoundAndPackFloatx80(
3279 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3280
3281 }
3282
3283 /*----------------------------------------------------------------------------
3284 | Returns the result of adding the extended double-precision floating-point
3285 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3286 | Standard for Binary Floating-Point Arithmetic.
3287 *----------------------------------------------------------------------------*/
3288
floatx80_add(floatx80 a,floatx80 b)3289 floatx80 floatx80_add( floatx80 a, floatx80 b )
3290 {
3291 flag aSign, bSign;
3292
3293 aSign = extractFloatx80Sign( a );
3294 bSign = extractFloatx80Sign( b );
3295 if ( aSign == bSign ) {
3296 return addFloatx80Sigs( a, b, aSign );
3297 }
3298 else {
3299 return subFloatx80Sigs( a, b, aSign );
3300 }
3301
3302 }
3303
3304 /*----------------------------------------------------------------------------
3305 | Returns the result of subtracting the extended double-precision floating-
3306 | point values `a' and `b'. The operation is performed according to the
3307 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3309
floatx80_sub(floatx80 a,floatx80 b)3310 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3311 {
3312 flag aSign, bSign;
3313
3314 aSign = extractFloatx80Sign( a );
3315 bSign = extractFloatx80Sign( b );
3316 if ( aSign == bSign ) {
3317 return subFloatx80Sigs( a, b, aSign );
3318 }
3319 else {
3320 return addFloatx80Sigs( a, b, aSign );
3321 }
3322
3323 }
3324
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of multiplying the extended double-precision floating-
3327 | point values `a' and `b'. The operation is performed according to the
3328 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3329 *----------------------------------------------------------------------------*/
3330
floatx80_mul(floatx80 a,floatx80 b)3331 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3332 {
3333 flag aSign, bSign, zSign;
3334 int32 aExp, bExp, zExp;
3335 bits64 aSig, bSig, zSig0, zSig1;
3336 floatx80 z;
3337
3338 aSig = extractFloatx80Frac( a );
3339 aExp = extractFloatx80Exp( a );
3340 aSign = extractFloatx80Sign( a );
3341 bSig = extractFloatx80Frac( b );
3342 bExp = extractFloatx80Exp( b );
3343 bSign = extractFloatx80Sign( b );
3344 zSign = aSign ^ bSign;
3345 if ( aExp == 0x7FFF ) {
3346 if ( (bits64) ( aSig<<1 )
3347 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3348 return propagateFloatx80NaN( a, b );
3349 }
3350 if ( ( bExp | bSig ) == 0 ) goto invalid;
3351 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3352 }
3353 if ( bExp == 0x7FFF ) {
3354 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3355 if ( ( aExp | aSig ) == 0 ) {
3356 invalid:
3357 float_raise( float_flag_invalid );
3358 z.low = floatx80_default_nan_low;
3359 z.high = floatx80_default_nan_high;
3360 return z;
3361 }
3362 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3363 }
3364 if ( aExp == 0 ) {
3365 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3366 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3367 }
3368 if ( bExp == 0 ) {
3369 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3370 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3371 }
3372 zExp = aExp + bExp - 0x3FFE;
3373 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3374 if ( 0 < (sbits64) zSig0 ) {
3375 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3376 --zExp;
3377 }
3378 return
3379 roundAndPackFloatx80(
3380 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3381
3382 }
3383
3384 /*----------------------------------------------------------------------------
3385 | Returns the result of dividing the extended double-precision floating-point
3386 | value `a' by the corresponding value `b'. The operation is performed
3387 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3388 *----------------------------------------------------------------------------*/
3389
floatx80_div(floatx80 a,floatx80 b)3390 floatx80 floatx80_div( floatx80 a, floatx80 b )
3391 {
3392 flag aSign, bSign, zSign;
3393 int32 aExp, bExp, zExp;
3394 bits64 aSig, bSig, zSig0, zSig1;
3395 bits64 rem0, rem1, rem2, term0, term1, term2;
3396 floatx80 z;
3397
3398 aSig = extractFloatx80Frac( a );
3399 aExp = extractFloatx80Exp( a );
3400 aSign = extractFloatx80Sign( a );
3401 bSig = extractFloatx80Frac( b );
3402 bExp = extractFloatx80Exp( b );
3403 bSign = extractFloatx80Sign( b );
3404 zSign = aSign ^ bSign;
3405 if ( aExp == 0x7FFF ) {
3406 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3407 if ( bExp == 0x7FFF ) {
3408 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3409 goto invalid;
3410 }
3411 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3412 }
3413 if ( bExp == 0x7FFF ) {
3414 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3415 return packFloatx80( zSign, 0, 0 );
3416 }
3417 if ( bExp == 0 ) {
3418 if ( bSig == 0 ) {
3419 if ( ( aExp | aSig ) == 0 ) {
3420 invalid:
3421 float_raise( float_flag_invalid );
3422 z.low = floatx80_default_nan_low;
3423 z.high = floatx80_default_nan_high;
3424 return z;
3425 }
3426 float_raise( float_flag_divbyzero );
3427 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3428 }
3429 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3430 }
3431 if ( aExp == 0 ) {
3432 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3433 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3434 }
3435 zExp = aExp - bExp + 0x3FFE;
3436 rem1 = 0;
3437 if ( bSig <= aSig ) {
3438 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3439 ++zExp;
3440 }
3441 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3442 mul64To128( bSig, zSig0, &term0, &term1 );
3443 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3444 while ( (sbits64) rem0 < 0 ) {
3445 --zSig0;
3446 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3447 }
3448 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3449 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3450 mul64To128( bSig, zSig1, &term1, &term2 );
3451 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3452 while ( (sbits64) rem1 < 0 ) {
3453 --zSig1;
3454 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3455 }
3456 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3457 }
3458 return
3459 roundAndPackFloatx80(
3460 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3461
3462 }
3463
3464 /*----------------------------------------------------------------------------
3465 | Returns the remainder of the extended double-precision floating-point value
3466 | `a' with respect to the corresponding value `b'. The operation is performed
3467 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3468 *----------------------------------------------------------------------------*/
3469
floatx80_rem(floatx80 a,floatx80 b)3470 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3471 {
3472 flag aSign, zSign;
3473 int32 aExp, bExp, expDiff;
3474 bits64 aSig0, aSig1, bSig;
3475 bits64 q, term0, term1, alternateASig0, alternateASig1;
3476 floatx80 z;
3477
3478 aSig0 = extractFloatx80Frac( a );
3479 aExp = extractFloatx80Exp( a );
3480 aSign = extractFloatx80Sign( a );
3481 bSig = extractFloatx80Frac( b );
3482 bExp = extractFloatx80Exp( b );
3483 // bSign = extractFloatx80Sign( b );
3484 if ( aExp == 0x7FFF ) {
3485 if ( (bits64) ( aSig0<<1 )
3486 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3487 return propagateFloatx80NaN( a, b );
3488 }
3489 goto invalid;
3490 }
3491 if ( bExp == 0x7FFF ) {
3492 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3493 return a;
3494 }
3495 if ( bExp == 0 ) {
3496 if ( bSig == 0 ) {
3497 invalid:
3498 float_raise( float_flag_invalid );
3499 z.low = floatx80_default_nan_low;
3500 z.high = floatx80_default_nan_high;
3501 return z;
3502 }
3503 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3504 }
3505 if ( aExp == 0 ) {
3506 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3507 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3508 }
3509 bSig |= LIT64( 0x8000000000000000 );
3510 zSign = aSign;
3511 expDiff = aExp - bExp;
3512 aSig1 = 0;
3513 if ( expDiff < 0 ) {
3514 if ( expDiff < -1 ) return a;
3515 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3516 expDiff = 0;
3517 }
3518 q = ( bSig <= aSig0 );
3519 if ( q ) aSig0 -= bSig;
3520 expDiff -= 64;
3521 while ( 0 < expDiff ) {
3522 q = estimateDiv128To64( aSig0, aSig1, bSig );
3523 q = ( 2 < q ) ? q - 2 : 0;
3524 mul64To128( bSig, q, &term0, &term1 );
3525 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3526 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3527 expDiff -= 62;
3528 }
3529 expDiff += 64;
3530 if ( 0 < expDiff ) {
3531 q = estimateDiv128To64( aSig0, aSig1, bSig );
3532 q = ( 2 < q ) ? q - 2 : 0;
3533 q >>= 64 - expDiff;
3534 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3535 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3536 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3537 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3538 ++q;
3539 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3540 }
3541 }
3542 else {
3543 term1 = 0;
3544 term0 = bSig;
3545 }
3546 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3547 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3548 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3549 && ( q & 1 ) )
3550 ) {
3551 aSig0 = alternateASig0;
3552 aSig1 = alternateASig1;
3553 zSign = ! zSign;
3554 }
3555 return
3556 normalizeRoundAndPackFloatx80(
3557 80, zSign, bExp + expDiff, aSig0, aSig1 );
3558
3559 }
3560
3561 /*----------------------------------------------------------------------------
3562 | Returns the square root of the extended double-precision floating-point
3563 | value `a'. The operation is performed according to the IEC/IEEE Standard
3564 | for Binary Floating-Point Arithmetic.
3565 *----------------------------------------------------------------------------*/
3566
floatx80_sqrt(floatx80 a)3567 floatx80 floatx80_sqrt( floatx80 a )
3568 {
3569 flag aSign;
3570 int32 aExp, zExp;
3571 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3572 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3573 floatx80 z;
3574
3575 aSig0 = extractFloatx80Frac( a );
3576 aExp = extractFloatx80Exp( a );
3577 aSign = extractFloatx80Sign( a );
3578 if ( aExp == 0x7FFF ) {
3579 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3580 if ( ! aSign ) return a;
3581 goto invalid;
3582 }
3583 if ( aSign ) {
3584 if ( ( aExp | aSig0 ) == 0 ) return a;
3585 invalid:
3586 float_raise( float_flag_invalid );
3587 z.low = floatx80_default_nan_low;
3588 z.high = floatx80_default_nan_high;
3589 return z;
3590 }
3591 if ( aExp == 0 ) {
3592 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3593 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3594 }
3595 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3596 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3597 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3598 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3599 doubleZSig0 = zSig0<<1;
3600 mul64To128( zSig0, zSig0, &term0, &term1 );
3601 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3602 while ( (sbits64) rem0 < 0 ) {
3603 --zSig0;
3604 doubleZSig0 -= 2;
3605 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3606 }
3607 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3608 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3609 if ( zSig1 == 0 ) zSig1 = 1;
3610 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3611 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3612 mul64To128( zSig1, zSig1, &term2, &term3 );
3613 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3614 while ( (sbits64) rem1 < 0 ) {
3615 --zSig1;
3616 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3617 term3 |= 1;
3618 term2 |= doubleZSig0;
3619 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3620 }
3621 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3622 }
3623 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3624 zSig0 |= doubleZSig0;
3625 return
3626 roundAndPackFloatx80(
3627 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3628
3629 }
3630
3631 /*----------------------------------------------------------------------------
3632 | Returns 1 if the extended double-precision floating-point value `a' is
3633 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3634 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3635 | Arithmetic.
3636 *----------------------------------------------------------------------------*/
3637
floatx80_eq(floatx80 a,floatx80 b)3638 flag floatx80_eq( floatx80 a, floatx80 b )
3639 {
3640 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3641 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3642 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3643 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3644 ) {
3645 if ( floatx80_is_signaling_nan( a )
3646 || floatx80_is_signaling_nan( b ) ) {
3647 float_raise( float_flag_invalid );
3648 }
3649 return 0;
3650 }
3651 return
3652 ( a.low == b.low )
3653 && ( ( a.high == b.high )
3654 || ( ( a.low == 0 )
3655 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3656 );
3657
3658 }
3659
3660 /*----------------------------------------------------------------------------
3661 | Returns 1 if the extended double-precision floating-point value `a' is
3662 | less than or equal to the corresponding value `b', and 0 otherwise. The
3663 | comparison is performed according to the IEC/IEEE Standard for Binary
3664 | Floating-Point Arithmetic.
3665 *----------------------------------------------------------------------------*/
3666
floatx80_le(floatx80 a,floatx80 b)3667 flag floatx80_le( floatx80 a, floatx80 b )
3668 {
3669 flag aSign, bSign;
3670
3671 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3672 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3673 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3674 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3675 ) {
3676 float_raise( float_flag_invalid );
3677 return 0;
3678 }
3679 aSign = extractFloatx80Sign( a );
3680 bSign = extractFloatx80Sign( b );
3681 if ( aSign != bSign ) {
3682 return
3683 aSign
3684 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3685 == 0 );
3686 }
3687 return
3688 aSign ? le128( b.high, b.low, a.high, a.low )
3689 : le128( a.high, a.low, b.high, b.low );
3690
3691 }
3692
3693 /*----------------------------------------------------------------------------
3694 | Returns 1 if the extended double-precision floating-point value `a' is
3695 | less than the corresponding value `b', and 0 otherwise. The comparison
3696 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3697 | Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699
floatx80_lt(floatx80 a,floatx80 b)3700 flag floatx80_lt( floatx80 a, floatx80 b )
3701 {
3702 flag aSign, bSign;
3703
3704 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3705 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3706 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3707 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3708 ) {
3709 float_raise( float_flag_invalid );
3710 return 0;
3711 }
3712 aSign = extractFloatx80Sign( a );
3713 bSign = extractFloatx80Sign( b );
3714 if ( aSign != bSign ) {
3715 return
3716 aSign
3717 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3718 != 0 );
3719 }
3720 return
3721 aSign ? lt128( b.high, b.low, a.high, a.low )
3722 : lt128( a.high, a.low, b.high, b.low );
3723
3724 }
3725
3726 /*----------------------------------------------------------------------------
3727 | Returns 1 if the extended double-precision floating-point value `a' is equal
3728 | to the corresponding value `b', and 0 otherwise. The invalid exception is
3729 | raised if either operand is a NaN. Otherwise, the comparison is performed
3730 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3731 *----------------------------------------------------------------------------*/
3732
floatx80_eq_signaling(floatx80 a,floatx80 b)3733 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3734 {
3735 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3736 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3737 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3738 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3739 ) {
3740 float_raise( float_flag_invalid );
3741 return 0;
3742 }
3743 return
3744 ( a.low == b.low )
3745 && ( ( a.high == b.high )
3746 || ( ( a.low == 0 )
3747 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3748 );
3749
3750 }
3751
3752 /*----------------------------------------------------------------------------
3753 | Returns 1 if the extended double-precision floating-point value `a' is less
3754 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3755 | do not cause an exception. Otherwise, the comparison is performed according
3756 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3757 *----------------------------------------------------------------------------*/
3758
floatx80_le_quiet(floatx80 a,floatx80 b)3759 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3760 {
3761 flag aSign, bSign;
3762
3763 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3764 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3765 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3766 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3767 ) {
3768 if ( floatx80_is_signaling_nan( a )
3769 || floatx80_is_signaling_nan( b ) ) {
3770 float_raise( float_flag_invalid );
3771 }
3772 return 0;
3773 }
3774 aSign = extractFloatx80Sign( a );
3775 bSign = extractFloatx80Sign( b );
3776 if ( aSign != bSign ) {
3777 return
3778 aSign
3779 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3780 == 0 );
3781 }
3782 return
3783 aSign ? le128( b.high, b.low, a.high, a.low )
3784 : le128( a.high, a.low, b.high, b.low );
3785
3786 }
3787
3788 /*----------------------------------------------------------------------------
3789 | Returns 1 if the extended double-precision floating-point value `a' is less
3790 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3791 | an exception. Otherwise, the comparison is performed according to the
3792 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3793 *----------------------------------------------------------------------------*/
3794
floatx80_lt_quiet(floatx80 a,floatx80 b)3795 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3796 {
3797 flag aSign, bSign;
3798
3799 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3800 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3801 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3802 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3803 ) {
3804 if ( floatx80_is_signaling_nan( a )
3805 || floatx80_is_signaling_nan( b ) ) {
3806 float_raise( float_flag_invalid );
3807 }
3808 return 0;
3809 }
3810 aSign = extractFloatx80Sign( a );
3811 bSign = extractFloatx80Sign( b );
3812 if ( aSign != bSign ) {
3813 return
3814 aSign
3815 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3816 != 0 );
3817 }
3818 return
3819 aSign ? lt128( b.high, b.low, a.high, a.low )
3820 : lt128( a.high, a.low, b.high, b.low );
3821
3822 }
3823
3824 #endif
3825
3826 #ifdef FLOAT128
3827
3828 /*----------------------------------------------------------------------------
3829 | Returns the result of converting the quadruple-precision floating-point
3830 | value `a' to the 32-bit two's complement integer format. The conversion
3831 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3832 | Arithmetic---which means in particular that the conversion is rounded
3833 | according to the current rounding mode. If `a' is a NaN, the largest
3834 | positive integer is returned. Otherwise, if the conversion overflows, the
3835 | largest integer with the same sign as `a' is returned.
3836 *----------------------------------------------------------------------------*/
3837
float128_to_int32(float128 a)3838 int32 float128_to_int32( float128 a )
3839 {
3840 flag aSign;
3841 int32 aExp, shiftCount;
3842 bits64 aSig0, aSig1;
3843
3844 aSig1 = extractFloat128Frac1( a );
3845 aSig0 = extractFloat128Frac0( a );
3846 aExp = extractFloat128Exp( a );
3847 aSign = extractFloat128Sign( a );
3848 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3849 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3850 aSig0 |= ( aSig1 != 0 );
3851 shiftCount = 0x4028 - aExp;
3852 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3853 return roundAndPackInt32( aSign, aSig0 );
3854
3855 }
3856
3857 /*----------------------------------------------------------------------------
3858 | Returns the result of converting the quadruple-precision floating-point
3859 | value `a' to the 32-bit two's complement integer format. The conversion
3860 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3861 | Arithmetic, except that the conversion is always rounded toward zero. If
3862 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
3863 | conversion overflows, the largest integer with the same sign as `a' is
3864 | returned.
3865 *----------------------------------------------------------------------------*/
3866
float128_to_int32_round_to_zero(float128 a)3867 int32 float128_to_int32_round_to_zero( float128 a )
3868 {
3869 flag aSign;
3870 int32 aExp, shiftCount;
3871 bits64 aSig0, aSig1, savedASig;
3872 int32 z;
3873
3874 aSig1 = extractFloat128Frac1( a );
3875 aSig0 = extractFloat128Frac0( a );
3876 aExp = extractFloat128Exp( a );
3877 aSign = extractFloat128Sign( a );
3878 aSig0 |= ( aSig1 != 0 );
3879 if ( 0x401E < aExp ) {
3880 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3881 goto invalid;
3882 }
3883 else if ( aExp < 0x3FFF ) {
3884 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3885 return 0;
3886 }
3887 aSig0 |= LIT64( 0x0001000000000000 );
3888 shiftCount = 0x402F - aExp;
3889 savedASig = aSig0;
3890 aSig0 >>= shiftCount;
3891 z = aSig0;
3892 if ( aSign ) z = - z;
3893 if ( ( z < 0 ) ^ aSign ) {
3894 invalid:
3895 float_raise( float_flag_invalid );
3896 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3897 }
3898 if ( ( aSig0<<shiftCount ) != savedASig ) {
3899 float_exception_flags |= float_flag_inexact;
3900 }
3901 return z;
3902
3903 }
3904
3905 /*----------------------------------------------------------------------------
3906 | Returns the result of converting the quadruple-precision floating-point
3907 | value `a' to the 64-bit two's complement integer format. The conversion
3908 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3909 | Arithmetic---which means in particular that the conversion is rounded
3910 | according to the current rounding mode. If `a' is a NaN, the largest
3911 | positive integer is returned. Otherwise, if the conversion overflows, the
3912 | largest integer with the same sign as `a' is returned.
3913 *----------------------------------------------------------------------------*/
3914
float128_to_int64(float128 a)3915 int64 float128_to_int64( float128 a )
3916 {
3917 flag aSign;
3918 int32 aExp, shiftCount;
3919 bits64 aSig0, aSig1;
3920
3921 aSig1 = extractFloat128Frac1( a );
3922 aSig0 = extractFloat128Frac0( a );
3923 aExp = extractFloat128Exp( a );
3924 aSign = extractFloat128Sign( a );
3925 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3926 shiftCount = 0x402F - aExp;
3927 if ( shiftCount <= 0 ) {
3928 if ( 0x403E < aExp ) {
3929 float_raise( float_flag_invalid );
3930 if ( ! aSign
3931 || ( ( aExp == 0x7FFF )
3932 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
3933 )
3934 ) {
3935 return LIT64( 0x7FFFFFFFFFFFFFFF );
3936 }
3937 return (sbits64) LIT64( 0x8000000000000000 );
3938 }
3939 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
3940 }
3941 else {
3942 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
3943 }
3944 return roundAndPackInt64( aSign, aSig0, aSig1 );
3945
3946 }
3947
3948 /*----------------------------------------------------------------------------
3949 | Returns the result of converting the quadruple-precision floating-point
3950 | value `a' to the 64-bit two's complement integer format. The conversion
3951 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3952 | Arithmetic, except that the conversion is always rounded toward zero.
3953 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3954 | the conversion overflows, the largest integer with the same sign as `a' is
3955 | returned.
3956 *----------------------------------------------------------------------------*/
3957
float128_to_int64_round_to_zero(float128 a)3958 int64 float128_to_int64_round_to_zero( float128 a )
3959 {
3960 flag aSign;
3961 int32 aExp, shiftCount;
3962 bits64 aSig0, aSig1;
3963 int64 z;
3964
3965 aSig1 = extractFloat128Frac1( a );
3966 aSig0 = extractFloat128Frac0( a );
3967 aExp = extractFloat128Exp( a );
3968 aSign = extractFloat128Sign( a );
3969 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3970 shiftCount = aExp - 0x402F;
3971 if ( 0 < shiftCount ) {
3972 if ( 0x403E <= aExp ) {
3973 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
3974 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
3975 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
3976 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
3977 }
3978 else {
3979 float_raise( float_flag_invalid );
3980 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
3981 return LIT64( 0x7FFFFFFFFFFFFFFF );
3982 }
3983 }
3984 return (sbits64) LIT64( 0x8000000000000000 );
3985 }
3986 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
3987 if ( (bits64) ( aSig1<<shiftCount ) ) {
3988 float_exception_flags |= float_flag_inexact;
3989 }
3990 }
3991 else {
3992 if ( aExp < 0x3FFF ) {
3993 if ( aExp | aSig0 | aSig1 ) {
3994 float_exception_flags |= float_flag_inexact;
3995 }
3996 return 0;
3997 }
3998 z = aSig0>>( - shiftCount );
3999 if ( aSig1
4000 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4001 float_exception_flags |= float_flag_inexact;
4002 }
4003 }
4004 if ( aSign ) z = - z;
4005 return z;
4006
4007 }
4008
4009 /*----------------------------------------------------------------------------
4010 | Returns the result of converting the quadruple-precision floating-point
4011 | value `a' to the single-precision floating-point format. The conversion
4012 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4013 | Arithmetic.
4014 *----------------------------------------------------------------------------*/
4015
float128_to_float32(float128 a)4016 float32 float128_to_float32( float128 a )
4017 {
4018 flag aSign;
4019 int32 aExp;
4020 bits64 aSig0, aSig1;
4021 bits32 zSig;
4022
4023 aSig1 = extractFloat128Frac1( a );
4024 aSig0 = extractFloat128Frac0( a );
4025 aExp = extractFloat128Exp( a );
4026 aSign = extractFloat128Sign( a );
4027 if ( aExp == 0x7FFF ) {
4028 if ( aSig0 | aSig1 ) {
4029 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4030 }
4031 return packFloat32( aSign, 0xFF, 0 );
4032 }
4033 aSig0 |= ( aSig1 != 0 );
4034 shift64RightJamming( aSig0, 18, &aSig0 );
4035 zSig = aSig0;
4036 if ( aExp || zSig ) {
4037 zSig |= 0x40000000;
4038 aExp -= 0x3F81;
4039 }
4040 return roundAndPackFloat32( aSign, aExp, zSig );
4041
4042 }
4043
4044 /*----------------------------------------------------------------------------
4045 | Returns the result of converting the quadruple-precision floating-point
4046 | value `a' to the double-precision floating-point format. The conversion
4047 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4048 | Arithmetic.
4049 *----------------------------------------------------------------------------*/
4050
float128_to_float64(float128 a)4051 float64 float128_to_float64( float128 a )
4052 {
4053 flag aSign;
4054 int32 aExp;
4055 bits64 aSig0, aSig1;
4056
4057 aSig1 = extractFloat128Frac1( a );
4058 aSig0 = extractFloat128Frac0( a );
4059 aExp = extractFloat128Exp( a );
4060 aSign = extractFloat128Sign( a );
4061 if ( aExp == 0x7FFF ) {
4062 if ( aSig0 | aSig1 ) {
4063 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4064 }
4065 return packFloat64( aSign, 0x7FF, 0 );
4066 }
4067 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4068 aSig0 |= ( aSig1 != 0 );
4069 if ( aExp || aSig0 ) {
4070 aSig0 |= LIT64( 0x4000000000000000 );
4071 aExp -= 0x3C01;
4072 }
4073 return roundAndPackFloat64( aSign, aExp, aSig0 );
4074
4075 }
4076
4077 #ifdef FLOATX80
4078
4079 /*----------------------------------------------------------------------------
4080 | Returns the result of converting the quadruple-precision floating-point
4081 | value `a' to the extended double-precision floating-point format. The
4082 | conversion is performed according to the IEC/IEEE Standard for Binary
4083 | Floating-Point Arithmetic.
4084 *----------------------------------------------------------------------------*/
4085
float128_to_floatx80(float128 a)4086 floatx80 float128_to_floatx80( float128 a )
4087 {
4088 flag aSign;
4089 int32 aExp;
4090 bits64 aSig0, aSig1;
4091
4092 aSig1 = extractFloat128Frac1( a );
4093 aSig0 = extractFloat128Frac0( a );
4094 aExp = extractFloat128Exp( a );
4095 aSign = extractFloat128Sign( a );
4096 if ( aExp == 0x7FFF ) {
4097 if ( aSig0 | aSig1 ) {
4098 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4099 }
4100 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4101 }
4102 if ( aExp == 0 ) {
4103 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4104 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4105 }
4106 else {
4107 aSig0 |= LIT64( 0x0001000000000000 );
4108 }
4109 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4110 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4111
4112 }
4113
4114 #endif
4115
4116 /*----------------------------------------------------------------------------
4117 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4118 | returns the result as a quadruple-precision floating-point value. The
4119 | operation is performed according to the IEC/IEEE Standard for Binary
4120 | Floating-Point Arithmetic.
4121 *----------------------------------------------------------------------------*/
4122
float128_round_to_int(float128 a)4123 float128 float128_round_to_int( float128 a )
4124 {
4125 flag aSign;
4126 int32 aExp;
4127 bits64 lastBitMask, roundBitsMask;
4128 int8 roundingMode;
4129 float128 z;
4130
4131 aExp = extractFloat128Exp( a );
4132 if ( 0x402F <= aExp ) {
4133 if ( 0x406F <= aExp ) {
4134 if ( ( aExp == 0x7FFF )
4135 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4136 ) {
4137 return propagateFloat128NaN( a, a );
4138 }
4139 return a;
4140 }
4141 lastBitMask = 1;
4142 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4143 roundBitsMask = lastBitMask - 1;
4144 z = a;
4145 roundingMode = float_rounding_mode;
4146 if ( roundingMode == float_round_nearest_even ) {
4147 if ( lastBitMask ) {
4148 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4149 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4150 }
4151 else {
4152 if ( (sbits64) z.low < 0 ) {
4153 ++z.high;
4154 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4155 }
4156 }
4157 }
4158 else if ( roundingMode != float_round_to_zero ) {
4159 if ( extractFloat128Sign( z )
4160 ^ ( roundingMode == float_round_up ) ) {
4161 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4162 }
4163 }
4164 z.low &= ~ roundBitsMask;
4165 }
4166 else {
4167 if ( aExp < 0x3FFF ) {
4168 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4169 float_exception_flags |= float_flag_inexact;
4170 aSign = extractFloat128Sign( a );
4171 switch ( float_rounding_mode ) {
4172 case float_round_nearest_even:
4173 if ( ( aExp == 0x3FFE )
4174 && ( extractFloat128Frac0( a )
4175 | extractFloat128Frac1( a ) )
4176 ) {
4177 return packFloat128( aSign, 0x3FFF, 0, 0 );
4178 }
4179 break;
4180 case float_round_down:
4181 return
4182 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4183 : packFloat128( 0, 0, 0, 0 );
4184 case float_round_up:
4185 return
4186 aSign ? packFloat128( 1, 0, 0, 0 )
4187 : packFloat128( 0, 0x3FFF, 0, 0 );
4188 }
4189 return packFloat128( aSign, 0, 0, 0 );
4190 }
4191 lastBitMask = 1;
4192 lastBitMask <<= 0x402F - aExp;
4193 roundBitsMask = lastBitMask - 1;
4194 z.low = 0;
4195 z.high = a.high;
4196 roundingMode = float_rounding_mode;
4197 if ( roundingMode == float_round_nearest_even ) {
4198 z.high += lastBitMask>>1;
4199 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4200 z.high &= ~ lastBitMask;
4201 }
4202 }
4203 else if ( roundingMode != float_round_to_zero ) {
4204 if ( extractFloat128Sign( z )
4205 ^ ( roundingMode == float_round_up ) ) {
4206 z.high |= ( a.low != 0 );
4207 z.high += roundBitsMask;
4208 }
4209 }
4210 z.high &= ~ roundBitsMask;
4211 }
4212 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4213 float_exception_flags |= float_flag_inexact;
4214 }
4215 return z;
4216
4217 }
4218
4219 /*----------------------------------------------------------------------------
4220 | Returns the result of adding the absolute values of the quadruple-precision
4221 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4222 | before being returned. `zSign' is ignored if the result is a NaN.
4223 | The addition is performed according to the IEC/IEEE Standard for Binary
4224 | Floating-Point Arithmetic.
4225 *----------------------------------------------------------------------------*/
4226
addFloat128Sigs(float128 a,float128 b,flag zSign)4227 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4228 {
4229 int32 aExp, bExp, zExp;
4230 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4231 int32 expDiff;
4232
4233 aSig1 = extractFloat128Frac1( a );
4234 aSig0 = extractFloat128Frac0( a );
4235 aExp = extractFloat128Exp( a );
4236 bSig1 = extractFloat128Frac1( b );
4237 bSig0 = extractFloat128Frac0( b );
4238 bExp = extractFloat128Exp( b );
4239 expDiff = aExp - bExp;
4240 if ( 0 < expDiff ) {
4241 if ( aExp == 0x7FFF ) {
4242 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4243 return a;
4244 }
4245 if ( bExp == 0 ) {
4246 --expDiff;
4247 }
4248 else {
4249 bSig0 |= LIT64( 0x0001000000000000 );
4250 }
4251 shift128ExtraRightJamming(
4252 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4253 zExp = aExp;
4254 }
4255 else if ( expDiff < 0 ) {
4256 if ( bExp == 0x7FFF ) {
4257 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4258 return packFloat128( zSign, 0x7FFF, 0, 0 );
4259 }
4260 if ( aExp == 0 ) {
4261 ++expDiff;
4262 }
4263 else {
4264 aSig0 |= LIT64( 0x0001000000000000 );
4265 }
4266 shift128ExtraRightJamming(
4267 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4268 zExp = bExp;
4269 }
4270 else {
4271 if ( aExp == 0x7FFF ) {
4272 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4273 return propagateFloat128NaN( a, b );
4274 }
4275 return a;
4276 }
4277 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4278 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4279 zSig2 = 0;
4280 zSig0 |= LIT64( 0x0002000000000000 );
4281 zExp = aExp;
4282 goto shiftRight1;
4283 }
4284 aSig0 |= LIT64( 0x0001000000000000 );
4285 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4286 --zExp;
4287 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4288 ++zExp;
4289 shiftRight1:
4290 shift128ExtraRightJamming(
4291 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4292 roundAndPack:
4293 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4294
4295 }
4296
4297 /*----------------------------------------------------------------------------
4298 | Returns the result of subtracting the absolute values of the quadruple-
4299 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4300 | difference is negated before being returned. `zSign' is ignored if the
4301 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4302 | Standard for Binary Floating-Point Arithmetic.
4303 *----------------------------------------------------------------------------*/
4304
subFloat128Sigs(float128 a,float128 b,flag zSign)4305 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4306 {
4307 int32 aExp, bExp, zExp;
4308 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4309 int32 expDiff;
4310 float128 z;
4311
4312 aSig1 = extractFloat128Frac1( a );
4313 aSig0 = extractFloat128Frac0( a );
4314 aExp = extractFloat128Exp( a );
4315 bSig1 = extractFloat128Frac1( b );
4316 bSig0 = extractFloat128Frac0( b );
4317 bExp = extractFloat128Exp( b );
4318 expDiff = aExp - bExp;
4319 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4320 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4321 if ( 0 < expDiff ) goto aExpBigger;
4322 if ( expDiff < 0 ) goto bExpBigger;
4323 if ( aExp == 0x7FFF ) {
4324 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4325 return propagateFloat128NaN( a, b );
4326 }
4327 float_raise( float_flag_invalid );
4328 z.low = float128_default_nan_low;
4329 z.high = float128_default_nan_high;
4330 return z;
4331 }
4332 if ( aExp == 0 ) {
4333 aExp = 1;
4334 bExp = 1;
4335 }
4336 if ( bSig0 < aSig0 ) goto aBigger;
4337 if ( aSig0 < bSig0 ) goto bBigger;
4338 if ( bSig1 < aSig1 ) goto aBigger;
4339 if ( aSig1 < bSig1 ) goto bBigger;
4340 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4341 bExpBigger:
4342 if ( bExp == 0x7FFF ) {
4343 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4344 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4345 }
4346 if ( aExp == 0 ) {
4347 ++expDiff;
4348 }
4349 else {
4350 aSig0 |= LIT64( 0x4000000000000000 );
4351 }
4352 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4353 bSig0 |= LIT64( 0x4000000000000000 );
4354 bBigger:
4355 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4356 zExp = bExp;
4357 zSign ^= 1;
4358 goto normalizeRoundAndPack;
4359 aExpBigger:
4360 if ( aExp == 0x7FFF ) {
4361 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4362 return a;
4363 }
4364 if ( bExp == 0 ) {
4365 --expDiff;
4366 }
4367 else {
4368 bSig0 |= LIT64( 0x4000000000000000 );
4369 }
4370 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4371 aSig0 |= LIT64( 0x4000000000000000 );
4372 aBigger:
4373 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4374 zExp = aExp;
4375 normalizeRoundAndPack:
4376 --zExp;
4377 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4378
4379 }
4380
4381 /*----------------------------------------------------------------------------
4382 | Returns the result of adding the quadruple-precision floating-point values
4383 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4384 | for Binary Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4386
float128_add(float128 a,float128 b)4387 float128 float128_add( float128 a, float128 b )
4388 {
4389 flag aSign, bSign;
4390
4391 aSign = extractFloat128Sign( a );
4392 bSign = extractFloat128Sign( b );
4393 if ( aSign == bSign ) {
4394 return addFloat128Sigs( a, b, aSign );
4395 }
4396 else {
4397 return subFloat128Sigs( a, b, aSign );
4398 }
4399
4400 }
4401
4402 /*----------------------------------------------------------------------------
4403 | Returns the result of subtracting the quadruple-precision floating-point
4404 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4405 | Standard for Binary Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4407
float128_sub(float128 a,float128 b)4408 float128 float128_sub( float128 a, float128 b )
4409 {
4410 flag aSign, bSign;
4411
4412 aSign = extractFloat128Sign( a );
4413 bSign = extractFloat128Sign( b );
4414 if ( aSign == bSign ) {
4415 return subFloat128Sigs( a, b, aSign );
4416 }
4417 else {
4418 return addFloat128Sigs( a, b, aSign );
4419 }
4420
4421 }
4422
4423 /*----------------------------------------------------------------------------
4424 | Returns the result of multiplying the quadruple-precision floating-point
4425 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4426 | Standard for Binary Floating-Point Arithmetic.
4427 *----------------------------------------------------------------------------*/
4428
float128_mul(float128 a,float128 b)4429 float128 float128_mul( float128 a, float128 b )
4430 {
4431 flag aSign, bSign, zSign;
4432 int32 aExp, bExp, zExp;
4433 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4434 float128 z;
4435
4436 aSig1 = extractFloat128Frac1( a );
4437 aSig0 = extractFloat128Frac0( a );
4438 aExp = extractFloat128Exp( a );
4439 aSign = extractFloat128Sign( a );
4440 bSig1 = extractFloat128Frac1( b );
4441 bSig0 = extractFloat128Frac0( b );
4442 bExp = extractFloat128Exp( b );
4443 bSign = extractFloat128Sign( b );
4444 zSign = aSign ^ bSign;
4445 if ( aExp == 0x7FFF ) {
4446 if ( ( aSig0 | aSig1 )
4447 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4448 return propagateFloat128NaN( a, b );
4449 }
4450 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4451 return packFloat128( zSign, 0x7FFF, 0, 0 );
4452 }
4453 if ( bExp == 0x7FFF ) {
4454 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4455 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4456 invalid:
4457 float_raise( float_flag_invalid );
4458 z.low = float128_default_nan_low;
4459 z.high = float128_default_nan_high;
4460 return z;
4461 }
4462 return packFloat128( zSign, 0x7FFF, 0, 0 );
4463 }
4464 if ( aExp == 0 ) {
4465 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4466 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4467 }
4468 if ( bExp == 0 ) {
4469 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4470 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4471 }
4472 zExp = aExp + bExp - 0x4000;
4473 aSig0 |= LIT64( 0x0001000000000000 );
4474 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4475 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4476 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4477 zSig2 |= ( zSig3 != 0 );
4478 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4479 shift128ExtraRightJamming(
4480 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4481 ++zExp;
4482 }
4483 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4484
4485 }
4486
4487 /*----------------------------------------------------------------------------
4488 | Returns the result of dividing the quadruple-precision floating-point value
4489 | `a' by the corresponding value `b'. The operation is performed according to
4490 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4491 *----------------------------------------------------------------------------*/
4492
float128_div(float128 a,float128 b)4493 float128 float128_div( float128 a, float128 b )
4494 {
4495 flag aSign, bSign, zSign;
4496 int32 aExp, bExp, zExp;
4497 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4498 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4499 float128 z;
4500
4501 aSig1 = extractFloat128Frac1( a );
4502 aSig0 = extractFloat128Frac0( a );
4503 aExp = extractFloat128Exp( a );
4504 aSign = extractFloat128Sign( a );
4505 bSig1 = extractFloat128Frac1( b );
4506 bSig0 = extractFloat128Frac0( b );
4507 bExp = extractFloat128Exp( b );
4508 bSign = extractFloat128Sign( b );
4509 zSign = aSign ^ bSign;
4510 if ( aExp == 0x7FFF ) {
4511 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4512 if ( bExp == 0x7FFF ) {
4513 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4514 goto invalid;
4515 }
4516 return packFloat128( zSign, 0x7FFF, 0, 0 );
4517 }
4518 if ( bExp == 0x7FFF ) {
4519 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4520 return packFloat128( zSign, 0, 0, 0 );
4521 }
4522 if ( bExp == 0 ) {
4523 if ( ( bSig0 | bSig1 ) == 0 ) {
4524 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4525 invalid:
4526 float_raise( float_flag_invalid );
4527 z.low = float128_default_nan_low;
4528 z.high = float128_default_nan_high;
4529 return z;
4530 }
4531 float_raise( float_flag_divbyzero );
4532 return packFloat128( zSign, 0x7FFF, 0, 0 );
4533 }
4534 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4535 }
4536 if ( aExp == 0 ) {
4537 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4538 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4539 }
4540 zExp = aExp - bExp + 0x3FFD;
4541 shortShift128Left(
4542 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4543 shortShift128Left(
4544 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4545 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4546 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4547 ++zExp;
4548 }
4549 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4550 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4551 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4552 while ( (sbits64) rem0 < 0 ) {
4553 --zSig0;
4554 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4555 }
4556 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4557 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4558 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4559 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4560 while ( (sbits64) rem1 < 0 ) {
4561 --zSig1;
4562 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4563 }
4564 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4565 }
4566 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4567 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4568
4569 }
4570
4571 /*----------------------------------------------------------------------------
4572 | Returns the remainder of the quadruple-precision floating-point value `a'
4573 | with respect to the corresponding value `b'. The operation is performed
4574 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4575 *----------------------------------------------------------------------------*/
4576
float128_rem(float128 a,float128 b)4577 float128 float128_rem( float128 a, float128 b )
4578 {
4579 flag aSign, zSign;
4580 int32 aExp, bExp, expDiff;
4581 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4582 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4583 sbits64 sigMean0;
4584 float128 z;
4585
4586 aSig1 = extractFloat128Frac1( a );
4587 aSig0 = extractFloat128Frac0( a );
4588 aExp = extractFloat128Exp( a );
4589 aSign = extractFloat128Sign( a );
4590 bSig1 = extractFloat128Frac1( b );
4591 bSig0 = extractFloat128Frac0( b );
4592 bExp = extractFloat128Exp( b );
4593 // bSign = extractFloat128Sign( b );
4594 if ( aExp == 0x7FFF ) {
4595 if ( ( aSig0 | aSig1 )
4596 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4597 return propagateFloat128NaN( a, b );
4598 }
4599 goto invalid;
4600 }
4601 if ( bExp == 0x7FFF ) {
4602 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4603 return a;
4604 }
4605 if ( bExp == 0 ) {
4606 if ( ( bSig0 | bSig1 ) == 0 ) {
4607 invalid:
4608 float_raise( float_flag_invalid );
4609 z.low = float128_default_nan_low;
4610 z.high = float128_default_nan_high;
4611 return z;
4612 }
4613 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4614 }
4615 if ( aExp == 0 ) {
4616 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4617 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4618 }
4619 expDiff = aExp - bExp;
4620 if ( expDiff < -1 ) return a;
4621 shortShift128Left(
4622 aSig0 | LIT64( 0x0001000000000000 ),
4623 aSig1,
4624 15 - ( expDiff < 0 ),
4625 &aSig0,
4626 &aSig1
4627 );
4628 shortShift128Left(
4629 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4630 q = le128( bSig0, bSig1, aSig0, aSig1 );
4631 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4632 expDiff -= 64;
4633 while ( 0 < expDiff ) {
4634 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4635 q = ( 4 < q ) ? q - 4 : 0;
4636 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4637 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4638 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4639 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4640 expDiff -= 61;
4641 }
4642 if ( -64 < expDiff ) {
4643 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4644 q = ( 4 < q ) ? q - 4 : 0;
4645 q >>= - expDiff;
4646 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4647 expDiff += 52;
4648 if ( expDiff < 0 ) {
4649 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4650 }
4651 else {
4652 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4653 }
4654 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4655 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4656 }
4657 else {
4658 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4659 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4660 }
4661 do {
4662 alternateASig0 = aSig0;
4663 alternateASig1 = aSig1;
4664 ++q;
4665 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4666 } while ( 0 <= (sbits64) aSig0 );
4667 add128(
4668 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
4669 if ( ( sigMean0 < 0 )
4670 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4671 aSig0 = alternateASig0;
4672 aSig1 = alternateASig1;
4673 }
4674 zSign = ( (sbits64) aSig0 < 0 );
4675 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4676 return
4677 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4678
4679 }
4680
4681 /*----------------------------------------------------------------------------
4682 | Returns the square root of the quadruple-precision floating-point value `a'.
4683 | The operation is performed according to the IEC/IEEE Standard for Binary
4684 | Floating-Point Arithmetic.
4685 *----------------------------------------------------------------------------*/
4686
float128_sqrt(float128 a)4687 float128 float128_sqrt( float128 a )
4688 {
4689 flag aSign;
4690 int32 aExp, zExp;
4691 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4692 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4693 float128 z;
4694
4695 aSig1 = extractFloat128Frac1( a );
4696 aSig0 = extractFloat128Frac0( a );
4697 aExp = extractFloat128Exp( a );
4698 aSign = extractFloat128Sign( a );
4699 if ( aExp == 0x7FFF ) {
4700 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4701 if ( ! aSign ) return a;
4702 goto invalid;
4703 }
4704 if ( aSign ) {
4705 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4706 invalid:
4707 float_raise( float_flag_invalid );
4708 z.low = float128_default_nan_low;
4709 z.high = float128_default_nan_high;
4710 return z;
4711 }
4712 if ( aExp == 0 ) {
4713 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4714 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4715 }
4716 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4717 aSig0 |= LIT64( 0x0001000000000000 );
4718 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4719 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4720 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4721 doubleZSig0 = zSig0<<1;
4722 mul64To128( zSig0, zSig0, &term0, &term1 );
4723 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4724 while ( (sbits64) rem0 < 0 ) {
4725 --zSig0;
4726 doubleZSig0 -= 2;
4727 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4728 }
4729 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4730 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4731 if ( zSig1 == 0 ) zSig1 = 1;
4732 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4733 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4734 mul64To128( zSig1, zSig1, &term2, &term3 );
4735 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4736 while ( (sbits64) rem1 < 0 ) {
4737 --zSig1;
4738 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4739 term3 |= 1;
4740 term2 |= doubleZSig0;
4741 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4742 }
4743 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4744 }
4745 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4746 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4747
4748 }
4749
4750 /*----------------------------------------------------------------------------
4751 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4752 | the corresponding value `b', and 0 otherwise. The comparison is performed
4753 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4755
float128_eq(float128 a,float128 b)4756 flag float128_eq( float128 a, float128 b )
4757 {
4758 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4759 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4760 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4761 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4762 ) {
4763 if ( float128_is_signaling_nan( a )
4764 || float128_is_signaling_nan( b ) ) {
4765 float_raise( float_flag_invalid );
4766 }
4767 return 0;
4768 }
4769 return
4770 ( a.low == b.low )
4771 && ( ( a.high == b.high )
4772 || ( ( a.low == 0 )
4773 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4774 );
4775
4776 }
4777
4778 /*----------------------------------------------------------------------------
4779 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4780 | or equal to the corresponding value `b', and 0 otherwise. The comparison
4781 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4782 | Arithmetic.
4783 *----------------------------------------------------------------------------*/
4784
float128_le(float128 a,float128 b)4785 flag float128_le( float128 a, float128 b )
4786 {
4787 flag aSign, bSign;
4788
4789 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4790 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4791 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4792 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4793 ) {
4794 float_raise( float_flag_invalid );
4795 return 0;
4796 }
4797 aSign = extractFloat128Sign( a );
4798 bSign = extractFloat128Sign( b );
4799 if ( aSign != bSign ) {
4800 return
4801 aSign
4802 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4803 == 0 );
4804 }
4805 return
4806 aSign ? le128( b.high, b.low, a.high, a.low )
4807 : le128( a.high, a.low, b.high, b.low );
4808
4809 }
4810
4811 /*----------------------------------------------------------------------------
4812 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4813 | the corresponding value `b', and 0 otherwise. The comparison is performed
4814 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4815 *----------------------------------------------------------------------------*/
4816
float128_lt(float128 a,float128 b)4817 flag float128_lt( float128 a, float128 b )
4818 {
4819 flag aSign, bSign;
4820
4821 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4822 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4823 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4824 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4825 ) {
4826 float_raise( float_flag_invalid );
4827 return 0;
4828 }
4829 aSign = extractFloat128Sign( a );
4830 bSign = extractFloat128Sign( b );
4831 if ( aSign != bSign ) {
4832 return
4833 aSign
4834 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4835 != 0 );
4836 }
4837 return
4838 aSign ? lt128( b.high, b.low, a.high, a.low )
4839 : lt128( a.high, a.low, b.high, b.low );
4840
4841 }
4842
4843 /*----------------------------------------------------------------------------
4844 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4845 | the corresponding value `b', and 0 otherwise. The invalid exception is
4846 | raised if either operand is a NaN. Otherwise, the comparison is performed
4847 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4849
float128_eq_signaling(float128 a,float128 b)4850 flag float128_eq_signaling( float128 a, float128 b )
4851 {
4852 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4853 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4854 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4855 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4856 ) {
4857 float_raise( float_flag_invalid );
4858 return 0;
4859 }
4860 return
4861 ( a.low == b.low )
4862 && ( ( a.high == b.high )
4863 || ( ( a.low == 0 )
4864 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4865 );
4866
4867 }
4868
4869 /*----------------------------------------------------------------------------
4870 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4871 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4872 | cause an exception. Otherwise, the comparison is performed according to the
4873 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4874 *----------------------------------------------------------------------------*/
4875
float128_le_quiet(float128 a,float128 b)4876 flag float128_le_quiet( float128 a, float128 b )
4877 {
4878 flag aSign, bSign;
4879
4880 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4881 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4882 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4883 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4884 ) {
4885 if ( float128_is_signaling_nan( a )
4886 || float128_is_signaling_nan( b ) ) {
4887 float_raise( float_flag_invalid );
4888 }
4889 return 0;
4890 }
4891 aSign = extractFloat128Sign( a );
4892 bSign = extractFloat128Sign( b );
4893 if ( aSign != bSign ) {
4894 return
4895 aSign
4896 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4897 == 0 );
4898 }
4899 return
4900 aSign ? le128( b.high, b.low, a.high, a.low )
4901 : le128( a.high, a.low, b.high, b.low );
4902
4903 }
4904
4905 /*----------------------------------------------------------------------------
4906 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4907 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4908 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4909 | Standard for Binary Floating-Point Arithmetic.
4910 *----------------------------------------------------------------------------*/
4911
float128_lt_quiet(float128 a,float128 b)4912 flag float128_lt_quiet( float128 a, float128 b )
4913 {
4914 flag aSign, bSign;
4915
4916 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4917 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4918 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4919 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4920 ) {
4921 if ( float128_is_signaling_nan( a )
4922 || float128_is_signaling_nan( b ) ) {
4923 float_raise( float_flag_invalid );
4924 }
4925 return 0;
4926 }
4927 aSign = extractFloat128Sign( a );
4928 bSign = extractFloat128Sign( b );
4929 if ( aSign != bSign ) {
4930 return
4931 aSign
4932 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4933 != 0 );
4934 }
4935 return
4936 aSign ? lt128( b.high, b.low, a.high, a.low )
4937 : lt128( a.high, a.low, b.high, b.low );
4938
4939 }
4940
4941 #endif
4942