1 /* Copyright (C) 2007-2013 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
8 version.
9 
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 for more details.
14 
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18 
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22 <http://www.gnu.org/licenses/>.  */
23 
24 /*****************************************************************************
25  *    BID64 square root
26  *****************************************************************************
27  *
28  *  Algorithm description:
29  *
30  *  if(exponent_x is odd)
31  *     scale coefficient_x by 10, adjust exponent
32  *  - get lower estimate for number of digits in coefficient_x
33  *  - scale coefficient x to between 31 and 33 decimal digits
34  *  - in parallel, check for exact case and return if true
35  *  - get high part of result coefficient using double precision sqrt
36  *  - compute remainder and refine coefficient in one iteration (which
37  *                                 modifies it by at most 1)
38  *  - result exponent is easy to compute from the adjusted arg. exponent
39  *
40  ****************************************************************************/
41 
42 #include "bid_internal.h"
43 #include "bid_sqrt_macros.h"
44 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
45 #include <fenv.h>
46 
47 #define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT
48 #endif
49 
50 extern double sqrt (double);
51 
52 #if DECIMAL_CALL_BY_REFERENCE
53 
54 void
bid64_sqrt(UINT64 * pres,UINT64 * px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM _EXC_INFO_PARAM)55 bid64_sqrt (UINT64 * pres,
56 	    UINT64 *
57 	    px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
58 	    _EXC_INFO_PARAM) {
59   UINT64 x;
60 #else
61 
62 UINT64
63 bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
64 	    _EXC_MASKS_PARAM _EXC_INFO_PARAM) {
65 #endif
66   UINT128 CA, CT;
67   UINT64 sign_x, coefficient_x;
68   UINT64 Q, Q2, A10, C4, R, R2, QE, res;
69   SINT64 D;
70   int_double t_scale;
71   int_float tempx;
72   double da, dq, da_h, da_l, dqe;
73   int exponent_x, exponent_q, bin_expon_cx;
74   int digits_x;
75   int scale;
76 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
77   fexcept_t binaryflags = 0;
78 #endif
79 
80 #if DECIMAL_CALL_BY_REFERENCE
81 #if !DECIMAL_GLOBAL_ROUNDING
82   _IDEC_round rnd_mode = *prnd_mode;
83 #endif
84   x = *px;
85 #endif
86 
87   // unpack arguments, check for NaN or Infinity
88   if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
89     // x is Inf. or NaN or 0
90     if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
91       res = coefficient_x;
92       if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64)	// -Infinity
93       {
94 	res = NAN_MASK64;
95 #ifdef SET_STATUS_FLAGS
96 	__set_status_flags (pfpsf, INVALID_EXCEPTION);
97 #endif
98       }
99 #ifdef SET_STATUS_FLAGS
100       if ((x & SNAN_MASK64) == SNAN_MASK64)	// sNaN
101 	__set_status_flags (pfpsf, INVALID_EXCEPTION);
102 #endif
103       BID_RETURN (res & QUIET_MASK64);
104     }
105     // x is 0
106     exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
107     res = sign_x | (((UINT64) exponent_x) << 53);
108     BID_RETURN (res);
109   }
110   // x<0?
111   if (sign_x && coefficient_x) {
112     res = NAN_MASK64;
113 #ifdef SET_STATUS_FLAGS
114     __set_status_flags (pfpsf, INVALID_EXCEPTION);
115 #endif
116     BID_RETURN (res);
117   }
118 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
119   (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
120 #endif
121   //--- get number of bits in the coefficient of x ---
122   tempx.d = (float) coefficient_x;
123   bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
124   digits_x = estimate_decimal_digits[bin_expon_cx];
125   // add test for range
126   if (coefficient_x >= power10_index_binexp[bin_expon_cx])
127     digits_x++;
128 
129   A10 = coefficient_x;
130   if (exponent_x & 1) {
131     A10 = (A10 << 2) + A10;
132     A10 += A10;
133   }
134 
135   dqe = sqrt ((double) A10);
136   QE = (UINT32) dqe;
137   if (QE * QE == A10) {
138     res =
139       very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
140 			   QE);
141 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
142     (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
143 #endif
144     BID_RETURN (res);
145   }
146   // if exponent is odd, scale coefficient by 10
147   scale = 31 - digits_x;
148   exponent_q = exponent_x - scale;
149   scale += (exponent_q & 1);	// exp. bias is even
150 
151   CT = power10_table_128[scale];
152   __mul_64x128_short (CA, coefficient_x, CT);
153 
154   // 2^64
155   t_scale.i = 0x43f0000000000000ull;
156   // convert CA to DP
157   da_h = CA.w[1];
158   da_l = CA.w[0];
159   da = da_h * t_scale.d + da_l;
160 
161   dq = sqrt (da);
162 
163   Q = (UINT64) dq;
164 
165   // get sign(sqrt(CA)-Q)
166   R = CA.w[0] - Q * Q;
167   R = ((SINT64) R) >> 63;
168   D = R + R + 1;
169 
170   exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;
171 
172 #ifdef SET_STATUS_FLAGS
173   __set_status_flags (pfpsf, INEXACT_EXCEPTION);
174 #endif
175 
176 #ifndef IEEE_ROUND_NEAREST
177 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
178   if (!((rnd_mode) & 3)) {
179 #endif
180 #endif
181 
182     // midpoint to check
183     Q2 = Q + Q + D;
184     C4 = CA.w[0] << 2;
185 
186     // get sign(-sqrt(CA)+Midpoint)
187     R2 = Q2 * Q2 - C4;
188     R2 = ((SINT64) R2) >> 63;
189 
190     // adjust Q if R!=R2
191     Q += (D & (R ^ R2));
192 #ifndef IEEE_ROUND_NEAREST
193 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
194   } else {
195     C4 = CA.w[0];
196     Q += D;
197     if ((SINT64) (Q * Q - C4) > 0)
198       Q--;
199     if (rnd_mode == ROUNDING_UP)
200       Q++;
201   }
202 #endif
203 #endif
204 
205   res = fast_get_BID64 (0, exponent_q, Q);
206 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
207   (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
208 #endif
209   BID_RETURN (res);
210 }
211 
212 
213 TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)
214 
215      UINT256 M256, C4, C8;
216      UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
217        mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
218 UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
219 SINT64 D;
220 int_float fx, f64;
221 int exponent_x, bin_expon_cx, done = 0;
222 int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
223 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
224 fexcept_t binaryflags = 0;
225 #endif
226 
227 	// unpack arguments, check for NaN or Infinity
228 if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
229   res = CX.w[1];
230   // NaN ?
231   if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
232 #ifdef SET_STATUS_FLAGS
233     if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull)	// sNaN
234       __set_status_flags (pfpsf, INVALID_EXCEPTION);
235 #endif
236     Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
237     Tmp.w[0] = CX.w[0];
238     TP128 = reciprocals10_128[18];
239     __mul_128x128_full (Qh, Ql, Tmp, TP128);
240     amount = recip_scale[18];
241     __shr_128 (Tmp, Qh, amount);
242     res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0];
243     BID_RETURN (res);
244   }
245   // x is Infinity?
246   if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
247     if (sign_x) {
248       // -Inf, return NaN
249       res = 0x7c00000000000000ull;
250 #ifdef SET_STATUS_FLAGS
251       __set_status_flags (pfpsf, INVALID_EXCEPTION);
252 #endif
253     }
254     BID_RETURN (res);
255   }
256   // x is 0 otherwise
257 
258   exponent_x =
259     ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
260     DECIMAL_EXPONENT_BIAS;
261   if (exponent_x < 0)
262     exponent_x = 0;
263   if (exponent_x > DECIMAL_MAX_EXPON_64)
264     exponent_x = DECIMAL_MAX_EXPON_64;
265   //res= sign_x | (((UINT64)exponent_x)<<53);
266   res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
267   BID_RETURN (res);
268 }
269 if (sign_x) {
270   res = 0x7c00000000000000ull;
271 #ifdef SET_STATUS_FLAGS
272   __set_status_flags (pfpsf, INVALID_EXCEPTION);
273 #endif
274   BID_RETURN (res);
275 }
276 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
277 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
278 #endif
279 
280 	   // 2^64
281 f64.i = 0x5f800000;
282 
283 	   // fx ~ CX
284 fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
285 bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
286 digits = estimate_decimal_digits[bin_expon_cx];
287 
288 A10 = CX;
289 if (exponent_x & 1) {
290   A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61);
291   A10.w[0] = CX.w[0] << 3;
292   CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63);
293   CX2.w[0] = CX.w[0] << 1;
294   __add_128_128 (A10, A10, CX2);
295 }
296 
297 C256.w[1] = A10.w[1];
298 C256.w[0] = A10.w[0];
299 CS.w[0] = short_sqrt128 (A10);
300 CS.w[1] = 0;
301 mul_factor = 0;
302 	   // check for exact result
303 if (CS.w[0] < 10000000000000000ull) {
304   if (CS.w[0] * CS.w[0] == A10.w[0]) {
305     __sqr64_fast (S2, CS.w[0]);
306     if (S2.w[1] == A10.w[1])	// && S2.w[0]==A10.w[0])
307     {
308       res =
309 	get_BID64 (0,
310 		   ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
311 		   DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
312 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
313       (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
314 #endif
315       BID_RETURN (res);
316     }
317   }
318   if (CS.w[0] >= 1000000000000000ull) {
319     done = 1;
320     exponent_q = exponent_x;
321     C256.w[1] = A10.w[1];
322     C256.w[0] = A10.w[0];
323   }
324 #ifdef SET_STATUS_FLAGS
325   __set_status_flags (pfpsf, INEXACT_EXCEPTION);
326 #endif
327   exact = 0;
328 } else {
329   B10 = 0x3333333333333334ull;
330   __mul_64x64_to_128_full (CS2, CS.w[0], B10);
331   CS0 = CS2.w[1] >> 1;
332   if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
333 #ifdef SET_STATUS_FLAGS
334     __set_status_flags (pfpsf, INEXACT_EXCEPTION);
335 #endif
336     exact = 0;
337   }
338   done = 1;
339   CS.w[0] = CS0;
340   exponent_q = exponent_x + 2;
341   mul_factor = 10;
342   mul_factor2 = 100;
343   if (CS.w[0] >= 10000000000000000ull) {
344     __mul_64x64_to_128_full (CS2, CS.w[0], B10);
345     CS0 = CS2.w[1] >> 1;
346     if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
347 #ifdef SET_STATUS_FLAGS
348       __set_status_flags (pfpsf, INEXACT_EXCEPTION);
349 #endif
350       exact = 0;
351     }
352     exponent_q += 2;
353     CS.w[0] = CS0;
354     mul_factor = 100;
355     mul_factor2 = 10000;
356   }
357   if (exact) {
358     CS0 = CS.w[0] * mul_factor;
359     __sqr64_fast (CS1, CS0)
360       if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) {
361 #ifdef SET_STATUS_FLAGS
362       __set_status_flags (pfpsf, INEXACT_EXCEPTION);
363 #endif
364       exact = 0;
365     }
366   }
367 }
368 
369 if (!done) {
370   // get number of digits in CX
371   D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
372   if (D > 0
373       || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
374     digits++;
375 
376   // if exponent is odd, scale coefficient by 10
377   scale = 31 - digits;
378   exponent_q = exponent_x - scale;
379   scale += (exponent_q & 1);	// exp. bias is even
380 
381   T128 = power10_table_128[scale];
382   __mul_128x128_low (C256, CX, T128);
383 
384 
385   CS.w[0] = short_sqrt128 (C256);
386 }
387    //printf("CS=%016I64x\n",CS.w[0]);
388 
389 exponent_q =
390   ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
391   DECIMAL_EXPONENT_BIAS;
392 if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
393   extra_digits = -exponent_q;
394   exponent_q = 0;
395 
396   // get coeff*(2^M[extra_digits])/10^extra_digits
397   __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);
398 
399   // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
400   amount = short_recip_scale[extra_digits];
401 
402   CS0 = QH.w[1] >> amount;
403 
404 #ifdef SET_STATUS_FLAGS
405   if (exact) {
406     if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
407       exact = 0;
408   }
409   if (!exact)
410     __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION);
411 #endif
412 
413   CS.w[0] = CS0;
414   if (!mul_factor)
415     mul_factor = 1;
416   mul_factor *= power10_table_128[extra_digits].w[0];
417   __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
418   if (mul_factor2_long.w[1])
419     mul_factor2 = 0;
420   else
421     mul_factor2 = mul_factor2_long.w[1];
422 }
423 	   // 4*C256
424 C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62);
425 C4.w[0] = C256.w[0] << 2;
426 
427 #ifndef IEEE_ROUND_NEAREST
428 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
429 if (!((rnd_mode) & 3)) {
430 #endif
431 #endif
432   // compare to midpoints
433   CSM.w[0] = (CS.w[0] + CS.w[0]) | 1;
434   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
435   if (mul_factor)
436     CSM.w[0] *= mul_factor;
437   // CSM^2
438   __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
439   //__mul_128x128_to_256(M256, CSM, CSM);
440 
441   if (C4.w[1] > M256.w[1] ||
442       (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
443     // round up
444     CS.w[0]++;
445   } else {
446     C8.w[0] = CS.w[0] << 3;
447     C8.w[1] = 0;
448     if (mul_factor) {
449       if (mul_factor2) {
450 	__mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
451       } else {
452 	__mul_64x128_low (C8, C8.w[0], mul_factor2_long);
453       }
454     }
455     // M256 - 8*CSM
456     __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
457     M256.w[1] = M256.w[1] - C8.w[1] - Carry;
458 
459     // if CSM' > C256, round up
460     if (M256.w[1] > C4.w[1] ||
461 	(M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
462       // round down
463       if (CS.w[0])
464 	CS.w[0]--;
465     }
466   }
467 #ifndef IEEE_ROUND_NEAREST
468 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
469 } else {
470   CS.w[0]++;
471   CSM.w[0] = CS.w[0];
472   C8.w[0] = CSM.w[0] << 1;
473   if (mul_factor)
474     CSM.w[0] *= mul_factor;
475   __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
476   C8.w[1] = 0;
477   if (mul_factor) {
478     if (mul_factor2) {
479       __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
480     } else {
481       __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
482     }
483   }
484   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);
485 
486   if (M256.w[1] > C256.w[1] ||
487       (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
488     __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
489     M256.w[1] = M256.w[1] - Carry - C8.w[1];
490     M256.w[0]++;
491     if (!M256.w[0]) {
492       M256.w[1]++;
493 
494     }
495 
496     if ((M256.w[1] > C256.w[1] ||
497 	 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
498 	&& (CS.w[0] > 1)) {
499 
500       CS.w[0]--;
501 
502       if (CS.w[0] > 1) {
503 	__sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
504 	M256.w[1] = M256.w[1] - Carry - C8.w[1];
505 	M256.w[0]++;
506 	if (!M256.w[0]) {
507 	  M256.w[1]++;
508 	}
509 
510 	if (M256.w[1] > C256.w[1] ||
511 	    (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
512 	  CS.w[0]--;
513       }
514     }
515   }
516 
517   else {
518 				/*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
519 				M256.w[1] = M256.w[1] + Carry + C8.w[1];
520 				M256.w[0]++;
521 				if(!M256.w[0])
522 				{
523 					M256.w[1]++;
524 				}
525 				CS.w[0]++;
526 			if(M256.w[1]<C256.w[1] ||
527 				(M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
528 			{
529 				CS.w[0]++;
530 			}*/
531     CS.w[0]++;
532   }
533   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
534   // RU?
535   if (((rnd_mode) != ROUNDING_UP) || exact) {
536     if (CS.w[0])
537       CS.w[0]--;
538   }
539 
540 }
541 #endif
542 #endif
543  //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
544 
545 res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
546 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
547 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
548 #endif
549 BID_RETURN (res);
550 
551 
552 }
553