1*ec02198aSmrg /* Copyright (C) 2007-2020 Free Software Foundation, Inc.
263d1a8abSmrg 
363d1a8abSmrg This file is part of GCC.
463d1a8abSmrg 
563d1a8abSmrg GCC is free software; you can redistribute it and/or modify it under
663d1a8abSmrg the terms of the GNU General Public License as published by the Free
763d1a8abSmrg Software Foundation; either version 3, or (at your option) any later
863d1a8abSmrg version.
963d1a8abSmrg 
1063d1a8abSmrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
1163d1a8abSmrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
1263d1a8abSmrg FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1363d1a8abSmrg for more details.
1463d1a8abSmrg 
1563d1a8abSmrg Under Section 7 of GPL version 3, you are granted additional
1663d1a8abSmrg permissions described in the GCC Runtime Library Exception, version
1763d1a8abSmrg 3.1, as published by the Free Software Foundation.
1863d1a8abSmrg 
1963d1a8abSmrg You should have received a copy of the GNU General Public License and
2063d1a8abSmrg a copy of the GCC Runtime Library Exception along with this program;
2163d1a8abSmrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
2263d1a8abSmrg <http://www.gnu.org/licenses/>.  */
2363d1a8abSmrg 
2463d1a8abSmrg #ifndef _SQRT_MACROS_H_
2563d1a8abSmrg #define _SQRT_MACROS_H_
2663d1a8abSmrg 
2763d1a8abSmrg #define FENCE __fence
2863d1a8abSmrg 
2963d1a8abSmrg #if DOUBLE_EXTENDED_ON
3063d1a8abSmrg 
3163d1a8abSmrg extern BINARY80 SQRT80 (BINARY80);
3263d1a8abSmrg 
3363d1a8abSmrg 
3463d1a8abSmrg __BID_INLINE__ UINT64
short_sqrt128(UINT128 A10)3563d1a8abSmrg short_sqrt128 (UINT128 A10) {
3663d1a8abSmrg   BINARY80 lx, ly, l64;
3763d1a8abSmrg   int_float f64;
3863d1a8abSmrg 
3963d1a8abSmrg   // 2^64
4063d1a8abSmrg   f64.i = 0x5f800000;
4163d1a8abSmrg   l64 = (BINARY80) f64.d;
4263d1a8abSmrg   lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
4363d1a8abSmrg   ly = SQRT80 (lx);
4463d1a8abSmrg   return (UINT64) ly;
4563d1a8abSmrg }
4663d1a8abSmrg 
4763d1a8abSmrg 
4863d1a8abSmrg __BID_INLINE__ void
long_sqrt128(UINT128 * pCS,UINT256 C256)4963d1a8abSmrg long_sqrt128 (UINT128 * pCS, UINT256 C256) {
5063d1a8abSmrg   UINT256 C4;
5163d1a8abSmrg   UINT128 CS;
5263d1a8abSmrg   UINT64 X;
5363d1a8abSmrg   SINT64 SE;
5463d1a8abSmrg   BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
5563d1a8abSmrg     l1, l0, lp, lCl;
5663d1a8abSmrg   int_float fx, f64, fm64;
5763d1a8abSmrg   int *ple = (int *) &lx;
5863d1a8abSmrg 
5963d1a8abSmrg   // 2^64
6063d1a8abSmrg   f64.i = 0x5f800000;
6163d1a8abSmrg   l64 = (BINARY80) f64.d;
6263d1a8abSmrg 
6363d1a8abSmrg   l128 = l64 * l64;
6463d1a8abSmrg   lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
6563d1a8abSmrg   l2 = (BINARY80) C256.w[2] * l128;
6663d1a8abSmrg   lx = FENCE (lx + l2);
6763d1a8abSmrg   l1 = (BINARY80) C256.w[1] * l64;
6863d1a8abSmrg   lx = FENCE (lx + l1);
6963d1a8abSmrg   l0 = (BINARY80) C256.w[0];
7063d1a8abSmrg   lx = FENCE (lx + l0);
7163d1a8abSmrg   // sqrt(C256)
7263d1a8abSmrg   lS = SQRT80 (lx);
7363d1a8abSmrg 
7463d1a8abSmrg   // get coefficient
7563d1a8abSmrg   // 2^(-64)
7663d1a8abSmrg   fm64.i = 0x1f800000;
7763d1a8abSmrg   lm64 = (BINARY80) fm64.d;
7863d1a8abSmrg   CS.w[1] = (UINT64) (lS * lm64);
7963d1a8abSmrg   CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
8063d1a8abSmrg 
8163d1a8abSmrg   ///////////////////////////////////////
8263d1a8abSmrg   //  CAUTION!
8363d1a8abSmrg   //  little endian code only
8463d1a8abSmrg   //  add solution for big endian
8563d1a8abSmrg   //////////////////////////////////////
8663d1a8abSmrg   lSH = lS;
8763d1a8abSmrg   *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
8863d1a8abSmrg 
8963d1a8abSmrg   // correction for C256 rounding
9063d1a8abSmrg   lCl = FENCE (l3 - lx);
9163d1a8abSmrg   lCl = FENCE (lCl + l2);
9263d1a8abSmrg   lCl = FENCE (lCl + l1);
9363d1a8abSmrg   lCl = FENCE (lCl + l0);
9463d1a8abSmrg 
9563d1a8abSmrg   lSL = lS - lSH;
9663d1a8abSmrg 
9763d1a8abSmrg   //////////////////////////////////////////
9863d1a8abSmrg   //   Watch for compiler re-ordering
9963d1a8abSmrg   //
10063d1a8abSmrg   /////////////////////////////////////////
10163d1a8abSmrg   // C256-S^2
10263d1a8abSmrg   lxL = FENCE (lx - lSH * lSH);
10363d1a8abSmrg   lp = lSH * lSL;
10463d1a8abSmrg   lp += lp;
10563d1a8abSmrg   lxL = FENCE (lxL - lp);
10663d1a8abSmrg   lSL *= lSL;
10763d1a8abSmrg   lxL = FENCE (lxL - lSL);
10863d1a8abSmrg   lCl += lxL;
10963d1a8abSmrg 
11063d1a8abSmrg   // correction term
11163d1a8abSmrg   lE = lCl / (lS + lS);
11263d1a8abSmrg 
11363d1a8abSmrg   // get low part of coefficient
11463d1a8abSmrg   X = CS.w[0];
11563d1a8abSmrg   if (lCl >= 0) {
11663d1a8abSmrg     SE = (SINT64) (lE);
11763d1a8abSmrg     CS.w[0] += SE;
11863d1a8abSmrg     if (CS.w[0] < X)
11963d1a8abSmrg       CS.w[1]++;
12063d1a8abSmrg   } else {
12163d1a8abSmrg     SE = (SINT64) (-lE);
12263d1a8abSmrg     CS.w[0] -= SE;
12363d1a8abSmrg     if (CS.w[0] > X)
12463d1a8abSmrg       CS.w[1]--;
12563d1a8abSmrg   }
12663d1a8abSmrg 
12763d1a8abSmrg   pCS->w[0] = CS.w[0];
12863d1a8abSmrg   pCS->w[1] = CS.w[1];
12963d1a8abSmrg }
13063d1a8abSmrg 
13163d1a8abSmrg #else
13263d1a8abSmrg 
13363d1a8abSmrg extern double sqrt (double);
13463d1a8abSmrg 
13563d1a8abSmrg __BID_INLINE__ UINT64
short_sqrt128(UINT128 A10)13663d1a8abSmrg short_sqrt128 (UINT128 A10) {
13763d1a8abSmrg   UINT256 ARS, ARS0, AE0, AE, S;
13863d1a8abSmrg 
13963d1a8abSmrg   UINT64 MY, ES, CY;
14063d1a8abSmrg   double lx, l64;
14163d1a8abSmrg   int_double f64, ly;
14263d1a8abSmrg   int ey, k;
14363d1a8abSmrg 
14463d1a8abSmrg   // 2^64
14563d1a8abSmrg   f64.i = 0x43f0000000000000ull;
14663d1a8abSmrg   l64 = f64.d;
14763d1a8abSmrg   lx = (double) A10.w[1] * l64 + (double) A10.w[0];
14863d1a8abSmrg   ly.d = 1.0 / sqrt (lx);
14963d1a8abSmrg 
15063d1a8abSmrg   MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
15163d1a8abSmrg   ey = 0x3ff - (ly.i >> 52);
15263d1a8abSmrg 
15363d1a8abSmrg   // A10*RS^2
15463d1a8abSmrg   __mul_64x128_to_192 (ARS0, MY, A10);
15563d1a8abSmrg   __mul_64x192_to_256 (ARS, MY, ARS0);
15663d1a8abSmrg 
15763d1a8abSmrg   // shr by 2*ey+40, to get a 64-bit value
15863d1a8abSmrg   k = (ey << 1) + 104 - 64;
15963d1a8abSmrg   if (k >= 128) {
16063d1a8abSmrg     if (k > 128)
16163d1a8abSmrg       ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
16263d1a8abSmrg     else
16363d1a8abSmrg       ES = ARS.w[2];
16463d1a8abSmrg   } else {
16563d1a8abSmrg     if (k >= 64) {
16663d1a8abSmrg       ARS.w[0] = ARS.w[1];
16763d1a8abSmrg       ARS.w[1] = ARS.w[2];
16863d1a8abSmrg       k -= 64;
16963d1a8abSmrg     }
17063d1a8abSmrg     if (k) {
17163d1a8abSmrg       __shr_128 (ARS, ARS, k);
17263d1a8abSmrg     }
17363d1a8abSmrg     ES = ARS.w[0];
17463d1a8abSmrg   }
17563d1a8abSmrg 
17663d1a8abSmrg   ES = ((SINT64) ES) >> 1;
17763d1a8abSmrg 
17863d1a8abSmrg   if (((SINT64) ES) < 0) {
17963d1a8abSmrg     ES = -ES;
18063d1a8abSmrg 
18163d1a8abSmrg     // A*RS*eps (scaled by 2^64)
18263d1a8abSmrg     __mul_64x192_to_256 (AE0, ES, ARS0);
18363d1a8abSmrg 
18463d1a8abSmrg     AE.w[0] = AE0.w[1];
18563d1a8abSmrg     AE.w[1] = AE0.w[2];
18663d1a8abSmrg     AE.w[2] = AE0.w[3];
18763d1a8abSmrg 
18863d1a8abSmrg     __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
18963d1a8abSmrg     __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
19063d1a8abSmrg     S.w[2] = ARS0.w[2] + AE.w[2] + CY;
19163d1a8abSmrg   } else {
19263d1a8abSmrg     // A*RS*eps (scaled by 2^64)
19363d1a8abSmrg     __mul_64x192_to_256 (AE0, ES, ARS0);
19463d1a8abSmrg 
19563d1a8abSmrg     AE.w[0] = AE0.w[1];
19663d1a8abSmrg     AE.w[1] = AE0.w[2];
19763d1a8abSmrg     AE.w[2] = AE0.w[3];
19863d1a8abSmrg 
19963d1a8abSmrg     __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
20063d1a8abSmrg     __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
20163d1a8abSmrg     S.w[2] = ARS0.w[2] - AE.w[2] - CY;
20263d1a8abSmrg   }
20363d1a8abSmrg 
20463d1a8abSmrg   k = ey + 51;
20563d1a8abSmrg 
20663d1a8abSmrg   if (k >= 64) {
20763d1a8abSmrg     if (k >= 128) {
20863d1a8abSmrg       S.w[0] = S.w[2];
20963d1a8abSmrg       S.w[1] = 0;
21063d1a8abSmrg       k -= 128;
21163d1a8abSmrg     } else {
21263d1a8abSmrg       S.w[0] = S.w[1];
21363d1a8abSmrg       S.w[1] = S.w[2];
21463d1a8abSmrg     }
21563d1a8abSmrg     k -= 64;
21663d1a8abSmrg   }
21763d1a8abSmrg   if (k) {
21863d1a8abSmrg     __shr_128 (S, S, k);
21963d1a8abSmrg   }
22063d1a8abSmrg 
22163d1a8abSmrg 
22263d1a8abSmrg   return (UINT64) ((S.w[0] + 1) >> 1);
22363d1a8abSmrg 
22463d1a8abSmrg }
22563d1a8abSmrg 
22663d1a8abSmrg 
22763d1a8abSmrg 
22863d1a8abSmrg __BID_INLINE__ void
long_sqrt128(UINT128 * pCS,UINT256 C256)22963d1a8abSmrg long_sqrt128 (UINT128 * pCS, UINT256 C256) {
23063d1a8abSmrg   UINT512 ARS0, ARS;
23163d1a8abSmrg   UINT256 ARS00, AE, AE2, S;
23263d1a8abSmrg   UINT128 ES, ES2, ARS1;
23363d1a8abSmrg   UINT64 ES32, CY, MY;
23463d1a8abSmrg   double l64, l128, lx, l2, l1, l0;
23563d1a8abSmrg   int_double f64, ly;
23663d1a8abSmrg   int ey, k, k2;
23763d1a8abSmrg 
23863d1a8abSmrg   // 2^64
23963d1a8abSmrg   f64.i = 0x43f0000000000000ull;
24063d1a8abSmrg   l64 = f64.d;
24163d1a8abSmrg 
24263d1a8abSmrg   l128 = l64 * l64;
24363d1a8abSmrg   lx = (double) C256.w[3] * l64 * l128;
24463d1a8abSmrg   l2 = (double) C256.w[2] * l128;
24563d1a8abSmrg   lx = FENCE (lx + l2);
24663d1a8abSmrg   l1 = (double) C256.w[1] * l64;
24763d1a8abSmrg   lx = FENCE (lx + l1);
24863d1a8abSmrg   l0 = (double) C256.w[0];
24963d1a8abSmrg   lx = FENCE (lx + l0);
25063d1a8abSmrg   // sqrt(C256)
25163d1a8abSmrg   ly.d = 1.0 / sqrt (lx);
25263d1a8abSmrg 
25363d1a8abSmrg   MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
25463d1a8abSmrg   ey = 0x3ff - (ly.i >> 52);
25563d1a8abSmrg 
25663d1a8abSmrg   // A10*RS^2, scaled by 2^(2*ey+104)
25763d1a8abSmrg   __mul_64x256_to_320 (ARS0, MY, C256);
25863d1a8abSmrg   __mul_64x320_to_384 (ARS, MY, ARS0);
25963d1a8abSmrg 
26063d1a8abSmrg   // shr by k=(2*ey+104)-128
26163d1a8abSmrg   // expect k is in the range (192, 256) if result in [10^33, 10^34)
26263d1a8abSmrg   // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
26363d1a8abSmrg   k = (ey << 1) + 104 - 128 - 192;
26463d1a8abSmrg   k2 = 64 - k;
26563d1a8abSmrg   ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
26663d1a8abSmrg   ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
26763d1a8abSmrg   ES.w[1] = ((SINT64) ES.w[1]) >> 1;
26863d1a8abSmrg 
26963d1a8abSmrg   // A*RS >> 192 (for error term computation)
27063d1a8abSmrg   ARS1.w[0] = ARS0.w[3];
27163d1a8abSmrg   ARS1.w[1] = ARS0.w[4];
27263d1a8abSmrg 
27363d1a8abSmrg   // A*RS>>64
27463d1a8abSmrg   ARS00.w[0] = ARS0.w[1];
27563d1a8abSmrg   ARS00.w[1] = ARS0.w[2];
27663d1a8abSmrg   ARS00.w[2] = ARS0.w[3];
27763d1a8abSmrg   ARS00.w[3] = ARS0.w[4];
27863d1a8abSmrg 
27963d1a8abSmrg   if (((SINT64) ES.w[1]) < 0) {
28063d1a8abSmrg     ES.w[0] = -ES.w[0];
28163d1a8abSmrg     ES.w[1] = -ES.w[1];
28263d1a8abSmrg     if (ES.w[0])
28363d1a8abSmrg       ES.w[1]--;
28463d1a8abSmrg 
28563d1a8abSmrg     // A*RS*eps
28663d1a8abSmrg     __mul_128x128_to_256 (AE, ES, ARS1);
28763d1a8abSmrg 
28863d1a8abSmrg     __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
28963d1a8abSmrg     __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
29063d1a8abSmrg     __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
29163d1a8abSmrg     S.w[3] = ARS00.w[3] + AE.w[3] + CY;
29263d1a8abSmrg   } else {
29363d1a8abSmrg     // A*RS*eps
29463d1a8abSmrg     __mul_128x128_to_256 (AE, ES, ARS1);
29563d1a8abSmrg 
29663d1a8abSmrg     __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
29763d1a8abSmrg     __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
29863d1a8abSmrg     __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
29963d1a8abSmrg     S.w[3] = ARS00.w[3] - AE.w[3] - CY;
30063d1a8abSmrg   }
30163d1a8abSmrg 
30263d1a8abSmrg   // 3/2*eps^2, scaled by 2^128
30363d1a8abSmrg   ES32 = ES.w[1] + (ES.w[1] >> 1);
30463d1a8abSmrg   __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
30563d1a8abSmrg   // A*RS*3/2*eps^2
30663d1a8abSmrg   __mul_128x128_to_256 (AE2, ES2, ARS1);
30763d1a8abSmrg 
30863d1a8abSmrg   // result, scaled by 2^(ey+52-64)
30963d1a8abSmrg   __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
31063d1a8abSmrg   __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
31163d1a8abSmrg   __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
31263d1a8abSmrg   S.w[3] = S.w[3] + AE2.w[3] + CY;
31363d1a8abSmrg 
31463d1a8abSmrg   // k in (0, 64)
31563d1a8abSmrg   k = ey + 51 - 128;
31663d1a8abSmrg   k2 = 64 - k;
31763d1a8abSmrg   S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
31863d1a8abSmrg   S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
31963d1a8abSmrg 
32063d1a8abSmrg   // round to nearest
32163d1a8abSmrg   S.w[0]++;
32263d1a8abSmrg   if (!S.w[0])
32363d1a8abSmrg     S.w[1]++;
32463d1a8abSmrg 
32563d1a8abSmrg   pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
32663d1a8abSmrg   pCS->w[1] = S.w[1] >> 1;
32763d1a8abSmrg 
32863d1a8abSmrg }
32963d1a8abSmrg 
33063d1a8abSmrg #endif
33163d1a8abSmrg #endif
332