1 /* Copyright (C) 2007-2016 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
8 version.
9 
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 for more details.
14 
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18 
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22 <http://www.gnu.org/licenses/>.  */
23 
24 #ifndef _SQRT_MACROS_H_
25 #define _SQRT_MACROS_H_
26 
27 #define FENCE __fence
28 
29 #if DOUBLE_EXTENDED_ON
30 
31 extern BINARY80 SQRT80 (BINARY80);
32 
33 
34 __BID_INLINE__ UINT64
short_sqrt128(UINT128 A10)35 short_sqrt128 (UINT128 A10) {
36   BINARY80 lx, ly, l64;
37   int_float f64;
38 
39   // 2^64
40   f64.i = 0x5f800000;
41   l64 = (BINARY80) f64.d;
42   lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
43   ly = SQRT80 (lx);
44   return (UINT64) ly;
45 }
46 
47 
48 __BID_INLINE__ void
long_sqrt128(UINT128 * pCS,UINT256 C256)49 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
50   UINT256 C4;
51   UINT128 CS;
52   UINT64 X;
53   SINT64 SE;
54   BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
55     l1, l0, lp, lCl;
56   int_float fx, f64, fm64;
57   int *ple = (int *) &lx;
58 
59   // 2^64
60   f64.i = 0x5f800000;
61   l64 = (BINARY80) f64.d;
62 
63   l128 = l64 * l64;
64   lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
65   l2 = (BINARY80) C256.w[2] * l128;
66   lx = FENCE (lx + l2);
67   l1 = (BINARY80) C256.w[1] * l64;
68   lx = FENCE (lx + l1);
69   l0 = (BINARY80) C256.w[0];
70   lx = FENCE (lx + l0);
71   // sqrt(C256)
72   lS = SQRT80 (lx);
73 
74   // get coefficient
75   // 2^(-64)
76   fm64.i = 0x1f800000;
77   lm64 = (BINARY80) fm64.d;
78   CS.w[1] = (UINT64) (lS * lm64);
79   CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
80 
81   ///////////////////////////////////////
82   //  CAUTION!
83   //  little endian code only
84   //  add solution for big endian
85   //////////////////////////////////////
86   lSH = lS;
87   *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
88 
89   // correction for C256 rounding
90   lCl = FENCE (l3 - lx);
91   lCl = FENCE (lCl + l2);
92   lCl = FENCE (lCl + l1);
93   lCl = FENCE (lCl + l0);
94 
95   lSL = lS - lSH;
96 
97   //////////////////////////////////////////
98   //   Watch for compiler re-ordering
99   //
100   /////////////////////////////////////////
101   // C256-S^2
102   lxL = FENCE (lx - lSH * lSH);
103   lp = lSH * lSL;
104   lp += lp;
105   lxL = FENCE (lxL - lp);
106   lSL *= lSL;
107   lxL = FENCE (lxL - lSL);
108   lCl += lxL;
109 
110   // correction term
111   lE = lCl / (lS + lS);
112 
113   // get low part of coefficient
114   X = CS.w[0];
115   if (lCl >= 0) {
116     SE = (SINT64) (lE);
117     CS.w[0] += SE;
118     if (CS.w[0] < X)
119       CS.w[1]++;
120   } else {
121     SE = (SINT64) (-lE);
122     CS.w[0] -= SE;
123     if (CS.w[0] > X)
124       CS.w[1]--;
125   }
126 
127   pCS->w[0] = CS.w[0];
128   pCS->w[1] = CS.w[1];
129 }
130 
131 #else
132 
133 extern double sqrt (double);
134 
135 __BID_INLINE__ UINT64
short_sqrt128(UINT128 A10)136 short_sqrt128 (UINT128 A10) {
137   UINT256 ARS, ARS0, AE0, AE, S;
138 
139   UINT64 MY, ES, CY;
140   double lx, l64;
141   int_double f64, ly;
142   int ey, k;
143 
144   // 2^64
145   f64.i = 0x43f0000000000000ull;
146   l64 = f64.d;
147   lx = (double) A10.w[1] * l64 + (double) A10.w[0];
148   ly.d = 1.0 / sqrt (lx);
149 
150   MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
151   ey = 0x3ff - (ly.i >> 52);
152 
153   // A10*RS^2
154   __mul_64x128_to_192 (ARS0, MY, A10);
155   __mul_64x192_to_256 (ARS, MY, ARS0);
156 
157   // shr by 2*ey+40, to get a 64-bit value
158   k = (ey << 1) + 104 - 64;
159   if (k >= 128) {
160     if (k > 128)
161       ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
162     else
163       ES = ARS.w[2];
164   } else {
165     if (k >= 64) {
166       ARS.w[0] = ARS.w[1];
167       ARS.w[1] = ARS.w[2];
168       k -= 64;
169     }
170     if (k) {
171       __shr_128 (ARS, ARS, k);
172     }
173     ES = ARS.w[0];
174   }
175 
176   ES = ((SINT64) ES) >> 1;
177 
178   if (((SINT64) ES) < 0) {
179     ES = -ES;
180 
181     // A*RS*eps (scaled by 2^64)
182     __mul_64x192_to_256 (AE0, ES, ARS0);
183 
184     AE.w[0] = AE0.w[1];
185     AE.w[1] = AE0.w[2];
186     AE.w[2] = AE0.w[3];
187 
188     __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
189     __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
190     S.w[2] = ARS0.w[2] + AE.w[2] + CY;
191   } else {
192     // A*RS*eps (scaled by 2^64)
193     __mul_64x192_to_256 (AE0, ES, ARS0);
194 
195     AE.w[0] = AE0.w[1];
196     AE.w[1] = AE0.w[2];
197     AE.w[2] = AE0.w[3];
198 
199     __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
200     __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
201     S.w[2] = ARS0.w[2] - AE.w[2] - CY;
202   }
203 
204   k = ey + 51;
205 
206   if (k >= 64) {
207     if (k >= 128) {
208       S.w[0] = S.w[2];
209       S.w[1] = 0;
210       k -= 128;
211     } else {
212       S.w[0] = S.w[1];
213       S.w[1] = S.w[2];
214     }
215     k -= 64;
216   }
217   if (k) {
218     __shr_128 (S, S, k);
219   }
220 
221 
222   return (UINT64) ((S.w[0] + 1) >> 1);
223 
224 }
225 
226 
227 
228 __BID_INLINE__ void
long_sqrt128(UINT128 * pCS,UINT256 C256)229 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
230   UINT512 ARS0, ARS;
231   UINT256 ARS00, AE, AE2, S;
232   UINT128 ES, ES2, ARS1;
233   UINT64 ES32, CY, MY;
234   double l64, l128, lx, l2, l1, l0;
235   int_double f64, ly;
236   int ey, k, k2;
237 
238   // 2^64
239   f64.i = 0x43f0000000000000ull;
240   l64 = f64.d;
241 
242   l128 = l64 * l64;
243   lx = (double) C256.w[3] * l64 * l128;
244   l2 = (double) C256.w[2] * l128;
245   lx = FENCE (lx + l2);
246   l1 = (double) C256.w[1] * l64;
247   lx = FENCE (lx + l1);
248   l0 = (double) C256.w[0];
249   lx = FENCE (lx + l0);
250   // sqrt(C256)
251   ly.d = 1.0 / sqrt (lx);
252 
253   MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
254   ey = 0x3ff - (ly.i >> 52);
255 
256   // A10*RS^2, scaled by 2^(2*ey+104)
257   __mul_64x256_to_320 (ARS0, MY, C256);
258   __mul_64x320_to_384 (ARS, MY, ARS0);
259 
260   // shr by k=(2*ey+104)-128
261   // expect k is in the range (192, 256) if result in [10^33, 10^34)
262   // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
263   k = (ey << 1) + 104 - 128 - 192;
264   k2 = 64 - k;
265   ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
266   ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
267   ES.w[1] = ((SINT64) ES.w[1]) >> 1;
268 
269   // A*RS >> 192 (for error term computation)
270   ARS1.w[0] = ARS0.w[3];
271   ARS1.w[1] = ARS0.w[4];
272 
273   // A*RS>>64
274   ARS00.w[0] = ARS0.w[1];
275   ARS00.w[1] = ARS0.w[2];
276   ARS00.w[2] = ARS0.w[3];
277   ARS00.w[3] = ARS0.w[4];
278 
279   if (((SINT64) ES.w[1]) < 0) {
280     ES.w[0] = -ES.w[0];
281     ES.w[1] = -ES.w[1];
282     if (ES.w[0])
283       ES.w[1]--;
284 
285     // A*RS*eps
286     __mul_128x128_to_256 (AE, ES, ARS1);
287 
288     __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
289     __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
290     __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
291     S.w[3] = ARS00.w[3] + AE.w[3] + CY;
292   } else {
293     // A*RS*eps
294     __mul_128x128_to_256 (AE, ES, ARS1);
295 
296     __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
297     __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
298     __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
299     S.w[3] = ARS00.w[3] - AE.w[3] - CY;
300   }
301 
302   // 3/2*eps^2, scaled by 2^128
303   ES32 = ES.w[1] + (ES.w[1] >> 1);
304   __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
305   // A*RS*3/2*eps^2
306   __mul_128x128_to_256 (AE2, ES2, ARS1);
307 
308   // result, scaled by 2^(ey+52-64)
309   __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
310   __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
311   __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
312   S.w[3] = S.w[3] + AE2.w[3] + CY;
313 
314   // k in (0, 64)
315   k = ey + 51 - 128;
316   k2 = 64 - k;
317   S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
318   S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
319 
320   // round to nearest
321   S.w[0]++;
322   if (!S.w[0])
323     S.w[1]++;
324 
325   pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
326   pCS->w[1] = S.w[1] >> 1;
327 
328 }
329 
330 #endif
331 #endif
332