1 /* mpfr_rec_sqrt -- inverse square root
2
3 Copyright 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramel projects, INRIA.
5
6 This file is part of the GNU MPFR Library.
7
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23 #include <stdio.h>
24 #include <stdlib.h>
25
26 #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
27 #include "mpfr-impl.h"
28
29 #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1)
30
31 #define MPFR_COM_N(x,y,n) \
32 { \
33 mp_size_t i; \
34 for (i = 0; i < n; i++) \
35 *((x)+i) = ~*((y)+i); \
36 }
37
38 /* Put in X a p-bit approximation of 1/sqrt(A),
39 where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
40 A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
41 where B = 2^GMP_NUMB_BITS.
42
43 We have 1 <= A < 4 and 1/2 <= X < 1.
44
45 The error in the approximate result with respect to the true
46 value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
47
48 Note: x and a are left-aligned, i.e., the most significant bit of
49 a[an-1] is set, and so is the most significant bit of the output x[n-1].
50
51 If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
52 A are taken into account to compute the approximation of 1/sqrt(A), but
53 whether or not they are zero, the error between X and 1/sqrt(A) is bounded
54 by 1 ulp(X) [in precision p].
55 The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
56 are set to 0.
57
58 Assumptions:
59 (1) A should be normalized, i.e., the most significant bit of a[an-1]
60 should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
61 (2) p >= 12
62 (3) {a, an} and {x, n} should not overlap
63 (4) GMP_NUMB_BITS >= 12 and is even
64
65 Note: this routine is much more efficient when ap is small compared to p,
66 including the case where ap <= GMP_NUMB_BITS, thus it can be used to
67 implement an efficient mpfr_rec_sqrt_ui function.
68
69 References:
70 [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann,
71 http://www.loria.fr/~zimmerma/mca/pub226.html
72 */
73 static void
mpfr_mpn_rec_sqrt(mpfr_limb_ptr x,mpfr_prec_t p,mpfr_limb_srcptr a,mpfr_prec_t ap,int as)74 mpfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p,
75 mpfr_limb_srcptr a, mpfr_prec_t ap, int as)
76
77 {
78 /* the following T1 and T2 are bipartite tables giving initial
79 approximation for the inverse square root, with 13-bit input split in
80 5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
81 with i = a*2^8 + b*2^4 + c, we use for approximation of
82 2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
83 The largest error is obtained for i = 2054, where x = 2044,
84 and 2048/sqrt(i/2048) = 2045.006576...
85 */
86 static short int T1[384] = {
87 2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
88 1944, 1938, 1931, /* a=8 */
89 1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
90 1844, 1838, 1832, /* a=9 */
91 1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
92 1757, 1752, 1747, /* a=10 */
93 1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
94 1681, 1677, 1673, /* a=11 */
95 1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
96 1615, 1611, 1607, /* a=12 */
97 1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
98 1556, 1552, 1549, /* a=13 */
99 1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
100 1502, 1499, 1496, /* a=14 */
101 1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
102 1454, 1451, 1449, /* a=15 */
103 1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
104 1411, 1408, 1405, /* a=16 */
105 1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
106 1371, 1368, 1366, /* a=17 */
107 1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
108 1333, 1331, 1329, /* a=18 */
109 1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
110 1300, 1298, 1296, /* a=19 */
111 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
112 1268, 1266, 1265, /* a=20 */
113 1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
114 1239, 1237, 1235, /* a=21 */
115 1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
116 1212, 1210, 1208, /* a=22 */
117 1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
118 1185, 1184, 1182, /* a=23 */
119 1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
120 1162, 1160, 1159, /* a=24 */
121 1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
122 1139, 1137, 1136, /* a=25 */
123 1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
124 1117, 1116, 1115, /* a=26 */
125 1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
126 1098, 1096, 1095, /* a=27 */
127 1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
128 1078, 1077, 1076, /* a=28 */
129 1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
130 1060, 1059, 1058, /* a=29 */
131 1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
132 1043, 1042, 1041, /* a=30 */
133 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
134 1027, 1026, 1025 /* a=31 */
135 };
136 static unsigned char T2[384] = {
137 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
138 6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
139 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
140 4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
141 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
142 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
143 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
144 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
145 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
146 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
147 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
148 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
149 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
150 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
151 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
152 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
153 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
156 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
158 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
159 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* a=31 */
161 };
162 mp_size_t n = LIMB_SIZE(p); /* number of limbs of X */
163 mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
164
165 /* A should be normalized */
166 MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
167 /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
168 Since that does not depend on MPFR, we always check this. */
169 MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0));
170 /* {a, an} and {x, n} should not overlap */
171 MPFR_ASSERTD((a + an <= x) || (x + n <= a));
172 MPFR_ASSERTD(p >= 11);
173
174 if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
175 {
176 a += an - n;
177 an = n;
178 }
179
180 if (p == 11) /* should happen only from recursive calls */
181 {
182 unsigned long i, ab, ac;
183 mp_limb_t t;
184
185 /* take the 12+as most significant bits of A */
186 i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
187 /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
188 ab = i >> 4;
189 ac = (ab & 0x3F0) | (i & 0x0F);
190 t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80];
191 x[0] = t << (GMP_NUMB_BITS - p);
192 }
193 else /* p >= 12 */
194 {
195 mpfr_prec_t h, pl;
196 mpfr_limb_ptr r, s, t, u;
197 mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
198 mp_limb_t neg, cy, cu;
199 MPFR_TMP_DECL(marker);
200
201 /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0,
202 and A/4 if as=1. */
203
204 /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
205 h = (p < 18) ? 11 : (p >> 1) + 2;
206
207 xn = LIMB_SIZE(h); /* limb size of the recursive Xh */
208 rn = LIMB_SIZE(2 * h); /* a priori limb size of Xh^2 */
209 ln = n - xn; /* remaining limbs to be computed */
210
211 /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
212 we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
213 thus the h-3 most significant bits of t should be zero,
214 which is in fact h+1+as-3 because of the normalization of A.
215 This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs.
216
217 More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2})
218 <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2}
219 since A < 4 and h >= 11, thus
220 |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}.
221 This is sufficient to prove that the upper limb of {t,tn} below is
222 less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below.
223 */
224 th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS;
225 tn = LIMB_SIZE(2 * h + 1 + as);
226
227 /* we need h+1+as bits of a */
228 ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
229 needed for the recursive call*/
230 if (MPFR_UNLIKELY(ahn > an))
231 ahn = an;
232 mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
233 /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
234 limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
235
236 /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */
237
238 MPFR_TMP_MARK (marker);
239 /* first step: square X in r, result is exact */
240 un = xn + (tn - th);
241 /* We use the same temporary buffer to store r and u: r needs 2*xn
242 limbs where u needs xn+(tn-th) limbs. Since tn can store at least
243 2h bits, and th at most h bits, then tn-th can store at least h bits,
244 thus tn - th >= xn, and reserving the space for u is enough. */
245 MPFR_ASSERTD(2 * xn <= un);
246 u = r = MPFR_TMP_LIMBS_ALLOC (un);
247 if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
248 thus ln = 0 */
249 {
250 MPFR_ASSERTD(ln == 0);
251 cy = x[0] >> (GMP_NUMB_BITS >> 1);
252 r ++;
253 r[0] = cy * cy;
254 }
255 else if (xn == 1) /* xn=1, rn=2 */
256 umul_ppmm(r[1], r[0], x[ln], x[ln]);
257 else
258 {
259 mpn_mul_n (r, x + ln, x + ln, xn);
260 /* we have {r, 2*xn} = X_h^2 */
261 if (rn < 2 * xn)
262 r ++;
263 }
264 /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
265 limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
266
267 /* Second step: s <- A * (r^2), and truncate the low ap bits,
268 i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
269 */
270 sn = an + rn;
271 s = MPFR_TMP_LIMBS_ALLOC (sn);
272 if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
273 and 2h >= p+3 */
274 {
275 /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
276 bits from A */
277 /* since n=1, and we ensured an <= n, we also have an=1 */
278 MPFR_ASSERTD(an == 1);
279 umul_ppmm (s[1], s[0], r[0], a[0]);
280 }
281 else
282 {
283 /* we have p <= n * GMP_NUMB_BITS
284 2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
285 thus n <= rn <= n + 1 */
286 MPFR_ASSERTD(rn <= n + 1);
287 /* since we ensured an <= n, we have an <= rn */
288 MPFR_ASSERTD(an <= rn);
289 mpn_mul (s, r, rn, a, an);
290 /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
291 100000... or 011111... if as=0, or
292 010000... or 001111... if as=1.
293 We ignore the bits of s after the first 2h+1+as ones.
294 We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */
295 }
296
297 /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
298 limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
299 t = s + sn - tn; /* pointer to low limb of the high part of t */
300 /* the upper h-3 bits of 1-t should be zero,
301 where 1 corresponds to the most significant bit of t[tn-1] if as=0,
302 and to the 2nd most significant bit of t[tn-1] if as=1 */
303
304 /* compute t <- 1 - t, which is B^tn - {t, tn+1},
305 with rounding toward -Inf, i.e., rounding the input t toward +Inf.
306 We could only modify the low tn - th limbs from t, but it gives only
307 a small speedup, and would make the code more complex.
308 */
309 neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
310 if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
311 is the part truncated above, thus 1 - t rounded to -Inf
312 is 1 - th - ulp(th) */
313 {
314 /* since the 1+as most significant bits of t are zero, set them
315 to 1 before the one-complement */
316 t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
317 MPFR_COM_N (t, t, tn);
318 /* we should add 1 here to get 1-th complement, and subtract 1 for
319 -ulp(th), thus we do nothing */
320 }
321 else /* negative case: we want 1 - t rounded toward -Inf, i.e.,
322 th + eps rounded toward +Inf, which is th + ulp(th):
323 we discard the bit corresponding to 1,
324 and we add 1 to the least significant bit of t */
325 {
326 t[tn - 1] ^= neg;
327 mpn_add_1 (t, t, tn, 1);
328 }
329 tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
330 the high limbs of {t, tn} are zero */
331
332 /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
333 th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
334 MPFR_ASSERTD(tn > 0);
335
336 /* u <- x * t, where {t, tn} contains at least h+3 bits,
337 and {x, xn} contains h bits, thus tn >= xn */
338 MPFR_ASSERTD(tn >= xn);
339 if (tn == 1) /* necessarily xn=1 */
340 umul_ppmm (u[1], u[0], t[0], x[ln]);
341 else
342 mpn_mul (u, t, tn, x + ln, xn);
343
344 /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */
345
346 /* we have already discarded the upper th high limbs of t, thus we only
347 have to consider the upper n - th limbs of u */
348 un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
349 h = ceil((p+3)/2) <= (p+4)/2,
350 th*GMP_NUMB_BITS <= h-1 <= p/2+1,
351 thus (n-th)*GMP_NUMB_BITS >= p/2-1.
352 */
353 MPFR_ASSERTD(un > 0);
354 u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
355 = xn + original_tn - n
356 = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
357 since 2h >= p+3 */
358 MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
359
360 /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
361 need to add or subtract.
362 In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
363 u by 2. */
364
365 if (as == 1)
366 /* shift on un+1 limbs to get most significant bit of u[-1] into
367 least significant bit of u[0] */
368 mpn_lshift (u - 1, u - 1, un + 1, 1);
369
370 /* now {u,un} represents U / 2 from Algorithm 3.9 */
371
372 pl = n * GMP_NUMB_BITS - p; /* low bits from x */
373 /* We want that the low pl bits are zero after rounding to nearest,
374 thus we round u to nearest at bit pl-1 of u[0] */
375 if (pl > 0)
376 {
377 cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1)));
378 /* mask bits 0..pl-1 of u[0] */
379 u[0] &= ~MPFR_LIMB_MASK(pl);
380 }
381 else /* round bit is in u[-1] */
382 cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
383 MPFR_ASSERTN(cu == 0);
384
385 /* We already have filled {x + ln, xn = n - ln}, and we want to add or
386 subtract {u, un} at position x.
387 un = n - th, where th contains <= h+1+as-3<=h-1 bits
388 ln = n - xn, where xn contains >= h bits
389 thus un > ln.
390 Warning: ln might be zero.
391 */
392 MPFR_ASSERTD(un > ln);
393 /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
394 p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
395 MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
396 /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
397 we need to add or subtract the overlapping part {u + ln, un - ln} */
398 /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1
399 below (with size = th) mustn't be used. */
400 if (neg == 0)
401 {
402 if (ln > 0)
403 MPN_COPY (x, u, ln);
404 cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
405 /* cy is the carry at x + (ln + xn) = x + n */
406 }
407 else /* negative case */
408 {
409 /* subtract {u+ln, un-ln} from {x+ln,un} */
410 cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
411 /* cy is the borrow at x + (ln + xn) = x + n */
412
413 /* cy cannot be non-zero, since the most significant bit of Xh is 1,
414 and the correction is bounded by 2^{-h+3} */
415 MPFR_ASSERTD(cy == 0);
416 if (ln > 0)
417 {
418 MPFR_COM_N (x, u, ln);
419 /* we must add one for the 2-complement ... */
420 cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
421 /* ... and subtract 1 at x[ln], where n = ln + xn */
422 cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
423 }
424 }
425
426 /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
427 have X = B^n, and setting X to 1-2^{-p} satisties the error bound
428 of 1 ulp. */
429 if (MPFR_UNLIKELY(cy != 0))
430 {
431 cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
432 MPFR_ASSERTD(cy == 0);
433 }
434
435 MPFR_TMP_FREE (marker);
436 }
437 }
438
439 int
mpfr_rec_sqrt(mpfr_ptr r,mpfr_srcptr u,mpfr_rnd_t rnd_mode)440 mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
441 {
442 mpfr_prec_t rp, up, wp;
443 mp_size_t rn, wn;
444 int s, cy, inex;
445 mpfr_limb_ptr x;
446 MPFR_TMP_DECL(marker);
447
448 MPFR_LOG_FUNC
449 (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode),
450 ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex));
451
452 /* special values */
453 if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
454 {
455 if (MPFR_IS_NAN(u))
456 {
457 MPFR_SET_NAN(r);
458 MPFR_RET_NAN;
459 }
460 else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
461 {
462 /* 0+ or 0- */
463 MPFR_SET_INF(r);
464 MPFR_SET_POS(r);
465 mpfr_set_divby0 ();
466 MPFR_RET(0); /* Inf is exact */
467 }
468 else
469 {
470 MPFR_ASSERTD(MPFR_IS_INF(u));
471 /* 1/sqrt(-Inf) = NAN */
472 if (MPFR_IS_NEG(u))
473 {
474 MPFR_SET_NAN(r);
475 MPFR_RET_NAN;
476 }
477 /* 1/sqrt(+Inf) = +0 */
478 MPFR_SET_POS(r);
479 MPFR_SET_ZERO(r);
480 MPFR_RET(0);
481 }
482 }
483
484 /* if u < 0, 1/sqrt(u) is NaN */
485 if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
486 {
487 MPFR_SET_NAN(r);
488 MPFR_RET_NAN;
489 }
490
491 MPFR_SET_POS(r);
492
493 rp = MPFR_PREC(r); /* output precision */
494 up = MPFR_PREC(u); /* input precision */
495 wp = rp + 11; /* initial working precision */
496
497 /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
498 If e is even, we compute an approximation of X of (4U)^{-1/2},
499 and the result is X*2^(-(e-2)/2) [case s=1].
500 If e is odd, we compute an approximation of X of (2U)^{-1/2},
501 and the result is X*2^(-(e-1)/2) [case s=0]. */
502
503 /* parity of the exponent of u */
504 s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
505
506 rn = LIMB_SIZE(rp);
507
508 /* for the first iteration, if rp + 11 fits into rn limbs, we round up
509 up to a full limb to maximize the chance of rounding, while avoiding
510 to allocate extra space */
511 wp = rp + 11;
512 if (wp < rn * GMP_NUMB_BITS)
513 wp = rn * GMP_NUMB_BITS;
514 for (;;)
515 {
516 MPFR_TMP_MARK (marker);
517 wn = LIMB_SIZE(wp);
518 if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */
519 x = MPFR_TMP_LIMBS_ALLOC (wn);
520 else
521 x = MPFR_MANT(r);
522 mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
523 /* If the input was not truncated, the error is at most one ulp;
524 if the input was truncated, the error is at most two ulps
525 (see algorithms.tex). */
526 if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
527 rp + (rnd_mode == MPFR_RNDN))))
528 break;
529
530 /* We detect only now the exact case where u=2^(2e), to avoid
531 slowing down the average case. This can happen only when the
532 mantissa is exactly 1/2 and the exponent is odd. */
533 if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
534 {
535 mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp;
536
537 /* we should have x=111...111 */
538 mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
539 x[wn - 1] = MPFR_LIMB_HIGHBIT;
540 s += 2;
541 break; /* go through */
542 }
543 MPFR_TMP_FREE(marker);
544
545 wp += GMP_NUMB_BITS;
546 }
547 cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
548 MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
549 if (MPFR_UNLIKELY(cy != 0))
550 {
551 MPFR_EXP(r) ++;
552 MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
553 }
554 MPFR_TMP_FREE(marker);
555 return mpfr_check_range (r, inex, rnd_mode);
556 }
557