xref: /dragonfly/contrib/mpfr/src/rec_sqrt.c (revision 25a2db75)
1 /* mpfr_rec_sqrt -- inverse square root
2 
3 Copyright 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramel projects, INRIA.
5 
6 This file is part of the GNU MPFR Library.
7 
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 
26 #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
27 #include "mpfr-impl.h"
28 
29 #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1)
30 
31 #define MPFR_COM_N(x,y,n)                               \
32   {                                                     \
33     mp_size_t i;                                        \
34     for (i = 0; i < n; i++)                             \
35       *((x)+i) = ~*((y)+i);                             \
36   }
37 
38 /* Put in X a p-bit approximation of 1/sqrt(A),
39    where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
40    A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
41    where B = 2^GMP_NUMB_BITS.
42 
43    We have 1 <= A < 4 and 1/2 <= X < 1.
44 
45    The error in the approximate result with respect to the true
46    value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
47 
48    Note: x and a are left-aligned, i.e., the most significant bit of
49    a[an-1] is set, and so is the most significant bit of the output x[n-1].
50 
51    If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
52    A are taken into account to compute the approximation of 1/sqrt(A), but
53    whether or not they are zero, the error between X and 1/sqrt(A) is bounded
54    by 1 ulp(X) [in precision p].
55    The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
56    are set to 0.
57 
58    Assumptions:
59    (1) A should be normalized, i.e., the most significant bit of a[an-1]
60        should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
61    (2) p >= 12
62    (3) {a, an} and {x, n} should not overlap
63    (4) GMP_NUMB_BITS >= 12 and is even
64 
65    Note: this routine is much more efficient when ap is small compared to p,
66    including the case where ap <= GMP_NUMB_BITS, thus it can be used to
67    implement an efficient mpfr_rec_sqrt_ui function.
68 
69    References:
70    [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann,
71    http://www.loria.fr/~zimmerma/mca/pub226.html
72 */
73 static void
74 mpfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p,
75                    mpfr_limb_srcptr a, mpfr_prec_t ap, int as)
76 
77 {
78   /* the following T1 and T2 are bipartite tables giving initial
79      approximation for the inverse square root, with 13-bit input split in
80      5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
81      with i = a*2^8 + b*2^4 + c, we use for approximation of
82      2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
83      The largest error is obtained for i = 2054, where x = 2044,
84      and 2048/sqrt(i/2048) = 2045.006576...
85   */
86   static short int T1[384] = {
87 2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
88 1944, 1938, 1931, /* a=8 */
89 1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
90 1844, 1838, 1832, /* a=9 */
91 1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
92 1757, 1752, 1747, /* a=10 */
93 1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
94 1681, 1677, 1673, /* a=11 */
95 1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
96 1615, 1611, 1607, /* a=12 */
97 1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
98 1556, 1552, 1549, /* a=13 */
99 1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
100 1502, 1499, 1496, /* a=14 */
101 1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
102 1454, 1451, 1449, /* a=15 */
103 1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
104 1411, 1408, 1405, /* a=16 */
105 1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
106 1371, 1368, 1366, /* a=17 */
107 1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
108 1333, 1331, 1329, /* a=18 */
109 1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
110 1300, 1298, 1296, /* a=19 */
111 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
112 1268, 1266, 1265, /* a=20 */
113 1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
114 1239, 1237, 1235, /* a=21 */
115 1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
116 1212, 1210, 1208, /* a=22 */
117 1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
118 1185, 1184, 1182, /* a=23 */
119 1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
120 1162, 1160, 1159, /* a=24 */
121 1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
122 1139, 1137, 1136, /* a=25 */
123 1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
124 1117, 1116, 1115, /* a=26 */
125 1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
126 1098, 1096, 1095, /* a=27 */
127 1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
128 1078, 1077, 1076, /* a=28 */
129 1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
130 1060, 1059, 1058, /* a=29 */
131 1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
132 1043, 1042, 1041, /* a=30 */
133 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
134 1027, 1026, 1025 /* a=31 */
135 };
136   static unsigned char T2[384] = {
137     7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
138     6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
139     5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
140     4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
141     3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
142     3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
143     3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
144     2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
145     2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
146     2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
147     3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
148     2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
149     1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
150     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
151     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
152     2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
153     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
154     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
155     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
156     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
157     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
158     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
159     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
160     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* a=31 */
161 };
162   mp_size_t n = LIMB_SIZE(p);   /* number of limbs of X */
163   mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
164 
165   /* A should be normalized */
166   MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
167   /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
168      Since that does not depend on MPFR, we always check this. */
169   MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0));
170   /* {a, an} and {x, n} should not overlap */
171   MPFR_ASSERTD((a + an <= x) || (x + n <= a));
172   MPFR_ASSERTD(p >= 11);
173 
174   if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
175     {
176       a += an - n;
177       an = n;
178     }
179 
180   if (p == 11) /* should happen only from recursive calls */
181     {
182       unsigned long i, ab, ac;
183       mp_limb_t t;
184 
185       /* take the 12+as most significant bits of A */
186       i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
187       /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
188       ab = i >> 4;
189       ac = (ab & 0x3F0) | (i & 0x0F);
190       t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80];
191       x[0] = t << (GMP_NUMB_BITS - p);
192     }
193   else /* p >= 12 */
194     {
195       mpfr_prec_t h, pl;
196       mpfr_limb_ptr r, s, t, u;
197       mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
198       mp_limb_t neg, cy, cu;
199       MPFR_TMP_DECL(marker);
200 
201       /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0,
202          and A/4 if as=1. */
203 
204       /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
205       h = (p < 18) ? 11 : (p >> 1) + 2;
206 
207       xn = LIMB_SIZE(h);       /* limb size of the recursive Xh */
208       rn = LIMB_SIZE(2 * h);   /* a priori limb size of Xh^2 */
209       ln = n - xn;             /* remaining limbs to be computed */
210 
211       /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
212          we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
213          thus the h-3 most significant bits of t should be zero,
214          which is in fact h+1+as-3 because of the normalization of A.
215          This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs.
216 
217          More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2})
218          <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2}
219          since A < 4 and h >= 11, thus
220          |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}.
221          This is sufficient to prove that the upper limb of {t,tn} below is
222          less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below.
223       */
224       th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS;
225       tn = LIMB_SIZE(2 * h + 1 + as);
226 
227       /* we need h+1+as bits of a */
228       ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
229                                       needed for the recursive call*/
230       if (MPFR_UNLIKELY(ahn > an))
231         ahn = an;
232       mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
233       /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
234          limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
235 
236       /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */
237 
238       MPFR_TMP_MARK (marker);
239       /* first step: square X in r, result is exact */
240       un = xn + (tn - th);
241       /* We use the same temporary buffer to store r and u: r needs 2*xn
242          limbs where u needs xn+(tn-th) limbs. Since tn can store at least
243          2h bits, and th at most h bits, then tn-th can store at least h bits,
244          thus tn - th >= xn, and reserving the space for u is enough. */
245       MPFR_ASSERTD(2 * xn <= un);
246       u = r = MPFR_TMP_LIMBS_ALLOC (un);
247       if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
248                                      thus ln = 0 */
249         {
250           MPFR_ASSERTD(ln == 0);
251           cy = x[0] >> (GMP_NUMB_BITS >> 1);
252           r ++;
253           r[0] = cy * cy;
254         }
255       else if (xn == 1) /* xn=1, rn=2 */
256         umul_ppmm(r[1], r[0], x[ln], x[ln]);
257       else
258         {
259           mpn_mul_n (r, x + ln, x + ln, xn);
260           /* we have {r, 2*xn} = X_h^2 */
261           if (rn < 2 * xn)
262             r ++;
263         }
264       /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
265          limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
266 
267       /* Second step: s <- A * (r^2), and truncate the low ap bits,
268          i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
269        */
270       sn = an + rn;
271       s = MPFR_TMP_LIMBS_ALLOC (sn);
272       if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
273                            and 2h >= p+3 */
274         {
275           /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
276              bits from A */
277           /* since n=1, and we ensured an <= n, we also have an=1 */
278           MPFR_ASSERTD(an == 1);
279           umul_ppmm (s[1], s[0], r[0], a[0]);
280         }
281       else
282         {
283           /* we have p <= n * GMP_NUMB_BITS
284              2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
285              thus n <= rn <= n + 1 */
286           MPFR_ASSERTD(rn <= n + 1);
287           /* since we ensured an <= n, we have an <= rn */
288           MPFR_ASSERTD(an <= rn);
289           mpn_mul (s, r, rn, a, an);
290           /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
291              100000... or 011111... if as=0, or
292              010000... or 001111... if as=1.
293              We ignore the bits of s after the first 2h+1+as ones.
294              We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */
295         }
296 
297       /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
298          limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
299       t = s + sn - tn; /* pointer to low limb of the high part of t */
300       /* the upper h-3 bits of 1-t should be zero,
301          where 1 corresponds to the most significant bit of t[tn-1] if as=0,
302          and to the 2nd most significant bit of t[tn-1] if as=1 */
303 
304       /* compute t <- 1 - t, which is B^tn - {t, tn+1},
305          with rounding toward -Inf, i.e., rounding the input t toward +Inf.
306          We could only modify the low tn - th limbs from t, but it gives only
307          a small speedup, and would make the code more complex.
308       */
309       neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
310       if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
311                        is the part truncated above, thus 1 - t rounded to -Inf
312                        is 1 - th - ulp(th) */
313         {
314           /* since the 1+as most significant bits of t are zero, set them
315              to 1 before the one-complement */
316           t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
317           MPFR_COM_N (t, t, tn);
318           /* we should add 1 here to get 1-th complement, and subtract 1 for
319              -ulp(th), thus we do nothing */
320         }
321       else /* negative case: we want 1 - t rounded toward -Inf, i.e.,
322               th + eps rounded toward +Inf, which is th + ulp(th):
323               we discard the bit corresponding to 1,
324               and we add 1 to the least significant bit of t */
325         {
326           t[tn - 1] ^= neg;
327           mpn_add_1 (t, t, tn, 1);
328         }
329       tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
330                    the high limbs of {t, tn} are zero */
331 
332       /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
333          th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
334       MPFR_ASSERTD(tn > 0);
335 
336       /* u <- x * t, where {t, tn} contains at least h+3 bits,
337          and {x, xn} contains h bits, thus tn >= xn */
338       MPFR_ASSERTD(tn >= xn);
339       if (tn == 1) /* necessarily xn=1 */
340         umul_ppmm (u[1], u[0], t[0], x[ln]);
341       else
342         mpn_mul (u, t, tn, x + ln, xn);
343 
344       /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */
345 
346       /* we have already discarded the upper th high limbs of t, thus we only
347          have to consider the upper n - th limbs of u */
348       un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
349                       h = ceil((p+3)/2) <= (p+4)/2,
350                       th*GMP_NUMB_BITS <= h-1 <= p/2+1,
351                       thus (n-th)*GMP_NUMB_BITS >= p/2-1.
352                    */
353       MPFR_ASSERTD(un > 0);
354       u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
355                                            = xn + original_tn - n
356                               = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
357                               since 2h >= p+3 */
358       MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
359 
360       /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
361          need to add or subtract.
362          In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
363          u by 2. */
364 
365       if (as == 1)
366         /* shift on un+1 limbs to get most significant bit of u[-1] into
367            least significant bit of u[0] */
368         mpn_lshift (u - 1, u - 1, un + 1, 1);
369 
370       /* now {u,un} represents U / 2 from Algorithm 3.9 */
371 
372       pl = n * GMP_NUMB_BITS - p;       /* low bits from x */
373       /* We want that the low pl bits are zero after rounding to nearest,
374          thus we round u to nearest at bit pl-1 of u[0] */
375       if (pl > 0)
376         {
377           cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1)));
378           /* mask bits 0..pl-1 of u[0] */
379           u[0] &= ~MPFR_LIMB_MASK(pl);
380         }
381       else /* round bit is in u[-1] */
382         cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
383       MPFR_ASSERTN(cu == 0);
384 
385       /* We already have filled {x + ln, xn = n - ln}, and we want to add or
386          subtract {u, un} at position x.
387          un = n - th, where th contains <= h+1+as-3<=h-1 bits
388          ln = n - xn, where xn contains >= h bits
389          thus un > ln.
390          Warning: ln might be zero.
391       */
392       MPFR_ASSERTD(un > ln);
393       /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
394          p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
395       MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
396       /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
397          we need to add or subtract the overlapping part {u + ln, un - ln} */
398       /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1
399          below (with size = th) mustn't be used. */
400       if (neg == 0)
401         {
402           if (ln > 0)
403             MPN_COPY (x, u, ln);
404           cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
405           /* cy is the carry at x + (ln + xn) = x + n */
406         }
407       else /* negative case */
408         {
409           /* subtract {u+ln, un-ln} from {x+ln,un} */
410           cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
411           /* cy is the borrow at x + (ln + xn) = x + n */
412 
413           /* cy cannot be non-zero, since the most significant bit of Xh is 1,
414              and the correction is bounded by 2^{-h+3} */
415           MPFR_ASSERTD(cy == 0);
416           if (ln > 0)
417             {
418               MPFR_COM_N (x, u, ln);
419               /* we must add one for the 2-complement ... */
420               cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
421               /* ... and subtract 1 at x[ln], where n = ln + xn */
422               cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
423             }
424         }
425 
426       /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
427          have X = B^n, and setting X to 1-2^{-p} satisties the error bound
428          of 1 ulp. */
429       if (MPFR_UNLIKELY(cy != 0))
430         {
431           cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
432           MPFR_ASSERTD(cy == 0);
433         }
434 
435       MPFR_TMP_FREE (marker);
436     }
437 }
438 
439 int
440 mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
441 {
442   mpfr_prec_t rp, up, wp;
443   mp_size_t rn, wn;
444   int s, cy, inex;
445   mpfr_limb_ptr x;
446   MPFR_TMP_DECL(marker);
447 
448   MPFR_LOG_FUNC
449     (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode),
450      ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex));
451 
452   /* special values */
453   if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
454     {
455       if (MPFR_IS_NAN(u))
456         {
457           MPFR_SET_NAN(r);
458           MPFR_RET_NAN;
459         }
460       else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
461         {
462           /* 0+ or 0- */
463           MPFR_SET_INF(r);
464           MPFR_SET_POS(r);
465           mpfr_set_divby0 ();
466           MPFR_RET(0); /* Inf is exact */
467         }
468       else
469         {
470           MPFR_ASSERTD(MPFR_IS_INF(u));
471           /* 1/sqrt(-Inf) = NAN */
472           if (MPFR_IS_NEG(u))
473             {
474               MPFR_SET_NAN(r);
475               MPFR_RET_NAN;
476             }
477           /* 1/sqrt(+Inf) = +0 */
478           MPFR_SET_POS(r);
479           MPFR_SET_ZERO(r);
480           MPFR_RET(0);
481         }
482     }
483 
484   /* if u < 0, 1/sqrt(u) is NaN */
485   if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
486     {
487       MPFR_SET_NAN(r);
488       MPFR_RET_NAN;
489     }
490 
491   MPFR_SET_POS(r);
492 
493   rp = MPFR_PREC(r); /* output precision */
494   up = MPFR_PREC(u); /* input precision */
495   wp = rp + 11;      /* initial working precision */
496 
497   /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
498      If e is even, we compute an approximation of X of (4U)^{-1/2},
499      and the result is X*2^(-(e-2)/2) [case s=1].
500      If e is odd, we compute an approximation of X of (2U)^{-1/2},
501      and the result is X*2^(-(e-1)/2) [case s=0]. */
502 
503   /* parity of the exponent of u */
504   s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
505 
506   rn = LIMB_SIZE(rp);
507 
508   /* for the first iteration, if rp + 11 fits into rn limbs, we round up
509      up to a full limb to maximize the chance of rounding, while avoiding
510      to allocate extra space */
511   wp = rp + 11;
512   if (wp < rn * GMP_NUMB_BITS)
513     wp = rn * GMP_NUMB_BITS;
514   for (;;)
515     {
516       MPFR_TMP_MARK (marker);
517       wn = LIMB_SIZE(wp);
518       if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */
519         x = MPFR_TMP_LIMBS_ALLOC (wn);
520       else
521         x = MPFR_MANT(r);
522       mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
523       /* If the input was not truncated, the error is at most one ulp;
524          if the input was truncated, the error is at most two ulps
525          (see algorithms.tex). */
526       if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
527                                      rp + (rnd_mode == MPFR_RNDN))))
528         break;
529 
530       /* We detect only now the exact case where u=2^(2e), to avoid
531          slowing down the average case. This can happen only when the
532          mantissa is exactly 1/2 and the exponent is odd. */
533       if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
534         {
535           mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp;
536 
537           /* we should have x=111...111 */
538           mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
539           x[wn - 1] = MPFR_LIMB_HIGHBIT;
540           s += 2;
541           break; /* go through */
542         }
543       MPFR_TMP_FREE(marker);
544 
545       wp += GMP_NUMB_BITS;
546     }
547   cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
548   MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
549   if (MPFR_UNLIKELY(cy != 0))
550     {
551       MPFR_EXP(r) ++;
552       MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
553     }
554   MPFR_TMP_FREE(marker);
555   return mpfr_check_range (r, inex, rnd_mode);
556 }
557