1 /* mpfr_rec_sqrt -- inverse square root 2 3 Copyright 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. 4 Contributed by the AriC and Caramel projects, INRIA. 5 6 This file is part of the GNU MPFR Library. 7 8 The GNU MPFR Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MPFR Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see 20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 #include <stdio.h> 24 #include <stdlib.h> 25 26 #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */ 27 #include "mpfr-impl.h" 28 29 #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1) 30 31 #define MPFR_COM_N(x,y,n) \ 32 { \ 33 mp_size_t i; \ 34 for (i = 0; i < n; i++) \ 35 *((x)+i) = ~*((y)+i); \ 36 } 37 38 /* Put in X a p-bit approximation of 1/sqrt(A), 39 where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS), 40 A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS), 41 where B = 2^GMP_NUMB_BITS. 42 43 We have 1 <= A < 4 and 1/2 <= X < 1. 44 45 The error in the approximate result with respect to the true 46 value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1. 47 48 Note: x and a are left-aligned, i.e., the most significant bit of 49 a[an-1] is set, and so is the most significant bit of the output x[n-1]. 50 51 If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input 52 A are taken into account to compute the approximation of 1/sqrt(A), but 53 whether or not they are zero, the error between X and 1/sqrt(A) is bounded 54 by 1 ulp(X) [in precision p]. 55 The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS) 56 are set to 0. 57 58 Assumptions: 59 (1) A should be normalized, i.e., the most significant bit of a[an-1] 60 should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4. 61 (2) p >= 12 62 (3) {a, an} and {x, n} should not overlap 63 (4) GMP_NUMB_BITS >= 12 and is even 64 65 Note: this routine is much more efficient when ap is small compared to p, 66 including the case where ap <= GMP_NUMB_BITS, thus it can be used to 67 implement an efficient mpfr_rec_sqrt_ui function. 68 69 References: 70 [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann, 71 http://www.loria.fr/~zimmerma/mca/pub226.html 72 */ 73 static void 74 mpfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p, 75 mpfr_limb_srcptr a, mpfr_prec_t ap, int as) 76 77 { 78 /* the following T1 and T2 are bipartite tables giving initial 79 approximation for the inverse square root, with 13-bit input split in 80 5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192, 81 with i = a*2^8 + b*2^4 + c, we use for approximation of 82 2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c]. 83 The largest error is obtained for i = 2054, where x = 2044, 84 and 2048/sqrt(i/2048) = 2045.006576... 85 */ 86 static short int T1[384] = { 87 2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951, 88 1944, 1938, 1931, /* a=8 */ 89 1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849, 90 1844, 1838, 1832, /* a=9 */ 91 1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762, 92 1757, 1752, 1747, /* a=10 */ 93 1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686, 94 1681, 1677, 1673, /* a=11 */ 95 1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619, 96 1615, 1611, 1607, /* a=12 */ 97 1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559, 98 1556, 1552, 1549, /* a=13 */ 99 1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505, 100 1502, 1499, 1496, /* a=14 */ 101 1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457, 102 1454, 1451, 1449, /* a=15 */ 103 1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413, 104 1411, 1408, 1405, /* a=16 */ 105 1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373, 106 1371, 1368, 1366, /* a=17 */ 107 1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335, 108 1333, 1331, 1329, /* a=18 */ 109 1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302, 110 1300, 1298, 1296, /* a=19 */ 111 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270, 112 1268, 1266, 1265, /* a=20 */ 113 1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241, 114 1239, 1237, 1235, /* a=21 */ 115 1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213, 116 1212, 1210, 1208, /* a=22 */ 117 1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187, 118 1185, 1184, 1182, /* a=23 */ 119 1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163, 120 1162, 1160, 1159, /* a=24 */ 121 1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140, 122 1139, 1137, 1136, /* a=25 */ 123 1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119, 124 1117, 1116, 1115, /* a=26 */ 125 1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099, 126 1098, 1096, 1095, /* a=27 */ 127 1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079, 128 1078, 1077, 1076, /* a=28 */ 129 1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061, 130 1060, 1059, 1058, /* a=29 */ 131 1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044, 132 1043, 1042, 1041, /* a=30 */ 133 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028, 134 1027, 1026, 1025 /* a=31 */ 135 }; 136 static unsigned char T2[384] = { 137 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */ 138 6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */ 139 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */ 140 4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */ 141 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */ 142 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */ 143 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */ 144 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */ 145 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */ 146 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */ 147 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */ 148 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */ 149 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */ 150 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */ 151 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */ 152 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */ 153 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */ 154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */ 155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */ 156 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */ 157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */ 158 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */ 159 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */ 160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* a=31 */ 161 }; 162 mp_size_t n = LIMB_SIZE(p); /* number of limbs of X */ 163 mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */ 164 165 /* A should be normalized */ 166 MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0); 167 /* We should have enough bits in one limb and GMP_NUMB_BITS should be even. 168 Since that does not depend on MPFR, we always check this. */ 169 MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0)); 170 /* {a, an} and {x, n} should not overlap */ 171 MPFR_ASSERTD((a + an <= x) || (x + n <= a)); 172 MPFR_ASSERTD(p >= 11); 173 174 if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */ 175 { 176 a += an - n; 177 an = n; 178 } 179 180 if (p == 11) /* should happen only from recursive calls */ 181 { 182 unsigned long i, ab, ac; 183 mp_limb_t t; 184 185 /* take the 12+as most significant bits of A */ 186 i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as)); 187 /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */ 188 ab = i >> 4; 189 ac = (ab & 0x3F0) | (i & 0x0F); 190 t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80]; 191 x[0] = t << (GMP_NUMB_BITS - p); 192 } 193 else /* p >= 12 */ 194 { 195 mpfr_prec_t h, pl; 196 mpfr_limb_ptr r, s, t, u; 197 mp_size_t xn, rn, th, ln, tn, sn, ahn, un; 198 mp_limb_t neg, cy, cu; 199 MPFR_TMP_DECL(marker); 200 201 /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0, 202 and A/4 if as=1. */ 203 204 /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */ 205 h = (p < 18) ? 11 : (p >> 1) + 2; 206 207 xn = LIMB_SIZE(h); /* limb size of the recursive Xh */ 208 rn = LIMB_SIZE(2 * h); /* a priori limb size of Xh^2 */ 209 ln = n - xn; /* remaining limbs to be computed */ 210 211 /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2} 212 we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3}, 213 thus the h-3 most significant bits of t should be zero, 214 which is in fact h+1+as-3 because of the normalization of A. 215 This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs. 216 217 More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2}) 218 <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2} 219 since A < 4 and h >= 11, thus 220 |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}. 221 This is sufficient to prove that the upper limb of {t,tn} below is 222 less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below. 223 */ 224 th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS; 225 tn = LIMB_SIZE(2 * h + 1 + as); 226 227 /* we need h+1+as bits of a */ 228 ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A 229 needed for the recursive call*/ 230 if (MPFR_UNLIKELY(ahn > an)) 231 ahn = an; 232 mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as); 233 /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS) 234 limbs, the low (-h) % GMP_NUMB_BITS bits are zero */ 235 236 /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */ 237 238 MPFR_TMP_MARK (marker); 239 /* first step: square X in r, result is exact */ 240 un = xn + (tn - th); 241 /* We use the same temporary buffer to store r and u: r needs 2*xn 242 limbs where u needs xn+(tn-th) limbs. Since tn can store at least 243 2h bits, and th at most h bits, then tn-th can store at least h bits, 244 thus tn - th >= xn, and reserving the space for u is enough. */ 245 MPFR_ASSERTD(2 * xn <= un); 246 u = r = MPFR_TMP_LIMBS_ALLOC (un); 247 if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1, 248 thus ln = 0 */ 249 { 250 MPFR_ASSERTD(ln == 0); 251 cy = x[0] >> (GMP_NUMB_BITS >> 1); 252 r ++; 253 r[0] = cy * cy; 254 } 255 else if (xn == 1) /* xn=1, rn=2 */ 256 umul_ppmm(r[1], r[0], x[ln], x[ln]); 257 else 258 { 259 mpn_mul_n (r, x + ln, x + ln, xn); 260 /* we have {r, 2*xn} = X_h^2 */ 261 if (rn < 2 * xn) 262 r ++; 263 } 264 /* now the 2h most significant bits of {r, rn} contains X^2, r has rn 265 limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */ 266 267 /* Second step: s <- A * (r^2), and truncate the low ap bits, 268 i.e., at weight 2^{-2h} (s is aligned to the low significant bits) 269 */ 270 sn = an + rn; 271 s = MPFR_TMP_LIMBS_ALLOC (sn); 272 if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h, 273 and 2h >= p+3 */ 274 { 275 /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low 276 bits from A */ 277 /* since n=1, and we ensured an <= n, we also have an=1 */ 278 MPFR_ASSERTD(an == 1); 279 umul_ppmm (s[1], s[0], r[0], a[0]); 280 } 281 else 282 { 283 /* we have p <= n * GMP_NUMB_BITS 284 2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4 285 thus n <= rn <= n + 1 */ 286 MPFR_ASSERTD(rn <= n + 1); 287 /* since we ensured an <= n, we have an <= rn */ 288 MPFR_ASSERTD(an <= rn); 289 mpn_mul (s, r, rn, a, an); 290 /* s should be near B^sn/2^(1+as), thus s[sn-1] is either 291 100000... or 011111... if as=0, or 292 010000... or 001111... if as=1. 293 We ignore the bits of s after the first 2h+1+as ones. 294 We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */ 295 } 296 297 /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an 298 limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */ 299 t = s + sn - tn; /* pointer to low limb of the high part of t */ 300 /* the upper h-3 bits of 1-t should be zero, 301 where 1 corresponds to the most significant bit of t[tn-1] if as=0, 302 and to the 2nd most significant bit of t[tn-1] if as=1 */ 303 304 /* compute t <- 1 - t, which is B^tn - {t, tn+1}, 305 with rounding toward -Inf, i.e., rounding the input t toward +Inf. 306 We could only modify the low tn - th limbs from t, but it gives only 307 a small speedup, and would make the code more complex. 308 */ 309 neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as); 310 if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th) 311 is the part truncated above, thus 1 - t rounded to -Inf 312 is 1 - th - ulp(th) */ 313 { 314 /* since the 1+as most significant bits of t are zero, set them 315 to 1 before the one-complement */ 316 t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as); 317 MPFR_COM_N (t, t, tn); 318 /* we should add 1 here to get 1-th complement, and subtract 1 for 319 -ulp(th), thus we do nothing */ 320 } 321 else /* negative case: we want 1 - t rounded toward -Inf, i.e., 322 th + eps rounded toward +Inf, which is th + ulp(th): 323 we discard the bit corresponding to 1, 324 and we add 1 to the least significant bit of t */ 325 { 326 t[tn - 1] ^= neg; 327 mpn_add_1 (t, t, tn, 1); 328 } 329 tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of 330 the high limbs of {t, tn} are zero */ 331 332 /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and 333 th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */ 334 MPFR_ASSERTD(tn > 0); 335 336 /* u <- x * t, where {t, tn} contains at least h+3 bits, 337 and {x, xn} contains h bits, thus tn >= xn */ 338 MPFR_ASSERTD(tn >= xn); 339 if (tn == 1) /* necessarily xn=1 */ 340 umul_ppmm (u[1], u[0], t[0], x[ln]); 341 else 342 mpn_mul (u, t, tn, x + ln, xn); 343 344 /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */ 345 346 /* we have already discarded the upper th high limbs of t, thus we only 347 have to consider the upper n - th limbs of u */ 348 un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS, 349 h = ceil((p+3)/2) <= (p+4)/2, 350 th*GMP_NUMB_BITS <= h-1 <= p/2+1, 351 thus (n-th)*GMP_NUMB_BITS >= p/2-1. 352 */ 353 MPFR_ASSERTD(un > 0); 354 u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th) 355 = xn + original_tn - n 356 = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0 357 since 2h >= p+3 */ 358 MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */ 359 360 /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we 361 need to add or subtract. 362 In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply 363 u by 2. */ 364 365 if (as == 1) 366 /* shift on un+1 limbs to get most significant bit of u[-1] into 367 least significant bit of u[0] */ 368 mpn_lshift (u - 1, u - 1, un + 1, 1); 369 370 /* now {u,un} represents U / 2 from Algorithm 3.9 */ 371 372 pl = n * GMP_NUMB_BITS - p; /* low bits from x */ 373 /* We want that the low pl bits are zero after rounding to nearest, 374 thus we round u to nearest at bit pl-1 of u[0] */ 375 if (pl > 0) 376 { 377 cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1))); 378 /* mask bits 0..pl-1 of u[0] */ 379 u[0] &= ~MPFR_LIMB_MASK(pl); 380 } 381 else /* round bit is in u[-1] */ 382 cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1)); 383 MPFR_ASSERTN(cu == 0); 384 385 /* We already have filled {x + ln, xn = n - ln}, and we want to add or 386 subtract {u, un} at position x. 387 un = n - th, where th contains <= h+1+as-3<=h-1 bits 388 ln = n - xn, where xn contains >= h bits 389 thus un > ln. 390 Warning: ln might be zero. 391 */ 392 MPFR_ASSERTD(un > ln); 393 /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and 394 p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */ 395 MPFR_ASSERTD(un == ln + 1 || un == ln + 2); 396 /* the high un-ln limbs of u will overlap the low part of {x+ln,xn}, 397 we need to add or subtract the overlapping part {u + ln, un - ln} */ 398 /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1 399 below (with size = th) mustn't be used. */ 400 if (neg == 0) 401 { 402 if (ln > 0) 403 MPN_COPY (x, u, ln); 404 cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln); 405 /* cy is the carry at x + (ln + xn) = x + n */ 406 } 407 else /* negative case */ 408 { 409 /* subtract {u+ln, un-ln} from {x+ln,un} */ 410 cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln); 411 /* cy is the borrow at x + (ln + xn) = x + n */ 412 413 /* cy cannot be non-zero, since the most significant bit of Xh is 1, 414 and the correction is bounded by 2^{-h+3} */ 415 MPFR_ASSERTD(cy == 0); 416 if (ln > 0) 417 { 418 MPFR_COM_N (x, u, ln); 419 /* we must add one for the 2-complement ... */ 420 cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE); 421 /* ... and subtract 1 at x[ln], where n = ln + xn */ 422 cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE); 423 } 424 } 425 426 /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should 427 have X = B^n, and setting X to 1-2^{-p} satisties the error bound 428 of 1 ulp. */ 429 if (MPFR_UNLIKELY(cy != 0)) 430 { 431 cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl); 432 MPFR_ASSERTD(cy == 0); 433 } 434 435 MPFR_TMP_FREE (marker); 436 } 437 } 438 439 int 440 mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode) 441 { 442 mpfr_prec_t rp, up, wp; 443 mp_size_t rn, wn; 444 int s, cy, inex; 445 mpfr_limb_ptr x; 446 MPFR_TMP_DECL(marker); 447 448 MPFR_LOG_FUNC 449 (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode), 450 ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex)); 451 452 /* special values */ 453 if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u))) 454 { 455 if (MPFR_IS_NAN(u)) 456 { 457 MPFR_SET_NAN(r); 458 MPFR_RET_NAN; 459 } 460 else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */ 461 { 462 /* 0+ or 0- */ 463 MPFR_SET_INF(r); 464 MPFR_SET_POS(r); 465 mpfr_set_divby0 (); 466 MPFR_RET(0); /* Inf is exact */ 467 } 468 else 469 { 470 MPFR_ASSERTD(MPFR_IS_INF(u)); 471 /* 1/sqrt(-Inf) = NAN */ 472 if (MPFR_IS_NEG(u)) 473 { 474 MPFR_SET_NAN(r); 475 MPFR_RET_NAN; 476 } 477 /* 1/sqrt(+Inf) = +0 */ 478 MPFR_SET_POS(r); 479 MPFR_SET_ZERO(r); 480 MPFR_RET(0); 481 } 482 } 483 484 /* if u < 0, 1/sqrt(u) is NaN */ 485 if (MPFR_UNLIKELY(MPFR_IS_NEG(u))) 486 { 487 MPFR_SET_NAN(r); 488 MPFR_RET_NAN; 489 } 490 491 MPFR_SET_POS(r); 492 493 rp = MPFR_PREC(r); /* output precision */ 494 up = MPFR_PREC(u); /* input precision */ 495 wp = rp + 11; /* initial working precision */ 496 497 /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1. 498 If e is even, we compute an approximation of X of (4U)^{-1/2}, 499 and the result is X*2^(-(e-2)/2) [case s=1]. 500 If e is odd, we compute an approximation of X of (2U)^{-1/2}, 501 and the result is X*2^(-(e-1)/2) [case s=0]. */ 502 503 /* parity of the exponent of u */ 504 s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1); 505 506 rn = LIMB_SIZE(rp); 507 508 /* for the first iteration, if rp + 11 fits into rn limbs, we round up 509 up to a full limb to maximize the chance of rounding, while avoiding 510 to allocate extra space */ 511 wp = rp + 11; 512 if (wp < rn * GMP_NUMB_BITS) 513 wp = rn * GMP_NUMB_BITS; 514 for (;;) 515 { 516 MPFR_TMP_MARK (marker); 517 wn = LIMB_SIZE(wp); 518 if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */ 519 x = MPFR_TMP_LIMBS_ALLOC (wn); 520 else 521 x = MPFR_MANT(r); 522 mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s); 523 /* If the input was not truncated, the error is at most one ulp; 524 if the input was truncated, the error is at most two ulps 525 (see algorithms.tex). */ 526 if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up), 527 rp + (rnd_mode == MPFR_RNDN)))) 528 break; 529 530 /* We detect only now the exact case where u=2^(2e), to avoid 531 slowing down the average case. This can happen only when the 532 mantissa is exactly 1/2 and the exponent is odd. */ 533 if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0) 534 { 535 mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp; 536 537 /* we should have x=111...111 */ 538 mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl); 539 x[wn - 1] = MPFR_LIMB_HIGHBIT; 540 s += 2; 541 break; /* go through */ 542 } 543 MPFR_TMP_FREE(marker); 544 545 wp += GMP_NUMB_BITS; 546 } 547 cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex); 548 MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2; 549 if (MPFR_UNLIKELY(cy != 0)) 550 { 551 MPFR_EXP(r) ++; 552 MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT; 553 } 554 MPFR_TMP_FREE(marker); 555 return mpfr_check_range (r, inex, rnd_mode); 556 } 557