1 /* mpn_sqrtrem -- square root and remainder 2 3 Contributed to the GNU project by Paul Zimmermann (most code) and 4 Torbjorn Granlund (mpn_sqrtrem1). 5 6 THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A 7 MUTABLE INTERFACE. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED 8 INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR 9 DISAPPEAR IN A FUTURE GMP RELEASE. 10 11 Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2010 Free Software 12 Foundation, Inc. 13 14 This file is part of the GNU MP Library. 15 16 The GNU MP Library is free software; you can redistribute it and/or modify 17 it under the terms of the GNU Lesser General Public License as published by 18 the Free Software Foundation; either version 3 of the License, or (at your 19 option) any later version. 20 21 The GNU MP Library is distributed in the hope that it will be useful, but 22 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 23 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 24 License for more details. 25 26 You should have received a copy of the GNU Lesser General Public License 27 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 28 29 30 /* See "Karatsuba Square Root", reference in gmp.texi. */ 31 32 33 #include <stdio.h> 34 #include <stdlib.h> 35 36 #include "gmp.h" 37 #include "gmp-impl.h" 38 #include "longlong.h" 39 40 static const unsigned short invsqrttab[384] = 41 { 42 0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */ 43 0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */ 44 0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */ 45 0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */ 46 0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */ 47 0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */ 48 0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */ 49 0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */ 50 0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */ 51 0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */ 52 0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */ 53 0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */ 54 0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */ 55 0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */ 56 0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */ 57 0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */ 58 0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */ 59 0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */ 60 0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */ 61 0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */ 62 0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */ 63 0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */ 64 0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */ 65 0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */ 66 0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */ 67 0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */ 68 0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */ 69 0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */ 70 0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */ 71 0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */ 72 0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */ 73 0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */ 74 0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */ 75 0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */ 76 0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */ 77 0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */ 78 0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */ 79 0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */ 80 0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */ 81 0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */ 82 0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */ 83 0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */ 84 0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */ 85 0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */ 86 0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */ 87 0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */ 88 0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */ 89 0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100 /* sqrt(1/1f8)..sqrt(1/1ff) */ 90 }; 91 92 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */ 93 94 #if GMP_NUMB_BITS > 32 95 #define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */ 96 #else 97 #define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */ 98 #endif 99 100 static mp_limb_t 101 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0) 102 { 103 #if GMP_NUMB_BITS > 32 104 mp_limb_t a1; 105 #endif 106 mp_limb_t x0, t2, t, x2; 107 unsigned abits; 108 109 ASSERT_ALWAYS (GMP_NAIL_BITS == 0); 110 ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64); 111 ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2); 112 113 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a), 114 since we can do the former without division. As part of the last 115 iteration convert from 1/sqrt(a) to sqrt(a). */ 116 117 abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */ 118 x0 = invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */ 119 120 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */ 121 122 #if GMP_NUMB_BITS > 32 123 a1 = a0 >> (GMP_LIMB_BITS - 1 - 32); 124 t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16; 125 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2)); 126 127 /* x0 is now an 16 bits approximation of 1/sqrt(a0) */ 128 129 t2 = x0 * (a0 >> (32-8)); 130 t = t2 >> 25; 131 t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8)); 132 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15); 133 x0 >>= 32; 134 #else 135 t2 = x0 * (a0 >> (16-8)); 136 t = t2 >> 13; 137 t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8)); 138 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7); 139 x0 >>= 16; 140 #endif 141 142 /* x0 is now a full limb approximation of sqrt(a0) */ 143 144 x2 = x0 * x0; 145 if (x2 + 2*x0 <= a0 - 1) 146 { 147 x2 += 2*x0 + 1; 148 x0++; 149 } 150 151 *rp = a0 - x2; 152 return x0; 153 } 154 155 156 #define Prec (GMP_NUMB_BITS >> 1) 157 158 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized 159 return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ 160 static mp_limb_t 161 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np) 162 { 163 mp_limb_t qhl, q, u, np0, sp0, rp0, q2; 164 int cc; 165 166 ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2); 167 168 np0 = np[0]; 169 sp0 = mpn_sqrtrem1 (rp, np[1]); 170 qhl = 0; 171 rp0 = rp[0]; 172 while (rp0 >= sp0) 173 { 174 qhl++; 175 rp0 -= sp0; 176 } 177 /* now rp0 < sp0 < 2^Prec */ 178 rp0 = (rp0 << Prec) + (np0 >> Prec); 179 u = 2 * sp0; 180 q = rp0 / u; 181 u = rp0 - q * u; 182 q += (qhl & 1) << (Prec - 1); 183 qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */ 184 /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */ 185 sp0 = ((sp0 + qhl) << Prec) + q; 186 cc = u >> Prec; 187 rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1)); 188 /* subtract q * q or qhl*2^(2*Prec) from rp */ 189 q2 = q * q; 190 cc -= (rp0 < q2) + qhl; 191 rp0 -= q2; 192 /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */ 193 if (cc < 0) 194 { 195 if (sp0 != 0) 196 { 197 rp0 += sp0; 198 cc += rp0 < sp0; 199 } 200 else 201 cc++; 202 --sp0; 203 rp0 += sp0; 204 cc += rp0 < sp0; 205 } 206 207 rp[0] = rp0; 208 sp[0] = sp0; 209 return cc; 210 } 211 212 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, 213 and in {np, n} the low n limbs of the remainder, returns the high 214 limb of the remainder (which is 0 or 1). 215 Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 216 where B=2^GMP_NUMB_BITS. */ 217 static mp_limb_t 218 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n) 219 { 220 mp_limb_t q; /* carry out of {sp, n} */ 221 int c, b; /* carry out of remainder */ 222 mp_size_t l, h; 223 224 ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2); 225 226 if (n == 1) 227 c = mpn_sqrtrem2 (sp, np, np); 228 else 229 { 230 l = n / 2; 231 h = n - l; 232 q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h); 233 if (q != 0) 234 mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h); 235 q += mpn_divrem (sp, 0, np + l, n, sp + l, h); 236 c = sp[0] & 1; 237 mpn_rshift (sp, sp, l, 1); 238 sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; 239 q >>= 1; 240 if (c != 0) 241 c = mpn_add_n (np + l, np + l, sp + l, h); 242 mpn_sqr (np + n, sp, l); 243 b = q + mpn_sub_n (np, np, np + n, 2 * l); 244 c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b); 245 q = mpn_add_1 (sp + l, sp + l, h, q); 246 247 if (c < 0) 248 { 249 c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q; 250 c -= mpn_sub_1 (np, np, n, CNST_LIMB(1)); 251 q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1)); 252 } 253 } 254 255 return c; 256 } 257 258 259 mp_size_t 260 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn) 261 { 262 mp_limb_t *tp, s0[1], cc, high, rl; 263 int c; 264 mp_size_t rn, tn; 265 TMP_DECL; 266 267 ASSERT (nn >= 0); 268 ASSERT_MPN (np, nn); 269 270 /* If OP is zero, both results are zero. */ 271 if (nn == 0) 272 return 0; 273 274 ASSERT (np[nn - 1] != 0); 275 ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn)); 276 ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn)); 277 ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn)); 278 279 high = np[nn - 1]; 280 if (nn == 1 && (high & GMP_NUMB_HIGHBIT)) 281 { 282 mp_limb_t r; 283 sp[0] = mpn_sqrtrem1 (&r, high); 284 if (rp != NULL) 285 rp[0] = r; 286 return r != 0; 287 } 288 count_leading_zeros (c, high); 289 c -= GMP_NAIL_BITS; 290 291 c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ 292 tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */ 293 294 TMP_MARK; 295 if (nn % 2 != 0 || c > 0) 296 { 297 tp = TMP_ALLOC_LIMBS (2 * tn); 298 tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */ 299 if (c != 0) 300 mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c); 301 else 302 MPN_COPY (tp + 2 * tn - nn, np, nn); 303 rl = mpn_dc_sqrtrem (sp, tp, tn); 304 /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2, 305 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */ 306 c += (nn % 2) * GMP_NUMB_BITS / 2; /* c now represents k */ 307 s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1); /* S mod 2^k */ 308 rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */ 309 cc = mpn_submul_1 (tp, s0, 1, s0[0]); 310 rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc; 311 mpn_rshift (sp, sp, tn, c); 312 tp[tn] = rl; 313 if (rp == NULL) 314 rp = tp; 315 c = c << 1; 316 if (c < GMP_NUMB_BITS) 317 tn++; 318 else 319 { 320 tp++; 321 c -= GMP_NUMB_BITS; 322 } 323 if (c != 0) 324 mpn_rshift (rp, tp, tn, c); 325 else 326 MPN_COPY_INCR (rp, tp, tn); 327 rn = tn; 328 } 329 else 330 { 331 if (rp == NULL) 332 rp = TMP_ALLOC_LIMBS (nn); 333 if (rp != np) 334 MPN_COPY (rp, np, nn); 335 rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn)); 336 } 337 338 MPN_NORMALIZE (rp, rn); 339 340 TMP_FREE; 341 return rn; 342 } 343