1 /* mpn_sqrtrem -- square root and remainder
2
3 Contributed to the GNU project by Paul Zimmermann (most code) and
4 Torbjorn Granlund (mpn_sqrtrem1).
5
6 THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
7 MUTABLE INTERFACE. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
8 INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
9 DISAPPEAR IN A FUTURE GMP RELEASE.
10
11 Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2010 Free Software
12 Foundation, Inc.
13
14 This file is part of the GNU MP Library.
15
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of the GNU Lesser General Public License as published by
18 the Free Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20
21 The GNU MP Library is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
24 License for more details.
25
26 You should have received a copy of the GNU Lesser General Public License
27 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
28
29
30 /* See "Karatsuba Square Root", reference in gmp.texi. */
31
32
33 #include <stdio.h>
34 #include <stdlib.h>
35
36 #include "gmp.h"
37 #include "gmp-impl.h"
38 #include "longlong.h"
39
40 static const unsigned short invsqrttab[384] =
41 {
42 0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
43 0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
44 0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
45 0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
46 0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
47 0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
48 0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
49 0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
50 0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
51 0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
52 0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
53 0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
54 0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
55 0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
56 0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
57 0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
58 0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
59 0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
60 0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
61 0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
62 0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
63 0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
64 0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
65 0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
66 0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
67 0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
68 0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
69 0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
70 0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
71 0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
72 0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
73 0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
74 0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
75 0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
76 0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
77 0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
78 0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
79 0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
80 0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
81 0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
82 0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
83 0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
84 0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
85 0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
86 0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
87 0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
88 0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
89 0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100 /* sqrt(1/1f8)..sqrt(1/1ff) */
90 };
91
92 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */
93
94 #if GMP_NUMB_BITS > 32
95 #define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
96 #else
97 #define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
98 #endif
99
100 static mp_limb_t
mpn_sqrtrem1(mp_ptr rp,mp_limb_t a0)101 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
102 {
103 #if GMP_NUMB_BITS > 32
104 mp_limb_t a1;
105 #endif
106 mp_limb_t x0, t2, t, x2;
107 unsigned abits;
108
109 ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
110 ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
111 ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
112
113 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
114 since we can do the former without division. As part of the last
115 iteration convert from 1/sqrt(a) to sqrt(a). */
116
117 abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
118 x0 = invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
119
120 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
121
122 #if GMP_NUMB_BITS > 32
123 a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
124 t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
125 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
126
127 /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
128
129 t2 = x0 * (a0 >> (32-8));
130 t = t2 >> 25;
131 t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
132 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
133 x0 >>= 32;
134 #else
135 t2 = x0 * (a0 >> (16-8));
136 t = t2 >> 13;
137 t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
138 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
139 x0 >>= 16;
140 #endif
141
142 /* x0 is now a full limb approximation of sqrt(a0) */
143
144 x2 = x0 * x0;
145 if (x2 + 2*x0 <= a0 - 1)
146 {
147 x2 += 2*x0 + 1;
148 x0++;
149 }
150
151 *rp = a0 - x2;
152 return x0;
153 }
154
155
156 #define Prec (GMP_NUMB_BITS >> 1)
157
158 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
159 return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
160 static mp_limb_t
mpn_sqrtrem2(mp_ptr sp,mp_ptr rp,mp_srcptr np)161 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
162 {
163 mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
164 int cc;
165
166 ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
167
168 np0 = np[0];
169 sp0 = mpn_sqrtrem1 (rp, np[1]);
170 qhl = 0;
171 rp0 = rp[0];
172 while (rp0 >= sp0)
173 {
174 qhl++;
175 rp0 -= sp0;
176 }
177 /* now rp0 < sp0 < 2^Prec */
178 rp0 = (rp0 << Prec) + (np0 >> Prec);
179 u = 2 * sp0;
180 q = rp0 / u;
181 u = rp0 - q * u;
182 q += (qhl & 1) << (Prec - 1);
183 qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
184 /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
185 sp0 = ((sp0 + qhl) << Prec) + q;
186 cc = u >> Prec;
187 rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
188 /* subtract q * q or qhl*2^(2*Prec) from rp */
189 q2 = q * q;
190 cc -= (rp0 < q2) + qhl;
191 rp0 -= q2;
192 /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
193 if (cc < 0)
194 {
195 if (sp0 != 0)
196 {
197 rp0 += sp0;
198 cc += rp0 < sp0;
199 }
200 else
201 cc++;
202 --sp0;
203 rp0 += sp0;
204 cc += rp0 < sp0;
205 }
206
207 rp[0] = rp0;
208 sp[0] = sp0;
209 return cc;
210 }
211
212 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
213 and in {np, n} the low n limbs of the remainder, returns the high
214 limb of the remainder (which is 0 or 1).
215 Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
216 where B=2^GMP_NUMB_BITS. */
217 static mp_limb_t
mpn_dc_sqrtrem(mp_ptr sp,mp_ptr np,mp_size_t n)218 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
219 {
220 mp_limb_t q; /* carry out of {sp, n} */
221 int c, b; /* carry out of remainder */
222 mp_size_t l, h;
223
224 ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
225
226 if (n == 1)
227 c = mpn_sqrtrem2 (sp, np, np);
228 else
229 {
230 l = n / 2;
231 h = n - l;
232 q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
233 if (q != 0)
234 mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
235 q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
236 c = sp[0] & 1;
237 mpn_rshift (sp, sp, l, 1);
238 sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
239 q >>= 1;
240 if (c != 0)
241 c = mpn_add_n (np + l, np + l, sp + l, h);
242 mpn_sqr (np + n, sp, l);
243 b = q + mpn_sub_n (np, np, np + n, 2 * l);
244 c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
245 q = mpn_add_1 (sp + l, sp + l, h, q);
246
247 if (c < 0)
248 {
249 c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
250 c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
251 q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
252 }
253 }
254
255 return c;
256 }
257
258
259 mp_size_t
mpn_sqrtrem(mp_ptr sp,mp_ptr rp,mp_srcptr np,mp_size_t nn)260 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
261 {
262 mp_limb_t *tp, s0[1], cc, high, rl;
263 int c;
264 mp_size_t rn, tn;
265 TMP_DECL;
266
267 ASSERT (nn >= 0);
268 ASSERT_MPN (np, nn);
269
270 /* If OP is zero, both results are zero. */
271 if (nn == 0)
272 return 0;
273
274 ASSERT (np[nn - 1] != 0);
275 ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
276 ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
277 ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
278
279 high = np[nn - 1];
280 if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
281 {
282 mp_limb_t r;
283 sp[0] = mpn_sqrtrem1 (&r, high);
284 if (rp != NULL)
285 rp[0] = r;
286 return r != 0;
287 }
288 count_leading_zeros (c, high);
289 c -= GMP_NAIL_BITS;
290
291 c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
292 tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
293
294 TMP_MARK;
295 if (nn % 2 != 0 || c > 0)
296 {
297 tp = TMP_ALLOC_LIMBS (2 * tn);
298 tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */
299 if (c != 0)
300 mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
301 else
302 MPN_COPY (tp + 2 * tn - nn, np, nn);
303 rl = mpn_dc_sqrtrem (sp, tp, tn);
304 /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
305 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
306 c += (nn % 2) * GMP_NUMB_BITS / 2; /* c now represents k */
307 s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1); /* S mod 2^k */
308 rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
309 cc = mpn_submul_1 (tp, s0, 1, s0[0]);
310 rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
311 mpn_rshift (sp, sp, tn, c);
312 tp[tn] = rl;
313 if (rp == NULL)
314 rp = tp;
315 c = c << 1;
316 if (c < GMP_NUMB_BITS)
317 tn++;
318 else
319 {
320 tp++;
321 c -= GMP_NUMB_BITS;
322 }
323 if (c != 0)
324 mpn_rshift (rp, tp, tn, c);
325 else
326 MPN_COPY_INCR (rp, tp, tn);
327 rn = tn;
328 }
329 else
330 {
331 if (rp == NULL)
332 rp = TMP_ALLOC_LIMBS (nn);
333 if (rp != np)
334 MPN_COPY (rp, np, nn);
335 rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
336 }
337
338 MPN_NORMALIZE (rp, rn);
339
340 TMP_FREE;
341 return rn;
342 }
343