xref: /dragonfly/contrib/gmp/mpn/generic/sqrtrem.c (revision 2b3f93ea)
1 /* mpn_sqrtrem -- square root and remainder
2 
3    Contributed to the GNU project by Paul Zimmermann (most code) and
4    Torbjorn Granlund (mpn_sqrtrem1).
5 
6    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
7    MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
8    INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
9    DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2010 Free Software
12 Foundation, Inc.
13 
14 This file is part of the GNU MP Library.
15 
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of the GNU Lesser General Public License as published by
18 the Free Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20 
21 The GNU MP Library is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
24 License for more details.
25 
26 You should have received a copy of the GNU Lesser General Public License
27 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
28 
29 
30 /* See "Karatsuba Square Root", reference in gmp.texi.  */
31 
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 
36 #include "gmp.h"
37 #include "gmp-impl.h"
38 #include "longlong.h"
39 
40 static const unsigned short invsqrttab[384] =
41 {
42   0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
43   0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
44   0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
45   0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
46   0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
47   0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
48   0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
49   0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
50   0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
51   0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
52   0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
53   0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
54   0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
55   0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
56   0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
57   0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
58   0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
59   0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
60   0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
61   0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
62   0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
63   0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
64   0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
65   0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
66   0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
67   0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
68   0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
69   0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
70   0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
71   0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
72   0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
73   0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
74   0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
75   0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
76   0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
77   0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
78   0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
79   0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
80   0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
81   0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
82   0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
83   0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
84   0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
85   0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
86   0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
87   0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
88   0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
89   0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100  /* sqrt(1/1f8)..sqrt(1/1ff) */
90 };
91 
92 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
93 
94 #if GMP_NUMB_BITS > 32
95 #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
96 #else
97 #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
98 #endif
99 
100 static mp_limb_t
101 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
102 {
103 #if GMP_NUMB_BITS > 32
104   mp_limb_t a1;
105 #endif
106   mp_limb_t x0, t2, t, x2;
107   unsigned abits;
108 
109   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
110   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
111   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
112 
113   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
114      since we can do the former without division.  As part of the last
115      iteration convert from 1/sqrt(a) to sqrt(a).  */
116 
117   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
118   x0 = invsqrttab[abits - 0x80];		/* initial 1/sqrt(a) */
119 
120   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
121 
122 #if GMP_NUMB_BITS > 32
123   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
124   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000  - a1 * x0 * x0) >> 16;
125   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
126 
127   /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
128 
129   t2 = x0 * (a0 >> (32-8));
130   t = t2 >> 25;
131   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
132   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
133   x0 >>= 32;
134 #else
135   t2 = x0 * (a0 >> (16-8));
136   t = t2 >> 13;
137   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
138   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
139   x0 >>= 16;
140 #endif
141 
142   /* x0 is now a full limb approximation of sqrt(a0) */
143 
144   x2 = x0 * x0;
145   if (x2 + 2*x0 <= a0 - 1)
146     {
147       x2 += 2*x0 + 1;
148       x0++;
149     }
150 
151   *rp = a0 - x2;
152   return x0;
153 }
154 
155 
156 #define Prec (GMP_NUMB_BITS >> 1)
157 
158 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
159    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
160 static mp_limb_t
161 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
162 {
163   mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
164   int cc;
165 
166   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
167 
168   np0 = np[0];
169   sp0 = mpn_sqrtrem1 (rp, np[1]);
170   qhl = 0;
171   rp0 = rp[0];
172   while (rp0 >= sp0)
173     {
174       qhl++;
175       rp0 -= sp0;
176     }
177   /* now rp0 < sp0 < 2^Prec */
178   rp0 = (rp0 << Prec) + (np0 >> Prec);
179   u = 2 * sp0;
180   q = rp0 / u;
181   u = rp0 - q * u;
182   q += (qhl & 1) << (Prec - 1);
183   qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
184   /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
185   sp0 = ((sp0 + qhl) << Prec) + q;
186   cc = u >> Prec;
187   rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
188   /* subtract q * q or qhl*2^(2*Prec) from rp */
189   q2 = q * q;
190   cc -= (rp0 < q2) + qhl;
191   rp0 -= q2;
192   /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
193   if (cc < 0)
194     {
195       if (sp0 != 0)
196 	{
197 	  rp0 += sp0;
198 	  cc += rp0 < sp0;
199 	}
200       else
201 	cc++;
202       --sp0;
203       rp0 += sp0;
204       cc += rp0 < sp0;
205     }
206 
207   rp[0] = rp0;
208   sp[0] = sp0;
209   return cc;
210 }
211 
212 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
213    and in {np, n} the low n limbs of the remainder, returns the high
214    limb of the remainder (which is 0 or 1).
215    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
216    where B=2^GMP_NUMB_BITS.  */
217 static mp_limb_t
218 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
219 {
220   mp_limb_t q;			/* carry out of {sp, n} */
221   int c, b;			/* carry out of remainder */
222   mp_size_t l, h;
223 
224   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
225 
226   if (n == 1)
227     c = mpn_sqrtrem2 (sp, np, np);
228   else
229     {
230       l = n / 2;
231       h = n - l;
232       q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
233       if (q != 0)
234 	mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
235       q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
236       c = sp[0] & 1;
237       mpn_rshift (sp, sp, l, 1);
238       sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
239       q >>= 1;
240       if (c != 0)
241 	c = mpn_add_n (np + l, np + l, sp + l, h);
242       mpn_sqr (np + n, sp, l);
243       b = q + mpn_sub_n (np, np, np + n, 2 * l);
244       c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
245       q = mpn_add_1 (sp + l, sp + l, h, q);
246 
247       if (c < 0)
248 	{
249 	  c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
250 	  c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
251 	  q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
252 	}
253     }
254 
255   return c;
256 }
257 
258 
259 mp_size_t
260 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
261 {
262   mp_limb_t *tp, s0[1], cc, high, rl;
263   int c;
264   mp_size_t rn, tn;
265   TMP_DECL;
266 
267   ASSERT (nn >= 0);
268   ASSERT_MPN (np, nn);
269 
270   /* If OP is zero, both results are zero.  */
271   if (nn == 0)
272     return 0;
273 
274   ASSERT (np[nn - 1] != 0);
275   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
276   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
277   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
278 
279   high = np[nn - 1];
280   if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
281     {
282       mp_limb_t r;
283       sp[0] = mpn_sqrtrem1 (&r, high);
284       if (rp != NULL)
285 	rp[0] = r;
286       return r != 0;
287     }
288   count_leading_zeros (c, high);
289   c -= GMP_NAIL_BITS;
290 
291   c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
292   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
293 
294   TMP_MARK;
295   if (nn % 2 != 0 || c > 0)
296     {
297       tp = TMP_ALLOC_LIMBS (2 * tn);
298       tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
299       if (c != 0)
300 	mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
301       else
302 	MPN_COPY (tp + 2 * tn - nn, np, nn);
303       rl = mpn_dc_sqrtrem (sp, tp, tn);
304       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
305 	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
306       c += (nn % 2) * GMP_NUMB_BITS / 2;		/* c now represents k */
307       s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);	/* S mod 2^k */
308       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
309       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
310       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
311       mpn_rshift (sp, sp, tn, c);
312       tp[tn] = rl;
313       if (rp == NULL)
314 	rp = tp;
315       c = c << 1;
316       if (c < GMP_NUMB_BITS)
317 	tn++;
318       else
319 	{
320 	  tp++;
321 	  c -= GMP_NUMB_BITS;
322 	}
323       if (c != 0)
324 	mpn_rshift (rp, tp, tn, c);
325       else
326 	MPN_COPY_INCR (rp, tp, tn);
327       rn = tn;
328     }
329   else
330     {
331       if (rp == NULL)
332 	rp = TMP_ALLOC_LIMBS (nn);
333       if (rp != np)
334 	MPN_COPY (rp, np, nn);
335       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
336     }
337 
338   MPN_NORMALIZE (rp, rn);
339 
340   TMP_FREE;
341   return rn;
342 }
343