xref: /dragonfly/contrib/gmp/mpn/generic/sqrtrem.c (revision 86d7f5d3)
1*86d7f5d3SJohn Marino /* mpn_sqrtrem -- square root and remainder
2*86d7f5d3SJohn Marino 
3*86d7f5d3SJohn Marino    Contributed to the GNU project by Paul Zimmermann (most code) and
4*86d7f5d3SJohn Marino    Torbjorn Granlund (mpn_sqrtrem1).
5*86d7f5d3SJohn Marino 
6*86d7f5d3SJohn Marino    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
7*86d7f5d3SJohn Marino    MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
8*86d7f5d3SJohn Marino    INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
9*86d7f5d3SJohn Marino    DISAPPEAR IN A FUTURE GMP RELEASE.
10*86d7f5d3SJohn Marino 
11*86d7f5d3SJohn Marino Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2010 Free Software
12*86d7f5d3SJohn Marino Foundation, Inc.
13*86d7f5d3SJohn Marino 
14*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
15*86d7f5d3SJohn Marino 
16*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
17*86d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
18*86d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
19*86d7f5d3SJohn Marino option) any later version.
20*86d7f5d3SJohn Marino 
21*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
22*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23*86d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
24*86d7f5d3SJohn Marino License for more details.
25*86d7f5d3SJohn Marino 
26*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
27*86d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
28*86d7f5d3SJohn Marino 
29*86d7f5d3SJohn Marino 
30*86d7f5d3SJohn Marino /* See "Karatsuba Square Root", reference in gmp.texi.  */
31*86d7f5d3SJohn Marino 
32*86d7f5d3SJohn Marino 
33*86d7f5d3SJohn Marino #include <stdio.h>
34*86d7f5d3SJohn Marino #include <stdlib.h>
35*86d7f5d3SJohn Marino 
36*86d7f5d3SJohn Marino #include "gmp.h"
37*86d7f5d3SJohn Marino #include "gmp-impl.h"
38*86d7f5d3SJohn Marino #include "longlong.h"
39*86d7f5d3SJohn Marino 
40*86d7f5d3SJohn Marino static const unsigned short invsqrttab[384] =
41*86d7f5d3SJohn Marino {
42*86d7f5d3SJohn Marino   0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
43*86d7f5d3SJohn Marino   0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
44*86d7f5d3SJohn Marino   0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
45*86d7f5d3SJohn Marino   0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
46*86d7f5d3SJohn Marino   0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
47*86d7f5d3SJohn Marino   0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
48*86d7f5d3SJohn Marino   0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
49*86d7f5d3SJohn Marino   0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
50*86d7f5d3SJohn Marino   0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
51*86d7f5d3SJohn Marino   0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
52*86d7f5d3SJohn Marino   0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
53*86d7f5d3SJohn Marino   0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
54*86d7f5d3SJohn Marino   0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
55*86d7f5d3SJohn Marino   0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
56*86d7f5d3SJohn Marino   0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
57*86d7f5d3SJohn Marino   0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
58*86d7f5d3SJohn Marino   0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
59*86d7f5d3SJohn Marino   0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
60*86d7f5d3SJohn Marino   0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
61*86d7f5d3SJohn Marino   0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
62*86d7f5d3SJohn Marino   0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
63*86d7f5d3SJohn Marino   0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
64*86d7f5d3SJohn Marino   0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
65*86d7f5d3SJohn Marino   0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
66*86d7f5d3SJohn Marino   0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
67*86d7f5d3SJohn Marino   0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
68*86d7f5d3SJohn Marino   0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
69*86d7f5d3SJohn Marino   0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
70*86d7f5d3SJohn Marino   0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
71*86d7f5d3SJohn Marino   0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
72*86d7f5d3SJohn Marino   0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
73*86d7f5d3SJohn Marino   0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
74*86d7f5d3SJohn Marino   0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
75*86d7f5d3SJohn Marino   0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
76*86d7f5d3SJohn Marino   0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
77*86d7f5d3SJohn Marino   0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
78*86d7f5d3SJohn Marino   0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
79*86d7f5d3SJohn Marino   0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
80*86d7f5d3SJohn Marino   0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
81*86d7f5d3SJohn Marino   0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
82*86d7f5d3SJohn Marino   0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
83*86d7f5d3SJohn Marino   0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
84*86d7f5d3SJohn Marino   0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
85*86d7f5d3SJohn Marino   0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
86*86d7f5d3SJohn Marino   0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
87*86d7f5d3SJohn Marino   0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
88*86d7f5d3SJohn Marino   0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
89*86d7f5d3SJohn Marino   0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100  /* sqrt(1/1f8)..sqrt(1/1ff) */
90*86d7f5d3SJohn Marino };
91*86d7f5d3SJohn Marino 
92*86d7f5d3SJohn Marino /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
93*86d7f5d3SJohn Marino 
94*86d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
95*86d7f5d3SJohn Marino #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
96*86d7f5d3SJohn Marino #else
97*86d7f5d3SJohn Marino #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
98*86d7f5d3SJohn Marino #endif
99*86d7f5d3SJohn Marino 
100*86d7f5d3SJohn Marino static mp_limb_t
mpn_sqrtrem1(mp_ptr rp,mp_limb_t a0)101*86d7f5d3SJohn Marino mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
102*86d7f5d3SJohn Marino {
103*86d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
104*86d7f5d3SJohn Marino   mp_limb_t a1;
105*86d7f5d3SJohn Marino #endif
106*86d7f5d3SJohn Marino   mp_limb_t x0, t2, t, x2;
107*86d7f5d3SJohn Marino   unsigned abits;
108*86d7f5d3SJohn Marino 
109*86d7f5d3SJohn Marino   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
110*86d7f5d3SJohn Marino   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
111*86d7f5d3SJohn Marino   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
112*86d7f5d3SJohn Marino 
113*86d7f5d3SJohn Marino   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
114*86d7f5d3SJohn Marino      since we can do the former without division.  As part of the last
115*86d7f5d3SJohn Marino      iteration convert from 1/sqrt(a) to sqrt(a).  */
116*86d7f5d3SJohn Marino 
117*86d7f5d3SJohn Marino   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
118*86d7f5d3SJohn Marino   x0 = invsqrttab[abits - 0x80];		/* initial 1/sqrt(a) */
119*86d7f5d3SJohn Marino 
120*86d7f5d3SJohn Marino   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
121*86d7f5d3SJohn Marino 
122*86d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
123*86d7f5d3SJohn Marino   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
124*86d7f5d3SJohn Marino   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000  - a1 * x0 * x0) >> 16;
125*86d7f5d3SJohn Marino   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
126*86d7f5d3SJohn Marino 
127*86d7f5d3SJohn Marino   /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
128*86d7f5d3SJohn Marino 
129*86d7f5d3SJohn Marino   t2 = x0 * (a0 >> (32-8));
130*86d7f5d3SJohn Marino   t = t2 >> 25;
131*86d7f5d3SJohn Marino   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
132*86d7f5d3SJohn Marino   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
133*86d7f5d3SJohn Marino   x0 >>= 32;
134*86d7f5d3SJohn Marino #else
135*86d7f5d3SJohn Marino   t2 = x0 * (a0 >> (16-8));
136*86d7f5d3SJohn Marino   t = t2 >> 13;
137*86d7f5d3SJohn Marino   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
138*86d7f5d3SJohn Marino   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
139*86d7f5d3SJohn Marino   x0 >>= 16;
140*86d7f5d3SJohn Marino #endif
141*86d7f5d3SJohn Marino 
142*86d7f5d3SJohn Marino   /* x0 is now a full limb approximation of sqrt(a0) */
143*86d7f5d3SJohn Marino 
144*86d7f5d3SJohn Marino   x2 = x0 * x0;
145*86d7f5d3SJohn Marino   if (x2 + 2*x0 <= a0 - 1)
146*86d7f5d3SJohn Marino     {
147*86d7f5d3SJohn Marino       x2 += 2*x0 + 1;
148*86d7f5d3SJohn Marino       x0++;
149*86d7f5d3SJohn Marino     }
150*86d7f5d3SJohn Marino 
151*86d7f5d3SJohn Marino   *rp = a0 - x2;
152*86d7f5d3SJohn Marino   return x0;
153*86d7f5d3SJohn Marino }
154*86d7f5d3SJohn Marino 
155*86d7f5d3SJohn Marino 
156*86d7f5d3SJohn Marino #define Prec (GMP_NUMB_BITS >> 1)
157*86d7f5d3SJohn Marino 
158*86d7f5d3SJohn Marino /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
159*86d7f5d3SJohn Marino    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
160*86d7f5d3SJohn Marino static mp_limb_t
mpn_sqrtrem2(mp_ptr sp,mp_ptr rp,mp_srcptr np)161*86d7f5d3SJohn Marino mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
162*86d7f5d3SJohn Marino {
163*86d7f5d3SJohn Marino   mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
164*86d7f5d3SJohn Marino   int cc;
165*86d7f5d3SJohn Marino 
166*86d7f5d3SJohn Marino   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
167*86d7f5d3SJohn Marino 
168*86d7f5d3SJohn Marino   np0 = np[0];
169*86d7f5d3SJohn Marino   sp0 = mpn_sqrtrem1 (rp, np[1]);
170*86d7f5d3SJohn Marino   qhl = 0;
171*86d7f5d3SJohn Marino   rp0 = rp[0];
172*86d7f5d3SJohn Marino   while (rp0 >= sp0)
173*86d7f5d3SJohn Marino     {
174*86d7f5d3SJohn Marino       qhl++;
175*86d7f5d3SJohn Marino       rp0 -= sp0;
176*86d7f5d3SJohn Marino     }
177*86d7f5d3SJohn Marino   /* now rp0 < sp0 < 2^Prec */
178*86d7f5d3SJohn Marino   rp0 = (rp0 << Prec) + (np0 >> Prec);
179*86d7f5d3SJohn Marino   u = 2 * sp0;
180*86d7f5d3SJohn Marino   q = rp0 / u;
181*86d7f5d3SJohn Marino   u = rp0 - q * u;
182*86d7f5d3SJohn Marino   q += (qhl & 1) << (Prec - 1);
183*86d7f5d3SJohn Marino   qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
184*86d7f5d3SJohn Marino   /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
185*86d7f5d3SJohn Marino   sp0 = ((sp0 + qhl) << Prec) + q;
186*86d7f5d3SJohn Marino   cc = u >> Prec;
187*86d7f5d3SJohn Marino   rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
188*86d7f5d3SJohn Marino   /* subtract q * q or qhl*2^(2*Prec) from rp */
189*86d7f5d3SJohn Marino   q2 = q * q;
190*86d7f5d3SJohn Marino   cc -= (rp0 < q2) + qhl;
191*86d7f5d3SJohn Marino   rp0 -= q2;
192*86d7f5d3SJohn Marino   /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
193*86d7f5d3SJohn Marino   if (cc < 0)
194*86d7f5d3SJohn Marino     {
195*86d7f5d3SJohn Marino       if (sp0 != 0)
196*86d7f5d3SJohn Marino 	{
197*86d7f5d3SJohn Marino 	  rp0 += sp0;
198*86d7f5d3SJohn Marino 	  cc += rp0 < sp0;
199*86d7f5d3SJohn Marino 	}
200*86d7f5d3SJohn Marino       else
201*86d7f5d3SJohn Marino 	cc++;
202*86d7f5d3SJohn Marino       --sp0;
203*86d7f5d3SJohn Marino       rp0 += sp0;
204*86d7f5d3SJohn Marino       cc += rp0 < sp0;
205*86d7f5d3SJohn Marino     }
206*86d7f5d3SJohn Marino 
207*86d7f5d3SJohn Marino   rp[0] = rp0;
208*86d7f5d3SJohn Marino   sp[0] = sp0;
209*86d7f5d3SJohn Marino   return cc;
210*86d7f5d3SJohn Marino }
211*86d7f5d3SJohn Marino 
212*86d7f5d3SJohn Marino /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
213*86d7f5d3SJohn Marino    and in {np, n} the low n limbs of the remainder, returns the high
214*86d7f5d3SJohn Marino    limb of the remainder (which is 0 or 1).
215*86d7f5d3SJohn Marino    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
216*86d7f5d3SJohn Marino    where B=2^GMP_NUMB_BITS.  */
217*86d7f5d3SJohn Marino static mp_limb_t
mpn_dc_sqrtrem(mp_ptr sp,mp_ptr np,mp_size_t n)218*86d7f5d3SJohn Marino mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
219*86d7f5d3SJohn Marino {
220*86d7f5d3SJohn Marino   mp_limb_t q;			/* carry out of {sp, n} */
221*86d7f5d3SJohn Marino   int c, b;			/* carry out of remainder */
222*86d7f5d3SJohn Marino   mp_size_t l, h;
223*86d7f5d3SJohn Marino 
224*86d7f5d3SJohn Marino   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
225*86d7f5d3SJohn Marino 
226*86d7f5d3SJohn Marino   if (n == 1)
227*86d7f5d3SJohn Marino     c = mpn_sqrtrem2 (sp, np, np);
228*86d7f5d3SJohn Marino   else
229*86d7f5d3SJohn Marino     {
230*86d7f5d3SJohn Marino       l = n / 2;
231*86d7f5d3SJohn Marino       h = n - l;
232*86d7f5d3SJohn Marino       q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
233*86d7f5d3SJohn Marino       if (q != 0)
234*86d7f5d3SJohn Marino 	mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
235*86d7f5d3SJohn Marino       q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
236*86d7f5d3SJohn Marino       c = sp[0] & 1;
237*86d7f5d3SJohn Marino       mpn_rshift (sp, sp, l, 1);
238*86d7f5d3SJohn Marino       sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
239*86d7f5d3SJohn Marino       q >>= 1;
240*86d7f5d3SJohn Marino       if (c != 0)
241*86d7f5d3SJohn Marino 	c = mpn_add_n (np + l, np + l, sp + l, h);
242*86d7f5d3SJohn Marino       mpn_sqr (np + n, sp, l);
243*86d7f5d3SJohn Marino       b = q + mpn_sub_n (np, np, np + n, 2 * l);
244*86d7f5d3SJohn Marino       c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
245*86d7f5d3SJohn Marino       q = mpn_add_1 (sp + l, sp + l, h, q);
246*86d7f5d3SJohn Marino 
247*86d7f5d3SJohn Marino       if (c < 0)
248*86d7f5d3SJohn Marino 	{
249*86d7f5d3SJohn Marino 	  c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
250*86d7f5d3SJohn Marino 	  c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
251*86d7f5d3SJohn Marino 	  q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
252*86d7f5d3SJohn Marino 	}
253*86d7f5d3SJohn Marino     }
254*86d7f5d3SJohn Marino 
255*86d7f5d3SJohn Marino   return c;
256*86d7f5d3SJohn Marino }
257*86d7f5d3SJohn Marino 
258*86d7f5d3SJohn Marino 
259*86d7f5d3SJohn Marino mp_size_t
mpn_sqrtrem(mp_ptr sp,mp_ptr rp,mp_srcptr np,mp_size_t nn)260*86d7f5d3SJohn Marino mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
261*86d7f5d3SJohn Marino {
262*86d7f5d3SJohn Marino   mp_limb_t *tp, s0[1], cc, high, rl;
263*86d7f5d3SJohn Marino   int c;
264*86d7f5d3SJohn Marino   mp_size_t rn, tn;
265*86d7f5d3SJohn Marino   TMP_DECL;
266*86d7f5d3SJohn Marino 
267*86d7f5d3SJohn Marino   ASSERT (nn >= 0);
268*86d7f5d3SJohn Marino   ASSERT_MPN (np, nn);
269*86d7f5d3SJohn Marino 
270*86d7f5d3SJohn Marino   /* If OP is zero, both results are zero.  */
271*86d7f5d3SJohn Marino   if (nn == 0)
272*86d7f5d3SJohn Marino     return 0;
273*86d7f5d3SJohn Marino 
274*86d7f5d3SJohn Marino   ASSERT (np[nn - 1] != 0);
275*86d7f5d3SJohn Marino   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
276*86d7f5d3SJohn Marino   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
277*86d7f5d3SJohn Marino   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
278*86d7f5d3SJohn Marino 
279*86d7f5d3SJohn Marino   high = np[nn - 1];
280*86d7f5d3SJohn Marino   if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
281*86d7f5d3SJohn Marino     {
282*86d7f5d3SJohn Marino       mp_limb_t r;
283*86d7f5d3SJohn Marino       sp[0] = mpn_sqrtrem1 (&r, high);
284*86d7f5d3SJohn Marino       if (rp != NULL)
285*86d7f5d3SJohn Marino 	rp[0] = r;
286*86d7f5d3SJohn Marino       return r != 0;
287*86d7f5d3SJohn Marino     }
288*86d7f5d3SJohn Marino   count_leading_zeros (c, high);
289*86d7f5d3SJohn Marino   c -= GMP_NAIL_BITS;
290*86d7f5d3SJohn Marino 
291*86d7f5d3SJohn Marino   c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
292*86d7f5d3SJohn Marino   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
293*86d7f5d3SJohn Marino 
294*86d7f5d3SJohn Marino   TMP_MARK;
295*86d7f5d3SJohn Marino   if (nn % 2 != 0 || c > 0)
296*86d7f5d3SJohn Marino     {
297*86d7f5d3SJohn Marino       tp = TMP_ALLOC_LIMBS (2 * tn);
298*86d7f5d3SJohn Marino       tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
299*86d7f5d3SJohn Marino       if (c != 0)
300*86d7f5d3SJohn Marino 	mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
301*86d7f5d3SJohn Marino       else
302*86d7f5d3SJohn Marino 	MPN_COPY (tp + 2 * tn - nn, np, nn);
303*86d7f5d3SJohn Marino       rl = mpn_dc_sqrtrem (sp, tp, tn);
304*86d7f5d3SJohn Marino       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
305*86d7f5d3SJohn Marino 	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
306*86d7f5d3SJohn Marino       c += (nn % 2) * GMP_NUMB_BITS / 2;		/* c now represents k */
307*86d7f5d3SJohn Marino       s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);	/* S mod 2^k */
308*86d7f5d3SJohn Marino       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
309*86d7f5d3SJohn Marino       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
310*86d7f5d3SJohn Marino       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
311*86d7f5d3SJohn Marino       mpn_rshift (sp, sp, tn, c);
312*86d7f5d3SJohn Marino       tp[tn] = rl;
313*86d7f5d3SJohn Marino       if (rp == NULL)
314*86d7f5d3SJohn Marino 	rp = tp;
315*86d7f5d3SJohn Marino       c = c << 1;
316*86d7f5d3SJohn Marino       if (c < GMP_NUMB_BITS)
317*86d7f5d3SJohn Marino 	tn++;
318*86d7f5d3SJohn Marino       else
319*86d7f5d3SJohn Marino 	{
320*86d7f5d3SJohn Marino 	  tp++;
321*86d7f5d3SJohn Marino 	  c -= GMP_NUMB_BITS;
322*86d7f5d3SJohn Marino 	}
323*86d7f5d3SJohn Marino       if (c != 0)
324*86d7f5d3SJohn Marino 	mpn_rshift (rp, tp, tn, c);
325*86d7f5d3SJohn Marino       else
326*86d7f5d3SJohn Marino 	MPN_COPY_INCR (rp, tp, tn);
327*86d7f5d3SJohn Marino       rn = tn;
328*86d7f5d3SJohn Marino     }
329*86d7f5d3SJohn Marino   else
330*86d7f5d3SJohn Marino     {
331*86d7f5d3SJohn Marino       if (rp == NULL)
332*86d7f5d3SJohn Marino 	rp = TMP_ALLOC_LIMBS (nn);
333*86d7f5d3SJohn Marino       if (rp != np)
334*86d7f5d3SJohn Marino 	MPN_COPY (rp, np, nn);
335*86d7f5d3SJohn Marino       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
336*86d7f5d3SJohn Marino     }
337*86d7f5d3SJohn Marino 
338*86d7f5d3SJohn Marino   MPN_NORMALIZE (rp, rn);
339*86d7f5d3SJohn Marino 
340*86d7f5d3SJohn Marino   TMP_FREE;
341*86d7f5d3SJohn Marino   return rn;
342*86d7f5d3SJohn Marino }
343