1 /* mpn_sqrtrem -- square root and remainder
2 
3    Contributed to the GNU project by Paul Zimmermann (most code),
4    Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5 
6    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7    INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8    IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9    FUTURE GMP RELEASE.
10 
11 Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12 Foundation, Inc.
13 
14 This file is part of the GNU MP Library.
15 
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of either:
18 
19   * the GNU Lesser General Public License as published by the Free
20     Software Foundation; either version 3 of the License, or (at your
21     option) any later version.
22 
23 or
24 
25   * the GNU General Public License as published by the Free Software
26     Foundation; either version 2 of the License, or (at your option) any
27     later version.
28 
29 or both in parallel, as here.
30 
31 The GNU MP Library is distributed in the hope that it will be useful, but
32 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
34 for more details.
35 
36 You should have received copies of the GNU General Public License and the
37 GNU Lesser General Public License along with the GNU MP Library.  If not,
38 see https://www.gnu.org/licenses/.  */
39 
40 
41 /* See "Karatsuba Square Root", reference in gmp.texi.  */
42 
43 
44 #include <stdio.h>
45 #include <stdlib.h>
46 
47 #include "gmp-impl.h"
48 #include "longlong.h"
49 #define USE_DIVAPPR_Q 1
50 #define TRACE(x)
51 
52 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53 {
54   0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55   0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56   0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57   0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58   0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59   0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60   0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61   0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62   0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63   0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64   0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65   0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66   0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67   0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68   0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69   0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70   0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71   0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72   0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73   0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74   0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75   0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76   0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77   0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78   0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79   0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80   0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81   0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82   0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83   0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84   0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85   0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86   0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87   0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88   0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89   0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90   0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91   0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92   0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93   0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94   0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95   0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96   0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97   0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98   0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99   0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100   0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101   0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
102 };
103 
104 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
105 
106 #if GMP_NUMB_BITS > 32
107 #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108 #else
109 #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
110 #endif
111 
112 static mp_limb_t
mpn_sqrtrem1(mp_ptr rp,mp_limb_t a0)113 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114 {
115 #if GMP_NUMB_BITS > 32
116   mp_limb_t a1;
117 #endif
118   mp_limb_t x0, t2, t, x2;
119   unsigned abits;
120 
121   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
123   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124 
125   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126      since we can do the former without division.  As part of the last
127      iteration convert from 1/sqrt(a) to sqrt(a).  */
128 
129   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
130   x0 = 0x100 | invsqrttab[abits - 0x80];	/* initial 1/sqrt(a) */
131 
132   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133 
134 #if GMP_NUMB_BITS > 32
135   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138 
139   /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140 
141   t2 = x0 * (a0 >> (32-8));
142   t = t2 >> 25;
143   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145   x0 >>= 32;
146 #else
147   t2 = x0 * (a0 >> (16-8));
148   t = t2 >> 13;
149   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151   x0 >>= 16;
152 #endif
153 
154   /* x0 is now a full limb approximation of sqrt(a0) */
155 
156   x2 = x0 * x0;
157   if (x2 + 2*x0 <= a0 - 1)
158     {
159       x2 += 2*x0 + 1;
160       x0++;
161     }
162 
163   *rp = a0 - x2;
164   return x0;
165 }
166 
167 
168 #define Prec (GMP_NUMB_BITS >> 1)
169 #if ! defined(SQRTREM2_INPLACE)
170 #define SQRTREM2_INPLACE 0
171 #endif
172 
173 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
175 #if SQRTREM2_INPLACE
176 #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177 static mp_limb_t
mpn_sqrtrem2(mp_ptr sp,mp_ptr rp)178 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179 {
180   mp_srcptr np = rp;
181 #else
182 #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183 static mp_limb_t
184 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185 {
186 #endif
187   mp_limb_t q, u, np0, sp0, rp0, q2;
188   int cc;
189 
190   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191 
192   np0 = np[0];
193   sp0 = mpn_sqrtrem1 (rp, np[1]);
194   rp0 = rp[0];
195   /* rp0 <= 2*sp0 < 2^(Prec + 1) */
196   rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197   q = rp0 / sp0;
198   /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199   q -= q >> Prec;
200   /* now we have q < 2^Prec */
201   u = rp0 - q * sp0;
202   /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203   sp0 = (sp0 << Prec) | q;
204   cc = u >> (Prec - 1);
205   rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206   /* subtract q * q from rp */
207   q2 = q * q;
208   cc -= rp0 < q2;
209   rp0 -= q2;
210   if (cc < 0)
211     {
212       rp0 += sp0;
213       cc += rp0 < sp0;
214       --sp0;
215       rp0 += sp0;
216       cc += rp0 < sp0;
217     }
218 
219   rp[0] = rp0;
220   sp[0] = sp0;
221   return cc;
222 }
223 
224 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225    and in {np, n} the low n limbs of the remainder, returns the high
226    limb of the remainder (which is 0 or 1).
227    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228    where B=2^GMP_NUMB_BITS.
229    Needs a scratch of n/2+1 limbs. */
230 static mp_limb_t
231 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232 {
233   mp_limb_t q;			/* carry out of {sp, n} */
234   int c, b;			/* carry out of remainder */
235   mp_size_t l, h;
236 
237   ASSERT (n > 1);
238   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239 
240   l = n / 2;
241   h = n - l;
242   if (h == 1)
243     q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244   else
245     q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246   if (q != 0)
247     ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248   TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249   mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250   q += scratch[l];
251   c = scratch[0] & 1;
252   mpn_rshift (sp, scratch, l, 1);
253   sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254   if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255     return 1; /* Remainder is non-zero */
256   q >>= 1;
257   if (c != 0)
258     c = mpn_add_n (np + l, np + l, sp + l, h);
259   TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260   mpn_sqr (np + n, sp, l);
261   b = q + mpn_sub_n (np, np, np + n, 2 * l);
262   c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263 
264   if (c < 0)
265     {
266       q = mpn_add_1 (sp + l, sp + l, h, q);
267 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
268       c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269 #else
270       c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271 #endif
272       c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273       q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274     }
275 
276   return c;
277 }
278 
279 #if USE_DIVAPPR_Q
280 static void
281 mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282 {
283   gmp_pi1_t inv;
284   mp_limb_t qh;
285   ASSERT (dn > 2);
286   ASSERT (nn >= dn);
287   ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288 
289   MPN_COPY (scratch, np, nn);
290   invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291   if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292     qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293   else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294     qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295   else
296     {
297       mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298       TMP_DECL;
299       TMP_MARK;
300       /* Sadly, scratch is too small. */
301       qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302       TMP_FREE;
303     }
304   qp [nn - dn] = qh;
305 }
306 #endif
307 
308 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309    returns zero if the operand was a perfect square, one otherwise.
310    Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
311    where B=2^GMP_NUMB_BITS.
312    THINK: In the odd case, three more (dummy) limbs are taken into account,
313    when nsh is maximal, two limbs are discarded from the result of the
314    division. Too much? Is a single dummy limb enough? */
315 static int
316 mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317 {
318   mp_limb_t q;			/* carry out of {sp, n} */
319   int c;			/* carry out of remainder */
320   mp_size_t l, h;
321   mp_ptr qp, tp, scratch;
322   TMP_DECL;
323   TMP_MARK;
324 
325   ASSERT (np[2 * n - 1 - odd] != 0);
326   ASSERT (n > 4);
327   ASSERT (nsh < GMP_NUMB_BITS / 2);
328 
329   l = (n - 1) / 2;
330   h = n - l;
331   ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332   scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333   tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334   if (nsh != 0)
335     {
336       /* o is used to exactly set the lowest bits of the dividend, is it needed? */
337       int o = l > (1 + odd);
338       ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339     }
340   else
341     MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342   q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343   if (q != 0)
344     ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345   qp = tp + n + 1; /* l + 2 */
346   TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347 #if USE_DIVAPPR_Q
348   mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349 #else
350   mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351 #endif
352   q += qp [l + 1];
353   c = 1;
354   if (q > 1)
355     {
356       /* FIXME: if s!=0 we will shift later, a noop on this area. */
357       MPN_FILL (sp, l, GMP_NUMB_MAX);
358     }
359   else
360     {
361       /* FIXME: if s!=0 we will shift again later, shift just once. */
362       mpn_rshift (sp, qp + 1, l, 1);
363       sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
364       if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
365 	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366 	{
367 	  mp_limb_t cy;
368 	  /* Approximation is not good enough, the extra limb(+ nsh bits)
369 	     is smaller than needed to absorb the possible error. */
370 	  /* {qp + 1, l + 1} equals 2*{sp, l} */
371 	  /* FIXME: use mullo or wrap-around, or directly evaluate
372 	     remainder with a single sqrmod_bnm1. */
373 	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374 	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375 	  /* Compute the remainder of the previous mpn_div(appr)_q. */
376 	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377 #if USE_DIVAPPR_Q || WANT_ASSERT
378 	  MPN_DECR_U (tp + 1 + h, l, cy);
379 #if USE_DIVAPPR_Q
380 	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381 	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382 	    {
383 	      /* May happen only if div result was not exact. */
384 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
385 	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386 #else
387 	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388 #endif
389 	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390 	      MPN_DECR_U (sp, l, 1);
391 	    }
392 	  /* Can the root be exact when a correction was needed? We
393 	     did not find an example, but it depends on divappr
394 	     internals, and we can not assume it true in general...*/
395 	  /* else */
396 #else /* WANT_ASSERT */
397 	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398 #endif
399 #endif
400 	  if (mpn_zero_p (tp + l + 1, h - l))
401 	    {
402 	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403 	      mpn_sqr (scratch, sp, l);
404 	      c = mpn_cmp (tp + 1, scratch + l, l);
405 	      if (c == 0)
406 		{
407 		  if (nsh != 0)
408 		    {
409 		      mpn_lshift (tp, np, l, 2 * nsh);
410 		      np = tp;
411 		    }
412 		  c = mpn_cmp (np, scratch + odd, l - odd);
413 		}
414 	      if (c < 0)
415 		{
416 		  MPN_DECR_U (sp, l, 1);
417 		  c = 1;
418 		}
419 	    }
420 	}
421     }
422   TMP_FREE;
423 
424   if ((odd | nsh) != 0)
425     mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426   return c;
427 }
428 
429 
430 mp_size_t
431 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432 {
433   mp_limb_t cc, high, rl;
434   int c;
435   mp_size_t rn, tn;
436   TMP_DECL;
437 
438   ASSERT (nn > 0);
439   ASSERT_MPN (np, nn);
440 
441   ASSERT (np[nn - 1] != 0);
442   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445 
446   high = np[nn - 1];
447   if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
448     c = 0;
449   else
450     {
451       count_leading_zeros (c, high);
452       c -= GMP_NAIL_BITS;
453 
454       c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455     }
456   if (nn == 1) {
457     if (c == 0)
458       {
459 	sp[0] = mpn_sqrtrem1 (&rl, high);
460 	if (rp != NULL)
461 	  rp[0] = rl;
462       }
463     else
464       {
465 	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466 	sp[0] = cc;
467 	if (rp != NULL)
468 	  rp[0] = rl = high - cc*cc;
469       }
470     return rl != 0;
471   }
472   if (nn == 2) {
473     mp_limb_t tp [2];
474     if (rp == NULL) rp = tp;
475     if (c == 0)
476       {
477 #if SQRTREM2_INPLACE
478 	rp[1] = high;
479 	rp[0] = np[0];
480 	cc = CALL_SQRTREM2_INPLACE (sp, rp);
481 #else
482 	cc = mpn_sqrtrem2 (sp, rp, np);
483 #endif
484 	rp[1] = cc;
485 	return ((rp[0] | cc) != 0) + cc;
486       }
487     else
488       {
489 	rl = np[0];
490 	rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
491 	rp[0] = rl << (2*c);
492 	CALL_SQRTREM2_INPLACE (sp, rp);
493 	cc = sp[0] >>= c;	/* c != 0, the highest bit of the root cc is 0. */
494 	rp[0] = rl -= cc*cc;	/* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495 	return rl != 0;
496       }
497   }
498   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
499 
500   if ((rp == NULL) && (nn > 8))
501     return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502   TMP_MARK;
503   if (((nn & 1) | c) != 0)
504     {
505       mp_limb_t s0[1], mask;
506       mp_ptr tp, scratch;
507       TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508       tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
509       if (c != 0)
510 	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511       else
512 	MPN_COPY (tp + (nn & 1), np, nn);
513       c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;		/* c now represents k */
514       mask = (CNST_LIMB (1) << c) - 1;
515       rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
517 	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
518       s0[0] = sp[0] & mask;	/* S mod 2^k */
519       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
520       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522       mpn_rshift (sp, sp, tn, c);
523       tp[tn] = rl;
524       if (rp == NULL)
525 	rp = tp;
526       c = c << 1;
527       if (c < GMP_NUMB_BITS)
528 	tn++;
529       else
530 	{
531 	  tp++;
532 	  c -= GMP_NUMB_BITS;
533 	}
534       if (c != 0)
535 	mpn_rshift (rp, tp, tn, c);
536       else
537 	MPN_COPY_INCR (rp, tp, tn);
538       rn = tn;
539     }
540   else
541     {
542       if (rp != np)
543 	{
544 	  if (rp == NULL) /* nn <= 8 */
545 	    rp = TMP_SALLOC_LIMBS (nn);
546 	  MPN_COPY (rp, np, nn);
547 	}
548       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549     }
550 
551   MPN_NORMALIZE (rp, rn);
552 
553   TMP_FREE;
554   return rn;
555 }
556