xref: /dragonfly/contrib/gmp/mpn/generic/rootrem.c (revision dcd37f7d)
1 /* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and
2    store the truncated integer part at rootp and the remainder at remp.
3 
4    Contributed by Paul Zimmermann (algorithm) and
5    Paul Zimmermann and Torbjorn Granlund (implementation).
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES.  IT'S
8    ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT'S ALMOST
9    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 
11 Copyright 2002, 2005, 2009 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19 
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23 License for more details.
24 
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27 
28 /* FIXME:
29    (a) Once there is a native mpn_tdiv_q function in GMP (division without
30        remainder), replace the quick-and-dirty implementation below by it.
31    (b) The implementation below is not optimal when remp == NULL, since the
32        complexity is M(n) where n is the input size, whereas it should be
33        only M(n/k) on average.
34 */
35 
36 #include <stdio.h>		/* for NULL */
37 
38 #include "gmp.h"
39 #include "gmp-impl.h"
40 #include "longlong.h"
41 
42 static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
43 				       mp_limb_t, int);
44 static void mpn_tdiv_q (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t,
45 			mp_srcptr, mp_size_t);
46 
47 #define MPN_RSHIFT(cy,rp,up,un,cnt) \
48   do {									\
49     if ((cnt) != 0)							\
50       cy = mpn_rshift (rp, up, un, cnt);				\
51     else								\
52       {									\
53 	MPN_COPY_INCR (rp, up, un);					\
54 	cy = 0;								\
55       }									\
56   } while (0)
57 
58 #define MPN_LSHIFT(cy,rp,up,un,cnt) \
59   do {									\
60     if ((cnt) != 0)							\
61       cy = mpn_lshift (rp, up, un, cnt);				\
62     else								\
63       {									\
64 	MPN_COPY_DECR (rp, up, un);					\
65 	cy = 0;								\
66       }									\
67   } while (0)
68 
69 
70 /* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero.
71    If remp <> NULL, put in {remp, un} the remainder.
72    Return the size (in limbs) of the remainder if remp <> NULL,
73 	  or a non-zero value iff the remainder is non-zero when remp = NULL.
74    Assumes:
75    (a) up[un-1] is not zero
76    (b) rootp has at least space for ceil(un/k) limbs
77    (c) remp has at least space for un limbs (in case remp <> NULL)
78    (d) the operands do not overlap.
79 
80    The auxiliary memory usage is 3*un+2 if remp = NULL,
81    and 2*un+2 if remp <> NULL.  FIXME: This is an incorrect comment.
82 */
83 mp_size_t
84 mpn_rootrem (mp_ptr rootp, mp_ptr remp,
85 	     mp_srcptr up, mp_size_t un, mp_limb_t k)
86 {
87   ASSERT (un > 0);
88   ASSERT (up[un - 1] != 0);
89   ASSERT (k > 1);
90 
91   if ((remp == NULL) && (un / k > 2))
92     /* call mpn_rootrem recursively, padding {up,un} with k zero limbs,
93        which will produce an approximate root with one more limb,
94        so that in most cases we can conclude. */
95     {
96       mp_ptr sp, wp;
97       mp_size_t rn, sn, wn;
98       TMP_DECL;
99       TMP_MARK;
100       wn = un + k;
101       wp = TMP_ALLOC_LIMBS (wn); /* will contain the padded input */
102       sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */
103       sp = TMP_ALLOC_LIMBS (sn); /* approximate root of padded input */
104       MPN_COPY (wp + k, up, un);
105       MPN_ZERO (wp, k);
106       rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
107       /* the approximate root S = {sp,sn} is either the correct root of
108 	 {sp,sn}, or one too large. Thus unless the least significant limb
109 	 of S is 0 or 1, we can deduce the root of {up,un} is S truncated by
110 	 one limb. (In case sp[0]=1, we can deduce the root, but not decide
111 	 whether it is exact or not.) */
112       MPN_COPY (rootp, sp + 1, sn - 1);
113       TMP_FREE;
114       return rn;
115     }
116   else /* remp <> NULL */
117     {
118       return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
119     }
120 }
121 
122 /* if approx is non-zero, does not compute the final remainder */
123 static mp_size_t
124 mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
125 		      mp_limb_t k, int approx)
126 {
127   mp_ptr qp, rp, sp, wp;
128   mp_size_t qn, rn, sn, wn, nl, bn;
129   mp_limb_t save, save2, cy;
130   unsigned long int unb; /* number of significant bits of {up,un} */
131   unsigned long int xnb; /* number of significant bits of the result */
132   unsigned int cnt;
133   unsigned long b, kk;
134   unsigned long sizes[GMP_NUMB_BITS + 1];
135   int ni, i;
136   int c;
137   int logk;
138   TMP_DECL;
139 
140   TMP_MARK;
141 
142   /* qp and wp need enough space to store S'^k where S' is an approximate
143      root. Since S' can be as large as S+2, the worst case is when S=2 and
144      S'=4. But then since we know the number of bits of S in advance, S'
145      can only be 3 at most. Similarly for S=4, then S' can be 6 at most.
146      So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k
147      fits in un limbs, the number of extra limbs needed is bounded by
148      ceil(k*log2(3/2)/GMP_NUMB_BITS). */
149 #define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS)
150   qp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain quotient and remainder
151 					of R/(k*S^(k-1)), and S^k */
152   if (remp == NULL)
153     rp = TMP_ALLOC_LIMBS (un);     /* will contain the remainder */
154   else
155     rp = remp;
156   sp = rootp;
157   wp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain S^(k-1), k*S^(k-1),
158 					and temporary for mpn_pow_1 */
159   count_leading_zeros (cnt, up[un - 1]);
160   unb = un * GMP_NUMB_BITS - cnt + GMP_NAIL_BITS;
161   /* unb is the number of bits of the input U */
162 
163   xnb = (unb - 1) / k + 1;	/* ceil (unb / k) */
164   /* xnb is the number of bits of the root R */
165 
166   if (xnb == 1) /* root is 1 */
167     {
168       if (remp == NULL)
169 	remp = rp;
170       mpn_sub_1 (remp, up, un, (mp_limb_t) 1);
171       MPN_NORMALIZE (remp, un);	/* There should be at most one zero limb,
172 				   if we demand u to be normalized  */
173       rootp[0] = 1;
174       TMP_FREE;
175       return un;
176     }
177 
178   /* We initialize the algorithm with a 1-bit approximation to zero: since we
179      know the root has exactly xnb bits, we write r0 = 2^(xnb-1), so that
180      r0^k = 2^(k*(xnb-1)), that we subtract to the input. */
181   kk = k * (xnb - 1);		/* number of truncated bits in the input */
182   rn = un - kk / GMP_NUMB_BITS; /* number of limbs of the non-truncated part */
183   MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS);
184   mpn_sub_1 (rp, rp, rn, 1);	/* subtract the initial approximation: since
185 				   the non-truncated part is less than 2^k, it
186 				   is <= k bits: rn <= ceil(k/GMP_NUMB_BITS) */
187   sp[0] = 1;			/* initial approximation */
188   sn = 1;			/* it has one limb */
189 
190   for (logk = 1; ((k - 1) >> logk) != 0; logk++)
191     ;
192   /* logk = ceil(log(k)/log(2)) */
193 
194   b = xnb - 1; /* number of remaining bits to determine in the kth root */
195   ni = 0;
196   while (b != 0)
197     {
198       /* invariant: here we want b+1 total bits for the kth root */
199       sizes[ni] = b;
200       /* if c is the new value of b, this means that we'll go from a root
201 	 of c+1 bits (say s') to a root of b+1 bits.
202 	 It is proved in the book "Modern Computer Arithmetic" from Brent
203 	 and Zimmermann, Chapter 1, that
204 	 if s' >= k*beta, then at most one correction is necessary.
205 	 Here beta = 2^(b-c), and s' >= 2^c, thus it suffices that
206 	 c >= ceil((b + log2(k))/2). */
207       b = (b + logk + 1) / 2;
208       if (b >= sizes[ni])
209 	b = sizes[ni] - 1;	/* add just one bit at a time */
210       ni++;
211     }
212   sizes[ni] = 0;
213   ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1);
214   /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with
215      sizes[i] <= 2 * sizes[i+1].
216      Newton iteration will first compute sizes[ni-1] extra bits,
217      then sizes[ni-2], ..., then sizes[0] = b. */
218 
219   wp[0] = 1; /* {sp,sn}^(k-1) = 1 */
220   wn = 1;
221   for (i = ni; i != 0; i--)
222     {
223       /* 1: loop invariant:
224 	 {sp, sn} is the current approximation of the root, which has
225 		  exactly 1 + sizes[ni] bits.
226 	 {rp, rn} is the current remainder
227 	 {wp, wn} = {sp, sn}^(k-1)
228 	 kk = number of truncated bits of the input
229       */
230       b = sizes[i - 1] - sizes[i]; /* number of bits to compute in that
231 				      iteration */
232 
233       /* Reinsert a low zero limb if we normalized away the entire remainder */
234       if (rn == 0)
235 	{
236 	  rp[0] = 0;
237 	  rn = 1;
238 	}
239 
240       /* first multiply the remainder by 2^b */
241       MPN_LSHIFT (cy, rp + b / GMP_NUMB_BITS, rp, rn, b % GMP_NUMB_BITS);
242       rn = rn + b / GMP_NUMB_BITS;
243       if (cy != 0)
244 	{
245 	  rp[rn] = cy;
246 	  rn++;
247 	}
248 
249       kk = kk - b;
250 
251       /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
252 
253       /* Now insert bits [kk,kk+b-1] from the input U */
254       bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[] */
255       save = rp[bn];
256       /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */
257       nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS);
258       /* nl  = 1 + floor((kk + b - 1) / GMP_NUMB_BITS)
259 		 - floor(kk / GMP_NUMB_BITS)
260 	     <= 1 + (kk + b - 1) / GMP_NUMB_BITS
261 		  - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS
262 	     = 2 + (b - 2) / GMP_NUMB_BITS
263 	 thus since nl is an integer:
264 	 nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */
265       /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */
266       if (nl - 1 > bn)
267 	save2 = rp[bn + 1];
268       MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS);
269       /* set to zero high bits of rp[bn] */
270       rp[bn] &= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS)) - 1;
271       /* restore corresponding bits */
272       rp[bn] |= save;
273       if (nl - 1 > bn)
274 	rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since
275 			       they start by bit 0 in rp[0], so they use
276 			       at most ceil(b/GMP_NUMB_BITS) limbs */
277 
278       /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
279 
280       /* compute {wp, wn} = k * {sp, sn}^(k-1) */
281       cy = mpn_mul_1 (wp, wp, wn, k);
282       wp[wn] = cy;
283       wn += cy != 0;
284 
285       /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
286 
287       /* now divide {rp, rn} by {wp, wn} to get the low part of the root */
288       if (rn < wn)
289 	{
290 	  qn = 0;
291 	}
292       else
293 	{
294 	  mp_ptr tp;
295 	  qn = rn - wn; /* expected quotient size */
296 	  /* tp must have space for wn limbs.
297 	     The quotient needs rn-wn+1 limbs, thus quotient+remainder
298 	     need altogether rn+1 limbs. */
299 	  tp = qp + qn + 1;	/* put remainder in Q buffer */
300 	  mpn_tdiv_q (qp, tp, 0, rp, rn, wp, wn);
301 	  qn += qp[qn] != 0;
302 	}
303 
304       /* 5: current buffers: {sp,sn}, {qp,qn}.
305 	 Note: {rp,rn} is not needed any more since we'll compute it from
306 	 scratch at the end of the loop.
307        */
308 
309       /* Number of limbs used by b bits, when least significant bit is
310 	 aligned to least limb */
311       bn = (b - 1) / GMP_NUMB_BITS + 1;
312 
313       /* the quotient should be smaller than 2^b, since the previous
314 	 approximation was correctly rounded toward zero */
315       if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) &&
316 		      qp[qn - 1] >= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS))))
317 	{
318 	  qn = b / GMP_NUMB_BITS + 1; /* b+1 bits */
319 	  MPN_ZERO (qp, qn);
320 	  qp[qn - 1] = (mp_limb_t) 1 << (b % GMP_NUMB_BITS);
321 	  MPN_DECR_U (qp, qn, 1);
322 	  qn -= qp[qn - 1] == 0;
323 	}
324 
325       /* 6: current buffers: {sp,sn}, {qp,qn} */
326 
327       /* multiply the root approximation by 2^b */
328       MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS);
329       sn = sn + b / GMP_NUMB_BITS;
330       if (cy != 0)
331 	{
332 	  sp[sn] = cy;
333 	  sn++;
334 	}
335 
336       /* 7: current buffers: {sp,sn}, {qp,qn} */
337 
338       ASSERT_ALWAYS (bn >= qn); /* this is ok since in the case qn > bn
339 				   above, q is set to 2^b-1, which has
340 				   exactly bn limbs */
341 
342       /* Combine sB and q to form sB + q.  */
343       save = sp[b / GMP_NUMB_BITS];
344       MPN_COPY (sp, qp, qn);
345       MPN_ZERO (sp + qn, bn - qn);
346       sp[b / GMP_NUMB_BITS] |= save;
347 
348       /* 8: current buffer: {sp,sn} */
349 
350       /* Since each iteration treats b bits from the root and thus k*b bits
351 	 from the input, and we already considered b bits from the input,
352 	 we now have to take another (k-1)*b bits from the input. */
353       kk -= (k - 1) * b; /* remaining input bits */
354       /* {rp, rn} = floor({up, un} / 2^kk) */
355       MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, un - kk / GMP_NUMB_BITS, kk % GMP_NUMB_BITS);
356       rn = un - kk / GMP_NUMB_BITS;
357       rn -= rp[rn - 1] == 0;
358 
359       /* 9: current buffers: {sp,sn}, {rp,rn} */
360 
361      for (c = 0;; c++)
362 	{
363 	  /* Compute S^k in {qp,qn}. */
364 	  if (i == 1)
365 	    {
366 	      /* Last iteration: we don't need W anymore. */
367 	      /* mpn_pow_1 requires that both qp and wp have enough space to
368 		 store the result {sp,sn}^k + 1 limb */
369 	      approx = approx && (sp[0] > 1);
370 	      qn = (approx == 0) ? mpn_pow_1 (qp, sp, sn, k, wp) : 0;
371 	    }
372 	  else
373 	    {
374 	      /* W <- S^(k-1) for the next iteration,
375 		 and S^k = W * S. */
376 	      wn = mpn_pow_1 (wp, sp, sn, k - 1, qp);
377 	      mpn_mul (qp, wp, wn, sp, sn);
378 	      qn = wn + sn;
379 	      qn -= qp[qn - 1] == 0;
380 	    }
381 
382 	  /* if S^k > floor(U/2^kk), the root approximation was too large */
383 	  if (qn > rn || (qn == rn && mpn_cmp (qp, rp, rn) > 0))
384 	    MPN_DECR_U (sp, sn, 1);
385 	  else
386 	    break;
387 	}
388 
389       /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */
390 
391       ASSERT_ALWAYS (c <= 1);
392       ASSERT_ALWAYS (rn >= qn);
393 
394       /* R = R - Q = floor(U/2^kk) - S^k */
395       if ((i > 1) || (approx == 0))
396 	{
397 	  mpn_sub (rp, rp, rn, qp, qn);
398 	  MPN_NORMALIZE (rp, rn);
399 	}
400       /* otherwise we have rn > 0, thus the return value is ok */
401 
402       /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
403     }
404 
405   TMP_FREE;
406   return rn;
407 }
408 
409 /* return the quotient Q = {np, nn} divided by {dp, dn} only */
410 static void
411 mpn_tdiv_q (mp_ptr qp, mp_ptr rp, mp_size_t qxn, mp_srcptr np, mp_size_t nn,
412 	    mp_srcptr dp, mp_size_t dn)
413 {
414   mp_size_t qn = nn - dn; /* expected quotient size is qn+1 */
415   mp_size_t cut;
416 
417   ASSERT_ALWAYS (qxn == 0);
418   if (dn <= qn + 3)
419     {
420       mpn_tdiv_qr (qp, rp, 0, np, nn, dp, dn);
421     }
422   else
423     {
424       mp_ptr tp;
425       TMP_DECL;
426       TMP_MARK;
427       tp = TMP_ALLOC_LIMBS (qn + 2);
428       cut = dn - (qn + 3);
429       /* perform a first division with divisor cut to dn-cut=qn+3 limbs
430 	 and dividend to nn-(cut-1) limbs, i.e. the quotient will be one
431 	 limb more than the final quotient.
432 	 The quotient will have qn+2 < dn-cut limbs,
433 	 and the remainder dn-cut = qn+3 limbs. */
434       mpn_tdiv_qr (tp, rp, 0, np + cut - 1, nn - cut + 1, dp + cut, dn - cut);
435       /* let Q' be the quotient of B * {np, nn} by {dp, dn} [qn+2 limbs]
436 	 and T  be the approximation of Q' computed above, where
437 	 B = 2^GMP_NUMB_BITS.
438 	 We have Q' <= T <= Q'+1, and since floor(Q'/B) = Q, we have
439 	 Q = floor(T/B), unless the last limb of T only consists of zeroes. */
440       if (tp[0] != 0)
441 	{
442 	  /* simply truncate one limb of T */
443 	  MPN_COPY (qp, tp + 1, qn + 1);
444 	}
445       else /* too bad: perform the expensive division */
446 	{
447 	  mpn_tdiv_qr (qp, rp, 0, np, nn, dp, dn);
448 	}
449       TMP_FREE;
450     }
451 }
452