xref: /dragonfly/contrib/gmp/mpn/generic/rootrem.c (revision 25a2db75)
1 /* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and
2    store the truncated integer part at rootp and the remainder at remp.
3 
4    Contributed by Paul Zimmermann (algorithm) and
5    Paul Zimmermann and Torbjorn Granlund (implementation).
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES.  IT'S
8    ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT'S ALMOST
9    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 
11 Copyright 2002, 2005, 2009, 2010 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19 
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23 License for more details.
24 
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27 
28 /* FIXME:
29      This implementation is not optimal when remp == NULL, since the complexity
30      is M(n), whereas it should be M(n/k) on average.
31 */
32 
33 #include <stdio.h>		/* for NULL */
34 
35 #include "gmp.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
38 
39 static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
40 				       mp_limb_t, int);
41 
42 #define MPN_RSHIFT(cy,rp,up,un,cnt) \
43   do {									\
44     if ((cnt) != 0)							\
45       cy = mpn_rshift (rp, up, un, cnt);				\
46     else								\
47       {									\
48 	MPN_COPY_INCR (rp, up, un);					\
49 	cy = 0;								\
50       }									\
51   } while (0)
52 
53 #define MPN_LSHIFT(cy,rp,up,un,cnt) \
54   do {									\
55     if ((cnt) != 0)							\
56       cy = mpn_lshift (rp, up, un, cnt);				\
57     else								\
58       {									\
59 	MPN_COPY_DECR (rp, up, un);					\
60 	cy = 0;								\
61       }									\
62   } while (0)
63 
64 
65 /* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero.
66    If remp <> NULL, put in {remp, un} the remainder.
67    Return the size (in limbs) of the remainder if remp <> NULL,
68 	  or a non-zero value iff the remainder is non-zero when remp = NULL.
69    Assumes:
70    (a) up[un-1] is not zero
71    (b) rootp has at least space for ceil(un/k) limbs
72    (c) remp has at least space for un limbs (in case remp <> NULL)
73    (d) the operands do not overlap.
74 
75    The auxiliary memory usage is 3*un+2 if remp = NULL,
76    and 2*un+2 if remp <> NULL.  FIXME: This is an incorrect comment.
77 */
78 mp_size_t
79 mpn_rootrem (mp_ptr rootp, mp_ptr remp,
80 	     mp_srcptr up, mp_size_t un, mp_limb_t k)
81 {
82   ASSERT (un > 0);
83   ASSERT (up[un - 1] != 0);
84   ASSERT (k > 1);
85 
86   if ((remp == NULL) && (un / k > 2))
87     /* call mpn_rootrem recursively, padding {up,un} with k zero limbs,
88        which will produce an approximate root with one more limb,
89        so that in most cases we can conclude. */
90     {
91       mp_ptr sp, wp;
92       mp_size_t rn, sn, wn;
93       TMP_DECL;
94       TMP_MARK;
95       wn = un + k;
96       wp = TMP_ALLOC_LIMBS (wn); /* will contain the padded input */
97       sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */
98       sp = TMP_ALLOC_LIMBS (sn); /* approximate root of padded input */
99       MPN_COPY (wp + k, up, un);
100       MPN_ZERO (wp, k);
101       rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
102       /* the approximate root S = {sp,sn} is either the correct root of
103 	 {sp,sn}, or one too large. Thus unless the least significant limb
104 	 of S is 0 or 1, we can deduce the root of {up,un} is S truncated by
105 	 one limb. (In case sp[0]=1, we can deduce the root, but not decide
106 	 whether it is exact or not.) */
107       MPN_COPY (rootp, sp + 1, sn - 1);
108       TMP_FREE;
109       return rn;
110     }
111   else /* remp <> NULL */
112     {
113       return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
114     }
115 }
116 
117 /* if approx is non-zero, does not compute the final remainder */
118 static mp_size_t
119 mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
120 		      mp_limb_t k, int approx)
121 {
122   mp_ptr qp, rp, sp, wp, scratch;
123   mp_size_t qn, rn, sn, wn, nl, bn;
124   mp_limb_t save, save2, cy;
125   unsigned long int unb; /* number of significant bits of {up,un} */
126   unsigned long int xnb; /* number of significant bits of the result */
127   unsigned int cnt;
128   unsigned long b, kk;
129   unsigned long sizes[GMP_NUMB_BITS + 1];
130   int ni, i;
131   int c;
132   int logk;
133   TMP_DECL;
134 
135   TMP_MARK;
136 
137   /* qp and wp need enough space to store S'^k where S' is an approximate
138      root. Since S' can be as large as S+2, the worst case is when S=2 and
139      S'=4. But then since we know the number of bits of S in advance, S'
140      can only be 3 at most. Similarly for S=4, then S' can be 6 at most.
141      So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k
142      fits in un limbs, the number of extra limbs needed is bounded by
143      ceil(k*log2(3/2)/GMP_NUMB_BITS). */
144 #define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS)
145   qp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain quotient and remainder
146 					of R/(k*S^(k-1)), and S^k */
147   if (remp == NULL)
148     {
149       rp = TMP_ALLOC_LIMBS (un + 1);     /* will contain the remainder */
150       scratch = rp;			 /* used by mpn_div_q */
151     }
152   else
153     {
154       scratch = TMP_ALLOC_LIMBS (un + 1); /* used by mpn_div_q */
155       rp = remp;
156     }
157   sp = rootp;
158   wp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain S^(k-1), k*S^(k-1),
159 					and temporary for mpn_pow_1 */
160   count_leading_zeros (cnt, up[un - 1]);
161   unb = un * GMP_NUMB_BITS - cnt + GMP_NAIL_BITS;
162   /* unb is the number of bits of the input U */
163 
164   xnb = (unb - 1) / k + 1;	/* ceil (unb / k) */
165   /* xnb is the number of bits of the root R */
166 
167   if (xnb == 1) /* root is 1 */
168     {
169       if (remp == NULL)
170 	remp = rp;
171       mpn_sub_1 (remp, up, un, (mp_limb_t) 1);
172       MPN_NORMALIZE (remp, un);	/* There should be at most one zero limb,
173 				   if we demand u to be normalized  */
174       rootp[0] = 1;
175       TMP_FREE;
176       return un;
177     }
178 
179   /* We initialize the algorithm with a 1-bit approximation to zero: since we
180      know the root has exactly xnb bits, we write r0 = 2^(xnb-1), so that
181      r0^k = 2^(k*(xnb-1)), that we subtract to the input. */
182   kk = k * (xnb - 1);		/* number of truncated bits in the input */
183   rn = un - kk / GMP_NUMB_BITS; /* number of limbs of the non-truncated part */
184   MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS);
185   mpn_sub_1 (rp, rp, rn, 1);	/* subtract the initial approximation: since
186 				   the non-truncated part is less than 2^k, it
187 				   is <= k bits: rn <= ceil(k/GMP_NUMB_BITS) */
188   sp[0] = 1;			/* initial approximation */
189   sn = 1;			/* it has one limb */
190 
191   for (logk = 1; ((k - 1) >> logk) != 0; logk++)
192     ;
193   /* logk = ceil(log(k)/log(2)) */
194 
195   b = xnb - 1; /* number of remaining bits to determine in the kth root */
196   ni = 0;
197   while (b != 0)
198     {
199       /* invariant: here we want b+1 total bits for the kth root */
200       sizes[ni] = b;
201       /* if c is the new value of b, this means that we'll go from a root
202 	 of c+1 bits (say s') to a root of b+1 bits.
203 	 It is proved in the book "Modern Computer Arithmetic" from Brent
204 	 and Zimmermann, Chapter 1, that
205 	 if s' >= k*beta, then at most one correction is necessary.
206 	 Here beta = 2^(b-c), and s' >= 2^c, thus it suffices that
207 	 c >= ceil((b + log2(k))/2). */
208       b = (b + logk + 1) / 2;
209       if (b >= sizes[ni])
210 	b = sizes[ni] - 1;	/* add just one bit at a time */
211       ni++;
212     }
213   sizes[ni] = 0;
214   ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1);
215   /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with
216      sizes[i] <= 2 * sizes[i+1].
217      Newton iteration will first compute sizes[ni-1] extra bits,
218      then sizes[ni-2], ..., then sizes[0] = b. */
219 
220   wp[0] = 1; /* {sp,sn}^(k-1) = 1 */
221   wn = 1;
222   for (i = ni; i != 0; i--)
223     {
224       /* 1: loop invariant:
225 	 {sp, sn} is the current approximation of the root, which has
226 		  exactly 1 + sizes[ni] bits.
227 	 {rp, rn} is the current remainder
228 	 {wp, wn} = {sp, sn}^(k-1)
229 	 kk = number of truncated bits of the input
230       */
231       b = sizes[i - 1] - sizes[i]; /* number of bits to compute in that
232 				      iteration */
233 
234       /* Reinsert a low zero limb if we normalized away the entire remainder */
235       if (rn == 0)
236 	{
237 	  rp[0] = 0;
238 	  rn = 1;
239 	}
240 
241       /* first multiply the remainder by 2^b */
242       MPN_LSHIFT (cy, rp + b / GMP_NUMB_BITS, rp, rn, b % GMP_NUMB_BITS);
243       rn = rn + b / GMP_NUMB_BITS;
244       if (cy != 0)
245 	{
246 	  rp[rn] = cy;
247 	  rn++;
248 	}
249 
250       kk = kk - b;
251 
252       /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
253 
254       /* Now insert bits [kk,kk+b-1] from the input U */
255       bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[] */
256       save = rp[bn];
257       /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */
258       nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS);
259       /* nl  = 1 + floor((kk + b - 1) / GMP_NUMB_BITS)
260 		 - floor(kk / GMP_NUMB_BITS)
261 	     <= 1 + (kk + b - 1) / GMP_NUMB_BITS
262 		  - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS
263 	     = 2 + (b - 2) / GMP_NUMB_BITS
264 	 thus since nl is an integer:
265 	 nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */
266       /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */
267       if (nl - 1 > bn)
268 	save2 = rp[bn + 1];
269       MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS);
270       /* set to zero high bits of rp[bn] */
271       rp[bn] &= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS)) - 1;
272       /* restore corresponding bits */
273       rp[bn] |= save;
274       if (nl - 1 > bn)
275 	rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since
276 			       they start by bit 0 in rp[0], so they use
277 			       at most ceil(b/GMP_NUMB_BITS) limbs */
278 
279       /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
280 
281       /* compute {wp, wn} = k * {sp, sn}^(k-1) */
282       cy = mpn_mul_1 (wp, wp, wn, k);
283       wp[wn] = cy;
284       wn += cy != 0;
285 
286       /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
287 
288       /* now divide {rp, rn} by {wp, wn} to get the low part of the root */
289       if (rn < wn)
290 	{
291 	  qn = 0;
292 	}
293       else
294 	{
295 	  mp_ptr tp;
296 	  qn = rn - wn; /* expected quotient size */
297 	  /* tp must have space for wn limbs.
298 	     The quotient needs rn-wn+1 limbs, thus quotient+remainder
299 	     need altogether rn+1 limbs. */
300 	  tp = qp + qn + 1;	/* put remainder in Q buffer */
301 	  mpn_div_q (qp, rp, rn, wp, wn, scratch);
302 	  qn += qp[qn] != 0;
303 	}
304 
305       /* 5: current buffers: {sp,sn}, {qp,qn}.
306 	 Note: {rp,rn} is not needed any more since we'll compute it from
307 	 scratch at the end of the loop.
308        */
309 
310       /* Number of limbs used by b bits, when least significant bit is
311 	 aligned to least limb */
312       bn = (b - 1) / GMP_NUMB_BITS + 1;
313 
314       /* the quotient should be smaller than 2^b, since the previous
315 	 approximation was correctly rounded toward zero */
316       if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) &&
317 		      qp[qn - 1] >= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS))))
318 	{
319 	  qn = b / GMP_NUMB_BITS + 1; /* b+1 bits */
320 	  MPN_ZERO (qp, qn);
321 	  qp[qn - 1] = (mp_limb_t) 1 << (b % GMP_NUMB_BITS);
322 	  MPN_DECR_U (qp, qn, 1);
323 	  qn -= qp[qn - 1] == 0;
324 	}
325 
326       /* 6: current buffers: {sp,sn}, {qp,qn} */
327 
328       /* multiply the root approximation by 2^b */
329       MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS);
330       sn = sn + b / GMP_NUMB_BITS;
331       if (cy != 0)
332 	{
333 	  sp[sn] = cy;
334 	  sn++;
335 	}
336 
337       /* 7: current buffers: {sp,sn}, {qp,qn} */
338 
339       ASSERT_ALWAYS (bn >= qn); /* this is ok since in the case qn > bn
340 				   above, q is set to 2^b-1, which has
341 				   exactly bn limbs */
342 
343       /* Combine sB and q to form sB + q.  */
344       save = sp[b / GMP_NUMB_BITS];
345       MPN_COPY (sp, qp, qn);
346       MPN_ZERO (sp + qn, bn - qn);
347       sp[b / GMP_NUMB_BITS] |= save;
348 
349       /* 8: current buffer: {sp,sn} */
350 
351       /* Since each iteration treats b bits from the root and thus k*b bits
352 	 from the input, and we already considered b bits from the input,
353 	 we now have to take another (k-1)*b bits from the input. */
354       kk -= (k - 1) * b; /* remaining input bits */
355       /* {rp, rn} = floor({up, un} / 2^kk) */
356       MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, un - kk / GMP_NUMB_BITS, kk % GMP_NUMB_BITS);
357       rn = un - kk / GMP_NUMB_BITS;
358       rn -= rp[rn - 1] == 0;
359 
360       /* 9: current buffers: {sp,sn}, {rp,rn} */
361 
362      for (c = 0;; c++)
363 	{
364 	  /* Compute S^k in {qp,qn}. */
365 	  if (i == 1)
366 	    {
367 	      /* Last iteration: we don't need W anymore. */
368 	      /* mpn_pow_1 requires that both qp and wp have enough space to
369 		 store the result {sp,sn}^k + 1 limb */
370 	      approx = approx && (sp[0] > 1);
371 	      qn = (approx == 0) ? mpn_pow_1 (qp, sp, sn, k, wp) : 0;
372 	    }
373 	  else
374 	    {
375 	      /* W <- S^(k-1) for the next iteration,
376 		 and S^k = W * S. */
377 	      wn = mpn_pow_1 (wp, sp, sn, k - 1, qp);
378 	      mpn_mul (qp, wp, wn, sp, sn);
379 	      qn = wn + sn;
380 	      qn -= qp[qn - 1] == 0;
381 	    }
382 
383 	  /* if S^k > floor(U/2^kk), the root approximation was too large */
384 	  if (qn > rn || (qn == rn && mpn_cmp (qp, rp, rn) > 0))
385 	    MPN_DECR_U (sp, sn, 1);
386 	  else
387 	    break;
388 	}
389 
390       /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */
391 
392       ASSERT_ALWAYS (c <= 1);
393       ASSERT_ALWAYS (rn >= qn);
394 
395       /* R = R - Q = floor(U/2^kk) - S^k */
396       if ((i > 1) || (approx == 0))
397 	{
398 	  mpn_sub (rp, rp, rn, qp, qn);
399 	  MPN_NORMALIZE (rp, rn);
400 	}
401       /* otherwise we have rn > 0, thus the return value is ok */
402 
403       /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
404     }
405 
406   TMP_FREE;
407   return rn;
408 }
409