1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
2 
3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
6    to let N be unmodified during the operation.
7 
8    Contributed to the GNU project by Torbjorn Granlund.
9 
10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE.  IT IS
11    ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
12    ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
13    RELEASE.
14 
15 Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
16 
17 This file is part of the GNU MP Library.
18 
19 The GNU MP Library is free software; you can redistribute it and/or modify
20 it under the terms of the GNU Lesser General Public License as published by
21 the Free Software Foundation; either version 3 of the License, or (at your
22 option) any later version.
23 
24 The GNU MP Library is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
27 License for more details.
28 
29 You should have received a copy of the GNU Lesser General Public License
30 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
31 
32 
33 /* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann
34    and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of
35    Jebelean's bidirectional exact division algorithm.
36 
37    The idea of this algorithm is to compute a smaller inverted value than used
38    in the standard Barrett algorithm, and thus save time in the Newton
39    iterations, and pay just a small price when using the inverted value for
40    developing quotient bits.
41 
42    Written by Torbjorn Granlund.  Paul Zimmermann suggested the use of the
43    "wrap around" trick.  Based on the GMP divexact code and inspired by code
44    contributed to GMP by Karl Hasselstroem.
45 */
46 
47 
48 /* CAUTION: This code and the code in mu_div_qr.c should be edited in lockstep.
49 
50  Things to work on:
51 
52   * Passing k isn't a great interface.  Either 'in' should be passed, or
53     determined by the code.
54 
55   * The current mpn_mu_div_qr_itch isn't exactly scientifically written.
56     Scratch space buffer overruns are not unlikely before some analysis is
57     applied.  Since scratch requirements are expected to change, such an
58     analysis will have to wait til things settle.
59 
60   * This isn't optimal when the remainder isn't needed, since the final
61     multiplication could be made special and take O(1) time on average, in that
62     case.  This is particularly bad when qn << dn.  At some level, code as in
63     GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn
64     dividend limbs by the qn divisor limbs.
65 
66   * This isn't optimal when the quotient isn't needed, as it might take a lot
67     of space.  The computation is always needed, though, so there is not time
68     to save with special code.
69 
70   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
71     demonstrated by the fact that the mpn_inv function's scratch needs means
72     that we need to keep a large allocation long after it is needed.  Things
73     are worse as mpn_mul_fft does not accept any scratch parameter, which means
74     we'll have a large memory hole while in mpn_mul_fft.  In general, a peak
75     scratch need in the beginning of a function isn't well-handled by the
76     itch/scratch scheme.
77 
78   * Some ideas from comments in divexact.c apply to this code too.
79 */
80 
81 /* the NOSTAT stuff handles properly the case where files are concatenated */
82 #ifdef NOSTAT
83 #undef STAT
84 #endif
85 
86 #ifdef STAT
87 #undef STAT
88 #define STAT(x) x
89 #else
90 #define NOSTAT
91 #define STAT(x)
92 #endif
93 
94 #include <stdlib.h>		/* for NULL */
95 #include "gmp.h"
96 #include "gmp-impl.h"
97 
98 
99 /* In case k=0 (automatic choice), we distinguish 3 cases:
100    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
101    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
102    (c) qn < dn/3:       in = qn
103    In all cases we have in <= dn.
104  */
105 mp_size_t
106 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
107 {
108   mp_size_t in;
109 
110   if (k == 0)
111     {
112       mp_size_t b;
113       if (qn > dn)
114 	{
115 	  /* Compute an inverse size that is a nice partition of the quotient.  */
116 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
117 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
118 	}
119       else if (3 * qn > dn)
120 	{
121 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
122 	}
123       else
124 	{
125 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
126 	}
127     }
128   else
129     {
130       mp_size_t xn;
131       xn = MIN (dn, qn);
132       in = (xn - 1) / k + 1;
133     }
134 
135   return in;
136 }
137 
138 mp_limb_t
139 mpn_mu_divappr_q (mp_ptr qp,
140 		  mp_ptr np,
141 		  mp_size_t nn,
142 		  mp_srcptr dp,
143 		  mp_size_t dn,
144 		  mp_ptr scratch)
145 {
146   mp_size_t qn, in;
147   mp_limb_t cy;
148   mp_ptr ip, tp;
149 
150   /* FIXME: We should probably not handle tiny operands, but do it for now.  */
151   if (dn == 1)
152     {
153       mpn_divrem_1 (scratch, 0L, np, nn, dp[0]);
154       MPN_COPY (qp, scratch, nn - 1);
155       return scratch[nn - 1];
156     }
157 
158   qn = nn - dn;
159 
160 #if 1
161   /* If Q is smaller than D, truncate operands. */
162   if (qn + 1 < dn)
163     {
164       np += dn - (qn + 1);
165       nn -= dn - (qn + 1);
166       dp += dn - (qn + 1);
167       dn = qn + 1;
168 
169       /* Since D is cut here, we can have a carry in N'/D' even if we don't
170 	 have it for N/D.  */
171       if (mpn_cmp (np + nn - (qn + 1), dp, qn + 1) >= 0)
172 	{ /* quotient is 111...111 */
173 	  mp_size_t i;
174 	  for (i = 0; i <= qn; i ++)
175 	    qp[i] = ~ (mp_limb_t) 0;
176 	  return 0;
177 	}
178     }
179 #endif
180 
181   /* Compute the inverse size.  */
182   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
183   ASSERT (in <= dn);
184 
185 #if 1
186   /* This alternative inverse computation method gets slightly more accurate
187      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
188      not adapted (3) mpn_invert scratch needs not met.  */
189   ip = scratch;
190   tp = scratch + in + 1;
191 
192   /* compute an approximate inverse on (in+1) limbs */
193   if (dn == in)
194     {
195       MPN_COPY (tp + 1, dp, in);
196       tp[0] = 1;
197       mpn_invert (ip, tp, in + 1, NULL);
198       MPN_COPY_INCR (ip, ip + 1, in);
199     }
200   else
201     {
202       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
203       if (UNLIKELY (cy != 0))
204 	MPN_ZERO (ip, in);
205       else
206 	{
207 	  mpn_invert (ip, tp, in + 1, NULL);
208 	  MPN_COPY_INCR (ip, ip + 1, in);
209 	}
210     }
211 #else
212   /* This older inverse computation method gets slightly worse results than the
213      one above.  */
214   ip = scratch;
215   tp = scratch + in;
216 
217   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
218      inversion function should do this automatically.  */
219   if (dn == in)
220     {
221       tp[in + 1] = 0;
222       MPN_COPY (tp + in + 2, dp, in);
223       mpn_invert (tp, tp + in + 1, in + 1, NULL);
224     }
225   else
226     {
227       mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL);
228     }
229   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
230   if (UNLIKELY (cy != 0))
231     MPN_ZERO (tp + 1, in);
232   MPN_COPY (ip, tp + 1, in);
233 #endif
234 
235 /* We can't really handle qh = 1 like this since we'd here clobber N, which is
236    not allowed in the way we've defined this function's API.  */
237 #if 0
238   qh = mpn_cmp (np + qn, dp, dn) >= 0;
239   if (qh != 0)
240     mpn_sub_n (np + qn, np + qn, dp, dn);
241 #endif
242 
243   mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
244 
245 /*  return qh; */
246   return 0;
247 }
248 
249 void
250 mpn_preinv_mu_divappr_q (mp_ptr qp,
251 			 mp_ptr np,
252 			 mp_size_t nn,
253 			 mp_srcptr dp,
254 			 mp_size_t dn,
255 			 mp_srcptr ip,
256 			 mp_size_t in,
257 			 mp_ptr scratch)
258 {
259   mp_ptr rp;
260   mp_size_t qn;
261   mp_limb_t cy;
262   mp_ptr tp;
263   mp_limb_t r;
264 
265   qn = nn - dn;
266 
267   if (qn == 0)
268     return;
269 
270   rp = scratch;
271   tp = scratch + dn;
272 
273   np += qn;
274   qp += qn;
275 
276   MPN_COPY (rp, np, dn);
277 
278   while (qn > 0)
279     {
280       if (qn < in)
281 	{
282 	  ip += in - qn;
283 	  in = qn;
284 	}
285       np -= in;
286       qp -= in;
287 
288       /* Compute the next block of quotient limbs by multiplying the inverse I
289 	 by the upper part of the partial remainder R.  */
290       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
291       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
292       ASSERT_ALWAYS (cy == 0);			/* FIXME */
293 
294       qn -= in;
295       if (qn == 0)
296 	break;
297 
298       /* Compute the product of the quotient block and the divisor D, to be
299 	 subtracted from the partial remainder combined with new limbs from the
300 	 dividend N.  We only really need the low dn limbs.  */
301 #if WANT_FFT
302       if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
303 	{
304 	  /* Use the wrap-around trick.  */
305 	  mp_size_t m, wn;
306 	  int k;
307 
308 	  k = mpn_fft_best_k (dn + 1, 0);
309 	  m = mpn_fft_next_size (dn + 1, k);
310 	  wn = dn + in - m;			/* number of wrapped limbs */
311 
312 	  mpn_mul_fft (tp, m, dp, dn, qp, in, k);
313 
314 	  if (wn > 0)
315 	    {
316 	      cy = mpn_add_n (tp, tp, rp + dn - wn, wn);
317 	      mpn_incr_u (tp + wn, cy);
318 
319 	      cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0;
320 	      mpn_decr_u (tp, cy);
321 	    }
322 	}
323       else
324 #endif
325 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
326 
327       r = rp[dn - in] - tp[dn];
328 
329       /* Subtract the product from the partial remainder combined with new
330 	 limbs from the dividend N, generating a new partial remainder R.  */
331       if (dn != in)
332 	{
333 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
334 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
335 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
336 	}
337       else
338 	{
339 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
340 	}
341 
342       STAT (int i; int err = 0;
343 	    static int errarr[5]; static int err_rec; static int tot);
344 
345       /* Check the remainder R and adjust the quotient as needed.  */
346       r -= cy;
347       while (r != 0)
348 	{
349 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
350 	     probability, 2 times with about 0.6% probability, if inverse is
351 	     computed as recommended.  */
352 	  mpn_incr_u (qp, 1);
353 	  cy = mpn_sub_n (rp, rp, dp, dn);
354 	  r -= cy;
355 	  STAT (err++);
356 	}
357       if (mpn_cmp (rp, dp, dn) >= 0)
358 	{
359 	  /* This is executed with about 76% probability.  */
360 	  mpn_incr_u (qp, 1);
361 	  cy = mpn_sub_n (rp, rp, dp, dn);
362 	  STAT (err++);
363 	}
364 
365       STAT (
366 	    tot++;
367 	    errarr[err]++;
368 	    if (err > err_rec)
369 	      err_rec = err;
370 	    if (tot % 0x10000 == 0)
371 	      {
372 		for (i = 0; i <= err_rec; i++)
373 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
374 		printf ("\n");
375 	      }
376 	    );
377     }
378 
379   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
380      quotient.  For now, just make sure the returned quotient is >= the real
381      quotient.  */
382   qn = nn - dn;
383   cy = mpn_add_1 (qp, qp, qn, 3);
384   if (cy != 0)
385     {
386       MPN_ZERO (qp, qn);
387       mpn_sub_1 (qp, qp, qn, 1);
388     }
389 }
390 
391 mp_size_t
392 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
393 {
394   mp_size_t qn, m;
395   int k;
396 
397   /* FIXME: This isn't very carefully written, and might grossly overestimate
398      the amount of scratch needed, and might perhaps also underestimate it,
399      leading to potential buffer overruns.  In particular k=0 might lead to
400      gross overestimates.  */
401 
402   if (dn == 1)
403     return nn;
404 
405   qn = nn - dn;
406   if (qn >= dn)
407     {
408       k = mpn_fft_best_k (dn + 1, 0);
409       m = mpn_fft_next_size (dn + 1, k);
410       return dn + (mua_k <= 1
411 		   ? 6 * dn
412 		   : m + 2 * dn);
413     }
414   else
415     {
416       k = mpn_fft_best_k (dn + 1, 0);
417       m = mpn_fft_next_size (dn + 1, k);
418       return dn + (mua_k <= 1
419 		   ? m + 4 * qn
420 		   : m + 2 * qn);
421     }
422 }
423