xref: /dragonfly/contrib/gmp/mpn/generic/mu_div_qr.c (revision dcd37f7d)
1 /* mpn_mu_div_qr, mpn_preinv_mu_div_qr.
2 
3    Compute Q = floor(N / D) and R = N-QD.  N is nn limbs and D is dn limbs and
4    must be normalized, and Q must be nn-dn limbs.  The requirement that Q is
5    nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to
6    let N be unmodified during the operation.
7 
8    Contributed to the GNU project by Torbjorn Granlund.
9 
10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE.  IT IS
11    ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
12    ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
13    RELEASE.
14 
15 Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
16 
17 This file is part of the GNU MP Library.
18 
19 The GNU MP Library is free software; you can redistribute it and/or modify
20 it under the terms of the GNU Lesser General Public License as published by
21 the Free Software Foundation; either version 3 of the License, or (at your
22 option) any later version.
23 
24 The GNU MP Library is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
27 License for more details.
28 
29 You should have received a copy of the GNU Lesser General Public License
30 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
31 
32 
33 /* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann
34    and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of
35    Jebelean's bidirectional exact division algorithm.
36 
37    The idea of this algorithm is to compute a smaller inverted value than used
38    in the standard Barrett algorithm, and thus save time in the Newton
39    iterations, and pay just a small price when using the inverted value for
40    developing quotient bits.
41 
42    Written by Torbjorn Granlund.  Paul Zimmermann suggested the use of the
43    "wrap around" trick.  Based on the GMP divexact code and inspired by code
44    contributed to GMP by Karl Hasselstroem.
45 */
46 
47 
48 /* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockstep.
49 
50  Things to work on:
51 
52   * Passing k isn't a great interface.  Either 'in' should be passed, or
53     determined by the code.
54 
55   * The current mpn_mu_div_qr_itch isn't exactly scientifically written.
56     Scratch space buffer overruns are not unlikely before some analysis is
57     applied.  Since scratch requirements are expected to change, such an
58     analysis will have to wait til things settle.
59 
60   * This isn't optimal when the remainder isn't needed, since the final
61     multiplication could be made special and take O(1) time on average, in that
62     case.  This is particularly bad when qn << dn.  At some level, code as in
63     GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn
64     dividend limbs by the qn divisor limbs.
65 
66   * This isn't optimal when the quotient isn't needed, as it might take a lot
67     of space.  The computation is always needed, though, so there is not time
68     to save with special code.
69 
70   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
71     demonstrated by the fact that the mpn_inv function's scratch needs means
72     that we need to keep a large allocation long after it is needed.  Things
73     are worse as mpn_mul_fft does not accept any scratch parameter, which means
74     we'll have a large memory hole while in mpn_mul_fft.  In general, a peak
75     scratch need in the beginning of a function isn't well-handled by the
76     itch/scratch scheme.
77 
78   * Some ideas from comments in divexact.c apply to this code too.
79 */
80 
81 /* the NOSTAT stuff handles properly the case where files are concatenated */
82 #ifdef NOSTAT
83 #undef STAT
84 #endif
85 
86 #ifdef STAT
87 #undef STAT
88 #define STAT(x) x
89 #else
90 #define NOSTAT
91 #define STAT(x)
92 #endif
93 
94 #include <stdlib.h>		/* for NULL */
95 #include "gmp.h"
96 #include "gmp-impl.h"
97 
98 
99 /* In case k=0 (automatic choice), we distinguish 3 cases:
100    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
101    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
102    (c) qn < dn/3:       in = qn
103    In all cases we have in <= dn.
104  */
105 mp_size_t
106 mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
107 {
108   mp_size_t in;
109 
110   if (k == 0)
111     {
112       mp_size_t b;
113       if (qn > dn)
114 	{
115 	  /* Compute an inverse size that is a nice partition of the quotient.  */
116 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
117 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
118 	}
119       else if (3 * qn > dn)
120 	{
121 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
122 	}
123       else
124 	{
125 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
126 	}
127     }
128   else
129     {
130       mp_size_t xn;
131       xn = MIN (dn, qn);
132       in = (xn - 1) / k + 1;
133     }
134 
135   return in;
136 }
137 
138 static mp_limb_t
139 mpn_mu_div_qr2 (mp_ptr qp,
140 		mp_ptr rp,
141 		mp_ptr np,
142 		mp_size_t nn,
143 		mp_srcptr dp,
144 		mp_size_t dn,
145 		mp_ptr scratch)
146 {
147   mp_size_t qn, in;
148   mp_limb_t cy;
149   mp_ptr ip, tp;
150 
151   /* FIXME: We should probably not handle tiny operands, but do it for now.  */
152   if (dn == 1)
153     {
154       rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]);
155       MPN_COPY (qp, scratch, nn - 1);
156       return scratch[nn - 1];
157     }
158 
159   qn = nn - dn;
160 
161   /* Compute the inverse size.  */
162   in = mpn_mu_div_qr_choose_in (qn, dn, 0);
163   ASSERT (in <= dn);
164 
165 #if 1
166   /* This alternative inverse computation method gets slightly more accurate
167      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
168      not adapted (3) mpn_invert scratch needs not met.  */
169   ip = scratch;
170   tp = scratch + in + 1;
171 
172   /* compute an approximate inverse on (in+1) limbs */
173   if (dn == in)
174     {
175       MPN_COPY (tp + 1, dp, in);
176       tp[0] = 1;
177       mpn_invert (ip, tp, in + 1, NULL);
178       MPN_COPY_INCR (ip, ip + 1, in);
179     }
180   else
181     {
182       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
183       if (UNLIKELY (cy != 0))
184 	MPN_ZERO (ip, in);
185       else
186 	{
187 	  mpn_invert (ip, tp, in + 1, NULL);
188 	  MPN_COPY_INCR (ip, ip + 1, in);
189 	}
190     }
191 #else
192   /* This older inverse computation method gets slightly worse results than the
193      one above.  */
194   ip = scratch;
195   tp = scratch + in;
196 
197   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
198      inversion function should do this automatically.  */
199   if (dn == in)
200     {
201       tp[in + 1] = 0;
202       MPN_COPY (tp + in + 2, dp, in);
203       mpn_invert (tp, tp + in + 1, in + 1, NULL);
204     }
205   else
206     {
207       mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL);
208     }
209   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
210   if (UNLIKELY (cy != 0))
211     MPN_ZERO (tp + 1, in);
212   MPN_COPY (ip, tp + 1, in);
213 #endif
214 
215 /* We can't really handle qh = 1 like this since we'd here clobber N, which is
216    not allowed in the way we've defined this function's API.  */
217 #if 0
218   qh = mpn_cmp (np + qn, dp, dn) >= 0;
219   if (qh != 0)
220     mpn_sub_n (np + qn, np + qn, dp, dn);
221 #endif
222 
223   mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
224 
225 /*  return qh; */
226   return 0;
227 }
228 
229 void
230 mpn_preinv_mu_div_qr (mp_ptr qp,
231 		      mp_ptr rp,
232 		      mp_ptr np,
233 		      mp_size_t nn,
234 		      mp_srcptr dp,
235 		      mp_size_t dn,
236 		      mp_srcptr ip,
237 		      mp_size_t in,
238 		      mp_ptr scratch)
239 {
240   mp_size_t qn;
241   mp_limb_t cy;
242   mp_ptr tp;
243   mp_limb_t r;
244 
245   qn = nn - dn;
246 
247   if (qn == 0)
248     {
249       MPN_COPY (rp, np, dn);
250       return;
251     }
252 
253   tp = scratch;
254 
255   np += qn;
256   qp += qn;
257 
258   MPN_COPY (rp, np, dn);
259 
260   while (qn > 0)
261     {
262       if (qn < in)
263 	{
264 	  ip += in - qn;
265 	  in = qn;
266 	}
267       np -= in;
268       qp -= in;
269 
270       /* Compute the next block of quotient limbs by multiplying the inverse I
271 	 by the upper part of the partial remainder R.  */
272       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
273       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
274       ASSERT_ALWAYS (cy == 0);			/* FIXME */
275 
276       /* Compute the product of the quotient block and the divisor D, to be
277 	 subtracted from the partial remainder combined with new limbs from the
278 	 dividend N.  We only really need the low dn limbs.  */
279 #if WANT_FFT
280       if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
281 	{
282 	  /* Use the wrap-around trick.  */
283 	  mp_size_t m, wn;
284 	  int k;
285 
286 	  k = mpn_fft_best_k (dn + 1, 0);
287 	  m = mpn_fft_next_size (dn + 1, k);
288 	  wn = dn + in - m;			/* number of wrapped limbs */
289 
290 	  mpn_mul_fft (tp, m, dp, dn, qp, in, k);
291 
292 	  if (wn > 0)
293 	    {
294 	      cy = mpn_add_n (tp, tp, rp + dn - wn, wn);
295 	      mpn_incr_u (tp + wn, cy);
296 
297 	      cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0;
298 	      mpn_decr_u (tp, cy);
299 	    }
300 	}
301       else
302 #endif
303 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
304 
305       r = rp[dn - in] - tp[dn];
306 
307       /* Subtract the product from the partial remainder combined with new
308 	 limbs from the dividend N, generating a new partial remainder R.  */
309       if (dn != in)
310 	{
311 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
312 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
313 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
314 	}
315       else
316 	{
317 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
318 	}
319 
320       STAT (int i; int err = 0;
321 	    static int errarr[5]; static int err_rec; static int tot);
322 
323       /* Check the remainder R and adjust the quotient as needed.  */
324       r -= cy;
325       while (r != 0)
326 	{
327 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
328 	     probability, 2 times with about 0.6% probability, if inverse is
329 	     computed as recommended.  */
330 	  mpn_incr_u (qp, 1);
331 	  cy = mpn_sub_n (rp, rp, dp, dn);
332 	  r -= cy;
333 	  STAT (err++);
334 	}
335       if (mpn_cmp (rp, dp, dn) >= 0)
336 	{
337 	  /* This is executed with about 76% probability.  */
338 	  mpn_incr_u (qp, 1);
339 	  cy = mpn_sub_n (rp, rp, dp, dn);
340 	  STAT (err++);
341 	}
342 
343       STAT (
344 	    tot++;
345 	    errarr[err]++;
346 	    if (err > err_rec)
347 	      err_rec = err;
348 	    if (tot % 0x10000 == 0)
349 	      {
350 		for (i = 0; i <= err_rec; i++)
351 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
352 		printf ("\n");
353 	      }
354 	    );
355 
356       qn -= in;
357     }
358 }
359 
360 #define THRES 100		/* FIXME: somewhat arbitrary */
361 
362 #ifdef CHECK
363 #undef THRES
364 #define THRES 1
365 #endif
366 
367 mp_limb_t
368 mpn_mu_div_qr (mp_ptr qp,
369 	       mp_ptr rp,
370 	       mp_ptr np,
371 	       mp_size_t nn,
372 	       mp_srcptr dp,
373 	       mp_size_t dn,
374 	       mp_ptr scratch)
375 {
376   mp_size_t qn;
377 
378   qn = nn - dn;
379   if (qn + THRES < dn)
380     {
381       /* |______________|________|   dividend				  nn
382 		|_______|________|   divisor				  dn
383 
384 		|______|	     quotient (prel)			  qn
385 
386 		 |_______________|   quotient * ignored-part-of(divisor)  dn-1
387       */
388 
389       mp_limb_t cy, x;
390 
391       if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0)
392 	{
393 	  /* Quotient is 111...111, could optimize this rare case at some point.  */
394 	  mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
395 	  return 0;
396 	}
397 
398       /* Compute a preliminary quotient and a partial remainder by dividing the
399 	 most significant limbs of each operand.  */
400       mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
401 		      np + nn - (2 * qn + 1), 2 * qn + 1,
402 		      dp + dn - (qn + 1), qn + 1,
403 		      scratch);
404 
405       /* Multiply the quotient by the divisor limbs ignored above.  */
406       if (dn - (qn + 1) > qn)
407 	mpn_mul (scratch, dp, dn - (qn + 1), qp, qn);  /* prod is dn-1 limbs */
408       else
409 	mpn_mul (scratch, qp, qn, dp, dn - (qn + 1));  /* prod is dn-1 limbs */
410 
411       cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
412       cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
413 		       rp + nn - (2 * qn + 1),
414 		       scratch + nn - (2 * qn + 1),
415 		       qn, cy);
416       x = rp[dn - 1];
417       rp[dn - 1] = x - cy;
418       if (cy > x)
419 	{
420 	  mpn_decr_u (qp, 1);
421 	  mpn_add_n (rp, rp, dp, dn);
422 	}
423     }
424   else
425     {
426       return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
427     }
428 
429   return 0;			/* FIXME */
430 }
431 
432 mp_size_t
433 mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
434 {
435   mp_size_t qn, m;
436   int k;
437 
438   /* FIXME: This isn't very carefully written, and might grossly overestimate
439      the amount of scratch needed, and might perhaps also underestimate it,
440      leading to potential buffer overruns.  In particular k=0 might lead to
441      gross overestimates.  */
442 
443   if (dn == 1)
444     return nn;
445 
446   qn = nn - dn;
447   if (qn >= dn)
448     {
449       k = mpn_fft_best_k (dn + 1, 0);
450       m = mpn_fft_next_size (dn + 1, k);
451       return (mua_k <= 1
452 	      ? 6 * dn
453 	      : m + 2 * dn);
454     }
455   else
456     {
457       k = mpn_fft_best_k (dn + 1, 0);
458       m = mpn_fft_next_size (dn + 1, k);
459       return (mua_k <= 1
460 	      ? m + 4 * qn
461 	      : m + 2 * qn);
462     }
463 }
464