1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
2 
3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
6    to let N be unmodified during the operation.
7 
8    Contributed to the GNU project by Torbjorn Granlund.
9 
10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
13 
14 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
15 
16 This file is part of the GNU MP Library.
17 
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of the GNU Lesser General Public License as published by
20 the Free Software Foundation; either version 3 of the License, or (at your
21 option) any later version.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
26 License for more details.
27 
28 You should have received a copy of the GNU Lesser General Public License
29 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
30 
31 
32 /*
33    The idea of the algorithm used herein is to compute a smaller inverted value
34    than used in the standard Barrett algorithm, and thus save time in the
35    Newton iterations, and pay just a small price when using the inverted value
36    for developing quotient bits.  This algorithm was presented at ICMS 2006.
37 */
38 
39 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
40 
41  Things to work on:
42 
43   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
44     demonstrated by the fact that the mpn_invertappr function's scratch needs
45     mean that we need to keep a large allocation long after it is needed.
46     Things are worse as mpn_mul_fft does not accept any scratch parameter,
47     which means we'll have a large memory hole while in mpn_mul_fft.  In
48     general, a peak scratch need in the beginning of a function isn't
49     well-handled by the itch/scratch scheme.
50 */
51 
52 #ifdef STAT
53 #undef STAT
54 #define STAT(x) x
55 #else
56 #define STAT(x)
57 #endif
58 
59 #include <stdlib.h>		/* for NULL */
60 #include "gmp.h"
61 #include "gmp-impl.h"
62 
63 
64 mp_limb_t
65 mpn_mu_divappr_q (mp_ptr qp,
66 		  mp_srcptr np,
67 		  mp_size_t nn,
68 		  mp_srcptr dp,
69 		  mp_size_t dn,
70 		  mp_ptr scratch)
71 {
72   mp_size_t qn, in;
73   mp_limb_t cy, qh;
74   mp_ptr ip, tp;
75 
76   ASSERT (dn > 1);
77 
78   qn = nn - dn;
79 
80   /* If Q is smaller than D, truncate operands. */
81   if (qn + 1 < dn)
82     {
83       np += dn - (qn + 1);
84       nn -= dn - (qn + 1);
85       dp += dn - (qn + 1);
86       dn = qn + 1;
87     }
88 
89   /* Compute the inverse size.  */
90   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
91   ASSERT (in <= dn);
92 
93 #if 1
94   /* This alternative inverse computation method gets slightly more accurate
95      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
96      not adapted (3) mpn_invertappr scratch needs not met.  */
97   ip = scratch;
98   tp = scratch + in + 1;
99 
100   /* compute an approximate inverse on (in+1) limbs */
101   if (dn == in)
102     {
103       MPN_COPY (tp + 1, dp, in);
104       tp[0] = 1;
105       mpn_invertappr (ip, tp, in + 1, NULL);
106       MPN_COPY_INCR (ip, ip + 1, in);
107     }
108   else
109     {
110       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
111       if (UNLIKELY (cy != 0))
112 	MPN_ZERO (ip, in);
113       else
114 	{
115 	  mpn_invertappr (ip, tp, in + 1, NULL);
116 	  MPN_COPY_INCR (ip, ip + 1, in);
117 	}
118     }
119 #else
120   /* This older inverse computation method gets slightly worse results than the
121      one above.  */
122   ip = scratch;
123   tp = scratch + in;
124 
125   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
126      inversion function should do this automatically.  */
127   if (dn == in)
128     {
129       tp[in + 1] = 0;
130       MPN_COPY (tp + in + 2, dp, in);
131       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
132     }
133   else
134     {
135       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
136     }
137   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
138   if (UNLIKELY (cy != 0))
139     MPN_ZERO (tp + 1, in);
140   MPN_COPY (ip, tp + 1, in);
141 #endif
142 
143   qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
144 
145   return qh;
146 }
147 
148 mp_limb_t
149 mpn_preinv_mu_divappr_q (mp_ptr qp,
150 			 mp_srcptr np,
151 			 mp_size_t nn,
152 			 mp_srcptr dp,
153 			 mp_size_t dn,
154 			 mp_srcptr ip,
155 			 mp_size_t in,
156 			 mp_ptr scratch)
157 {
158   mp_size_t qn;
159   mp_limb_t cy, cx, qh;
160   mp_limb_t r;
161   mp_size_t tn, wn;
162 
163 #define rp           scratch
164 #define tp           (scratch + dn)
165 #define scratch_out  (scratch + dn + tn)
166 
167   qn = nn - dn;
168 
169   np += qn;
170   qp += qn;
171 
172   qh = mpn_cmp (np, dp, dn) >= 0;
173   if (qh != 0)
174     mpn_sub_n (rp, np, dp, dn);
175   else
176     MPN_COPY (rp, np, dn);
177 
178   if (qn == 0)
179     return qh;			/* Degenerate use.  Should we allow this? */
180 
181   while (qn > 0)
182     {
183       if (qn < in)
184 	{
185 	  ip += in - qn;
186 	  in = qn;
187 	}
188       np -= in;
189       qp -= in;
190 
191       /* Compute the next block of quotient limbs by multiplying the inverse I
192 	 by the upper part of the partial remainder R.  */
193       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
194       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
195       ASSERT_ALWAYS (cy == 0);
196 
197       qn -= in;
198       if (qn == 0)
199 	break;
200 
201       /* Compute the product of the quotient block and the divisor D, to be
202 	 subtracted from the partial remainder combined with new limbs from the
203 	 dividend N.  We only really need the low dn limbs.  */
204 
205       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
206 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
207       else
208 	{
209 	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
210 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
211 	  wn = dn + in - tn;			/* number of wrapped limbs */
212 	  if (wn > 0)
213 	    {
214 	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
215 	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
216 	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
217 	      ASSERT_ALWAYS (cx >= cy);
218 	      mpn_incr_u (tp, cx - cy);
219 	    }
220 	}
221 
222       r = rp[dn - in] - tp[dn];
223 
224       /* Subtract the product from the partial remainder combined with new
225 	 limbs from the dividend N, generating a new partial remainder R.  */
226       if (dn != in)
227 	{
228 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
229 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
230 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
231 	}
232       else
233 	{
234 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
235 	}
236 
237       STAT (int i; int err = 0;
238 	    static int errarr[5]; static int err_rec; static int tot);
239 
240       /* Check the remainder R and adjust the quotient as needed.  */
241       r -= cy;
242       while (r != 0)
243 	{
244 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
245 	     probability, 2 times with about 0.6% probability, if inverse is
246 	     computed as recommended.  */
247 	  mpn_incr_u (qp, 1);
248 	  cy = mpn_sub_n (rp, rp, dp, dn);
249 	  r -= cy;
250 	  STAT (err++);
251 	}
252       if (mpn_cmp (rp, dp, dn) >= 0)
253 	{
254 	  /* This is executed with about 76% probability.  */
255 	  mpn_incr_u (qp, 1);
256 	  cy = mpn_sub_n (rp, rp, dp, dn);
257 	  STAT (err++);
258 	}
259 
260       STAT (
261 	    tot++;
262 	    errarr[err]++;
263 	    if (err > err_rec)
264 	      err_rec = err;
265 	    if (tot % 0x10000 == 0)
266 	      {
267 		for (i = 0; i <= err_rec; i++)
268 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
269 		printf ("\n");
270 	      }
271 	    );
272     }
273 
274   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
275      quotient.  For now, just make sure the returned quotient is >= the real
276      quotient; add 3 with saturating arithmetic.  */
277   qn = nn - dn;
278   cy += mpn_add_1 (qp, qp, qn, 3);
279   if (cy != 0)
280     {
281       if (qh != 0)
282 	{
283 	  /* Return a quotient of just 1-bits, with qh set.  */
284 	  mp_size_t i;
285 	  for (i = 0; i < qn; i++)
286 	    qp[i] = GMP_NUMB_MAX;
287 	}
288       else
289 	{
290 	  /* Propagate carry into qh.  */
291 	  qh = 1;
292 	}
293     }
294 
295   return qh;
296 }
297 
298 /* In case k=0 (automatic choice), we distinguish 3 cases:
299    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
300    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
301    (c) qn < dn/3:       in = qn
302    In all cases we have in <= dn.
303  */
304 mp_size_t
305 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
306 {
307   mp_size_t in;
308 
309   if (k == 0)
310     {
311       mp_size_t b;
312       if (qn > dn)
313 	{
314 	  /* Compute an inverse size that is a nice partition of the quotient.  */
315 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
316 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
317 	}
318       else if (3 * qn > dn)
319 	{
320 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
321 	}
322       else
323 	{
324 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
325 	}
326     }
327   else
328     {
329       mp_size_t xn;
330       xn = MIN (dn, qn);
331       in = (xn - 1) / k + 1;
332     }
333 
334   return in;
335 }
336 
337 mp_size_t
338 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
339 {
340   mp_size_t qn, in, itch_local, itch_out;
341 
342   qn = nn - dn;
343   if (qn + 1 < dn)
344     {
345       dn = qn + 1;
346     }
347   in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
348 
349   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
350   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
351   return in + dn + itch_local + itch_out;
352 }
353