1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
2 
3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
6    to let N be unmodified during the operation.
7 
8    Contributed to the GNU project by Torbjorn Granlund.
9 
10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
13 
14 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
15 
16 This file is part of the GNU MP Library.
17 
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of either:
20 
21   * the GNU Lesser General Public License as published by the Free
22     Software Foundation; either version 3 of the License, or (at your
23     option) any later version.
24 
25 or
26 
27   * the GNU General Public License as published by the Free Software
28     Foundation; either version 2 of the License, or (at your option) any
29     later version.
30 
31 or both in parallel, as here.
32 
33 The GNU MP Library is distributed in the hope that it will be useful, but
34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
36 for more details.
37 
38 You should have received copies of the GNU General Public License and the
39 GNU Lesser General Public License along with the GNU MP Library.  If not,
40 see https://www.gnu.org/licenses/.  */
41 
42 
43 /*
44    The idea of the algorithm used herein is to compute a smaller inverted value
45    than used in the standard Barrett algorithm, and thus save time in the
46    Newton iterations, and pay just a small price when using the inverted value
47    for developing quotient bits.  This algorithm was presented at ICMS 2006.
48 */
49 
50 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
51 
52  Things to work on:
53 
54   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
55     demonstrated by the fact that the mpn_invertappr function's scratch needs
56     mean that we need to keep a large allocation long after it is needed.
57     Things are worse as mpn_mul_fft does not accept any scratch parameter,
58     which means we'll have a large memory hole while in mpn_mul_fft.  In
59     general, a peak scratch need in the beginning of a function isn't
60     well-handled by the itch/scratch scheme.
61 */
62 
63 #ifdef STAT
64 #undef STAT
65 #define STAT(x) x
66 #else
67 #define STAT(x)
68 #endif
69 
70 #include <stdlib.h>		/* for NULL */
71 #include "gmp.h"
72 #include "gmp-impl.h"
73 
74 
75 mp_limb_t
mpn_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)76 mpn_mu_divappr_q (mp_ptr qp,
77 		  mp_srcptr np,
78 		  mp_size_t nn,
79 		  mp_srcptr dp,
80 		  mp_size_t dn,
81 		  mp_ptr scratch)
82 {
83   mp_size_t qn, in;
84   mp_limb_t cy, qh;
85   mp_ptr ip, tp;
86 
87   ASSERT (dn > 1);
88 
89   qn = nn - dn;
90 
91   /* If Q is smaller than D, truncate operands. */
92   if (qn + 1 < dn)
93     {
94       np += dn - (qn + 1);
95       nn -= dn - (qn + 1);
96       dp += dn - (qn + 1);
97       dn = qn + 1;
98     }
99 
100   /* Compute the inverse size.  */
101   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
102   ASSERT (in <= dn);
103 
104 #if 1
105   /* This alternative inverse computation method gets slightly more accurate
106      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
107      not adapted (3) mpn_invertappr scratch needs not met.  */
108   ip = scratch;
109   tp = scratch + in + 1;
110 
111   /* compute an approximate inverse on (in+1) limbs */
112   if (dn == in)
113     {
114       MPN_COPY (tp + 1, dp, in);
115       tp[0] = 1;
116       mpn_invertappr (ip, tp, in + 1, NULL);
117       MPN_COPY_INCR (ip, ip + 1, in);
118     }
119   else
120     {
121       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
122       if (UNLIKELY (cy != 0))
123 	MPN_ZERO (ip, in);
124       else
125 	{
126 	  mpn_invertappr (ip, tp, in + 1, NULL);
127 	  MPN_COPY_INCR (ip, ip + 1, in);
128 	}
129     }
130 #else
131   /* This older inverse computation method gets slightly worse results than the
132      one above.  */
133   ip = scratch;
134   tp = scratch + in;
135 
136   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
137      inversion function should do this automatically.  */
138   if (dn == in)
139     {
140       tp[in + 1] = 0;
141       MPN_COPY (tp + in + 2, dp, in);
142       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
143     }
144   else
145     {
146       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
147     }
148   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
149   if (UNLIKELY (cy != 0))
150     MPN_ZERO (tp + 1, in);
151   MPN_COPY (ip, tp + 1, in);
152 #endif
153 
154   qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
155 
156   return qh;
157 }
158 
159 mp_limb_t
mpn_preinv_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_srcptr ip,mp_size_t in,mp_ptr scratch)160 mpn_preinv_mu_divappr_q (mp_ptr qp,
161 			 mp_srcptr np,
162 			 mp_size_t nn,
163 			 mp_srcptr dp,
164 			 mp_size_t dn,
165 			 mp_srcptr ip,
166 			 mp_size_t in,
167 			 mp_ptr scratch)
168 {
169   mp_size_t qn;
170   mp_limb_t cy, cx, qh;
171   mp_limb_t r;
172   mp_size_t tn, wn;
173 
174 #define rp           scratch
175 #define tp           (scratch + dn)
176 #define scratch_out  (scratch + dn + tn)
177 
178   qn = nn - dn;
179 
180   np += qn;
181   qp += qn;
182 
183   qh = mpn_cmp (np, dp, dn) >= 0;
184   if (qh != 0)
185     mpn_sub_n (rp, np, dp, dn);
186   else
187     MPN_COPY (rp, np, dn);
188 
189   if (qn == 0)
190     return qh;			/* Degenerate use.  Should we allow this? */
191 
192   while (qn > 0)
193     {
194       if (qn < in)
195 	{
196 	  ip += in - qn;
197 	  in = qn;
198 	}
199       np -= in;
200       qp -= in;
201 
202       /* Compute the next block of quotient limbs by multiplying the inverse I
203 	 by the upper part of the partial remainder R.  */
204       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
205       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
206       ASSERT_ALWAYS (cy == 0);
207 
208       qn -= in;
209       if (qn == 0)
210 	break;
211 
212       /* Compute the product of the quotient block and the divisor D, to be
213 	 subtracted from the partial remainder combined with new limbs from the
214 	 dividend N.  We only really need the low dn limbs.  */
215 
216       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
217 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
218       else
219 	{
220 	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
221 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
222 	  wn = dn + in - tn;			/* number of wrapped limbs */
223 	  if (wn > 0)
224 	    {
225 	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
226 	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
227 	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
228 	      ASSERT_ALWAYS (cx >= cy);
229 	      mpn_incr_u (tp, cx - cy);
230 	    }
231 	}
232 
233       r = rp[dn - in] - tp[dn];
234 
235       /* Subtract the product from the partial remainder combined with new
236 	 limbs from the dividend N, generating a new partial remainder R.  */
237       if (dn != in)
238 	{
239 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
240 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
241 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
242 	}
243       else
244 	{
245 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
246 	}
247 
248       STAT (int i; int err = 0;
249 	    static int errarr[5]; static int err_rec; static int tot);
250 
251       /* Check the remainder R and adjust the quotient as needed.  */
252       r -= cy;
253       while (r != 0)
254 	{
255 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
256 	     probability, 2 times with about 0.6% probability, if inverse is
257 	     computed as recommended.  */
258 	  mpn_incr_u (qp, 1);
259 	  cy = mpn_sub_n (rp, rp, dp, dn);
260 	  r -= cy;
261 	  STAT (err++);
262 	}
263       if (mpn_cmp (rp, dp, dn) >= 0)
264 	{
265 	  /* This is executed with about 76% probability.  */
266 	  mpn_incr_u (qp, 1);
267 	  cy = mpn_sub_n (rp, rp, dp, dn);
268 	  STAT (err++);
269 	}
270 
271       STAT (
272 	    tot++;
273 	    errarr[err]++;
274 	    if (err > err_rec)
275 	      err_rec = err;
276 	    if (tot % 0x10000 == 0)
277 	      {
278 		for (i = 0; i <= err_rec; i++)
279 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
280 		printf ("\n");
281 	      }
282 	    );
283     }
284 
285   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
286      quotient.  For now, just make sure the returned quotient is >= the real
287      quotient; add 3 with saturating arithmetic.  */
288   qn = nn - dn;
289   cy += mpn_add_1 (qp, qp, qn, 3);
290   if (cy != 0)
291     {
292       if (qh != 0)
293 	{
294 	  /* Return a quotient of just 1-bits, with qh set.  */
295 	  mp_size_t i;
296 	  for (i = 0; i < qn; i++)
297 	    qp[i] = GMP_NUMB_MAX;
298 	}
299       else
300 	{
301 	  /* Propagate carry into qh.  */
302 	  qh = 1;
303 	}
304     }
305 
306   return qh;
307 }
308 
309 /* In case k=0 (automatic choice), we distinguish 3 cases:
310    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
311    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
312    (c) qn < dn/3:       in = qn
313    In all cases we have in <= dn.
314  */
315 mp_size_t
mpn_mu_divappr_q_choose_in(mp_size_t qn,mp_size_t dn,int k)316 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
317 {
318   mp_size_t in;
319 
320   if (k == 0)
321     {
322       mp_size_t b;
323       if (qn > dn)
324 	{
325 	  /* Compute an inverse size that is a nice partition of the quotient.  */
326 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
327 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
328 	}
329       else if (3 * qn > dn)
330 	{
331 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
332 	}
333       else
334 	{
335 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
336 	}
337     }
338   else
339     {
340       mp_size_t xn;
341       xn = MIN (dn, qn);
342       in = (xn - 1) / k + 1;
343     }
344 
345   return in;
346 }
347 
348 mp_size_t
mpn_mu_divappr_q_itch(mp_size_t nn,mp_size_t dn,int mua_k)349 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
350 {
351   mp_size_t qn, in, itch_local, itch_out;
352 
353   qn = nn - dn;
354   if (qn + 1 < dn)
355     {
356       dn = qn + 1;
357     }
358   in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
359 
360   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
361   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
362   return in + dn + itch_local + itch_out;
363 }
364