xref: /dragonfly/contrib/gmp/mpn/generic/mu_div_q.c (revision 37de577a)
1 /* mpn_mu_div_q.
2 
3    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4 
5    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
6    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
8 
9 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17 
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21 License for more details.
22 
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25 
26 
27 /*
28    The idea of the algorithm used herein is to compute a smaller inverted value
29    than used in the standard Barrett algorithm, and thus save time in the
30    Newton iterations, and pay just a small price when using the inverted value
31    for developing quotient bits.  This algorithm was presented at ICMS 2006.
32 */
33 
34 /*
35   Things to work on:
36 
37   1. This is a rudimentary implementation of mpn_mu_div_q.  The algorithm is
38      probably close to optimal, except when mpn_mu_divappr_q fails.
39 
40      An alternative which could be considered for much simpler code for the
41      complex qn>=dn arm would be to allocate a temporary nn+1 limb buffer, then
42      simply call mpn_mu_divappr_q.  Such a temporary allocation is
43      unfortunately very large.
44 
45   2. We used to fall back to mpn_mu_div_qr when we detect a possible
46      mpn_mu_divappr_q rounding problem, now we multiply and compare.
47      Unfortunately, since mpn_mu_divappr_q does not return the partial
48      remainder, this also doesn't become optimal.  A mpn_mu_divappr_qr could
49      solve that.
50 
51   3. The allocations done here should be made from the scratch area, which
52      then would need to be amended.
53 */
54 
55 #include <stdlib.h>		/* for NULL */
56 #include "gmp.h"
57 #include "gmp-impl.h"
58 
59 
60 mp_limb_t
61 mpn_mu_div_q (mp_ptr qp,
62 	      mp_srcptr np, mp_size_t nn,
63 	      mp_srcptr dp, mp_size_t dn,
64 	      mp_ptr scratch)
65 {
66   mp_ptr tp, rp, ip, this_ip;
67   mp_size_t qn, in, this_in;
68   mp_limb_t cy, qh;
69   TMP_DECL;
70 
71   TMP_MARK;
72 
73   qn = nn - dn;
74 
75   tp = TMP_BALLOC_LIMBS (qn + 1);
76 
77   if (qn >= dn)			/* nn >= 2*dn + 1 */
78     {
79       /* Find max inverse size needed by the two preinv calls.  FIXME: This is
80 	 not optimal, it underestimates the invariance.  */
81       if (dn != qn)
82 	{
83 	  mp_size_t in1, in2;
84 
85 	  in1 = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
86 	  in2 = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
87 	  in = MAX (in1, in2);
88 	}
89       else
90 	{
91 	  in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
92 	}
93 
94       ip = TMP_BALLOC_LIMBS (in + 1);
95 
96       if (dn == in)
97 	{
98 	  MPN_COPY (scratch + 1, dp, in);
99 	  scratch[0] = 1;
100 	  mpn_invertappr (ip, scratch, in + 1, NULL);
101 	  MPN_COPY_INCR (ip, ip + 1, in);
102 	}
103       else
104 	{
105 	  cy = mpn_add_1 (scratch, dp + dn - (in + 1), in + 1, 1);
106 	  if (UNLIKELY (cy != 0))
107 	    MPN_ZERO (ip, in);
108 	  else
109 	    {
110 	      mpn_invertappr (ip, scratch, in + 1, NULL);
111 	      MPN_COPY_INCR (ip, ip + 1, in);
112 	    }
113 	}
114 
115        /* |_______________________|   dividend
116 			 |________|   divisor  */
117       rp = TMP_BALLOC_LIMBS (2 * dn + 1);
118 
119       this_in = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
120       this_ip = ip + in - this_in;
121       qh = mpn_preinv_mu_div_qr (tp + dn + 1, rp + dn + 1, np + dn, qn, dp, dn,
122 				 this_ip, this_in, scratch);
123 
124       MPN_COPY (rp + 1, np, dn);
125       rp[0] = 0;
126       this_in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
127       this_ip = ip + in - this_in;
128       cy = mpn_preinv_mu_divappr_q (tp, rp, 2 * dn + 1, dp, dn,
129 				    this_ip, this_in, scratch);
130 
131       if (UNLIKELY (cy != 0))
132 	{
133 	  /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was
134 	     canonically reduced, replace the returned value of B^(qn-dn)+eps
135 	     by the largest possible value.  */
136 	  mp_size_t i;
137 	  for (i = 0; i < dn + 1; i++)
138 	    tp[i] = GMP_NUMB_MAX;
139 	}
140 
141       /* The max error of mpn_mu_divappr_q is +4.  If the low quotient limb is
142 	 greater than the max error, we cannot trust the quotient.  */
143       if (tp[0] > 4)
144 	{
145 	  MPN_COPY (qp, tp + 1, qn);
146 	}
147       else
148 	{
149 	  mp_limb_t cy;
150 	  mp_ptr pp;
151 
152 	  /* FIXME: can we use already allocated space? */
153 	  pp = TMP_BALLOC_LIMBS (nn);
154 	  mpn_mul (pp, tp + 1, qn, dp, dn);
155 
156 	  cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0;
157 
158 	  if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */
159 	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
160 	  else /* Same as above */
161 	    MPN_COPY (qp, tp + 1, qn);
162 	}
163     }
164   else
165     {
166        /* |_______________________|   dividend
167 		 |________________|   divisor  */
168 
169       /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed
170 	 here becomes 2dn, i.e., more than nn.  This shouldn't hurt, since only
171 	 the most significant dn-1 limbs will actually be read, but it is not
172 	 pretty.  */
173 
174       qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2,
175 			     dp + dn - (qn + 1), qn + 1, scratch);
176 
177       /* The max error of mpn_mu_divappr_q is +4, but we get an additional
178          error from the divisor truncation.  */
179       if (tp[0] > 6)
180 	{
181 	  MPN_COPY (qp, tp + 1, qn);
182 	}
183       else
184 	{
185 	  mp_limb_t cy;
186 
187 	  /* FIXME: a shorter product should be enough; we may use already
188 	     allocated space... */
189 	  rp = TMP_BALLOC_LIMBS (nn);
190 	  mpn_mul (rp, dp, dn, tp + 1, qn);
191 
192 	  cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0;
193 
194 	  if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */
195 	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
196 	  else /* Same as above */
197 	    MPN_COPY (qp, tp + 1, qn);
198 	}
199     }
200 
201   TMP_FREE;
202   return qh;
203 }
204 
205 mp_size_t
206 mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
207 {
208   mp_size_t qn, itch1, itch2;
209 
210   qn = nn - dn;
211   if (qn >= dn)
212     {
213       itch1 = mpn_mu_div_qr_itch (qn, dn, mua_k);
214       itch2 = mpn_mu_divappr_q_itch (2 * dn + 1, dn, mua_k);
215       return MAX (itch1, itch2);
216     }
217   else
218     {
219       itch1 = mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k);
220       return itch1;
221     }
222 }
223