xref: /dragonfly/contrib/gmp/mpn/generic/mu_bdiv_q.c (revision bcb3e04d)
1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
2    storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
3    overlap disallowed.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE.  IT IS
8    ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
9    ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
10    RELEASE.
11 
12 Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
13 
14 This file is part of the GNU MP Library.
15 
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of the GNU Lesser General Public License as published by
18 the Free Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20 
21 The GNU MP Library is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
24 License for more details.
25 
26 You should have received a copy of the GNU Lesser General Public License
27 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
28 
29 
30 /* We use the "misunderstanding algorithm" (MU), discovered by Paul Zimmermann
31    and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of
32    Jebelean's bidirectional exact division algorithm.
33 
34    The idea of this algorithm is to compute a smaller inverted value than used
35    in the standard Barrett algorithm, and thus save time in the Newton
36    iterations, and pay just a small price when using the inverted value for
37    developing quotient bits.
38 
39    Written by Torbjorn Granlund.  Paul Zimmermann suggested the use of the
40    "wrap around" trick.
41 */
42 
43 #include "gmp.h"
44 #include "gmp-impl.h"
45 
46 
47 /* N = {np,nn}
48    D = {dp,dn}
49 
50    Requirements: N >= D
51 		 D >= 1
52 		 N mod D = 0
53 		 D odd
54 		 dn >= 2
55 		 nn >= 2
56 		 scratch space as determined by mpn_divexact_itch(nn,dn).
57 
58    Write quotient to Q = {qp,nn}.
59 
60    FIXME: When iterating, perhaps do the small step before loop, not after.
61    FIXME: Try to avoid the scalar divisions when computing inverse size.
62    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
63 	  particular, when dn==in, tp and rp could use the same space.
64    FIXME: Trim final quotient calculation to qn limbs of precision.
65 */
66 void
67 mpn_mu_bdiv_q (mp_ptr qp,
68 	       mp_srcptr np, mp_size_t nn,
69 	       mp_srcptr dp, mp_size_t dn,
70 	       mp_ptr scratch)
71 {
72   mp_ptr ip;
73   mp_ptr rp;
74   mp_size_t qn;
75   mp_size_t in;
76 
77   qn = nn;
78 
79   ASSERT (dn >= 2);
80   ASSERT (qn >= 2);
81 
82   if (qn > dn)
83     {
84       mp_size_t b;
85       mp_ptr tp;
86       mp_limb_t cy;
87       int k;
88       mp_size_t m, wn;
89       mp_size_t i;
90 
91       /* |_______________________|   dividend
92 			|________|   divisor  */
93 
94       /* Compute an inverse size that is a nice partition of the quotient.  */
95       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
96       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
97 
98       /* Some notes on allocation:
99 
100 	 When in = dn, R dies when mpn_mullow returns, if in < dn the low in
101 	 limbs of R dies at that point.  We could save memory by letting T live
102 	 just under R, and let the upper part of T expand into R. These changes
103 	 should reduce itch to perhaps 3dn.
104        */
105 
106       ip = scratch;			/* in limbs */
107       rp = scratch + in;		/* dn limbs */
108       tp = scratch + in + dn;		/* dn + in limbs FIXME: mpn_fft_next_size */
109       scratch += in;			/* Roughly 2in+1 limbs */
110 
111       mpn_binvert (ip, dp, in, scratch);
112 
113       cy = 0;
114 
115       MPN_COPY (rp, np, dn);
116       np += dn;
117       mpn_mullow_n (qp, rp, ip, in);
118       qn -= in;
119 
120       if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
121 	{
122 	  k = mpn_fft_best_k (dn, 0);
123 	  m = mpn_fft_next_size (dn, k);
124 	  wn = dn + in - m;			/* number of wrapped limbs */
125 	  ASSERT_ALWAYS (wn >= 0);		/* could handle this below */
126 	}
127 
128       while (qn > in)
129 	{
130 #if WANT_FFT
131 	  if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
132 	    {
133 	      /* The two multiplicands are dn and 'in' limbs, with dn >= in.
134 		 The relevant part of the result will typically partially wrap,
135 		 and that part will come out as subtracted to the right.  The
136 		 unwrapped part, m-in limbs at the high end of tp, is the lower
137 		 part of the sought product.  The wrapped part, at the low end
138 		 of tp, will be subtracted from the low part of the partial
139 		 remainder; we undo that operation with another subtraction. */
140 	      int c0;
141 
142 	      mpn_mul_fft (tp, m, dp, dn, qp, in, k);
143 
144 	      c0 = mpn_sub_n (tp + m, rp, tp, wn);
145 
146 	      for (i = wn; c0 != 0 && i < in; i++)
147 		c0 = tp[i] == GMP_NUMB_MASK;
148 	      mpn_incr_u (tp + in, c0);
149 	    }
150 	  else
151 #endif
152 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
153 	  qp += in;
154 	  if (dn != in)
155 	    {
156 	      /* Subtract tp[dn-1...in] from partial remainder.  */
157 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
158 	      if (cy == 2)
159 		{
160 		  mpn_incr_u (tp + dn, 1);
161 		  cy = 1;
162 		}
163 	    }
164 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
165 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
166 	  np += in;
167 	  mpn_mullow_n (qp, rp, ip, in);
168 	  qn -= in;
169 	}
170 
171       /* Generate last qn limbs.
172 	 FIXME: It should be possible to limit precision here, since qn is
173 	 typically somewhat smaller than dn.  No big gains expected.  */
174 #if WANT_FFT
175       if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
176 	{
177 	  int c0;
178 
179 	  mpn_mul_fft (tp, m, dp, dn, qp, in, k);
180 
181 	  c0 = mpn_sub_n (tp + m, rp, tp, wn);
182 
183 	  for (i = wn; c0 != 0 && i < in; i++)
184 	    c0 = tp[i] == GMP_NUMB_MASK;
185 	  mpn_incr_u (tp + in, c0);
186 	}
187       else
188 #endif
189 	mpn_mul (tp, dp, dn, qp, in);		/* mulhi, need tp[qn+in-1...in] */
190       qp += in;
191       if (dn != in)
192 	{
193 	  cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
194 	  if (cy == 2)
195 	    {
196 	      mpn_incr_u (tp + dn, 1);
197 	      cy = 1;
198 	    }
199 	}
200 
201       mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
202       mpn_mullow_n (qp, rp, ip, qn);
203     }
204   else
205     {
206       /* |_______________________|   dividend
207 		|________________|   divisor  */
208 
209       /* Compute half-sized inverse.  */
210       in = qn - (qn >> 1);
211 
212       ip = scratch;			/* ceil(qn/2) limbs */
213       rp = scratch + in;		/* ceil(qn/2)+qn limbs */
214       scratch += in;			/* 2*ceil(qn/2)+2 */
215 
216       mpn_binvert (ip, dp, in, scratch);
217 
218       mpn_mullow_n (qp, np, ip, in);		/* low `in' quotient limbs */
219 #if WANT_FFT
220       if (ABOVE_THRESHOLD (qn, MUL_FFT_MODF_THRESHOLD))
221 	{
222 	  int k;
223 	  mp_size_t m;
224 
225 	  k = mpn_fft_best_k (qn, 0);
226 	  m = mpn_fft_next_size (qn, k);
227 	  mpn_mul_fft (rp, m, dp, qn, qp, in, k);
228 	  if (mpn_cmp (np, rp, in) < 0)
229 	    mpn_incr_u (rp + in, 1);
230 	}
231       else
232 #endif
233 	mpn_mul (rp, dp, qn, qp, in);		/* mulhigh */
234 
235       mpn_sub_n (rp, np + in, rp + in, qn - in);
236       mpn_mullow_n (qp + in, rp, ip, qn - in);	/* high qn-in quotient limbs */
237     }
238 }
239 
240 mp_size_t
241 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
242 {
243   mp_size_t qn;
244 
245   qn = nn;
246 
247   if (qn > dn)
248     {
249       return 4 * dn;		/* FIXME FIXME FIXME need mpn_fft_next_size */
250     }
251   else
252     {
253       return 2 * qn + 1 + 2;	/* FIXME FIXME FIXME need mpn_fft_next_size */
254     }
255 }
256