1 /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2    where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
3    and N; all other overlap disallowed.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 2005-2007, 2009, 2010, 2012 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 
40 /*
41    The idea of the algorithm used herein is to compute a smaller inverted value
42    than used in the standard Barrett algorithm, and thus save time in the
43    Newton iterations, and pay just a small price when using the inverted value
44    for developing quotient bits.  This algorithm was presented at ICMS 2006.
45 */
46 
47 #include "gmp.h"
48 #include "gmp-impl.h"
49 
50 
51 /* N = {np,nn}
52    D = {dp,dn}
53 
54    Requirements: N >= D
55 		 D >= 1
56 		 D odd
57 		 dn >= 2
58 		 nn >= 2
59 		 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
60 
61    Write quotient to Q = {qp,nn-dn}.
62 
63    FIXME: When iterating, perhaps do the small step before loop, not after.
64    FIXME: Try to avoid the scalar divisions when computing inverse size.
65    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
66 	  particular, when dn==in, tp and rp could use the same space.
67 */
68 mp_limb_t
mpn_mu_bdiv_qr(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)69 mpn_mu_bdiv_qr (mp_ptr qp,
70 		mp_ptr rp,
71 		mp_srcptr np, mp_size_t nn,
72 		mp_srcptr dp, mp_size_t dn,
73 		mp_ptr scratch)
74 {
75   mp_size_t qn;
76   mp_size_t in;
77   mp_limb_t cy, c0;
78   mp_size_t tn, wn;
79 
80   qn = nn - dn;
81 
82   ASSERT (dn >= 2);
83   ASSERT (qn >= 2);
84 
85   if (qn > dn)
86     {
87       mp_size_t b;
88 
89       /* |_______________________|   dividend
90 			|________|   divisor  */
91 
92 #define ip           scratch		/* in */
93 #define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
94 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
95 
96       /* Compute an inverse size that is a nice partition of the quotient.  */
97       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
98       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
99 
100       /* Some notes on allocation:
101 
102 	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
103 	 limbs of R dies at that point.  We could save memory by letting T live
104 	 just under R, and let the upper part of T expand into R. These changes
105 	 should reduce itch to perhaps 3dn.
106        */
107 
108       mpn_binvert (ip, dp, in, tp);
109 
110       MPN_COPY (rp, np, dn);
111       np += dn;
112       cy = 0;
113 
114       while (qn > in)
115 	{
116 	  mpn_mullo_n (qp, rp, ip, in);
117 
118 	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
119 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
120 	  else
121 	    {
122 	      tn = mpn_mulmod_bnm1_next_size (dn);
123 	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
124 	      wn = dn + in - tn;		/* number of wrapped limbs */
125 	      if (wn > 0)
126 		{
127 		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
128 		  mpn_decr_u (tp + wn, c0);
129 		}
130 	    }
131 
132 	  qp += in;
133 	  qn -= in;
134 
135 	  if (dn != in)
136 	    {
137 	      /* Subtract tp[dn-1...in] from partial remainder.  */
138 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
139 	      if (cy == 2)
140 		{
141 		  mpn_incr_u (tp + dn, 1);
142 		  cy = 1;
143 		}
144 	    }
145 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
146 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
147 	  np += in;
148 	}
149 
150       /* Generate last qn limbs.  */
151       mpn_mullo_n (qp, rp, ip, qn);
152 
153       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
154 	mpn_mul (tp, dp, dn, qp, qn);		/* mulhi, need tp[qn+in-1...in] */
155       else
156 	{
157 	  tn = mpn_mulmod_bnm1_next_size (dn);
158 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
159 	  wn = dn + qn - tn;			/* number of wrapped limbs */
160 	  if (wn > 0)
161 	    {
162 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
163 	      mpn_decr_u (tp + wn, c0);
164 	    }
165 	}
166 
167       if (dn != qn)
168 	{
169 	  cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
170 	  if (cy == 2)
171 	    {
172 	      mpn_incr_u (tp + dn, 1);
173 	      cy = 1;
174 	    }
175 	}
176       return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
177 
178 #undef ip
179 #undef tp
180 #undef scratch_out
181     }
182   else
183     {
184       /* |_______________________|   dividend
185 		|________________|   divisor  */
186 
187 #define ip           scratch		/* in */
188 #define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
189 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
190 
191       /* Compute half-sized inverse.  */
192       in = qn - (qn >> 1);
193 
194       mpn_binvert (ip, dp, in, tp);
195 
196       mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
197 
198       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
199 	mpn_mul (tp, dp, dn, qp, in);		/* mulhigh */
200       else
201 	{
202 	  tn = mpn_mulmod_bnm1_next_size (dn);
203 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
204 	  wn = dn + in - tn;			/* number of wrapped limbs */
205 	  if (wn > 0)
206 	    {
207 	      c0 = mpn_sub_n (tp + tn, tp, np, wn);
208 	      mpn_decr_u (tp + wn, c0);
209 	    }
210 	}
211 
212       qp += in;
213       qn -= in;
214 
215       cy = mpn_sub_n (rp, np + in, tp + in, dn);
216       mpn_mullo_n (qp, rp, ip, qn);		/* high qn quotient limbs */
217 
218       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
219 	mpn_mul (tp, dp, dn, qp, qn);		/* mulhigh */
220       else
221 	{
222 	  tn = mpn_mulmod_bnm1_next_size (dn);
223 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
224 	  wn = dn + qn - tn;			/* number of wrapped limbs */
225 	  if (wn > 0)
226 	    {
227 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
228 	      mpn_decr_u (tp + wn, c0);
229 	    }
230 	}
231 
232       cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
233       if (cy == 2)
234 	{
235 	  mpn_incr_u (tp + dn, 1);
236 	  cy = 1;
237 	}
238       return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
239 
240 #undef ip
241 #undef tp
242 #undef scratch_out
243     }
244 }
245 
246 mp_size_t
mpn_mu_bdiv_qr_itch(mp_size_t nn,mp_size_t dn)247 mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
248 {
249   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
250   mp_size_t b;
251 
252   qn = nn - dn;
253 
254   if (qn > dn)
255     {
256       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
257       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
258       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
259 	{
260 	  tn = dn + in;
261 	  itch_out = 0;
262 	}
263       else
264 	{
265 	  tn = mpn_mulmod_bnm1_next_size (dn);
266 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
267 	}
268       itch_binvert = mpn_binvert_itch (in);
269       itches = tn + itch_out;
270       return in + MAX (itches, itch_binvert);
271     }
272   else
273     {
274       in = qn - (qn >> 1);
275       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
276 	{
277 	  tn = dn + in;
278 	  itch_out = 0;
279 	}
280       else
281 	{
282 	  tn = mpn_mulmod_bnm1_next_size (dn);
283 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
284 	}
285     }
286   itch_binvert = mpn_binvert_itch (in);
287   itches = tn + itch_out;
288   return in + MAX (itches, itch_binvert);
289 }
290