1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
2    storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
3    overlap disallowed.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 
40 /*
41    The idea of the algorithm used herein is to compute a smaller inverted value
42    than used in the standard Barrett algorithm, and thus save time in the
43    Newton iterations, and pay just a small price when using the inverted value
44    for developing quotient bits.  This algorithm was presented at ICMS 2006.
45 */
46 
47 #include "gmp.h"
48 #include "gmp-impl.h"
49 
50 
51 /* N = {np,nn}
52    D = {dp,dn}
53 
54    Requirements: N >= D
55 		 D >= 1
56 		 D odd
57 		 dn >= 2
58 		 nn >= 2
59 		 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
60 
61    Write quotient to Q = {qp,nn}.
62 
63    FIXME: When iterating, perhaps do the small step before loop, not after.
64    FIXME: Try to avoid the scalar divisions when computing inverse size.
65    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
66 	  particular, when dn==in, tp and rp could use the same space.
67    FIXME: Trim final quotient calculation to qn limbs of precision.
68 */
69 void
mpn_mu_bdiv_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)70 mpn_mu_bdiv_q (mp_ptr qp,
71 	       mp_srcptr np, mp_size_t nn,
72 	       mp_srcptr dp, mp_size_t dn,
73 	       mp_ptr scratch)
74 {
75   mp_size_t qn;
76   mp_size_t in;
77   int cy, c0;
78   mp_size_t tn, wn;
79 
80   qn = nn;
81 
82   ASSERT (dn >= 2);
83   ASSERT (qn >= 2);
84 
85   if (qn > dn)
86     {
87       mp_size_t b;
88 
89       /* |_______________________|   dividend
90 			|________|   divisor  */
91 
92 #define ip           scratch			/* in */
93 #define rp           (scratch + in)		/* dn or rest >= binvert_itch(in) */
94 #define tp           (scratch + in + dn)	/* dn+in or next_size(dn) */
95 #define scratch_out  (scratch + in + dn + tn)	/* mulmod_bnm1_itch(next_size(dn)) */
96 
97       /* Compute an inverse size that is a nice partition of the quotient.  */
98       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
99       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
100 
101       /* Some notes on allocation:
102 
103 	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
104 	 limbs of R dies at that point.  We could save memory by letting T live
105 	 just under R, and let the upper part of T expand into R. These changes
106 	 should reduce itch to perhaps 3dn.
107        */
108 
109       mpn_binvert (ip, dp, in, rp);
110 
111       cy = 0;
112 
113       MPN_COPY (rp, np, dn);
114       np += dn;
115       mpn_mullo_n (qp, rp, ip, in);
116       qn -= in;
117 
118       while (qn > in)
119 	{
120 	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
121 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
122 	  else
123 	    {
124 	      tn = mpn_mulmod_bnm1_next_size (dn);
125 	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
126 	      wn = dn + in - tn;		/* number of wrapped limbs */
127 	      if (wn > 0)
128 		{
129 		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
130 		  mpn_decr_u (tp + wn, c0);
131 		}
132 	    }
133 
134 	  qp += in;
135 	  if (dn != in)
136 	    {
137 	      /* Subtract tp[dn-1...in] from partial remainder.  */
138 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
139 	      if (cy == 2)
140 		{
141 		  mpn_incr_u (tp + dn, 1);
142 		  cy = 1;
143 		}
144 	    }
145 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
146 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
147 	  np += in;
148 	  mpn_mullo_n (qp, rp, ip, in);
149 	  qn -= in;
150 	}
151 
152       /* Generate last qn limbs.
153 	 FIXME: It should be possible to limit precision here, since qn is
154 	 typically somewhat smaller than dn.  No big gains expected.  */
155 
156       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
157 	mpn_mul (tp, dp, dn, qp, in);		/* mulhi, need tp[qn+in-1...in] */
158       else
159 	{
160 	  tn = mpn_mulmod_bnm1_next_size (dn);
161 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
162 	  wn = dn + in - tn;			/* number of wrapped limbs */
163 	  if (wn > 0)
164 	    {
165 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
166 	      mpn_decr_u (tp + wn, c0);
167 	    }
168 	}
169 
170       qp += in;
171       if (dn != in)
172 	{
173 	  cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
174 	  if (cy == 2)
175 	    {
176 	      mpn_incr_u (tp + dn, 1);
177 	      cy = 1;
178 	    }
179 	}
180 
181       mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
182       mpn_mullo_n (qp, rp, ip, qn);
183 
184 #undef ip
185 #undef rp
186 #undef tp
187 #undef scratch_out
188    }
189   else
190     {
191       /* |_______________________|   dividend
192 		|________________|   divisor  */
193 
194 #define ip           scratch		/* in */
195 #define tp           (scratch + in)	/* qn+in or next_size(qn) or rest >= binvert_itch(in) */
196 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
197 
198       /* Compute half-sized inverse.  */
199       in = qn - (qn >> 1);
200 
201       mpn_binvert (ip, dp, in, tp);
202 
203       mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
204 
205       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
206 	mpn_mul (tp, dp, qn, qp, in);		/* mulhigh */
207       else
208 	{
209 	  tn = mpn_mulmod_bnm1_next_size (qn);
210 	  mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
211 	  wn = qn + in - tn;			/* number of wrapped limbs */
212 	  if (wn > 0)
213 	    {
214 	      c0 = mpn_cmp (tp, np, wn) < 0;
215 	      mpn_decr_u (tp + wn, c0);
216 	    }
217 	}
218 
219       mpn_sub_n (tp, np + in, tp + in, qn - in);
220       mpn_mullo_n (qp + in, tp, ip, qn - in);	/* high qn-in quotient limbs */
221 
222 #undef ip
223 #undef tp
224 #undef scratch_out
225     }
226 }
227 
228 mp_size_t
mpn_mu_bdiv_q_itch(mp_size_t nn,mp_size_t dn)229 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
230 {
231   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
232   mp_size_t b;
233 
234   qn = nn;
235 
236   if (qn > dn)
237     {
238       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
239       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
240       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
241 	{
242 	  tn = dn + in;
243 	  itch_out = 0;
244 	}
245       else
246 	{
247 	  tn = mpn_mulmod_bnm1_next_size (dn);
248 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
249 	}
250       itch_binvert = mpn_binvert_itch (in);
251       itches = dn + tn + itch_out;
252       return in + MAX (itches, itch_binvert);
253     }
254   else
255     {
256       in = qn - (qn >> 1);
257       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
258 	{
259 	  tn = qn + in;
260 	  itch_out = 0;
261 	}
262       else
263 	{
264 	  tn = mpn_mulmod_bnm1_next_size (qn);
265 	  itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
266 	}
267       itch_binvert = mpn_binvert_itch (in);
268       itches = tn + itch_out;
269       return in + MAX (itches, itch_binvert);
270     }
271 }
272