1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
2    storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
3    overlap disallowed.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 
40 /*
41    The idea of the algorithm used herein is to compute a smaller inverted value
42    than used in the standard Barrett algorithm, and thus save time in the
43    Newton iterations, and pay just a small price when using the inverted value
44    for developing quotient bits.  This algorithm was presented at ICMS 2006.
45 */
46 
47 #include "gmp-impl.h"
48 
49 
50 /* N = {np,nn}
51    D = {dp,dn}
52 
53    Requirements: N >= D
54 		 D >= 1
55 		 D odd
56 		 dn >= 2
57 		 nn >= 2
58 		 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
59 
60    Write quotient to Q = {qp,nn}.
61 
62    FIXME: When iterating, perhaps do the small step before loop, not after.
63    FIXME: Try to avoid the scalar divisions when computing inverse size.
64    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
65 	  particular, when dn==in, tp and rp could use the same space.
66    FIXME: Trim final quotient calculation to qn limbs of precision.
67 */
68 static void
mpn_mu_bdiv_q_old(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)69 mpn_mu_bdiv_q_old (mp_ptr qp,
70 	       mp_srcptr np, mp_size_t nn,
71 	       mp_srcptr dp, mp_size_t dn,
72 	       mp_ptr scratch)
73 {
74   mp_size_t qn;
75   mp_size_t in;
76   int cy, c0;
77   mp_size_t tn, wn;
78 
79   qn = nn;
80 
81   ASSERT (dn >= 2);
82   ASSERT (qn >= 2);
83 
84   if (qn > dn)
85     {
86       mp_size_t b;
87 
88       /* |_______________________|   dividend
89 			|________|   divisor  */
90 
91 #define ip           scratch			/* in */
92 #define rp           (scratch + in)		/* dn or rest >= binvert_itch(in) */
93 #define tp           (scratch + in + dn)	/* dn+in or next_size(dn) */
94 #define scratch_out  (scratch + in + dn + tn)	/* mulmod_bnm1_itch(next_size(dn)) */
95 
96       /* Compute an inverse size that is a nice partition of the quotient.  */
97       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
98       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
99 
100       /* Some notes on allocation:
101 
102 	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
103 	 limbs of R dies at that point.  We could save memory by letting T live
104 	 just under R, and let the upper part of T expand into R. These changes
105 	 should reduce itch to perhaps 3dn.
106        */
107 
108       mpn_binvert (ip, dp, in, rp);
109 
110       cy = 0;
111 
112       MPN_COPY (rp, np, dn);
113       np += dn;
114       mpn_mullo_n (qp, rp, ip, in);
115       qn -= in;
116 
117       while (qn > in)
118 	{
119 	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
120 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
121 	  else
122 	    {
123 	      tn = mpn_mulmod_bnm1_next_size (dn);
124 	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
125 	      wn = dn + in - tn;		/* number of wrapped limbs */
126 	      if (wn > 0)
127 		{
128 		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
129 		  mpn_decr_u (tp + wn, c0);
130 		}
131 	    }
132 
133 	  qp += in;
134 	  if (dn != in)
135 	    {
136 	      /* Subtract tp[dn-1...in] from partial remainder.  */
137 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
138 	      if (cy == 2)
139 		{
140 		  mpn_incr_u (tp + dn, 1);
141 		  cy = 1;
142 		}
143 	    }
144 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
145 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
146 	  np += in;
147 	  mpn_mullo_n (qp, rp, ip, in);
148 	  qn -= in;
149 	}
150 
151       /* Generate last qn limbs.
152 	 FIXME: It should be possible to limit precision here, since qn is
153 	 typically somewhat smaller than dn.  No big gains expected.  */
154 
155       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
156 	mpn_mul (tp, dp, dn, qp, in);		/* mulhi, need tp[qn+in-1...in] */
157       else
158 	{
159 	  tn = mpn_mulmod_bnm1_next_size (dn);
160 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
161 	  wn = dn + in - tn;			/* number of wrapped limbs */
162 	  if (wn > 0)
163 	    {
164 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
165 	      mpn_decr_u (tp + wn, c0);
166 	    }
167 	}
168 
169       qp += in;
170       if (dn != in)
171 	{
172 	  cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
173 	  if (cy == 2)
174 	    {
175 	      mpn_incr_u (tp + dn, 1);
176 	      cy = 1;
177 	    }
178 	}
179 
180       mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
181       mpn_mullo_n (qp, rp, ip, qn);
182 
183 #undef ip
184 #undef rp
185 #undef tp
186 #undef scratch_out
187    }
188   else
189     {
190       /* |_______________________|   dividend
191 		|________________|   divisor  */
192 
193 #define ip           scratch		/* in */
194 #define tp           (scratch + in)	/* qn+in or next_size(qn) or rest >= binvert_itch(in) */
195 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
196 
197       /* Compute half-sized inverse.  */
198       in = qn - (qn >> 1);
199 
200       mpn_binvert (ip, dp, in, tp);
201 
202       mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
203 
204       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
205 	mpn_mul (tp, dp, qn, qp, in);		/* mulhigh */
206       else
207 	{
208 	  tn = mpn_mulmod_bnm1_next_size (qn);
209 	  mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
210 	  wn = qn + in - tn;			/* number of wrapped limbs */
211 	  if (wn > 0)
212 	    {
213 	      c0 = mpn_cmp (tp, np, wn) < 0;
214 	      mpn_decr_u (tp + wn, c0);
215 	    }
216 	}
217 
218       mpn_sub_n (tp, np + in, tp + in, qn - in);
219       mpn_mullo_n (qp + in, tp, ip, qn - in);	/* high qn-in quotient limbs */
220 
221 #undef ip
222 #undef tp
223 #undef scratch_out
224     }
225 }
226 
227 void
mpn_mu_bdiv_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)228 mpn_mu_bdiv_q (mp_ptr qp,
229 	       mp_srcptr np, mp_size_t nn,
230 	       mp_srcptr dp, mp_size_t dn,
231 	       mp_ptr scratch)
232 {
233   mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch);
234   mpn_neg (qp, qp, nn);
235 }
236 
237 mp_size_t
mpn_mu_bdiv_q_itch(mp_size_t nn,mp_size_t dn)238 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
239 {
240   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
241   mp_size_t b;
242 
243   ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
244 
245   qn = nn;
246 
247   if (qn > dn)
248     {
249       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
250       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
251       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
252 	{
253 	  tn = dn + in;
254 	  itch_out = 0;
255 	}
256       else
257 	{
258 	  tn = mpn_mulmod_bnm1_next_size (dn);
259 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
260 	}
261       itches = dn + tn + itch_out;
262     }
263   else
264     {
265       in = qn - (qn >> 1);
266       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
267 	{
268 	  tn = qn + in;
269 	  itch_out = 0;
270 	}
271       else
272 	{
273 	  tn = mpn_mulmod_bnm1_next_size (qn);
274 	  itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
275 	}
276       itches = tn + itch_out;
277     }
278 
279   itch_binvert = mpn_binvert_itch (in);
280   return in + MAX (itches, itch_binvert);
281 }
282