xref: /dragonfly/contrib/gmp/mpn/generic/mu_bdiv_q.c (revision cfd1aba3)
1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
2    storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
3    overlap disallowed.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19 
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23 License for more details.
24 
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27 
28 
29 /*
30    The idea of the algorithm used herein is to compute a smaller inverted value
31    than used in the standard Barrett algorithm, and thus save time in the
32    Newton iterations, and pay just a small price when using the inverted value
33    for developing quotient bits.  This algorithm was presented at ICMS 2006.
34 */
35 
36 #include "gmp.h"
37 #include "gmp-impl.h"
38 
39 
40 /* N = {np,nn}
41    D = {dp,dn}
42 
43    Requirements: N >= D
44 		 D >= 1
45 		 D odd
46 		 dn >= 2
47 		 nn >= 2
48 		 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
49 
50    Write quotient to Q = {qp,nn}.
51 
52    FIXME: When iterating, perhaps do the small step before loop, not after.
53    FIXME: Try to avoid the scalar divisions when computing inverse size.
54    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
55 	  particular, when dn==in, tp and rp could use the same space.
56    FIXME: Trim final quotient calculation to qn limbs of precision.
57 */
58 void
59 mpn_mu_bdiv_q (mp_ptr qp,
60 	       mp_srcptr np, mp_size_t nn,
61 	       mp_srcptr dp, mp_size_t dn,
62 	       mp_ptr scratch)
63 {
64   mp_size_t qn;
65   mp_size_t in;
66   int cy, c0;
67   mp_size_t tn, wn;
68 
69   qn = nn;
70 
71   ASSERT (dn >= 2);
72   ASSERT (qn >= 2);
73 
74   if (qn > dn)
75     {
76       mp_size_t b;
77 
78       /* |_______________________|   dividend
79 			|________|   divisor  */
80 
81 #define ip           scratch			/* in */
82 #define rp           (scratch + in)		/* dn or rest >= binvert_itch(in) */
83 #define tp           (scratch + in + dn)	/* dn+in or next_size(dn) */
84 #define scratch_out  (scratch + in + dn + tn)	/* mulmod_bnm1_itch(next_size(dn)) */
85 
86       /* Compute an inverse size that is a nice partition of the quotient.  */
87       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
88       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
89 
90       /* Some notes on allocation:
91 
92 	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
93 	 limbs of R dies at that point.  We could save memory by letting T live
94 	 just under R, and let the upper part of T expand into R. These changes
95 	 should reduce itch to perhaps 3dn.
96        */
97 
98       mpn_binvert (ip, dp, in, rp);
99 
100       cy = 0;
101 
102       MPN_COPY (rp, np, dn);
103       np += dn;
104       mpn_mullo_n (qp, rp, ip, in);
105       qn -= in;
106 
107       while (qn > in)
108 	{
109 	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
110 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
111 	  else
112 	    {
113 	      tn = mpn_mulmod_bnm1_next_size (dn);
114 	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
115 	      wn = dn + in - tn;		/* number of wrapped limbs */
116 	      if (wn > 0)
117 		{
118 		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
119 		  mpn_decr_u (tp + wn, c0);
120 		}
121 	    }
122 
123 	  qp += in;
124 	  if (dn != in)
125 	    {
126 	      /* Subtract tp[dn-1...in] from partial remainder.  */
127 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
128 	      if (cy == 2)
129 		{
130 		  mpn_incr_u (tp + dn, 1);
131 		  cy = 1;
132 		}
133 	    }
134 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
135 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
136 	  np += in;
137 	  mpn_mullo_n (qp, rp, ip, in);
138 	  qn -= in;
139 	}
140 
141       /* Generate last qn limbs.
142 	 FIXME: It should be possible to limit precision here, since qn is
143 	 typically somewhat smaller than dn.  No big gains expected.  */
144 
145       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
146 	mpn_mul (tp, dp, dn, qp, in);		/* mulhi, need tp[qn+in-1...in] */
147       else
148 	{
149 	  tn = mpn_mulmod_bnm1_next_size (dn);
150 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
151 	  wn = dn + in - tn;			/* number of wrapped limbs */
152 	  if (wn > 0)
153 	    {
154 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
155 	      mpn_decr_u (tp + wn, c0);
156 	    }
157 	}
158 
159       qp += in;
160       if (dn != in)
161 	{
162 	  cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
163 	  if (cy == 2)
164 	    {
165 	      mpn_incr_u (tp + dn, 1);
166 	      cy = 1;
167 	    }
168 	}
169 
170       mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
171       mpn_mullo_n (qp, rp, ip, qn);
172 
173 #undef ip
174 #undef rp
175 #undef tp
176 #undef scratch_out
177    }
178   else
179     {
180       /* |_______________________|   dividend
181 		|________________|   divisor  */
182 
183 #define ip           scratch		/* in */
184 #define tp           (scratch + in)	/* qn+in or next_size(qn) or rest >= binvert_itch(in) */
185 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
186 
187       /* Compute half-sized inverse.  */
188       in = qn - (qn >> 1);
189 
190       mpn_binvert (ip, dp, in, tp);
191 
192       mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
193 
194       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
195 	mpn_mul (tp, dp, qn, qp, in);		/* mulhigh */
196       else
197 	{
198 	  tn = mpn_mulmod_bnm1_next_size (qn);
199 	  mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
200 	  wn = qn + in - tn;			/* number of wrapped limbs */
201 	  if (wn > 0)
202 	    {
203 	      c0 = mpn_cmp (tp, np, wn) < 0;
204 	      mpn_decr_u (tp + wn, c0);
205 	    }
206 	}
207 
208       mpn_sub_n (tp, np + in, tp + in, qn - in);
209       mpn_mullo_n (qp + in, tp, ip, qn - in);	/* high qn-in quotient limbs */
210 
211 #undef ip
212 #undef tp
213 #undef scratch_out
214     }
215 }
216 
217 mp_size_t
218 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
219 {
220   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
221   mp_size_t b;
222 
223   qn = nn;
224 
225   if (qn > dn)
226     {
227       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
228       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
229       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
230 	{
231 	  tn = dn + in;
232 	  itch_out = 0;
233 	}
234       else
235 	{
236 	  tn = mpn_mulmod_bnm1_next_size (dn);
237 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
238 	}
239       itch_binvert = mpn_binvert_itch (in);
240       itches = dn + tn + itch_out;
241       return in + MAX (itches, itch_binvert);
242     }
243   else
244     {
245       in = qn - (qn >> 1);
246       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
247 	{
248 	  tn = qn + in;
249 	  itch_out = 0;
250 	}
251       else
252 	{
253 	  tn = mpn_mulmod_bnm1_next_size (qn);
254 	  itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
255 	}
256       itch_binvert = mpn_binvert_itch (in);
257       itches = tn + itch_out;
258       return in + MAX (itches, itch_binvert);
259     }
260 }
261