1 /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2 where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q
3 and N; all other overlap disallowed.
4
5 Contributed to the GNU project by Torbjorn Granlund.
6
7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10
11 Copyright 2005-2007, 2009, 2010, 2012 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39
40 /*
41 The idea of the algorithm used herein is to compute a smaller inverted value
42 than used in the standard Barrett algorithm, and thus save time in the
43 Newton iterations, and pay just a small price when using the inverted value
44 for developing quotient bits. This algorithm was presented at ICMS 2006.
45 */
46
47 #include "gmp.h"
48 #include "gmp-impl.h"
49
50
51 /* N = {np,nn}
52 D = {dp,dn}
53
54 Requirements: N >= D
55 D >= 1
56 D odd
57 dn >= 2
58 nn >= 2
59 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
60
61 Write quotient to Q = {qp,nn-dn}.
62
63 FIXME: When iterating, perhaps do the small step before loop, not after.
64 FIXME: Try to avoid the scalar divisions when computing inverse size.
65 FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
66 particular, when dn==in, tp and rp could use the same space.
67 */
68 mp_limb_t
mpn_mu_bdiv_qr(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)69 mpn_mu_bdiv_qr (mp_ptr qp,
70 mp_ptr rp,
71 mp_srcptr np, mp_size_t nn,
72 mp_srcptr dp, mp_size_t dn,
73 mp_ptr scratch)
74 {
75 mp_size_t qn;
76 mp_size_t in;
77 mp_limb_t cy, c0;
78 mp_size_t tn, wn;
79
80 qn = nn - dn;
81
82 ASSERT (dn >= 2);
83 ASSERT (qn >= 2);
84
85 if (qn > dn)
86 {
87 mp_size_t b;
88
89 /* |_______________________| dividend
90 |________| divisor */
91
92 #define ip scratch /* in */
93 #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
94 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
95
96 /* Compute an inverse size that is a nice partition of the quotient. */
97 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
98 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
99
100 /* Some notes on allocation:
101
102 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
103 limbs of R dies at that point. We could save memory by letting T live
104 just under R, and let the upper part of T expand into R. These changes
105 should reduce itch to perhaps 3dn.
106 */
107
108 mpn_binvert (ip, dp, in, tp);
109
110 MPN_COPY (rp, np, dn);
111 np += dn;
112 cy = 0;
113
114 while (qn > in)
115 {
116 mpn_mullo_n (qp, rp, ip, in);
117
118 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
119 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
120 else
121 {
122 tn = mpn_mulmod_bnm1_next_size (dn);
123 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
124 wn = dn + in - tn; /* number of wrapped limbs */
125 if (wn > 0)
126 {
127 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
128 mpn_decr_u (tp + wn, c0);
129 }
130 }
131
132 qp += in;
133 qn -= in;
134
135 if (dn != in)
136 {
137 /* Subtract tp[dn-1...in] from partial remainder. */
138 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
139 if (cy == 2)
140 {
141 mpn_incr_u (tp + dn, 1);
142 cy = 1;
143 }
144 }
145 /* Subtract tp[dn+in-1...dn] from dividend. */
146 cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
147 np += in;
148 }
149
150 /* Generate last qn limbs. */
151 mpn_mullo_n (qp, rp, ip, qn);
152
153 if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
154 mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */
155 else
156 {
157 tn = mpn_mulmod_bnm1_next_size (dn);
158 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
159 wn = dn + qn - tn; /* number of wrapped limbs */
160 if (wn > 0)
161 {
162 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
163 mpn_decr_u (tp + wn, c0);
164 }
165 }
166
167 if (dn != qn)
168 {
169 cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
170 if (cy == 2)
171 {
172 mpn_incr_u (tp + dn, 1);
173 cy = 1;
174 }
175 }
176 return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
177
178 #undef ip
179 #undef tp
180 #undef scratch_out
181 }
182 else
183 {
184 /* |_______________________| dividend
185 |________________| divisor */
186
187 #define ip scratch /* in */
188 #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
189 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
190
191 /* Compute half-sized inverse. */
192 in = qn - (qn >> 1);
193
194 mpn_binvert (ip, dp, in, tp);
195
196 mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
197
198 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
199 mpn_mul (tp, dp, dn, qp, in); /* mulhigh */
200 else
201 {
202 tn = mpn_mulmod_bnm1_next_size (dn);
203 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
204 wn = dn + in - tn; /* number of wrapped limbs */
205 if (wn > 0)
206 {
207 c0 = mpn_sub_n (tp + tn, tp, np, wn);
208 mpn_decr_u (tp + wn, c0);
209 }
210 }
211
212 qp += in;
213 qn -= in;
214
215 cy = mpn_sub_n (rp, np + in, tp + in, dn);
216 mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */
217
218 if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
219 mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */
220 else
221 {
222 tn = mpn_mulmod_bnm1_next_size (dn);
223 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
224 wn = dn + qn - tn; /* number of wrapped limbs */
225 if (wn > 0)
226 {
227 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
228 mpn_decr_u (tp + wn, c0);
229 }
230 }
231
232 cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
233 if (cy == 2)
234 {
235 mpn_incr_u (tp + dn, 1);
236 cy = 1;
237 }
238 return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
239
240 #undef ip
241 #undef tp
242 #undef scratch_out
243 }
244 }
245
246 mp_size_t
mpn_mu_bdiv_qr_itch(mp_size_t nn,mp_size_t dn)247 mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
248 {
249 mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
250 mp_size_t b;
251
252 qn = nn - dn;
253
254 if (qn > dn)
255 {
256 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
257 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
258 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
259 {
260 tn = dn + in;
261 itch_out = 0;
262 }
263 else
264 {
265 tn = mpn_mulmod_bnm1_next_size (dn);
266 itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
267 }
268 itch_binvert = mpn_binvert_itch (in);
269 itches = tn + itch_out;
270 return in + MAX (itches, itch_binvert);
271 }
272 else
273 {
274 in = qn - (qn >> 1);
275 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
276 {
277 tn = dn + in;
278 itch_out = 0;
279 }
280 else
281 {
282 tn = mpn_mulmod_bnm1_next_size (dn);
283 itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
284 }
285 }
286 itch_binvert = mpn_binvert_itch (in);
287 itches = tn + itch_out;
288 return in + MAX (itches, itch_binvert);
289 }
290