1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
2 storing the result in {qp,nn}. Overlap allowed between Q and N; all other
3 overlap disallowed.
4
5 Contributed to the GNU project by Torbjorn Granlund.
6
7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10
11 Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39
40 /*
41 The idea of the algorithm used herein is to compute a smaller inverted value
42 than used in the standard Barrett algorithm, and thus save time in the
43 Newton iterations, and pay just a small price when using the inverted value
44 for developing quotient bits. This algorithm was presented at ICMS 2006.
45 */
46
47 #include "gmp-impl.h"
48
49
50 /* N = {np,nn}
51 D = {dp,dn}
52
53 Requirements: N >= D
54 D >= 1
55 D odd
56 dn >= 2
57 nn >= 2
58 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
59
60 Write quotient to Q = {qp,nn}.
61
62 FIXME: When iterating, perhaps do the small step before loop, not after.
63 FIXME: Try to avoid the scalar divisions when computing inverse size.
64 FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
65 particular, when dn==in, tp and rp could use the same space.
66 FIXME: Trim final quotient calculation to qn limbs of precision.
67 */
68 static void
mpn_mu_bdiv_q_old(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)69 mpn_mu_bdiv_q_old (mp_ptr qp,
70 mp_srcptr np, mp_size_t nn,
71 mp_srcptr dp, mp_size_t dn,
72 mp_ptr scratch)
73 {
74 mp_size_t qn;
75 mp_size_t in;
76 int cy, c0;
77 mp_size_t tn, wn;
78
79 qn = nn;
80
81 ASSERT (dn >= 2);
82 ASSERT (qn >= 2);
83
84 if (qn > dn)
85 {
86 mp_size_t b;
87
88 /* |_______________________| dividend
89 |________| divisor */
90
91 #define ip scratch /* in */
92 #define rp (scratch + in) /* dn or rest >= binvert_itch(in) */
93 #define tp (scratch + in + dn) /* dn+in or next_size(dn) */
94 #define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */
95
96 /* Compute an inverse size that is a nice partition of the quotient. */
97 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
98 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
99
100 /* Some notes on allocation:
101
102 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
103 limbs of R dies at that point. We could save memory by letting T live
104 just under R, and let the upper part of T expand into R. These changes
105 should reduce itch to perhaps 3dn.
106 */
107
108 mpn_binvert (ip, dp, in, rp);
109
110 cy = 0;
111
112 MPN_COPY (rp, np, dn);
113 np += dn;
114 mpn_mullo_n (qp, rp, ip, in);
115 qn -= in;
116
117 while (qn > in)
118 {
119 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
120 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
121 else
122 {
123 tn = mpn_mulmod_bnm1_next_size (dn);
124 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
125 wn = dn + in - tn; /* number of wrapped limbs */
126 if (wn > 0)
127 {
128 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
129 mpn_decr_u (tp + wn, c0);
130 }
131 }
132
133 qp += in;
134 if (dn != in)
135 {
136 /* Subtract tp[dn-1...in] from partial remainder. */
137 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
138 if (cy == 2)
139 {
140 mpn_incr_u (tp + dn, 1);
141 cy = 1;
142 }
143 }
144 /* Subtract tp[dn+in-1...dn] from dividend. */
145 cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
146 np += in;
147 mpn_mullo_n (qp, rp, ip, in);
148 qn -= in;
149 }
150
151 /* Generate last qn limbs.
152 FIXME: It should be possible to limit precision here, since qn is
153 typically somewhat smaller than dn. No big gains expected. */
154
155 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
156 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */
157 else
158 {
159 tn = mpn_mulmod_bnm1_next_size (dn);
160 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
161 wn = dn + in - tn; /* number of wrapped limbs */
162 if (wn > 0)
163 {
164 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
165 mpn_decr_u (tp + wn, c0);
166 }
167 }
168
169 qp += in;
170 if (dn != in)
171 {
172 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
173 if (cy == 2)
174 {
175 mpn_incr_u (tp + dn, 1);
176 cy = 1;
177 }
178 }
179
180 mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
181 mpn_mullo_n (qp, rp, ip, qn);
182
183 #undef ip
184 #undef rp
185 #undef tp
186 #undef scratch_out
187 }
188 else
189 {
190 /* |_______________________| dividend
191 |________________| divisor */
192
193 #define ip scratch /* in */
194 #define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */
195 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
196
197 /* Compute half-sized inverse. */
198 in = qn - (qn >> 1);
199
200 mpn_binvert (ip, dp, in, tp);
201
202 mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
203
204 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
205 mpn_mul (tp, dp, qn, qp, in); /* mulhigh */
206 else
207 {
208 tn = mpn_mulmod_bnm1_next_size (qn);
209 mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
210 wn = qn + in - tn; /* number of wrapped limbs */
211 if (wn > 0)
212 {
213 c0 = mpn_cmp (tp, np, wn) < 0;
214 mpn_decr_u (tp + wn, c0);
215 }
216 }
217
218 mpn_sub_n (tp, np + in, tp + in, qn - in);
219 mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */
220
221 #undef ip
222 #undef tp
223 #undef scratch_out
224 }
225 }
226
227 void
mpn_mu_bdiv_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)228 mpn_mu_bdiv_q (mp_ptr qp,
229 mp_srcptr np, mp_size_t nn,
230 mp_srcptr dp, mp_size_t dn,
231 mp_ptr scratch)
232 {
233 mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch);
234 mpn_neg (qp, qp, nn);
235 }
236
237 mp_size_t
mpn_mu_bdiv_q_itch(mp_size_t nn,mp_size_t dn)238 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
239 {
240 mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
241 mp_size_t b;
242
243 ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
244
245 qn = nn;
246
247 if (qn > dn)
248 {
249 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
250 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
251 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
252 {
253 tn = dn + in;
254 itch_out = 0;
255 }
256 else
257 {
258 tn = mpn_mulmod_bnm1_next_size (dn);
259 itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
260 }
261 itches = dn + tn + itch_out;
262 }
263 else
264 {
265 in = qn - (qn >> 1);
266 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
267 {
268 tn = qn + in;
269 itch_out = 0;
270 }
271 else
272 {
273 tn = mpn_mulmod_bnm1_next_size (qn);
274 itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
275 }
276 itches = tn + itch_out;
277 }
278
279 itch_binvert = mpn_binvert_itch (in);
280 return in + MAX (itches, itch_binvert);
281 }
282