1 /* mpn_mul -- Multiply two natural numbers.
2
3 Contributed to the GNU project by Torbjorn Granlund.
4
5 Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005,
6 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
7
8 This file is part of the GNU MP Library.
9
10 The GNU MP Library is free software; you can redistribute it and/or modify
11 it under the terms of the GNU Lesser General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 The GNU MP Library is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
18 License for more details.
19
20 You should have received a copy of the GNU Lesser General Public License
21 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
22
23 #include "gmp.h"
24 #include "gmp-impl.h"
25
26
27 #ifndef MUL_BASECASE_MAX_UN
28 #define MUL_BASECASE_MAX_UN 500
29 #endif
30
31 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
32 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
33
34 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
35 (pointed to by VP, with VN limbs), and store the result at PRODP. The
36 result is UN + VN limbs. Return the most significant limb of the result.
37
38 NOTE: The space pointed to by PRODP is overwritten before finished with U
39 and V, so overlap is an error.
40
41 Argument constraints:
42 1. UN >= VN.
43 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
44 the multiplier and the multiplicand. */
45
46 /*
47 * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
48 ideal lines of the surrounding algorithms. Is that optimal?
49
50 * The toomX3 code now uses a structure similar to the one of toomX2, except
51 that it loops longer in the unbalanced case. The result is that the
52 remaining area might have un < vn. Should we fix the toomX2 code in a
53 similar way?
54
55 * The toomX3 code is used for the largest non-FFT unbalanced operands. It
56 therefore calls mpn_mul recursively for certain cases.
57
58 * Allocate static temp space using THRESHOLD variables (except for toom44
59 when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
60
61 * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
62 have the same vn threshold. This is not true, we should actually use
63 mul_basecase for slightly larger operands for toom32 than for toom22, and
64 even larger for toom42.
65
66 * That problem is even more prevalent for toomX3. We therefore use special
67 THRESHOLD variables there.
68
69 * Is our ITCH allocation correct?
70 */
71
72 #define ITCH (16*vn + 100)
73
74 mp_limb_t
mpn_mul(mp_ptr prodp,mp_srcptr up,mp_size_t un,mp_srcptr vp,mp_size_t vn)75 mpn_mul (mp_ptr prodp,
76 mp_srcptr up, mp_size_t un,
77 mp_srcptr vp, mp_size_t vn)
78 {
79 ASSERT (un >= vn);
80 ASSERT (vn >= 1);
81 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
82 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
83
84 if (un == vn)
85 {
86 if (up == vp)
87 mpn_sqr (prodp, up, un);
88 else
89 mpn_mul_n (prodp, up, vp, un);
90 }
91 else if (vn < MUL_TOOM22_THRESHOLD)
92 { /* plain schoolbook multiplication */
93
94 /* Unless un is very large, or else if have an applicable mpn_mul_N,
95 perform basecase multiply directly. */
96 if (un <= MUL_BASECASE_MAX_UN
97 #if HAVE_NATIVE_mpn_mul_2
98 || vn <= 2
99 #else
100 || vn == 1
101 #endif
102 )
103 mpn_mul_basecase (prodp, up, un, vp, vn);
104 else
105 {
106 /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
107 locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
108 these pieces with the vp[] operand. After each such partial
109 multiplication (but the last) we copy the most significant vn
110 limbs into a temporary buffer since that part would otherwise be
111 overwritten by the next multiplication. After the next
112 multiplication, we add it back. This illustrates the situation:
113
114 -->vn<--
115 | |<------- un ------->|
116 _____________________|
117 X /|
118 /XX__________________/ |
119 _____________________ |
120 X / |
121 /XX__________________/ |
122 _____________________ |
123 / / |
124 /____________________/ |
125 ==================================================================
126
127 The parts marked with X are the parts whose sums are copied into
128 the temporary buffer. */
129
130 mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
131 mp_limb_t cy;
132 ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
133
134 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
135 prodp += MUL_BASECASE_MAX_UN;
136 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
137 up += MUL_BASECASE_MAX_UN;
138 un -= MUL_BASECASE_MAX_UN;
139 while (un > MUL_BASECASE_MAX_UN)
140 {
141 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
142 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
143 mpn_incr_u (prodp + vn, cy);
144 prodp += MUL_BASECASE_MAX_UN;
145 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
146 up += MUL_BASECASE_MAX_UN;
147 un -= MUL_BASECASE_MAX_UN;
148 }
149 if (un > vn)
150 {
151 mpn_mul_basecase (prodp, up, un, vp, vn);
152 }
153 else
154 {
155 ASSERT (un > 0);
156 mpn_mul_basecase (prodp, vp, vn, up, un);
157 }
158 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
159 mpn_incr_u (prodp + vn, cy);
160 }
161 }
162 else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
163 {
164 /* Use ToomX2 variants */
165 mp_ptr scratch;
166 TMP_SDECL; TMP_SMARK;
167
168 scratch = TMP_SALLOC_LIMBS (ITCH);
169
170 /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
171 square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
172 wise; we would get better balance by slightly moving the bound. We
173 will sometimes end up with un < vn, like the the X3 arm below. */
174 if (un >= 3 * vn)
175 {
176 mp_limb_t cy;
177 mp_ptr ws;
178
179 /* The maximum ws usage is for the mpn_mul result. */
180 ws = TMP_SALLOC_LIMBS (4 * vn);
181
182 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
183 un -= 2 * vn;
184 up += 2 * vn;
185 prodp += 2 * vn;
186
187 while (un >= 3 * vn)
188 {
189 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
190 un -= 2 * vn;
191 up += 2 * vn;
192 cy = mpn_add_n (prodp, prodp, ws, vn);
193 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
194 mpn_incr_u (prodp + vn, cy);
195 prodp += 2 * vn;
196 }
197
198 /* vn <= un < 3vn */
199
200 if (4 * un < 5 * vn)
201 mpn_toom22_mul (ws, up, un, vp, vn, scratch);
202 else if (4 * un < 7 * vn)
203 mpn_toom32_mul (ws, up, un, vp, vn, scratch);
204 else
205 mpn_toom42_mul (ws, up, un, vp, vn, scratch);
206
207 cy = mpn_add_n (prodp, prodp, ws, vn);
208 MPN_COPY (prodp + vn, ws + vn, un);
209 mpn_incr_u (prodp + vn, cy);
210 }
211 else
212 {
213 if (4 * un < 5 * vn)
214 mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
215 else if (4 * un < 7 * vn)
216 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
217 else
218 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
219 }
220 TMP_SFREE;
221 }
222 else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
223 BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
224 {
225 /* Handle the largest operands that are not in the FFT range. The 2nd
226 condition makes very unbalanced operands avoid the FFT code (except
227 perhaps as coefficient products of the Toom code. */
228
229 if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
230 {
231 /* Use ToomX3 variants */
232 mp_ptr scratch;
233 TMP_SDECL; TMP_SMARK;
234
235 scratch = TMP_SALLOC_LIMBS (ITCH);
236
237 if (2 * un >= 5 * vn)
238 {
239 mp_limb_t cy;
240 mp_ptr ws;
241
242 /* The maximum ws usage is for the mpn_mul result. */
243 ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
244
245 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
246 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
247 else
248 mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
249 un -= 2 * vn;
250 up += 2 * vn;
251 prodp += 2 * vn;
252
253 while (2 * un >= 5 * vn) /* un >= 2.5vn */
254 {
255 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
256 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
257 else
258 mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
259 un -= 2 * vn;
260 up += 2 * vn;
261 cy = mpn_add_n (prodp, prodp, ws, vn);
262 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
263 mpn_incr_u (prodp + vn, cy);
264 prodp += 2 * vn;
265 }
266
267 /* vn / 2 <= un < 2.5vn */
268
269 if (un < vn)
270 mpn_mul (ws, vp, vn, up, un);
271 else
272 mpn_mul (ws, up, un, vp, vn);
273
274 cy = mpn_add_n (prodp, prodp, ws, vn);
275 MPN_COPY (prodp + vn, ws + vn, un);
276 mpn_incr_u (prodp + vn, cy);
277 }
278 else
279 {
280 if (6 * un < 7 * vn)
281 mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
282 else if (2 * un < 3 * vn)
283 {
284 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
285 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
286 else
287 mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
288 }
289 else if (6 * un < 11 * vn)
290 {
291 if (4 * un < 7 * vn)
292 {
293 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
294 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
295 else
296 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
297 }
298 else
299 {
300 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
301 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
302 else
303 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
304 }
305 }
306 else
307 {
308 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
309 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
310 else
311 mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
312 }
313 }
314 TMP_SFREE;
315 }
316 else
317 {
318 mp_ptr scratch;
319 TMP_DECL; TMP_MARK;
320
321 if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
322 {
323 scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
324 mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
325 }
326 else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
327 {
328 scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
329 mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
330 }
331 else
332 {
333 scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
334 mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
335 }
336 TMP_FREE;
337 }
338 }
339 else
340 {
341 if (un >= 8 * vn)
342 {
343 mp_limb_t cy;
344 mp_ptr ws;
345 TMP_DECL; TMP_MARK;
346
347 /* The maximum ws usage is for the mpn_mul result. */
348 ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
349
350 mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
351 un -= 3 * vn;
352 up += 3 * vn;
353 prodp += 3 * vn;
354
355 while (2 * un >= 7 * vn) /* un >= 3.5vn */
356 {
357 mpn_fft_mul (ws, up, 3 * vn, vp, vn);
358 un -= 3 * vn;
359 up += 3 * vn;
360 cy = mpn_add_n (prodp, prodp, ws, vn);
361 MPN_COPY (prodp + vn, ws + vn, 3 * vn);
362 mpn_incr_u (prodp + vn, cy);
363 prodp += 3 * vn;
364 }
365
366 /* vn / 2 <= un < 3.5vn */
367
368 if (un < vn)
369 mpn_mul (ws, vp, vn, up, un);
370 else
371 mpn_mul (ws, up, un, vp, vn);
372
373 cy = mpn_add_n (prodp, prodp, ws, vn);
374 MPN_COPY (prodp + vn, ws + vn, un);
375 mpn_incr_u (prodp + vn, cy);
376
377 TMP_FREE;
378 }
379 else
380 mpn_fft_mul (prodp, up, un, vp, vn);
381 }
382
383 return prodp[un + vn - 1]; /* historic */
384 }
385