1 /* mpn_mul -- Multiply two natural numbers.
2
3 Contributed to the GNU project by Torbjorn Granlund.
4
5 Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012
6 Free Software Foundation, Inc.
7
8 This file is part of the GNU MP Library.
9
10 The GNU MP Library is free software; you can redistribute it and/or modify
11 it under the terms of either:
12
13 * the GNU Lesser General Public License as published by the Free
14 Software Foundation; either version 3 of the License, or (at your
15 option) any later version.
16
17 or
18
19 * the GNU General Public License as published by the Free Software
20 Foundation; either version 2 of the License, or (at your option) any
21 later version.
22
23 or both in parallel, as here.
24
25 The GNU MP Library is distributed in the hope that it will be useful, but
26 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 for more details.
29
30 You should have received copies of the GNU General Public License and the
31 GNU Lesser General Public License along with the GNU MP Library. If not,
32 see https://www.gnu.org/licenses/. */
33
34 #include "gmp.h"
35 #include "gmp-impl.h"
36
37
38 #ifndef MUL_BASECASE_MAX_UN
39 #define MUL_BASECASE_MAX_UN 500
40 #endif
41
42 /* Areas where the different toom algorithms can be called (extracted
43 from the t-toom*.c files, and ignoring small constant offsets):
44
45 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
46 4/7 6/7
47 6/11
48 |--------------------| toom22 (small)
49 || toom22 (large)
50 |xxxx| toom22 called
51 |-------------------------------------| toom32
52 |xxxxxxxxxxxxxxxx| | toom32 called
53 |------------| toom33
54 |x| toom33 called
55 |---------------------------------| | toom42
56 |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called
57 |--------------------| toom43
58 |xxxxxxxxxx| toom43 called
59 |-----------------------------| toom52 (unused)
60 |--------| toom44
61 |xxxxxxxx| toom44 called
62 |--------------------| | toom53
63 |xxxxxx| toom53 called
64 |-------------------------| toom62 (unused)
65 |----------------| toom54 (unused)
66 |--------------------| toom63
67 |xxxxxxxxx| | toom63 called
68 |---------------------------------| toom6h
69 |xxxxxxxx| toom6h called
70 |-------------------------| toom8h (32 bit)
71 |------------------------------------------| toom8h (64 bit)
72 |xxxxxxxx| toom8h called
73 */
74
75 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
76 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
77
78 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
79 (pointed to by VP, with VN limbs), and store the result at PRODP. The
80 result is UN + VN limbs. Return the most significant limb of the result.
81
82 NOTE: The space pointed to by PRODP is overwritten before finished with U
83 and V, so overlap is an error.
84
85 Argument constraints:
86 1. UN >= VN.
87 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
88 the multiplier and the multiplicand. */
89
90 /*
91 * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
92 ideal lines of the surrounding algorithms. Is that optimal?
93
94 * The toomX3 code now uses a structure similar to the one of toomX2, except
95 that it loops longer in the unbalanced case. The result is that the
96 remaining area might have un < vn. Should we fix the toomX2 code in a
97 similar way?
98
99 * The toomX3 code is used for the largest non-FFT unbalanced operands. It
100 therefore calls mpn_mul recursively for certain cases.
101
102 * Allocate static temp space using THRESHOLD variables (except for toom44
103 when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
104
105 * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
106 have the same vn threshold. This is not true, we should actually use
107 mul_basecase for slightly larger operands for toom32 than for toom22, and
108 even larger for toom42.
109
110 * That problem is even more prevalent for toomX3. We therefore use special
111 THRESHOLD variables there.
112
113 * Is our ITCH allocation correct?
114 */
115
116 #define ITCH (16*vn + 100)
117
118 mp_limb_t
mpn_mul(mp_ptr prodp,mp_srcptr up,mp_size_t un,mp_srcptr vp,mp_size_t vn)119 mpn_mul (mp_ptr prodp,
120 mp_srcptr up, mp_size_t un,
121 mp_srcptr vp, mp_size_t vn)
122 {
123 ASSERT (un >= vn);
124 ASSERT (vn >= 1);
125 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
126 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
127
128 if (un == vn)
129 {
130 if (up == vp)
131 mpn_sqr (prodp, up, un);
132 else
133 mpn_mul_n (prodp, up, vp, un);
134 }
135 else if (vn < MUL_TOOM22_THRESHOLD)
136 { /* plain schoolbook multiplication */
137
138 /* Unless un is very large, or else if have an applicable mpn_mul_N,
139 perform basecase multiply directly. */
140 if (un <= MUL_BASECASE_MAX_UN
141 #if HAVE_NATIVE_mpn_mul_2
142 || vn <= 2
143 #else
144 || vn == 1
145 #endif
146 )
147 mpn_mul_basecase (prodp, up, un, vp, vn);
148 else
149 {
150 /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
151 locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
152 these pieces with the vp[] operand. After each such partial
153 multiplication (but the last) we copy the most significant vn
154 limbs into a temporary buffer since that part would otherwise be
155 overwritten by the next multiplication. After the next
156 multiplication, we add it back. This illustrates the situation:
157
158 -->vn<--
159 | |<------- un ------->|
160 _____________________|
161 X /|
162 /XX__________________/ |
163 _____________________ |
164 X / |
165 /XX__________________/ |
166 _____________________ |
167 / / |
168 /____________________/ |
169 ==================================================================
170
171 The parts marked with X are the parts whose sums are copied into
172 the temporary buffer. */
173
174 mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
175 mp_limb_t cy;
176 ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
177
178 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
179 prodp += MUL_BASECASE_MAX_UN;
180 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
181 up += MUL_BASECASE_MAX_UN;
182 un -= MUL_BASECASE_MAX_UN;
183 while (un > MUL_BASECASE_MAX_UN)
184 {
185 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
186 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
187 mpn_incr_u (prodp + vn, cy);
188 prodp += MUL_BASECASE_MAX_UN;
189 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
190 up += MUL_BASECASE_MAX_UN;
191 un -= MUL_BASECASE_MAX_UN;
192 }
193 if (un > vn)
194 {
195 mpn_mul_basecase (prodp, up, un, vp, vn);
196 }
197 else
198 {
199 ASSERT (un > 0);
200 mpn_mul_basecase (prodp, vp, vn, up, un);
201 }
202 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
203 mpn_incr_u (prodp + vn, cy);
204 }
205 }
206 else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
207 {
208 /* Use ToomX2 variants */
209 mp_ptr scratch;
210 TMP_SDECL; TMP_SMARK;
211
212 scratch = TMP_SALLOC_LIMBS (ITCH);
213
214 /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
215 square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
216 wise; we would get better balance by slightly moving the bound. We
217 will sometimes end up with un < vn, like in the X3 arm below. */
218 if (un >= 3 * vn)
219 {
220 mp_limb_t cy;
221 mp_ptr ws;
222
223 /* The maximum ws usage is for the mpn_mul result. */
224 ws = TMP_SALLOC_LIMBS (4 * vn);
225
226 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
227 un -= 2 * vn;
228 up += 2 * vn;
229 prodp += 2 * vn;
230
231 while (un >= 3 * vn)
232 {
233 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
234 un -= 2 * vn;
235 up += 2 * vn;
236 cy = mpn_add_n (prodp, prodp, ws, vn);
237 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
238 mpn_incr_u (prodp + vn, cy);
239 prodp += 2 * vn;
240 }
241
242 /* vn <= un < 3vn */
243
244 if (4 * un < 5 * vn)
245 mpn_toom22_mul (ws, up, un, vp, vn, scratch);
246 else if (4 * un < 7 * vn)
247 mpn_toom32_mul (ws, up, un, vp, vn, scratch);
248 else
249 mpn_toom42_mul (ws, up, un, vp, vn, scratch);
250
251 cy = mpn_add_n (prodp, prodp, ws, vn);
252 MPN_COPY (prodp + vn, ws + vn, un);
253 mpn_incr_u (prodp + vn, cy);
254 }
255 else
256 {
257 if (4 * un < 5 * vn)
258 mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
259 else if (4 * un < 7 * vn)
260 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
261 else
262 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
263 }
264 TMP_SFREE;
265 }
266 else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
267 BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
268 {
269 /* Handle the largest operands that are not in the FFT range. The 2nd
270 condition makes very unbalanced operands avoid the FFT code (except
271 perhaps as coefficient products of the Toom code. */
272
273 if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
274 {
275 /* Use ToomX3 variants */
276 mp_ptr scratch;
277 TMP_SDECL; TMP_SMARK;
278
279 scratch = TMP_SALLOC_LIMBS (ITCH);
280
281 if (2 * un >= 5 * vn)
282 {
283 mp_limb_t cy;
284 mp_ptr ws;
285
286 /* The maximum ws usage is for the mpn_mul result. */
287 ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
288
289 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
290 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
291 else
292 mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
293 un -= 2 * vn;
294 up += 2 * vn;
295 prodp += 2 * vn;
296
297 while (2 * un >= 5 * vn) /* un >= 2.5vn */
298 {
299 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
300 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
301 else
302 mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
303 un -= 2 * vn;
304 up += 2 * vn;
305 cy = mpn_add_n (prodp, prodp, ws, vn);
306 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
307 mpn_incr_u (prodp + vn, cy);
308 prodp += 2 * vn;
309 }
310
311 /* vn / 2 <= un < 2.5vn */
312
313 if (un < vn)
314 mpn_mul (ws, vp, vn, up, un);
315 else
316 mpn_mul (ws, up, un, vp, vn);
317
318 cy = mpn_add_n (prodp, prodp, ws, vn);
319 MPN_COPY (prodp + vn, ws + vn, un);
320 mpn_incr_u (prodp + vn, cy);
321 }
322 else
323 {
324 if (6 * un < 7 * vn)
325 mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
326 else if (2 * un < 3 * vn)
327 {
328 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
329 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
330 else
331 mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
332 }
333 else if (6 * un < 11 * vn)
334 {
335 if (4 * un < 7 * vn)
336 {
337 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
338 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
339 else
340 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
341 }
342 else
343 {
344 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
345 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
346 else
347 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
348 }
349 }
350 else
351 {
352 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
353 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
354 else
355 mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
356 }
357 }
358 TMP_SFREE;
359 }
360 else
361 {
362 mp_ptr scratch;
363 TMP_DECL; TMP_MARK;
364
365 if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
366 {
367 scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
368 mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
369 }
370 else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
371 {
372 scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
373 mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
374 }
375 else
376 {
377 scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
378 mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
379 }
380 TMP_FREE;
381 }
382 }
383 else
384 {
385 if (un >= 8 * vn)
386 {
387 mp_limb_t cy;
388 mp_ptr ws;
389 TMP_DECL; TMP_MARK;
390
391 /* The maximum ws usage is for the mpn_mul result. */
392 ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
393
394 mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
395 un -= 3 * vn;
396 up += 3 * vn;
397 prodp += 3 * vn;
398
399 while (2 * un >= 7 * vn) /* un >= 3.5vn */
400 {
401 mpn_fft_mul (ws, up, 3 * vn, vp, vn);
402 un -= 3 * vn;
403 up += 3 * vn;
404 cy = mpn_add_n (prodp, prodp, ws, vn);
405 MPN_COPY (prodp + vn, ws + vn, 3 * vn);
406 mpn_incr_u (prodp + vn, cy);
407 prodp += 3 * vn;
408 }
409
410 /* vn / 2 <= un < 3.5vn */
411
412 if (un < vn)
413 mpn_mul (ws, vp, vn, up, un);
414 else
415 mpn_mul (ws, up, un, vp, vn);
416
417 cy = mpn_add_n (prodp, prodp, ws, vn);
418 MPN_COPY (prodp + vn, ws + vn, un);
419 mpn_incr_u (prodp + vn, cy);
420
421 TMP_FREE;
422 }
423 else
424 mpn_fft_mul (prodp, up, un, vp, vn);
425 }
426
427 return prodp[un + vn - 1]; /* historic */
428 }
429