1 /* mpn_mul -- Multiply two natural numbers.
2 
3    Contributed to the GNU project by Torbjorn Granlund.
4 
5 Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012
6 Free Software Foundation, Inc.
7 
8 This file is part of the GNU MP Library.
9 
10 The GNU MP Library is free software; you can redistribute it and/or modify
11 it under the terms of either:
12 
13   * the GNU Lesser General Public License as published by the Free
14     Software Foundation; either version 3 of the License, or (at your
15     option) any later version.
16 
17 or
18 
19   * the GNU General Public License as published by the Free Software
20     Foundation; either version 2 of the License, or (at your option) any
21     later version.
22 
23 or both in parallel, as here.
24 
25 The GNU MP Library is distributed in the hope that it will be useful, but
26 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28 for more details.
29 
30 You should have received copies of the GNU General Public License and the
31 GNU Lesser General Public License along with the GNU MP Library.  If not,
32 see https://www.gnu.org/licenses/.  */
33 
34 #include "gmp.h"
35 #include "gmp-impl.h"
36 
37 
38 #ifndef MUL_BASECASE_MAX_UN
39 #define MUL_BASECASE_MAX_UN 500
40 #endif
41 
42 /* Areas where the different toom algorithms can be called (extracted
43    from the t-toom*.c files, and ignoring small constant offsets):
44 
45    1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
46                                         4/7              6/7
47 				       6/11
48                                        |--------------------| toom22 (small)
49                                                            || toom22 (large)
50                                                        |xxxx| toom22 called
51                       |-------------------------------------| toom32
52                                          |xxxxxxxxxxxxxxxx| | toom32 called
53                                                |------------| toom33
54                                                           |x| toom33 called
55              |---------------------------------|            | toom42
56 	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
57                                        |--------------------| toom43
58                                                |xxxxxxxxxx|   toom43 called
59          |-----------------------------|                      toom52 (unused)
60                                                    |--------| toom44
61 						   |xxxxxxxx| toom44 called
62                               |--------------------|        | toom53
63                                         |xxxxxx|              toom53 called
64     |-------------------------|                               toom62 (unused)
65                                            |----------------| toom54 (unused)
66                       |--------------------|                  toom63
67 	                      |xxxxxxxxx|                   | toom63 called
68                           |---------------------------------| toom6h
69 						   |xxxxxxxx| toom6h called
70                                   |-------------------------| toom8h (32 bit)
71                  |------------------------------------------| toom8h (64 bit)
72 						   |xxxxxxxx| toom8h called
73 */
74 
75 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
76 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
77 
78 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
79    (pointed to by VP, with VN limbs), and store the result at PRODP.  The
80    result is UN + VN limbs.  Return the most significant limb of the result.
81 
82    NOTE: The space pointed to by PRODP is overwritten before finished with U
83    and V, so overlap is an error.
84 
85    Argument constraints:
86    1. UN >= VN.
87    2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
88       the multiplier and the multiplicand.  */
89 
90 /*
91   * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
92     ideal lines of the surrounding algorithms.  Is that optimal?
93 
94   * The toomX3 code now uses a structure similar to the one of toomX2, except
95     that it loops longer in the unbalanced case.  The result is that the
96     remaining area might have un < vn.  Should we fix the toomX2 code in a
97     similar way?
98 
99   * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
100     therefore calls mpn_mul recursively for certain cases.
101 
102   * Allocate static temp space using THRESHOLD variables (except for toom44
103     when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
104 
105   * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
106     have the same vn threshold.  This is not true, we should actually use
107     mul_basecase for slightly larger operands for toom32 than for toom22, and
108     even larger for toom42.
109 
110   * That problem is even more prevalent for toomX3.  We therefore use special
111     THRESHOLD variables there.
112 
113   * Is our ITCH allocation correct?
114 */
115 
116 #define ITCH (16*vn + 100)
117 
118 mp_limb_t
mpn_mul(mp_ptr prodp,mp_srcptr up,mp_size_t un,mp_srcptr vp,mp_size_t vn)119 mpn_mul (mp_ptr prodp,
120 	 mp_srcptr up, mp_size_t un,
121 	 mp_srcptr vp, mp_size_t vn)
122 {
123   ASSERT (un >= vn);
124   ASSERT (vn >= 1);
125   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
126   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
127 
128   if (un == vn)
129     {
130       if (up == vp)
131 	mpn_sqr (prodp, up, un);
132       else
133 	mpn_mul_n (prodp, up, vp, un);
134     }
135   else if (vn < MUL_TOOM22_THRESHOLD)
136     { /* plain schoolbook multiplication */
137 
138       /* Unless un is very large, or else if have an applicable mpn_mul_N,
139 	 perform basecase multiply directly.  */
140       if (un <= MUL_BASECASE_MAX_UN
141 #if HAVE_NATIVE_mpn_mul_2
142 	  || vn <= 2
143 #else
144 	  || vn == 1
145 #endif
146 	  )
147 	mpn_mul_basecase (prodp, up, un, vp, vn);
148       else
149 	{
150 	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
151 	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
152 	     these pieces with the vp[] operand.  After each such partial
153 	     multiplication (but the last) we copy the most significant vn
154 	     limbs into a temporary buffer since that part would otherwise be
155 	     overwritten by the next multiplication.  After the next
156 	     multiplication, we add it back.  This illustrates the situation:
157 
158                                                     -->vn<--
159                                                       |  |<------- un ------->|
160                                                          _____________________|
161                                                         X                    /|
162                                                       /XX__________________/  |
163                                     _____________________                     |
164                                    X                    /                     |
165                                  /XX__________________/                       |
166                _____________________                                          |
167               /                    /                                          |
168             /____________________/                                            |
169 	    ==================================================================
170 
171 	    The parts marked with X are the parts whose sums are copied into
172 	    the temporary buffer.  */
173 
174 	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
175 	  mp_limb_t cy;
176 	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
177 
178 	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
179 	  prodp += MUL_BASECASE_MAX_UN;
180 	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
181 	  up += MUL_BASECASE_MAX_UN;
182 	  un -= MUL_BASECASE_MAX_UN;
183 	  while (un > MUL_BASECASE_MAX_UN)
184 	    {
185 	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
186 	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
187 	      mpn_incr_u (prodp + vn, cy);
188 	      prodp += MUL_BASECASE_MAX_UN;
189 	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
190 	      up += MUL_BASECASE_MAX_UN;
191 	      un -= MUL_BASECASE_MAX_UN;
192 	    }
193 	  if (un > vn)
194 	    {
195 	      mpn_mul_basecase (prodp, up, un, vp, vn);
196 	    }
197 	  else
198 	    {
199 	      ASSERT (un > 0);
200 	      mpn_mul_basecase (prodp, vp, vn, up, un);
201 	    }
202 	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
203 	  mpn_incr_u (prodp + vn, cy);
204 	}
205     }
206   else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
207     {
208       /* Use ToomX2 variants */
209       mp_ptr scratch;
210       TMP_SDECL; TMP_SMARK;
211 
212       scratch = TMP_SALLOC_LIMBS (ITCH);
213 
214       /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
215 	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
216 	 wise; we would get better balance by slightly moving the bound.  We
217 	 will sometimes end up with un < vn, like in the X3 arm below.  */
218       if (un >= 3 * vn)
219 	{
220 	  mp_limb_t cy;
221 	  mp_ptr ws;
222 
223 	  /* The maximum ws usage is for the mpn_mul result.  */
224 	  ws = TMP_SALLOC_LIMBS (4 * vn);
225 
226 	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
227 	  un -= 2 * vn;
228 	  up += 2 * vn;
229 	  prodp += 2 * vn;
230 
231 	  while (un >= 3 * vn)
232 	    {
233 	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
234 	      un -= 2 * vn;
235 	      up += 2 * vn;
236 	      cy = mpn_add_n (prodp, prodp, ws, vn);
237 	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
238 	      mpn_incr_u (prodp + vn, cy);
239 	      prodp += 2 * vn;
240 	    }
241 
242 	  /* vn <= un < 3vn */
243 
244 	  if (4 * un < 5 * vn)
245 	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
246 	  else if (4 * un < 7 * vn)
247 	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
248 	  else
249 	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
250 
251 	  cy = mpn_add_n (prodp, prodp, ws, vn);
252 	  MPN_COPY (prodp + vn, ws + vn, un);
253 	  mpn_incr_u (prodp + vn, cy);
254 	}
255       else
256 	{
257 	  if (4 * un < 5 * vn)
258 	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
259 	  else if (4 * un < 7 * vn)
260 	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
261 	  else
262 	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
263 	}
264       TMP_SFREE;
265     }
266   else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
267 	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
268     {
269       /* Handle the largest operands that are not in the FFT range.  The 2nd
270 	 condition makes very unbalanced operands avoid the FFT code (except
271 	 perhaps as coefficient products of the Toom code.  */
272 
273       if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
274 	{
275 	  /* Use ToomX3 variants */
276 	  mp_ptr scratch;
277 	  TMP_SDECL; TMP_SMARK;
278 
279 	  scratch = TMP_SALLOC_LIMBS (ITCH);
280 
281 	  if (2 * un >= 5 * vn)
282 	    {
283 	      mp_limb_t cy;
284 	      mp_ptr ws;
285 
286 	      /* The maximum ws usage is for the mpn_mul result.  */
287 	      ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
288 
289 	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
290 		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
291 	      else
292 		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
293 	      un -= 2 * vn;
294 	      up += 2 * vn;
295 	      prodp += 2 * vn;
296 
297 	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
298 		{
299 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
300 		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
301 		  else
302 		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
303 		  un -= 2 * vn;
304 		  up += 2 * vn;
305 		  cy = mpn_add_n (prodp, prodp, ws, vn);
306 		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
307 		  mpn_incr_u (prodp + vn, cy);
308 		  prodp += 2 * vn;
309 		}
310 
311 	      /* vn / 2 <= un < 2.5vn */
312 
313 	      if (un < vn)
314 		mpn_mul (ws, vp, vn, up, un);
315 	      else
316 		mpn_mul (ws, up, un, vp, vn);
317 
318 	      cy = mpn_add_n (prodp, prodp, ws, vn);
319 	      MPN_COPY (prodp + vn, ws + vn, un);
320 	      mpn_incr_u (prodp + vn, cy);
321 	    }
322 	  else
323 	    {
324 	      if (6 * un < 7 * vn)
325 		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
326 	      else if (2 * un < 3 * vn)
327 		{
328 		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
329 		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
330 		  else
331 		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
332 		}
333 	      else if (6 * un < 11 * vn)
334 		{
335 		  if (4 * un < 7 * vn)
336 		    {
337 		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
338 			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
339 		      else
340 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
341 		    }
342 		  else
343 		    {
344 		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
345 			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
346 		      else
347 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
348 		    }
349 		}
350 	      else
351 		{
352 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
353 		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
354 		  else
355 		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
356 		}
357 	    }
358 	  TMP_SFREE;
359 	}
360       else
361 	{
362 	  mp_ptr scratch;
363 	  TMP_DECL; TMP_MARK;
364 
365 	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
366 	    {
367 	      scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
368 	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
369 	    }
370 	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
371 	    {
372 	      scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
373 	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
374 	    }
375 	  else
376 	    {
377 	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
378 	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
379 	    }
380 	  TMP_FREE;
381 	}
382     }
383   else
384     {
385       if (un >= 8 * vn)
386 	{
387 	  mp_limb_t cy;
388 	  mp_ptr ws;
389 	  TMP_DECL; TMP_MARK;
390 
391 	  /* The maximum ws usage is for the mpn_mul result.  */
392 	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
393 
394 	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
395 	  un -= 3 * vn;
396 	  up += 3 * vn;
397 	  prodp += 3 * vn;
398 
399 	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
400 	    {
401 	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
402 	      un -= 3 * vn;
403 	      up += 3 * vn;
404 	      cy = mpn_add_n (prodp, prodp, ws, vn);
405 	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
406 	      mpn_incr_u (prodp + vn, cy);
407 	      prodp += 3 * vn;
408 	    }
409 
410 	  /* vn / 2 <= un < 3.5vn */
411 
412 	  if (un < vn)
413 	    mpn_mul (ws, vp, vn, up, un);
414 	  else
415 	    mpn_mul (ws, up, un, vp, vn);
416 
417 	  cy = mpn_add_n (prodp, prodp, ws, vn);
418 	  MPN_COPY (prodp + vn, ws + vn, un);
419 	  mpn_incr_u (prodp + vn, cy);
420 
421 	  TMP_FREE;
422 	}
423       else
424 	mpn_fft_mul (prodp, up, un, vp, vn);
425     }
426 
427   return prodp[un + vn - 1];	/* historic */
428 }
429