xref: /dragonfly/contrib/gmp/mpn/generic/mul.c (revision 0ca59c34)
1 /* mpn_mul -- Multiply two natural numbers.
2 
3    Contributed to the GNU project by Torbjorn Granlund.
4 
5 Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005,
6 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
7 
8 This file is part of the GNU MP Library.
9 
10 The GNU MP Library is free software; you can redistribute it and/or modify
11 it under the terms of the GNU Lesser General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14 
15 The GNU MP Library is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18 License for more details.
19 
20 You should have received a copy of the GNU Lesser General Public License
21 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
22 
23 #include "gmp.h"
24 #include "gmp-impl.h"
25 
26 
27 #ifndef MUL_BASECASE_MAX_UN
28 #define MUL_BASECASE_MAX_UN 500
29 #endif
30 
31 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
32 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
33 
34 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
35    (pointed to by VP, with VN limbs), and store the result at PRODP.  The
36    result is UN + VN limbs.  Return the most significant limb of the result.
37 
38    NOTE: The space pointed to by PRODP is overwritten before finished with U
39    and V, so overlap is an error.
40 
41    Argument constraints:
42    1. UN >= VN.
43    2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
44       the multiplier and the multiplicand.  */
45 
46 /*
47   * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
48     ideal lines of the surrounding algorithms.  Is that optimal?
49 
50   * The toomX3 code now uses a structure similar to the one of toomX2, except
51     that it loops longer in the unbalanced case.  The result is that the
52     remaining area might have un < vn.  Should we fix the toomX2 code in a
53     similar way?
54 
55   * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
56     therefore calls mpn_mul recursively for certain cases.
57 
58   * Allocate static temp space using THRESHOLD variables (except for toom44
59     when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
60 
61   * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
62     have the same vn threshold.  This is not true, we should actually use
63     mul_basecase for slightly larger operands for toom32 than for toom22, and
64     even larger for toom42.
65 
66   * That problem is even more prevalent for toomX3.  We therefore use special
67     THRESHOLD variables there.
68 
69   * Is our ITCH allocation correct?
70 */
71 
72 #define ITCH (16*vn + 100)
73 
74 mp_limb_t
75 mpn_mul (mp_ptr prodp,
76 	 mp_srcptr up, mp_size_t un,
77 	 mp_srcptr vp, mp_size_t vn)
78 {
79   ASSERT (un >= vn);
80   ASSERT (vn >= 1);
81   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
82   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
83 
84   if (un == vn)
85     {
86       if (up == vp)
87 	mpn_sqr (prodp, up, un);
88       else
89 	mpn_mul_n (prodp, up, vp, un);
90     }
91   else if (vn < MUL_TOOM22_THRESHOLD)
92     { /* plain schoolbook multiplication */
93 
94       /* Unless un is very large, or else if have an applicable mpn_mul_N,
95 	 perform basecase multiply directly.  */
96       if (un <= MUL_BASECASE_MAX_UN
97 #if HAVE_NATIVE_mpn_mul_2
98 	  || vn <= 2
99 #else
100 	  || vn == 1
101 #endif
102 	  )
103 	mpn_mul_basecase (prodp, up, un, vp, vn);
104       else
105 	{
106 	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
107 	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
108 	     these pieces with the vp[] operand.  After each such partial
109 	     multiplication (but the last) we copy the most significant vn
110 	     limbs into a temporary buffer since that part would otherwise be
111 	     overwritten by the next multiplication.  After the next
112 	     multiplication, we add it back.  This illustrates the situation:
113 
114                                                     -->vn<--
115                                                       |  |<------- un ------->|
116                                                          _____________________|
117                                                         X                    /|
118                                                       /XX__________________/  |
119                                     _____________________                     |
120                                    X                    /                     |
121                                  /XX__________________/                       |
122                _____________________                                          |
123               /                    /                                          |
124             /____________________/                                            |
125 	    ==================================================================
126 
127 	    The parts marked with X are the parts whose sums are copied into
128 	    the temporary buffer.  */
129 
130 	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
131 	  mp_limb_t cy;
132 	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
133 
134 	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
135 	  prodp += MUL_BASECASE_MAX_UN;
136 	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
137 	  up += MUL_BASECASE_MAX_UN;
138 	  un -= MUL_BASECASE_MAX_UN;
139 	  while (un > MUL_BASECASE_MAX_UN)
140 	    {
141 	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
142 	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
143 	      mpn_incr_u (prodp + vn, cy);
144 	      prodp += MUL_BASECASE_MAX_UN;
145 	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
146 	      up += MUL_BASECASE_MAX_UN;
147 	      un -= MUL_BASECASE_MAX_UN;
148 	    }
149 	  if (un > vn)
150 	    {
151 	      mpn_mul_basecase (prodp, up, un, vp, vn);
152 	    }
153 	  else
154 	    {
155 	      ASSERT (un > 0);
156 	      mpn_mul_basecase (prodp, vp, vn, up, un);
157 	    }
158 	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
159 	  mpn_incr_u (prodp + vn, cy);
160 	}
161     }
162   else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
163     {
164       /* Use ToomX2 variants */
165       mp_ptr scratch;
166       TMP_SDECL; TMP_SMARK;
167 
168       scratch = TMP_SALLOC_LIMBS (ITCH);
169 
170       /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
171 	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
172 	 wise; we would get better balance by slightly moving the bound.  We
173 	 will sometimes end up with un < vn, like the the X3 arm below.  */
174       if (un >= 3 * vn)
175 	{
176 	  mp_limb_t cy;
177 	  mp_ptr ws;
178 
179 	  /* The maximum ws usage is for the mpn_mul result.  */
180 	  ws = TMP_SALLOC_LIMBS (4 * vn);
181 
182 	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
183 	  un -= 2 * vn;
184 	  up += 2 * vn;
185 	  prodp += 2 * vn;
186 
187 	  while (un >= 3 * vn)
188 	    {
189 	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
190 	      un -= 2 * vn;
191 	      up += 2 * vn;
192 	      cy = mpn_add_n (prodp, prodp, ws, vn);
193 	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
194 	      mpn_incr_u (prodp + vn, cy);
195 	      prodp += 2 * vn;
196 	    }
197 
198 	  /* vn <= un < 3vn */
199 
200 	  if (4 * un < 5 * vn)
201 	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
202 	  else if (4 * un < 7 * vn)
203 	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
204 	  else
205 	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
206 
207 	  cy = mpn_add_n (prodp, prodp, ws, vn);
208 	  MPN_COPY (prodp + vn, ws + vn, un);
209 	  mpn_incr_u (prodp + vn, cy);
210 	}
211       else
212 	{
213 	  if (4 * un < 5 * vn)
214 	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
215 	  else if (4 * un < 7 * vn)
216 	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
217 	  else
218 	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
219 	}
220       TMP_SFREE;
221     }
222   else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
223 	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
224     {
225       /* Handle the largest operands that are not in the FFT range.  The 2nd
226 	 condition makes very unbalanced operands avoid the FFT code (except
227 	 perhaps as coefficient products of the Toom code.  */
228 
229       if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
230 	{
231 	  /* Use ToomX3 variants */
232 	  mp_ptr scratch;
233 	  TMP_SDECL; TMP_SMARK;
234 
235 	  scratch = TMP_SALLOC_LIMBS (ITCH);
236 
237 	  if (2 * un >= 5 * vn)
238 	    {
239 	      mp_limb_t cy;
240 	      mp_ptr ws;
241 
242 	      /* The maximum ws usage is for the mpn_mul result.  */
243 	      ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
244 
245 	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
246 		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
247 	      else
248 		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
249 	      un -= 2 * vn;
250 	      up += 2 * vn;
251 	      prodp += 2 * vn;
252 
253 	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
254 		{
255 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
256 		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
257 		  else
258 		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
259 		  un -= 2 * vn;
260 		  up += 2 * vn;
261 		  cy = mpn_add_n (prodp, prodp, ws, vn);
262 		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
263 		  mpn_incr_u (prodp + vn, cy);
264 		  prodp += 2 * vn;
265 		}
266 
267 	      /* vn / 2 <= un < 2.5vn */
268 
269 	      if (un < vn)
270 		mpn_mul (ws, vp, vn, up, un);
271 	      else
272 		mpn_mul (ws, up, un, vp, vn);
273 
274 	      cy = mpn_add_n (prodp, prodp, ws, vn);
275 	      MPN_COPY (prodp + vn, ws + vn, un);
276 	      mpn_incr_u (prodp + vn, cy);
277 	    }
278 	  else
279 	    {
280 	      if (6 * un < 7 * vn)
281 		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
282 	      else if (2 * un < 3 * vn)
283 		{
284 		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
285 		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
286 		  else
287 		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
288 		}
289 	      else if (6 * un < 11 * vn)
290 		{
291 		  if (4 * un < 7 * vn)
292 		    {
293 		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
294 			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
295 		      else
296 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
297 		    }
298 		  else
299 		    {
300 		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
301 			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
302 		      else
303 			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
304 		    }
305 		}
306 	      else
307 		{
308 		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
309 		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
310 		  else
311 		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
312 		}
313 	    }
314 	  TMP_SFREE;
315 	}
316       else
317 	{
318 	  mp_ptr scratch;
319 	  TMP_DECL; TMP_MARK;
320 
321 	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
322 	    {
323 	      scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
324 	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
325 	    }
326 	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
327 	    {
328 	      scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
329 	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
330 	    }
331 	  else
332 	    {
333 	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
334 	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
335 	    }
336 	  TMP_FREE;
337 	}
338     }
339   else
340     {
341       if (un >= 8 * vn)
342 	{
343 	  mp_limb_t cy;
344 	  mp_ptr ws;
345 	  TMP_DECL; TMP_MARK;
346 
347 	  /* The maximum ws usage is for the mpn_mul result.  */
348 	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
349 
350 	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
351 	  un -= 3 * vn;
352 	  up += 3 * vn;
353 	  prodp += 3 * vn;
354 
355 	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
356 	    {
357 	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
358 	      un -= 3 * vn;
359 	      up += 3 * vn;
360 	      cy = mpn_add_n (prodp, prodp, ws, vn);
361 	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
362 	      mpn_incr_u (prodp + vn, cy);
363 	      prodp += 3 * vn;
364 	    }
365 
366 	  /* vn / 2 <= un < 3.5vn */
367 
368 	  if (un < vn)
369 	    mpn_mul (ws, vp, vn, up, un);
370 	  else
371 	    mpn_mul (ws, up, un, vp, vn);
372 
373 	  cy = mpn_add_n (prodp, prodp, ws, vn);
374 	  MPN_COPY (prodp + vn, ws + vn, un);
375 	  mpn_incr_u (prodp + vn, cy);
376 
377 	  TMP_FREE;
378 	}
379       else
380 	mpn_fft_mul (prodp, up, un, vp, vn);
381     }
382 
383   return prodp[un + vn - 1];	/* historic */
384 }
385