xref: /dragonfly/contrib/gmp/mpn/generic/mul_n.c (revision dcd37f7d)
1 /* mpn_mul_n and helper function -- Multiply/square natural numbers.
2 
3    THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n) ARE
4    INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH
5    DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE
6    OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
7 
8 Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
9 2005 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17 
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21 License for more details.
22 
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25 
26 #include "gmp.h"
27 #include "gmp-impl.h"
28 #include "longlong.h"
29 
30 
31 /* Multiplies using 3 half-sized mults and so on recursively.
32  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
33  * No overlap of p[...] with a[...] or b[...].
34  * ws is workspace.
35  */
36 
37 void
38 mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
39 {
40   mp_limb_t w, w0, w1;
41   mp_size_t n2;
42   mp_srcptr x, y;
43   mp_size_t i;
44   int sign;
45 
46   n2 = n >> 1;
47   ASSERT (n2 > 0);
48 
49   if ((n & 1) != 0)
50     {
51       /* Odd length. */
52       mp_size_t n1, n3, nm1;
53 
54       n3 = n - n2;
55 
56       sign = 0;
57       w = a[n2];
58       if (w != 0)
59 	w -= mpn_sub_n (p, a, a + n3, n2);
60       else
61 	{
62 	  i = n2;
63 	  do
64 	    {
65 	      --i;
66 	      w0 = a[i];
67 	      w1 = a[n3 + i];
68 	    }
69 	  while (w0 == w1 && i != 0);
70 	  if (w0 < w1)
71 	    {
72 	      x = a + n3;
73 	      y = a;
74 	      sign = ~0;
75 	    }
76 	  else
77 	    {
78 	      x = a;
79 	      y = a + n3;
80 	    }
81 	  mpn_sub_n (p, x, y, n2);
82 	}
83       p[n2] = w;
84 
85       w = b[n2];
86       if (w != 0)
87 	w -= mpn_sub_n (p + n3, b, b + n3, n2);
88       else
89 	{
90 	  i = n2;
91 	  do
92 	    {
93 	      --i;
94 	      w0 = b[i];
95 	      w1 = b[n3 + i];
96 	    }
97 	  while (w0 == w1 && i != 0);
98 	  if (w0 < w1)
99 	    {
100 	      x = b + n3;
101 	      y = b;
102 	      sign = ~sign;
103 	    }
104 	  else
105 	    {
106 	      x = b;
107 	      y = b + n3;
108 	    }
109 	  mpn_sub_n (p + n3, x, y, n2);
110 	}
111       p[n] = w;
112 
113       n1 = n + 1;
114       if (n2 < MUL_KARATSUBA_THRESHOLD)
115 	{
116 	  if (n3 < MUL_KARATSUBA_THRESHOLD)
117 	    {
118 	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
119 	      mpn_mul_basecase (p, a, n3, b, n3);
120 	    }
121 	  else
122 	    {
123 	      mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
124 	      mpn_kara_mul_n (p, a, b, n3, ws + n1);
125 	    }
126 	  mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
127 	}
128       else
129 	{
130 	  mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
131 	  mpn_kara_mul_n (p, a, b, n3, ws + n1);
132 	  mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
133 	}
134 
135       if (sign)
136 	mpn_add_n (ws, p, ws, n1);
137       else
138 	mpn_sub_n (ws, p, ws, n1);
139 
140       nm1 = n - 1;
141       if (mpn_add_n (ws, p + n1, ws, nm1))
142 	{
143 	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
144 	  ws[nm1] = x;
145 	  if (x == 0)
146 	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
147 	}
148       if (mpn_add_n (p + n3, p + n3, ws, n1))
149 	{
150 	  mpn_incr_u (p + n1 + n3, 1);
151 	}
152     }
153   else
154     {
155       /* Even length. */
156       i = n2;
157       do
158 	{
159 	  --i;
160 	  w0 = a[i];
161 	  w1 = a[n2 + i];
162 	}
163       while (w0 == w1 && i != 0);
164       sign = 0;
165       if (w0 < w1)
166 	{
167 	  x = a + n2;
168 	  y = a;
169 	  sign = ~0;
170 	}
171       else
172 	{
173 	  x = a;
174 	  y = a + n2;
175 	}
176       mpn_sub_n (p, x, y, n2);
177 
178       i = n2;
179       do
180 	{
181 	  --i;
182 	  w0 = b[i];
183 	  w1 = b[n2 + i];
184 	}
185       while (w0 == w1 && i != 0);
186       if (w0 < w1)
187 	{
188 	  x = b + n2;
189 	  y = b;
190 	  sign = ~sign;
191 	}
192       else
193 	{
194 	  x = b;
195 	  y = b + n2;
196 	}
197       mpn_sub_n (p + n2, x, y, n2);
198 
199       /* Pointwise products. */
200       if (n2 < MUL_KARATSUBA_THRESHOLD)
201 	{
202 	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
203 	  mpn_mul_basecase (p, a, n2, b, n2);
204 	  mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
205 	}
206       else
207 	{
208 	  mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
209 	  mpn_kara_mul_n (p, a, b, n2, ws + n);
210 	  mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
211 	}
212 
213       /* Interpolate. */
214       if (sign)
215 	w = mpn_add_n (ws, p, ws, n);
216       else
217 	w = -mpn_sub_n (ws, p, ws, n);
218       w += mpn_add_n (ws, p + n, ws, n);
219       w += mpn_add_n (p + n2, p + n2, ws, n);
220       MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
221     }
222 }
223 
224 void
225 mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
226 {
227   mp_limb_t w, w0, w1;
228   mp_size_t n2;
229   mp_srcptr x, y;
230   mp_size_t i;
231 
232   n2 = n >> 1;
233   ASSERT (n2 > 0);
234 
235   if ((n & 1) != 0)
236     {
237       /* Odd length. */
238       mp_size_t n1, n3, nm1;
239 
240       n3 = n - n2;
241 
242       w = a[n2];
243       if (w != 0)
244 	w -= mpn_sub_n (p, a, a + n3, n2);
245       else
246 	{
247 	  i = n2;
248 	  do
249 	    {
250 	      --i;
251 	      w0 = a[i];
252 	      w1 = a[n3 + i];
253 	    }
254 	  while (w0 == w1 && i != 0);
255 	  if (w0 < w1)
256 	    {
257 	      x = a + n3;
258 	      y = a;
259 	    }
260 	  else
261 	    {
262 	      x = a;
263 	      y = a + n3;
264 	    }
265 	  mpn_sub_n (p, x, y, n2);
266 	}
267       p[n2] = w;
268 
269       n1 = n + 1;
270 
271       /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
272 	 could be combined.  But that's not important, since the tests will
273 	 take a miniscule amount of time compared to the function calls.  */
274       if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
275 	{
276 	  mpn_mul_basecase (ws, p, n3, p, n3);
277 	  mpn_mul_basecase (p,  a, n3, a, n3);
278 	}
279       else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
280 	{
281 	  mpn_sqr_basecase (ws, p, n3);
282 	  mpn_sqr_basecase (p,  a, n3);
283 	}
284       else
285 	{
286 	  mpn_kara_sqr_n   (ws, p, n3, ws + n1);	 /* (x-y)^2 */
287 	  mpn_kara_sqr_n   (p,  a, n3, ws + n1);	 /* x^2	    */
288 	}
289       if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
290 	mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
291       else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
292 	mpn_sqr_basecase (p + n1, a + n3, n2);
293       else
294 	mpn_kara_sqr_n   (p + n1, a + n3, n2, ws + n1);	 /* y^2	    */
295 
296       /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
297 	 borrow from mpn_sub_n.	 If it occurs then it'll be cancelled by a
298 	 carry from ws[n].  Further, since 2xy fits in n1 limbs there won't
299 	 be any carry out of ws[n] other than cancelling that borrow. */
300 
301       mpn_sub_n (ws, p, ws, n1);	     /* x^2-(x-y)^2 */
302 
303       nm1 = n - 1;
304       if (mpn_add_n (ws, p + n1, ws, nm1))   /* x^2+y^2-(x-y)^2 = 2xy */
305 	{
306 	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
307 	  ws[nm1] = x;
308 	  if (x == 0)
309 	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
310 	}
311       if (mpn_add_n (p + n3, p + n3, ws, n1))
312 	{
313 	  mpn_incr_u (p + n1 + n3, 1);
314 	}
315     }
316   else
317     {
318       /* Even length. */
319       i = n2;
320       do
321 	{
322 	  --i;
323 	  w0 = a[i];
324 	  w1 = a[n2 + i];
325 	}
326       while (w0 == w1 && i != 0);
327       if (w0 < w1)
328 	{
329 	  x = a + n2;
330 	  y = a;
331 	}
332       else
333 	{
334 	  x = a;
335 	  y = a + n2;
336 	}
337       mpn_sub_n (p, x, y, n2);
338 
339       /* Pointwise products. */
340       if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
341 	{
342 	  mpn_mul_basecase (ws,    p,      n2, p,      n2);
343 	  mpn_mul_basecase (p,     a,      n2, a,      n2);
344 	  mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
345 	}
346       else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
347 	{
348 	  mpn_sqr_basecase (ws,    p,      n2);
349 	  mpn_sqr_basecase (p,     a,      n2);
350 	  mpn_sqr_basecase (p + n, a + n2, n2);
351 	}
352       else
353 	{
354 	  mpn_kara_sqr_n (ws,    p,      n2, ws + n);
355 	  mpn_kara_sqr_n (p,     a,      n2, ws + n);
356 	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
357 	}
358 
359       /* Interpolate. */
360       w = -mpn_sub_n (ws, p, ws, n);
361       w += mpn_add_n (ws, p + n, ws, n);
362       w += mpn_add_n (p + n2, p + n2, ws, n);
363       MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
364     }
365 }
366 
367 /******************************************************************************
368  *                                                                            *
369  *              Toom 3-way multiplication and squaring                        *
370  *                                                                            *
371  *****************************************************************************/
372 
373 /* Starts from:
374    {v0,2k}    (stored in {c,2k})
375    {vm1,2k+1} (which sign is sa, and absolute value is stored in {vm1,2k+1})
376    {v1,2k+1}  (stored in {c+2k,2k+1})
377    {v2,2k+1}
378    {vinf,twor}  (stored in {c+4k,twor}, except the first limb, saved in vinf0)
379 
380    ws is temporary space, and should have at least twor limbs.
381 
382    put in {c, 2n} where n = 2k+twor the value of {v0,2k} (already in place)
383    + B^k * {tm1, 2k+1}
384    + B^(2k) * {t1, 2k+1}
385    + B^(3k) * {t2, 2k+1}
386    + B^(4k) * {vinf,twor} (high twor-1 limbs already in place)
387    where {t1, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1}- 2*{v0,2k})/2-*{vinf,twor}
388 	 {t2, 2k+1} = (3*({v1,2k+1}-{v0,2k})-sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,twor}
389 	 {tm1,2k+1} = ({v1,2k+1}-sa*{vm1,2k+1}/2-{t2,2k+1}
390 
391    Exact sequence described in a comment in mpn_toom3_mul_n.
392    mpn_toom3_mul_n() and mpn_toom3_sqr_n() implement steps 1-2.
393    mpn_toom_interpolate_5pts() implements steps 3-4.
394 
395    Reference: What About Toom-Cook Matrices Optimality? Marco Bodrato
396    and Alberto Zanoni, October 19, 2006, http://bodrato.it/papers/#CIVV2006
397 
398    ************* saved note ****************
399    Think about:
400 
401    The evaluated point a-b+c stands a good chance of having a zero carry
402    limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly.
403    Perhaps this could be tested and stripped.  Doing so before recursing
404    would be better than stripping at the start of mpn_toom3_mul_n/sqr_n,
405    since then the recursion could be based on the new size.  Although in
406    truth the kara vs toom3 crossover is never so exact that one limb either
407    way makes a difference.
408 
409    A small value like 1 or 2 for the carry could perhaps also be handled
410    with an add_n or addlsh1_n.  Would that be faster than an extra limb on a
411    (recursed) multiply/square?
412 */
413 
414 #define TOOM3_MUL_REC(p, a, b, n, ws) \
415   do {								\
416     if (MUL_TOOM3_THRESHOLD / 3 < MUL_KARATSUBA_THRESHOLD	\
417 	&& BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))	\
418       mpn_mul_basecase (p, a, n, b, n);				\
419     else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD))		\
420       mpn_kara_mul_n (p, a, b, n, ws);				\
421     else							\
422       mpn_toom3_mul_n (p, a, b, n, ws);				\
423   } while (0)
424 
425 #define TOOM3_SQR_REC(p, a, n, ws)				\
426   do {								\
427     if (SQR_TOOM3_THRESHOLD / 3 < SQR_BASECASE_THRESHOLD	\
428 	&& BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))		\
429       mpn_mul_basecase (p, a, n, a, n);				\
430     else if (SQR_TOOM3_THRESHOLD / 3 < SQR_KARATSUBA_THRESHOLD	\
431 	&& BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))	\
432       mpn_sqr_basecase (p, a, n);				\
433     else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
434       mpn_kara_sqr_n (p, a, n, ws);				\
435     else							\
436       mpn_toom3_sqr_n (p, a, n, ws);				\
437   } while (0)
438 
439 /* The necessary temporary space T(n) satisfies T(n)=0 for n < THRESHOLD,
440    and T(n) <= max(2n+2, 6k+3, 4k+3+T(k+1)) otherwise, where k = ceil(n/3).
441 
442    Assuming T(n) >= 2n, 6k+3 <= 4k+3+T(k+1).
443    Similarly, 2n+2 <= 6k+2 <= 4k+3+T(k+1).
444 
445    With T(n) = 2n+S(n), this simplifies to S(n) <= 9 + S(k+1).
446    Since THRESHOLD >= 17, we have n/(k+1) >= 19/8
447    thus S(n) <= S(n/(19/8)) + 9 thus S(n) <= 9*log(n)/log(19/8) <= 8*log2(n).
448 */
449 
450 void
451 mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t)
452 {
453   mp_size_t k, k1, kk1, r, twok, twor;
454   mp_limb_t cy, cc, saved, vinf0;
455   mp_ptr trec;
456   int sa, sb;
457   mp_ptr c1, c2, c3, c4, c5;
458 
459   ASSERT(GMP_NUMB_BITS >= 6);
460   ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */
461 
462   /*
463   The algorithm is the following:
464 
465   0. k = ceil(n/3), r = n - 2k, B = 2^(GMP_NUMB_BITS), t = B^k
466   1. split a and b in three parts each a0, a1, a2 and b0, b1, b2
467      with a0, a1, b0, b1 of k limbs, and a2, b2 of r limbs
468   2. Evaluation: vm1 may be negative, the other can not.
469      v0   <- a0*b0
470      v1   <- (a0+a1+a2)*(b0+b1+b2)
471      v2   <- (a0+2*a1+4*a2)*(b0+2*b1+4*b2)
472      vm1  <- (a0-a1+a2)*(b0-b1+b2)
473      vinf <- a2*b2
474   3. Interpolation: every result is positive, all divisions are exact
475      t2   <- (v2 - vm1)/3
476      tm1  <- (v1 - vm1)/2
477      t1   <- (v1 - v0)
478      t2   <- (t2 - t1)/2
479      t1   <- (t1 - tm1 - vinf)
480      t2   <- (t2 - 2*vinf)
481      tm1  <- (tm1 - t2)
482   4. result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where
483      c0   <- v0
484      c1   <- tm1
485      c2   <- t1
486      c3   <- t2
487      c4   <- vinf
488   */
489 
490   k = (n + 2) / 3; /* ceil(n/3) */
491   twok = 2 * k;
492   k1 = k + 1;
493   kk1 = k + k1;
494   r = n - twok;   /* last chunk */
495   twor = 2 * r;
496 
497   c1 = c + k;
498   c2 = c1 + k;
499   c3 = c2 + k;
500   c4 = c3 + k;
501   c5 = c4 + k;
502 
503   trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */
504 
505   /* put a0+a2 in {c, k+1}, and b0+b2 in {c+4k+2, k+1};
506      put a0+a1+a2 in {t, k+1} and b0+b1+b2 in {t+k+1,k+1}
507      [????requires 5k+3 <= 2n, ie. n >= 9] */
508   cy = mpn_add_n (c,      a, a + twok, r);
509   cc = mpn_add_n (c4 + 2, b, b + twok, r);
510   if (r < k)
511     {
512       __GMPN_ADD_1 (cy, c + r,      a + r, k - r, cy);
513       __GMPN_ADD_1 (cc, c4 + 2 + r, b + r, k - r, cc);
514     }
515 
516   /* Put in {t, k+1} the sum
517    * (a_0+a_2) - stored in {c, k+1} -
518    * +
519    * a_1       - stored in {a+k, k} */
520   t[k] = (c1[0] = cy) + mpn_add_n (t, c, a + k, k);
521   /*          ^              ^
522    * carry of a_0 + a_2    carry of (a_0+a_2) + a_1
523 
524    */
525 
526   /* Put in {t+k+1, k+1} the sum of the two values
527    * (b_0+b_2) - stored in {c1+1, k+1} -
528    * +
529    * b_1       - stored in {b+k, k} */
530   t[kk1] = (c5[3] = cc) + mpn_add_n (t + k1, c4 + 2, b + k, k);
531   /*          ^              ^
532    * carry of b_0 + b_2    carry of (b_0+b_2) + b_1 */
533 
534 #define v2 (t+2*k+1)
535 
536   /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {t, 2k+1};
537      since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
538   TOOM3_MUL_REC (c2, t, t + k1, k1, trec);
539 
540   /*   c         c2    c4                 t
541      {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
542 		 v1                                            */
543 
544   /* put |a0-a1+a2| in {c, k+1} and |b0-b1+b2| in {c+4k+2,k+1} */
545   /* (They're already there, actually)                         */
546 
547   /* sa = sign(a0-a1+a2) */
548   sa   = (cy != 0) ? 1 : mpn_cmp (c, a + k, k);
549   c[k] = (sa >= 0) ? cy - mpn_sub_n (c, c, a + k, k)
550 		   : mpn_sub_n (c, a + k, c, k);
551 
552   sb    = (cc != 0) ? 1 : mpn_cmp (c4 + 2, b + k, k);
553   c5[2] = (sb >= 0) ? cc - mpn_sub_n (c4 + 2, c4 + 2, b + k, k)
554 		    : mpn_sub_n (c4 + 2, b + k, c4 + 2, k);
555   sa *= sb; /* sign of vm1 */
556 
557   /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1};
558      since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
559   TOOM3_MUL_REC (t, c, c4 + 2, k1, trec);
560 
561   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
562 		v1                      vm1
563   */
564 
565   /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c+4k+2, k+1}
566      [requires 5k+3 <= 2n, i.e. n >= 17] */
567 #ifdef HAVE_NATIVE_mpn_addlsh1_n
568   c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
569   c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r);
570   if (r < k)
571     {
572       __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
573       __GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]);
574     }
575   c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
576   c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k);
577 #else
578   c[r] = mpn_lshift (c, a + twok, r, 1);
579   c4[r + 2] = mpn_lshift (c4 + 2, b + twok, r, 1);
580   if (r < k)
581     {
582       MPN_ZERO(c + r + 1, k - r);
583       MPN_ZERO(c4 + r + 3, k - r);
584     }
585   c1[0] += mpn_add_n (c, c, a + k, k);
586   c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k);
587   mpn_lshift (c, c, k1, 1);
588   mpn_lshift (c4 + 2, c4 + 2, k1, 1);
589   c1[0] += mpn_add_n (c, c, a, k);
590   c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k);
591 #endif
592 
593   /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1}
594      v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
595   TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec);
596 
597   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
598 		v1                      vm1         v2
599   */
600 
601   /* compute v0 := a0*b0 in {c, 2k} */
602   TOOM3_MUL_REC (c, a, b, k, trec);
603 
604   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
605        v0       v1                      vm1       v2                   */
606 
607   /* compute vinf := a2*b2 in {t+4k+2, 2r}: in {c4, 2r} */
608 
609   saved = c4[0];              /* Remember v1's highest byte (will be overwritten). */
610   TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec);           /* Overwrites c4[0].  */
611   vinf0 = c4[0];              /* Remember vinf's lowest byte (will be overwritten).*/
612   c4[0] = saved;              /* Overwriting. Now v1 value is correct.             */
613 
614   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
615        v0       v1       vinf[1..]      vm1       v2               */
616 
617   mpn_toom_interpolate_5pts (c, v2, t, k, 2*r, sa, vinf0, trec);
618 
619 #undef v2
620 }
621 
622 void
623 mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t)
624 {
625   mp_size_t k, k1, kk1, r, twok, twor;
626   mp_limb_t cy, saved, vinf0;
627   mp_ptr trec;
628   int sa;
629   mp_ptr c1, c2, c3, c4;
630 
631   ASSERT(GMP_NUMB_BITS >= 6);
632   ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */
633 
634   /* the algorithm is the same as mpn_toom3_mul_n, with b=a */
635 
636   k = (n + 2) / 3; /* ceil(n/3) */
637   twok = 2 * k;
638   k1 = k + 1;
639   kk1 = k + k1;
640   r = n - twok;   /* last chunk */
641   twor = 2 * r;
642 
643   c1 = c + k;
644   c2 = c1 + k;
645   c3 = c2 + k;
646   c4 = c3 + k;
647 
648   trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */
649 
650   cy = mpn_add_n (c, a, a + twok, r);
651   if (r < k)
652     __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
653   t[k] = (c1[0] = cy) + mpn_add_n (t, c, a + k, k);
654 
655 #define v2 (t+2*k+1)
656 
657   TOOM3_SQR_REC (c2, t, k1, trec);
658 
659   sa = (cy != 0) ? 1 : mpn_cmp (c, a + k, k);
660   c[k] = (sa >= 0) ? cy - mpn_sub_n (c, c, a + k, k)
661     : mpn_sub_n (c, a + k, c, k);
662 
663   TOOM3_SQR_REC (t, c, k1, trec);
664 
665 #ifdef HAVE_NATIVE_mpn_addlsh1_n
666   c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
667   if (r < k)
668     __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
669   c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
670 #else
671   c[r] = mpn_lshift (c, a + twok, r, 1);
672   if (r < k)
673     MPN_ZERO(c + r + 1, k - r);
674   c1[0] += mpn_add_n (c, c, a + k, k);
675   mpn_lshift (c, c, k1, 1);
676   c1[0] += mpn_add_n (c, c, a, k);
677 #endif
678 
679   TOOM3_SQR_REC (v2, c, k1, trec);
680 
681   TOOM3_SQR_REC (c, a, k, trec);
682 
683   saved = c4[0];
684   TOOM3_SQR_REC (c4, a + twok, r, trec);
685   vinf0 = c4[0];
686   c4[0] = saved;
687 
688   mpn_toom_interpolate_5pts (c, v2, t, k, 2*r,  1, vinf0, trec);
689 
690 #undef v2
691 }
692 
693 void
694 mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
695 {
696   ASSERT (n >= 1);
697   ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
698   ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));
699 
700   if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))
701     {
702       mpn_mul_basecase (p, a, n, b, n);
703     }
704   else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD))
705     {
706       /* Allocate workspace of fixed size on stack: fast! */
707       mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
708       ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT);
709       mpn_kara_mul_n (p, a, b, n, ws);
710     }
711   else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD))
712     {
713       mp_ptr ws;
714       TMP_SDECL;
715       TMP_SMARK;
716       ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n));
717       mpn_toom3_mul_n (p, a, b, n, ws);
718       TMP_SFREE;
719     }
720 #if WANT_FFT || TUNE_PROGRAM_BUILD
721   else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD))
722 #else
723   else if (BELOW_THRESHOLD (n, MPN_TOOM44_MAX_N))
724 #endif
725     {
726       mp_ptr ws;
727       TMP_SDECL;
728       TMP_SMARK;
729       ws = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (n, n));
730       mpn_toom44_mul (p, a, n, b, n, ws);
731       TMP_SFREE;
732     }
733   else
734 #if WANT_FFT || TUNE_PROGRAM_BUILD
735     {
736       /* The current FFT code allocates its own space.  That should probably
737 	 change.  */
738       mpn_mul_fft_full (p, a, n, b, n);
739     }
740 #else
741     {
742       /* Toom4 for large operands.  */
743       mp_ptr ws;
744       TMP_DECL;
745       TMP_MARK;
746       ws = TMP_BALLOC_LIMBS (mpn_toom44_mul_itch (n, n));
747       mpn_toom44_mul (p, a, n, b, n, ws);
748       TMP_FREE;
749     }
750 #endif
751 }
752 
753 void
754 mpn_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n)
755 {
756   ASSERT (n >= 1);
757   ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
758 
759 #if 0
760   /* FIXME: Can this be removed? */
761   if (n == 0)
762     return;
763 #endif
764 
765   if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
766     { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
767       mpn_mul_basecase (p, a, n, a, n);
768     }
769   else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))
770     {
771       mpn_sqr_basecase (p, a, n);
772     }
773   else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
774     {
775       /* Allocate workspace of fixed size on stack: fast! */
776       mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)];
777       ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
778       mpn_kara_sqr_n (p, a, n, ws);
779     }
780   else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))
781     {
782       mp_ptr ws;
783       TMP_SDECL;
784       TMP_SMARK;
785       ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n));
786       mpn_toom3_sqr_n (p, a, n, ws);
787       TMP_SFREE;
788     }
789 #if WANT_FFT || TUNE_PROGRAM_BUILD
790   else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD))
791 #else
792   else if (BELOW_THRESHOLD (n, MPN_TOOM44_MAX_N))
793 #endif
794     {
795       mp_ptr ws;
796       TMP_SDECL;
797       TMP_SMARK;
798       ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n));
799       mpn_toom4_sqr (p, a, n, ws);
800       TMP_SFREE;
801     }
802   else
803 #if WANT_FFT || TUNE_PROGRAM_BUILD
804     {
805       /* The current FFT code allocates its own space.  That should probably
806 	 change.  */
807       mpn_mul_fft_full (p, a, n, a, n);
808     }
809 #else
810     {
811       /* Toom4 for large operands.  */
812       mp_ptr ws;
813       TMP_DECL;
814       TMP_MARK;
815       ws = TMP_BALLOC_LIMBS (mpn_toom4_sqr_itch (n));
816       mpn_toom4_sqr (p, a, n, ws);
817       TMP_FREE;
818     }
819 #endif
820 }
821