14a1767b4Smrg /* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice
24a1767b4Smrg    as large as bn.  Or more accurately, (3/2)bn < an < 4bn.
34a1767b4Smrg 
44a1767b4Smrg    Contributed to the GNU project by Torbjorn Granlund.
54a1767b4Smrg    Additional improvements by Marco Bodrato.
64a1767b4Smrg 
74a1767b4Smrg    The idea of applying toom to unbalanced multiplication is due to Marco
84a1767b4Smrg    Bodrato and Alberto Zanoni.
94a1767b4Smrg 
104a1767b4Smrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
114a1767b4Smrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
124a1767b4Smrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
134a1767b4Smrg 
14*f81b1c5bSmrg Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc.
154a1767b4Smrg 
164a1767b4Smrg This file is part of the GNU MP Library.
174a1767b4Smrg 
184a1767b4Smrg The GNU MP Library is free software; you can redistribute it and/or modify
19*f81b1c5bSmrg it under the terms of either:
20*f81b1c5bSmrg 
21*f81b1c5bSmrg   * the GNU Lesser General Public License as published by the Free
22*f81b1c5bSmrg     Software Foundation; either version 3 of the License, or (at your
234a1767b4Smrg     option) any later version.
244a1767b4Smrg 
25*f81b1c5bSmrg or
26*f81b1c5bSmrg 
27*f81b1c5bSmrg   * the GNU General Public License as published by the Free Software
28*f81b1c5bSmrg     Foundation; either version 2 of the License, or (at your option) any
29*f81b1c5bSmrg     later version.
30*f81b1c5bSmrg 
31*f81b1c5bSmrg or both in parallel, as here.
32*f81b1c5bSmrg 
334a1767b4Smrg The GNU MP Library is distributed in the hope that it will be useful, but
344a1767b4Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35*f81b1c5bSmrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
36*f81b1c5bSmrg for more details.
374a1767b4Smrg 
38*f81b1c5bSmrg You should have received copies of the GNU General Public License and the
39*f81b1c5bSmrg GNU Lesser General Public License along with the GNU MP Library.  If not,
40*f81b1c5bSmrg see https://www.gnu.org/licenses/.  */
414a1767b4Smrg 
424a1767b4Smrg 
434a1767b4Smrg #include "gmp-impl.h"
444a1767b4Smrg 
454a1767b4Smrg /* Evaluate in: -1, 0, +1, +2, +inf
464a1767b4Smrg 
474a1767b4Smrg   <-s-><--n--><--n--><--n-->
484a1767b4Smrg    ___ ______ ______ ______
494a1767b4Smrg   |a3_|___a2_|___a1_|___a0_|
504a1767b4Smrg 	       |_b1_|___b0_|
514a1767b4Smrg 	       <-t--><--n-->
524a1767b4Smrg 
534a1767b4Smrg   v0  =  a0             * b0      #   A(0)*B(0)
544a1767b4Smrg   v1  = (a0+ a1+ a2+ a3)*(b0+ b1) #   A(1)*B(1)      ah  <= 3  bh <= 1
554a1767b4Smrg   vm1 = (a0- a1+ a2- a3)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh  = 0
564a1767b4Smrg   v2  = (a0+2a1+4a2+8a3)*(b0+2b1) #   A(2)*B(2)      ah  <= 14 bh <= 2
574a1767b4Smrg   vinf=              a3 *     b1  # A(inf)*B(inf)
584a1767b4Smrg */
594a1767b4Smrg 
604a1767b4Smrg #define TOOM42_MUL_N_REC(p, a, b, n, ws)				\
614a1767b4Smrg   do {									\
624a1767b4Smrg     mpn_mul_n (p, a, b, n);						\
634a1767b4Smrg   } while (0)
644a1767b4Smrg 
654a1767b4Smrg void
mpn_toom42_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)664a1767b4Smrg mpn_toom42_mul (mp_ptr pp,
674a1767b4Smrg 		mp_srcptr ap, mp_size_t an,
684a1767b4Smrg 		mp_srcptr bp, mp_size_t bn,
694a1767b4Smrg 		mp_ptr scratch)
704a1767b4Smrg {
714a1767b4Smrg   mp_size_t n, s, t;
724a1767b4Smrg   int vm1_neg;
734a1767b4Smrg   mp_limb_t cy, vinf0;
74d25e02daSmrg   mp_ptr a0_a2;
754a1767b4Smrg   mp_ptr as1, asm1, as2;
764a1767b4Smrg   mp_ptr bs1, bsm1, bs2;
77*f81b1c5bSmrg   mp_ptr tmp;
784a1767b4Smrg   TMP_DECL;
794a1767b4Smrg 
804a1767b4Smrg #define a0  ap
814a1767b4Smrg #define a1  (ap + n)
824a1767b4Smrg #define a2  (ap + 2*n)
834a1767b4Smrg #define a3  (ap + 3*n)
844a1767b4Smrg #define b0  bp
854a1767b4Smrg #define b1  (bp + n)
864a1767b4Smrg 
874a1767b4Smrg   n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1;
884a1767b4Smrg 
894a1767b4Smrg   s = an - 3 * n;
904a1767b4Smrg   t = bn - n;
914a1767b4Smrg 
924a1767b4Smrg   ASSERT (0 < s && s <= n);
934a1767b4Smrg   ASSERT (0 < t && t <= n);
944a1767b4Smrg 
954a1767b4Smrg   TMP_MARK;
964a1767b4Smrg 
97*f81b1c5bSmrg   tmp = TMP_ALLOC_LIMBS (6 * n + 5);
98*f81b1c5bSmrg   as1  = tmp; tmp += n + 1;
99*f81b1c5bSmrg   asm1 = tmp; tmp += n + 1;
100*f81b1c5bSmrg   as2  = tmp; tmp += n + 1;
101*f81b1c5bSmrg   bs1  = tmp; tmp += n + 1;
102*f81b1c5bSmrg   bsm1 = tmp; tmp += n;
103*f81b1c5bSmrg   bs2  = tmp; tmp += n + 1;
1044a1767b4Smrg 
1054a1767b4Smrg   a0_a2 = pp;
1064a1767b4Smrg 
1074a1767b4Smrg   /* Compute as1 and asm1.  */
1084a1767b4Smrg   vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1;
1094a1767b4Smrg 
1104a1767b4Smrg   /* Compute as2.  */
1114a1767b4Smrg #if HAVE_NATIVE_mpn_addlsh1_n
1124a1767b4Smrg   cy  = mpn_addlsh1_n (as2, a2, a3, s);
1134a1767b4Smrg   if (s != n)
1144a1767b4Smrg     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
1154a1767b4Smrg   cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
1164a1767b4Smrg   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
1174a1767b4Smrg #else
1184a1767b4Smrg   cy  = mpn_lshift (as2, a3, s, 1);
1194a1767b4Smrg   cy += mpn_add_n (as2, a2, as2, s);
1204a1767b4Smrg   if (s != n)
1214a1767b4Smrg     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
1224a1767b4Smrg   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
1234a1767b4Smrg   cy += mpn_add_n (as2, a1, as2, n);
1244a1767b4Smrg   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
1254a1767b4Smrg   cy += mpn_add_n (as2, a0, as2, n);
1264a1767b4Smrg #endif
1274a1767b4Smrg   as2[n] = cy;
1284a1767b4Smrg 
1294a1767b4Smrg   /* Compute bs1 and bsm1.  */
1304a1767b4Smrg   if (t == n)
1314a1767b4Smrg     {
1324a1767b4Smrg #if HAVE_NATIVE_mpn_add_n_sub_n
1334a1767b4Smrg       if (mpn_cmp (b0, b1, n) < 0)
1344a1767b4Smrg 	{
1354a1767b4Smrg 	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
1364a1767b4Smrg 	  vm1_neg ^= 1;
1374a1767b4Smrg 	}
1384a1767b4Smrg       else
1394a1767b4Smrg 	{
1404a1767b4Smrg 	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
1414a1767b4Smrg 	}
1424a1767b4Smrg       bs1[n] = cy >> 1;
1434a1767b4Smrg #else
1444a1767b4Smrg       bs1[n] = mpn_add_n (bs1, b0, b1, n);
1454a1767b4Smrg 
1464a1767b4Smrg       if (mpn_cmp (b0, b1, n) < 0)
1474a1767b4Smrg 	{
1484a1767b4Smrg 	  mpn_sub_n (bsm1, b1, b0, n);
1494a1767b4Smrg 	  vm1_neg ^= 1;
1504a1767b4Smrg 	}
1514a1767b4Smrg       else
1524a1767b4Smrg 	{
1534a1767b4Smrg 	  mpn_sub_n (bsm1, b0, b1, n);
1544a1767b4Smrg 	}
1554a1767b4Smrg #endif
1564a1767b4Smrg     }
1574a1767b4Smrg   else
1584a1767b4Smrg     {
1594a1767b4Smrg       bs1[n] = mpn_add (bs1, b0, n, b1, t);
1604a1767b4Smrg 
1614a1767b4Smrg       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
1624a1767b4Smrg 	{
1634a1767b4Smrg 	  mpn_sub_n (bsm1, b1, b0, t);
1644a1767b4Smrg 	  MPN_ZERO (bsm1 + t, n - t);
1654a1767b4Smrg 	  vm1_neg ^= 1;
1664a1767b4Smrg 	}
1674a1767b4Smrg       else
1684a1767b4Smrg 	{
1694a1767b4Smrg 	  mpn_sub (bsm1, b0, n, b1, t);
1704a1767b4Smrg 	}
1714a1767b4Smrg     }
1724a1767b4Smrg 
1734a1767b4Smrg   /* Compute bs2, recycling bs1. bs2=bs1+b1  */
1744a1767b4Smrg   mpn_add (bs2, bs1, n + 1, b1, t);
1754a1767b4Smrg 
1764a1767b4Smrg   ASSERT (as1[n] <= 3);
1774a1767b4Smrg   ASSERT (bs1[n] <= 1);
1784a1767b4Smrg   ASSERT (asm1[n] <= 1);
1794a1767b4Smrg /*ASSERT (bsm1[n] == 0);*/
1804a1767b4Smrg   ASSERT (as2[n] <= 14);
1814a1767b4Smrg   ASSERT (bs2[n] <= 2);
1824a1767b4Smrg 
1834a1767b4Smrg #define v0    pp				/* 2n */
1844a1767b4Smrg #define v1    (pp + 2 * n)			/* 2n+1 */
1854a1767b4Smrg #define vinf  (pp + 4 * n)			/* s+t */
1864a1767b4Smrg #define vm1   scratch				/* 2n+1 */
1874a1767b4Smrg #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
1884a1767b4Smrg #define scratch_out	scratch + 4 * n + 4	/* Currently unused. */
1894a1767b4Smrg 
1904a1767b4Smrg   /* vm1, 2n+1 limbs */
1914a1767b4Smrg   TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
1924a1767b4Smrg   cy = 0;
1934a1767b4Smrg   if (asm1[n] != 0)
1944a1767b4Smrg     cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
1954a1767b4Smrg   vm1[2 * n] = cy;
1964a1767b4Smrg 
1974a1767b4Smrg   TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
1984a1767b4Smrg 
1994a1767b4Smrg   /* vinf, s+t limbs */
2004a1767b4Smrg   if (s > t)  mpn_mul (vinf, a3, s, b1, t);
2014a1767b4Smrg   else        mpn_mul (vinf, b1, t, a3, s);
2024a1767b4Smrg 
2034a1767b4Smrg   vinf0 = vinf[0];				/* v1 overlaps with this */
2044a1767b4Smrg 
2054a1767b4Smrg   /* v1, 2n+1 limbs */
2064a1767b4Smrg   TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out);
2074a1767b4Smrg   if (as1[n] == 1)
2084a1767b4Smrg     {
2094a1767b4Smrg       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
2104a1767b4Smrg     }
2114a1767b4Smrg   else if (as1[n] == 2)
2124a1767b4Smrg     {
2134a1767b4Smrg #if HAVE_NATIVE_mpn_addlsh1_n
2144a1767b4Smrg       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
2154a1767b4Smrg #else
2164a1767b4Smrg       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
2174a1767b4Smrg #endif
2184a1767b4Smrg     }
2194a1767b4Smrg   else if (as1[n] == 3)
2204a1767b4Smrg     {
2214a1767b4Smrg       cy = 3 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3));
2224a1767b4Smrg     }
2234a1767b4Smrg   else
2244a1767b4Smrg     cy = 0;
2254a1767b4Smrg   if (bs1[n] != 0)
2264a1767b4Smrg     cy += mpn_add_n (v1 + n, v1 + n, as1, n);
2274a1767b4Smrg   v1[2 * n] = cy;
2284a1767b4Smrg 
2294a1767b4Smrg   TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
2304a1767b4Smrg 
2314a1767b4Smrg   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
2324a1767b4Smrg 
2334a1767b4Smrg   TMP_FREE;
2344a1767b4Smrg }
235