14a1767b4Smrg /* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice
24a1767b4Smrg as large as bn. Or more accurately, (3/2)bn < an < 4bn.
34a1767b4Smrg
44a1767b4Smrg Contributed to the GNU project by Torbjorn Granlund.
54a1767b4Smrg Additional improvements by Marco Bodrato.
64a1767b4Smrg
74a1767b4Smrg The idea of applying toom to unbalanced multiplication is due to Marco
84a1767b4Smrg Bodrato and Alberto Zanoni.
94a1767b4Smrg
104a1767b4Smrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
114a1767b4Smrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
124a1767b4Smrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
134a1767b4Smrg
14*f81b1c5bSmrg Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc.
154a1767b4Smrg
164a1767b4Smrg This file is part of the GNU MP Library.
174a1767b4Smrg
184a1767b4Smrg The GNU MP Library is free software; you can redistribute it and/or modify
19*f81b1c5bSmrg it under the terms of either:
20*f81b1c5bSmrg
21*f81b1c5bSmrg * the GNU Lesser General Public License as published by the Free
22*f81b1c5bSmrg Software Foundation; either version 3 of the License, or (at your
234a1767b4Smrg option) any later version.
244a1767b4Smrg
25*f81b1c5bSmrg or
26*f81b1c5bSmrg
27*f81b1c5bSmrg * the GNU General Public License as published by the Free Software
28*f81b1c5bSmrg Foundation; either version 2 of the License, or (at your option) any
29*f81b1c5bSmrg later version.
30*f81b1c5bSmrg
31*f81b1c5bSmrg or both in parallel, as here.
32*f81b1c5bSmrg
334a1767b4Smrg The GNU MP Library is distributed in the hope that it will be useful, but
344a1767b4Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35*f81b1c5bSmrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
36*f81b1c5bSmrg for more details.
374a1767b4Smrg
38*f81b1c5bSmrg You should have received copies of the GNU General Public License and the
39*f81b1c5bSmrg GNU Lesser General Public License along with the GNU MP Library. If not,
40*f81b1c5bSmrg see https://www.gnu.org/licenses/. */
414a1767b4Smrg
424a1767b4Smrg
434a1767b4Smrg #include "gmp-impl.h"
444a1767b4Smrg
454a1767b4Smrg /* Evaluate in: -1, 0, +1, +2, +inf
464a1767b4Smrg
474a1767b4Smrg <-s-><--n--><--n--><--n-->
484a1767b4Smrg ___ ______ ______ ______
494a1767b4Smrg |a3_|___a2_|___a1_|___a0_|
504a1767b4Smrg |_b1_|___b0_|
514a1767b4Smrg <-t--><--n-->
524a1767b4Smrg
534a1767b4Smrg v0 = a0 * b0 # A(0)*B(0)
544a1767b4Smrg v1 = (a0+ a1+ a2+ a3)*(b0+ b1) # A(1)*B(1) ah <= 3 bh <= 1
554a1767b4Smrg vm1 = (a0- a1+ a2- a3)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0
564a1767b4Smrg v2 = (a0+2a1+4a2+8a3)*(b0+2b1) # A(2)*B(2) ah <= 14 bh <= 2
574a1767b4Smrg vinf= a3 * b1 # A(inf)*B(inf)
584a1767b4Smrg */
594a1767b4Smrg
604a1767b4Smrg #define TOOM42_MUL_N_REC(p, a, b, n, ws) \
614a1767b4Smrg do { \
624a1767b4Smrg mpn_mul_n (p, a, b, n); \
634a1767b4Smrg } while (0)
644a1767b4Smrg
654a1767b4Smrg void
mpn_toom42_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)664a1767b4Smrg mpn_toom42_mul (mp_ptr pp,
674a1767b4Smrg mp_srcptr ap, mp_size_t an,
684a1767b4Smrg mp_srcptr bp, mp_size_t bn,
694a1767b4Smrg mp_ptr scratch)
704a1767b4Smrg {
714a1767b4Smrg mp_size_t n, s, t;
724a1767b4Smrg int vm1_neg;
734a1767b4Smrg mp_limb_t cy, vinf0;
74d25e02daSmrg mp_ptr a0_a2;
754a1767b4Smrg mp_ptr as1, asm1, as2;
764a1767b4Smrg mp_ptr bs1, bsm1, bs2;
77*f81b1c5bSmrg mp_ptr tmp;
784a1767b4Smrg TMP_DECL;
794a1767b4Smrg
804a1767b4Smrg #define a0 ap
814a1767b4Smrg #define a1 (ap + n)
824a1767b4Smrg #define a2 (ap + 2*n)
834a1767b4Smrg #define a3 (ap + 3*n)
844a1767b4Smrg #define b0 bp
854a1767b4Smrg #define b1 (bp + n)
864a1767b4Smrg
874a1767b4Smrg n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1;
884a1767b4Smrg
894a1767b4Smrg s = an - 3 * n;
904a1767b4Smrg t = bn - n;
914a1767b4Smrg
924a1767b4Smrg ASSERT (0 < s && s <= n);
934a1767b4Smrg ASSERT (0 < t && t <= n);
944a1767b4Smrg
954a1767b4Smrg TMP_MARK;
964a1767b4Smrg
97*f81b1c5bSmrg tmp = TMP_ALLOC_LIMBS (6 * n + 5);
98*f81b1c5bSmrg as1 = tmp; tmp += n + 1;
99*f81b1c5bSmrg asm1 = tmp; tmp += n + 1;
100*f81b1c5bSmrg as2 = tmp; tmp += n + 1;
101*f81b1c5bSmrg bs1 = tmp; tmp += n + 1;
102*f81b1c5bSmrg bsm1 = tmp; tmp += n;
103*f81b1c5bSmrg bs2 = tmp; tmp += n + 1;
1044a1767b4Smrg
1054a1767b4Smrg a0_a2 = pp;
1064a1767b4Smrg
1074a1767b4Smrg /* Compute as1 and asm1. */
1084a1767b4Smrg vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1;
1094a1767b4Smrg
1104a1767b4Smrg /* Compute as2. */
1114a1767b4Smrg #if HAVE_NATIVE_mpn_addlsh1_n
1124a1767b4Smrg cy = mpn_addlsh1_n (as2, a2, a3, s);
1134a1767b4Smrg if (s != n)
1144a1767b4Smrg cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
1154a1767b4Smrg cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
1164a1767b4Smrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
1174a1767b4Smrg #else
1184a1767b4Smrg cy = mpn_lshift (as2, a3, s, 1);
1194a1767b4Smrg cy += mpn_add_n (as2, a2, as2, s);
1204a1767b4Smrg if (s != n)
1214a1767b4Smrg cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
1224a1767b4Smrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
1234a1767b4Smrg cy += mpn_add_n (as2, a1, as2, n);
1244a1767b4Smrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
1254a1767b4Smrg cy += mpn_add_n (as2, a0, as2, n);
1264a1767b4Smrg #endif
1274a1767b4Smrg as2[n] = cy;
1284a1767b4Smrg
1294a1767b4Smrg /* Compute bs1 and bsm1. */
1304a1767b4Smrg if (t == n)
1314a1767b4Smrg {
1324a1767b4Smrg #if HAVE_NATIVE_mpn_add_n_sub_n
1334a1767b4Smrg if (mpn_cmp (b0, b1, n) < 0)
1344a1767b4Smrg {
1354a1767b4Smrg cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
1364a1767b4Smrg vm1_neg ^= 1;
1374a1767b4Smrg }
1384a1767b4Smrg else
1394a1767b4Smrg {
1404a1767b4Smrg cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
1414a1767b4Smrg }
1424a1767b4Smrg bs1[n] = cy >> 1;
1434a1767b4Smrg #else
1444a1767b4Smrg bs1[n] = mpn_add_n (bs1, b0, b1, n);
1454a1767b4Smrg
1464a1767b4Smrg if (mpn_cmp (b0, b1, n) < 0)
1474a1767b4Smrg {
1484a1767b4Smrg mpn_sub_n (bsm1, b1, b0, n);
1494a1767b4Smrg vm1_neg ^= 1;
1504a1767b4Smrg }
1514a1767b4Smrg else
1524a1767b4Smrg {
1534a1767b4Smrg mpn_sub_n (bsm1, b0, b1, n);
1544a1767b4Smrg }
1554a1767b4Smrg #endif
1564a1767b4Smrg }
1574a1767b4Smrg else
1584a1767b4Smrg {
1594a1767b4Smrg bs1[n] = mpn_add (bs1, b0, n, b1, t);
1604a1767b4Smrg
1614a1767b4Smrg if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
1624a1767b4Smrg {
1634a1767b4Smrg mpn_sub_n (bsm1, b1, b0, t);
1644a1767b4Smrg MPN_ZERO (bsm1 + t, n - t);
1654a1767b4Smrg vm1_neg ^= 1;
1664a1767b4Smrg }
1674a1767b4Smrg else
1684a1767b4Smrg {
1694a1767b4Smrg mpn_sub (bsm1, b0, n, b1, t);
1704a1767b4Smrg }
1714a1767b4Smrg }
1724a1767b4Smrg
1734a1767b4Smrg /* Compute bs2, recycling bs1. bs2=bs1+b1 */
1744a1767b4Smrg mpn_add (bs2, bs1, n + 1, b1, t);
1754a1767b4Smrg
1764a1767b4Smrg ASSERT (as1[n] <= 3);
1774a1767b4Smrg ASSERT (bs1[n] <= 1);
1784a1767b4Smrg ASSERT (asm1[n] <= 1);
1794a1767b4Smrg /*ASSERT (bsm1[n] == 0);*/
1804a1767b4Smrg ASSERT (as2[n] <= 14);
1814a1767b4Smrg ASSERT (bs2[n] <= 2);
1824a1767b4Smrg
1834a1767b4Smrg #define v0 pp /* 2n */
1844a1767b4Smrg #define v1 (pp + 2 * n) /* 2n+1 */
1854a1767b4Smrg #define vinf (pp + 4 * n) /* s+t */
1864a1767b4Smrg #define vm1 scratch /* 2n+1 */
1874a1767b4Smrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */
1884a1767b4Smrg #define scratch_out scratch + 4 * n + 4 /* Currently unused. */
1894a1767b4Smrg
1904a1767b4Smrg /* vm1, 2n+1 limbs */
1914a1767b4Smrg TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
1924a1767b4Smrg cy = 0;
1934a1767b4Smrg if (asm1[n] != 0)
1944a1767b4Smrg cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
1954a1767b4Smrg vm1[2 * n] = cy;
1964a1767b4Smrg
1974a1767b4Smrg TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
1984a1767b4Smrg
1994a1767b4Smrg /* vinf, s+t limbs */
2004a1767b4Smrg if (s > t) mpn_mul (vinf, a3, s, b1, t);
2014a1767b4Smrg else mpn_mul (vinf, b1, t, a3, s);
2024a1767b4Smrg
2034a1767b4Smrg vinf0 = vinf[0]; /* v1 overlaps with this */
2044a1767b4Smrg
2054a1767b4Smrg /* v1, 2n+1 limbs */
2064a1767b4Smrg TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out);
2074a1767b4Smrg if (as1[n] == 1)
2084a1767b4Smrg {
2094a1767b4Smrg cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
2104a1767b4Smrg }
2114a1767b4Smrg else if (as1[n] == 2)
2124a1767b4Smrg {
2134a1767b4Smrg #if HAVE_NATIVE_mpn_addlsh1_n
2144a1767b4Smrg cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
2154a1767b4Smrg #else
2164a1767b4Smrg cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
2174a1767b4Smrg #endif
2184a1767b4Smrg }
2194a1767b4Smrg else if (as1[n] == 3)
2204a1767b4Smrg {
2214a1767b4Smrg cy = 3 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3));
2224a1767b4Smrg }
2234a1767b4Smrg else
2244a1767b4Smrg cy = 0;
2254a1767b4Smrg if (bs1[n] != 0)
2264a1767b4Smrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
2274a1767b4Smrg v1[2 * n] = cy;
2284a1767b4Smrg
2294a1767b4Smrg TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
2304a1767b4Smrg
2314a1767b4Smrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
2324a1767b4Smrg
2334a1767b4Smrg TMP_FREE;
2344a1767b4Smrg }
235