14a1767b4Smrg /* Implementation of the squaring algorithm with Toom-Cook 6.5-way.
24a1767b4Smrg 
34a1767b4Smrg    Contributed to the GNU project by Marco Bodrato.
44a1767b4Smrg 
54a1767b4Smrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
64a1767b4Smrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
74a1767b4Smrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
84a1767b4Smrg 
94a1767b4Smrg Copyright 2009 Free Software Foundation, Inc.
104a1767b4Smrg 
114a1767b4Smrg This file is part of the GNU MP Library.
124a1767b4Smrg 
134a1767b4Smrg The GNU MP Library is free software; you can redistribute it and/or modify
14*f81b1c5bSmrg it under the terms of either:
15*f81b1c5bSmrg 
16*f81b1c5bSmrg   * the GNU Lesser General Public License as published by the Free
17*f81b1c5bSmrg     Software Foundation; either version 3 of the License, or (at your
184a1767b4Smrg     option) any later version.
194a1767b4Smrg 
20*f81b1c5bSmrg or
21*f81b1c5bSmrg 
22*f81b1c5bSmrg   * the GNU General Public License as published by the Free Software
23*f81b1c5bSmrg     Foundation; either version 2 of the License, or (at your option) any
24*f81b1c5bSmrg     later version.
25*f81b1c5bSmrg 
26*f81b1c5bSmrg or both in parallel, as here.
27*f81b1c5bSmrg 
284a1767b4Smrg The GNU MP Library is distributed in the hope that it will be useful, but
294a1767b4Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30*f81b1c5bSmrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31*f81b1c5bSmrg for more details.
324a1767b4Smrg 
33*f81b1c5bSmrg You should have received copies of the GNU General Public License and the
34*f81b1c5bSmrg GNU Lesser General Public License along with the GNU MP Library.  If not,
35*f81b1c5bSmrg see https://www.gnu.org/licenses/.  */
364a1767b4Smrg 
374a1767b4Smrg 
384a1767b4Smrg #include "gmp-impl.h"
394a1767b4Smrg 
404a1767b4Smrg 
414a1767b4Smrg #if GMP_NUMB_BITS < 21
424a1767b4Smrg #error Not implemented.
434a1767b4Smrg #endif
444a1767b4Smrg 
454a1767b4Smrg 
464a1767b4Smrg #if TUNE_PROGRAM_BUILD
474a1767b4Smrg #define MAYBE_sqr_basecase 1
484a1767b4Smrg #define MAYBE_sqr_above_basecase   1
494a1767b4Smrg #define MAYBE_sqr_toom2   1
504a1767b4Smrg #define MAYBE_sqr_above_toom2   1
514a1767b4Smrg #define MAYBE_sqr_toom3   1
524a1767b4Smrg #define MAYBE_sqr_above_toom3   1
534a1767b4Smrg #define MAYBE_sqr_above_toom4   1
544a1767b4Smrg #else
554a1767b4Smrg #ifdef  SQR_TOOM8_THRESHOLD
564a1767b4Smrg #define SQR_TOOM6_MAX ((SQR_TOOM8_THRESHOLD+6*2-1+5)/6)
574a1767b4Smrg #else
584a1767b4Smrg #define SQR_TOOM6_MAX					\
594a1767b4Smrg   ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (6*2-1+5)) ?	\
604a1767b4Smrg    ((SQR_FFT_THRESHOLD+6*2-1+5)/6)			\
614a1767b4Smrg    : MP_SIZE_T_MAX )
624a1767b4Smrg #endif
634a1767b4Smrg #define MAYBE_sqr_basecase					\
644a1767b4Smrg   (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM2_THRESHOLD)
654a1767b4Smrg #define MAYBE_sqr_above_basecase				\
664a1767b4Smrg   (SQR_TOOM6_MAX >=  SQR_TOOM2_THRESHOLD)
674a1767b4Smrg #define MAYBE_sqr_toom2						\
684a1767b4Smrg   (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM3_THRESHOLD)
694a1767b4Smrg #define MAYBE_sqr_above_toom2					\
704a1767b4Smrg   (SQR_TOOM6_MAX >= SQR_TOOM3_THRESHOLD)
714a1767b4Smrg #define MAYBE_sqr_toom3						\
724a1767b4Smrg   (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM4_THRESHOLD)
734a1767b4Smrg #define MAYBE_sqr_above_toom3					\
744a1767b4Smrg   (SQR_TOOM6_MAX >= SQR_TOOM4_THRESHOLD)
754a1767b4Smrg #define MAYBE_sqr_above_toom4					\
764a1767b4Smrg   (SQR_TOOM6_MAX >= SQR_TOOM6_THRESHOLD)
774a1767b4Smrg #endif
784a1767b4Smrg 
794a1767b4Smrg #define TOOM6_SQR_REC(p, a, n, ws)					\
804a1767b4Smrg   do {									\
814a1767b4Smrg     if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase		\
824a1767b4Smrg 	|| BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)))			\
834a1767b4Smrg       mpn_sqr_basecase (p, a, n);					\
844a1767b4Smrg     else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2		\
854a1767b4Smrg 	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)))		\
864a1767b4Smrg       mpn_toom2_sqr (p, a, n, ws);					\
874a1767b4Smrg     else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3		\
884a1767b4Smrg 	     || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)))		\
894a1767b4Smrg       mpn_toom3_sqr (p, a, n, ws);					\
904a1767b4Smrg     else if (! MAYBE_sqr_above_toom4					\
914a1767b4Smrg 	     || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))		\
924a1767b4Smrg       mpn_toom4_sqr (p, a, n, ws);					\
934a1767b4Smrg     else								\
944a1767b4Smrg       mpn_toom6_sqr (p, a, n, ws);					\
954a1767b4Smrg   } while (0)
964a1767b4Smrg 
974a1767b4Smrg void
mpn_toom6_sqr(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_ptr scratch)984a1767b4Smrg mpn_toom6_sqr  (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch)
994a1767b4Smrg {
1004a1767b4Smrg   mp_size_t n, s;
1014a1767b4Smrg 
1024a1767b4Smrg   /***************************** decomposition *******************************/
1034a1767b4Smrg 
1044a1767b4Smrg   ASSERT( an >= 18 );
1054a1767b4Smrg 
1064a1767b4Smrg   n = 1 + (an - 1) / (size_t) 6;
1074a1767b4Smrg 
1084a1767b4Smrg   s = an - 5 * n;
1094a1767b4Smrg 
1104a1767b4Smrg   ASSERT (0 < s && s <= n);
1114a1767b4Smrg 
1124a1767b4Smrg #define   r4    (pp + 3 * n)			/* 3n+1 */
1134a1767b4Smrg #define   r2    (pp + 7 * n)			/* 3n+1 */
1144a1767b4Smrg #define   r0    (pp +11 * n)			/* s+t <= 2*n */
1154a1767b4Smrg #define   r5    (scratch)			/* 3n+1 */
1164a1767b4Smrg #define   r3    (scratch + 3 * n + 1)		/* 3n+1 */
1174a1767b4Smrg #define   r1    (scratch + 6 * n + 2)		/* 3n+1 */
1184a1767b4Smrg #define   v0    (pp + 7 * n)			/* n+1 */
1194a1767b4Smrg #define   v2    (pp + 9 * n+2)			/* n+1 */
1204a1767b4Smrg #define   wse   (scratch + 9 * n + 3)		/* 3n+1 */
1214a1767b4Smrg 
1224a1767b4Smrg   /* Alloc also 3n+1 limbs for ws... toom_interpolate_12pts may
1234a1767b4Smrg      need all of them, when DO_mpn_sublsh_n usea a scratch  */
1244a1767b4Smrg /*   if (scratch== NULL) */
1254a1767b4Smrg /*     scratch = TMP_SALLOC_LIMBS (12 * n + 6); */
1264a1767b4Smrg 
1274a1767b4Smrg   /********************** evaluation and recursive calls *********************/
1284a1767b4Smrg   /* $\pm1/2$ */
1294a1767b4Smrg   mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 1, pp);
1304a1767b4Smrg   TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/2)*B(-1/2)*2^. */
1314a1767b4Smrg   TOOM6_SQR_REC(r5, v2, n + 1, wse); /* A(+1/2)*B(+1/2)*2^. */
1324a1767b4Smrg   mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 1, 0);
1334a1767b4Smrg 
1344a1767b4Smrg   /* $\pm1$ */
1354a1767b4Smrg   mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s,    pp);
1364a1767b4Smrg   TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1)*B(-1) */
1374a1767b4Smrg   TOOM6_SQR_REC(r3, v2, n + 1, wse); /* A(1)*B(1) */
1384a1767b4Smrg   mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 0, 0);
1394a1767b4Smrg 
1404a1767b4Smrg   /* $\pm4$ */
1414a1767b4Smrg   mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp);
1424a1767b4Smrg   TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-4)*B(-4) */
1434a1767b4Smrg   TOOM6_SQR_REC(r1, v2, n + 1, wse); /* A(+4)*B(+4) */
1444a1767b4Smrg   mpn_toom_couple_handling (r1, 2 * n + 1, pp, 0, n, 2, 4);
1454a1767b4Smrg 
1464a1767b4Smrg   /* $\pm1/4$ */
1474a1767b4Smrg   mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 2, pp);
1484a1767b4Smrg   TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/4)*B(-1/4)*4^. */
1494a1767b4Smrg   TOOM6_SQR_REC(r4, v2, n + 1, wse); /* A(+1/4)*B(+1/4)*4^. */
1504a1767b4Smrg   mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 2, 0);
1514a1767b4Smrg 
1524a1767b4Smrg   /* $\pm2$ */
1534a1767b4Smrg   mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp);
1544a1767b4Smrg   TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-2)*B(-2) */
1554a1767b4Smrg   TOOM6_SQR_REC(r2, v2, n + 1, wse); /* A(+2)*B(+2) */
1564a1767b4Smrg   mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 1, 2);
1574a1767b4Smrg 
1584a1767b4Smrg #undef v0
1594a1767b4Smrg #undef v2
1604a1767b4Smrg 
1614a1767b4Smrg   /* A(0)*B(0) */
1624a1767b4Smrg   TOOM6_SQR_REC(pp, ap, n, wse);
1634a1767b4Smrg 
1644a1767b4Smrg   mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, 2 * s, 0, wse);
1654a1767b4Smrg 
1664a1767b4Smrg #undef r0
1674a1767b4Smrg #undef r1
1684a1767b4Smrg #undef r2
1694a1767b4Smrg #undef r3
1704a1767b4Smrg #undef r4
1714a1767b4Smrg #undef r5
1724a1767b4Smrg 
1734a1767b4Smrg }
1744a1767b4Smrg #undef TOOM6_SQR_REC
1754a1767b4Smrg #undef MAYBE_sqr_basecase
1764a1767b4Smrg #undef MAYBE_sqr_above_basecase
1774a1767b4Smrg #undef MAYBE_sqr_toom2
1784a1767b4Smrg #undef MAYBE_sqr_above_toom2
1794a1767b4Smrg #undef MAYBE_sqr_toom3
1804a1767b4Smrg #undef MAYBE_sqr_above_toom3
1814a1767b4Smrg #undef MAYBE_sqr_above_toom4
182