154028e53SJohn Marino /* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72.
254028e53SJohn Marino 
354028e53SJohn Marino    Contributed to the GNU project by Marco Bodrato.
454028e53SJohn Marino 
554028e53SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
654028e53SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
754028e53SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
854028e53SJohn Marino 
954028e53SJohn Marino Copyright 2009 Free Software Foundation, Inc.
1054028e53SJohn Marino 
1154028e53SJohn Marino This file is part of the GNU MP Library.
1254028e53SJohn Marino 
1354028e53SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1454028e53SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1554028e53SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1654028e53SJohn Marino option) any later version.
1754028e53SJohn Marino 
1854028e53SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1954028e53SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2054028e53SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2154028e53SJohn Marino License for more details.
2254028e53SJohn Marino 
2354028e53SJohn Marino You should have received a copy of the GNU Lesser General Public License
2454028e53SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2554028e53SJohn Marino 
2654028e53SJohn Marino #include "gmp.h"
2754028e53SJohn Marino #include "gmp-impl.h"
2854028e53SJohn Marino 
2954028e53SJohn Marino #define BINVERT_3 MODLIMB_INVERSE_3
3054028e53SJohn Marino 
3154028e53SJohn Marino #define BINVERT_15 \
3254028e53SJohn Marino   ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
3354028e53SJohn Marino 
3454028e53SJohn Marino #define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK)
3554028e53SJohn Marino 
3654028e53SJohn Marino #ifndef mpn_divexact_by3
3754028e53SJohn Marino #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
3854028e53SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
3954028e53SJohn Marino #else
4054028e53SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
4154028e53SJohn Marino #endif
4254028e53SJohn Marino #endif
4354028e53SJohn Marino 
4454028e53SJohn Marino #ifndef mpn_divexact_by45
4554028e53SJohn Marino #if GMP_NUMB_BITS % 12 == 0
4654028e53SJohn Marino #define mpn_divexact_by45(dst,src,size) \
4754028e53SJohn Marino   (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45)))
4854028e53SJohn Marino #else
4954028e53SJohn Marino #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
5054028e53SJohn Marino #define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0)
5154028e53SJohn Marino #else
5254028e53SJohn Marino #define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45)
5354028e53SJohn Marino #endif
5454028e53SJohn Marino #endif
5554028e53SJohn Marino #endif
5654028e53SJohn Marino 
5754028e53SJohn Marino #if HAVE_NATIVE_mpn_sublsh_n
5854028e53SJohn Marino #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,src,n,s)
5954028e53SJohn Marino #else
6054028e53SJohn Marino static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)6154028e53SJohn Marino DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
6254028e53SJohn Marino {
6354028e53SJohn Marino #if USE_MUL_1
6454028e53SJohn Marino   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
6554028e53SJohn Marino #else
6654028e53SJohn Marino   mp_limb_t __cy;
6754028e53SJohn Marino   __cy = mpn_lshift (ws,src,n,s);
6854028e53SJohn Marino   return    __cy + mpn_sub_n (dst,dst,ws,n);
6954028e53SJohn Marino #endif
7054028e53SJohn Marino }
7154028e53SJohn Marino #endif
7254028e53SJohn Marino 
7354028e53SJohn Marino 
7454028e53SJohn Marino #if HAVE_NATIVE_mpn_subrsh
7554028e53SJohn Marino #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s)
7654028e53SJohn Marino #else
7754028e53SJohn Marino /* This is not a correct definition, it assumes no carry */
7854028e53SJohn Marino #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
7954028e53SJohn Marino do {									\
8054028e53SJohn Marino   mp_limb_t __cy;							\
8154028e53SJohn Marino   MPN_DECR_U (dst, nd, src[0] >> s);					\
8254028e53SJohn Marino   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
8354028e53SJohn Marino   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
8454028e53SJohn Marino } while (0)
8554028e53SJohn Marino #endif
8654028e53SJohn Marino 
8754028e53SJohn Marino /* Interpolation for Toom-4.5 (or Toom-4), using the evaluation
8854028e53SJohn Marino    points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely,
8954028e53SJohn Marino    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
9054028e53SJohn Marino    degree 7 (or 6), given the 8 (rsp. 7) values:
9154028e53SJohn Marino 
9254028e53SJohn Marino      r1 = limit at infinity of f(x) / x^7,
9354028e53SJohn Marino      r2 = f(4),
9454028e53SJohn Marino      r3 = f(-4),
9554028e53SJohn Marino      r4 = f(2),
9654028e53SJohn Marino      r5 = f(-2),
9754028e53SJohn Marino      r6 = f(1),
9854028e53SJohn Marino      r7 = f(-1),
9954028e53SJohn Marino      r8 = f(0).
10054028e53SJohn Marino 
10154028e53SJohn Marino    All couples of the form f(n),f(-n) must be already mixed with
10254028e53SJohn Marino    toom_couple_handling(f(n),...,f(-n),...)
10354028e53SJohn Marino 
10454028e53SJohn Marino    The result is stored in {pp, spt + 7*n (or 6*n)}.
10554028e53SJohn Marino    At entry, r8 is stored at {pp, 2n},
10654028e53SJohn Marino    r5 is stored at {pp + 3n, 3n + 1}.
10754028e53SJohn Marino 
10854028e53SJohn Marino    The other values are 2n+... limbs each (with most significant limbs small).
10954028e53SJohn Marino 
11054028e53SJohn Marino    All intermediate results are positive.
11154028e53SJohn Marino    Inputs are destroyed.
11254028e53SJohn Marino */
11354028e53SJohn Marino 
11454028e53SJohn Marino void
mpn_toom_interpolate_8pts(mp_ptr pp,mp_size_t n,mp_ptr r3,mp_ptr r7,mp_size_t spt,mp_ptr ws)11554028e53SJohn Marino mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
11654028e53SJohn Marino 			   mp_ptr r3, mp_ptr r7,
11754028e53SJohn Marino 			   mp_size_t spt, mp_ptr ws)
11854028e53SJohn Marino {
11954028e53SJohn Marino   mp_limb_signed_t cy;
12054028e53SJohn Marino   mp_ptr r5, r1;
12154028e53SJohn Marino   r5 = (pp + 3 * n);			/* 3n+1 */
12254028e53SJohn Marino   r1 = (pp + 7 * n);			/* spt */
12354028e53SJohn Marino 
12454028e53SJohn Marino   /******************************* interpolation *****************************/
12554028e53SJohn Marino 
12654028e53SJohn Marino   DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
12754028e53SJohn Marino   cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
12854028e53SJohn Marino   MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);
12954028e53SJohn Marino 
13054028e53SJohn Marino   DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
13154028e53SJohn Marino   cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
13254028e53SJohn Marino   MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);
13354028e53SJohn Marino 
13454028e53SJohn Marino   r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
13554028e53SJohn Marino   cy = mpn_sub_n (r7, r7, r1, spt);
13654028e53SJohn Marino   MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);
13754028e53SJohn Marino 
13854028e53SJohn Marino   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
13954028e53SJohn Marino   ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));
14054028e53SJohn Marino 
14154028e53SJohn Marino   ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));
14254028e53SJohn Marino 
14354028e53SJohn Marino   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
14454028e53SJohn Marino 
14554028e53SJohn Marino   mpn_divexact_by45 (r3, r3, 3 * n + 1);
14654028e53SJohn Marino 
14754028e53SJohn Marino   ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));
14854028e53SJohn Marino 
14954028e53SJohn Marino   ASSERT_NOCARRY(DO_mpn_sublsh_n (r5, r3, 3 * n + 1, 2, ws));
15054028e53SJohn Marino 
15154028e53SJohn Marino   /* last interpolation steps... */
15254028e53SJohn Marino   /* ... are mixed with recomposition */
15354028e53SJohn Marino 
15454028e53SJohn Marino   /***************************** recomposition *******************************/
15554028e53SJohn Marino   /*
15654028e53SJohn Marino     pp[] prior to operations:
15754028e53SJohn Marino      |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp
15854028e53SJohn Marino 
15954028e53SJohn Marino     summation scheme for remaining operations:
16054028e53SJohn Marino      |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
16154028e53SJohn Marino      |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
16254028e53SJohn Marino 	  ||_H r3|_M r3|_L*r3|
16354028e53SJohn Marino 				  ||_H_r7|_M_r7|_L_r7|
16454028e53SJohn Marino 		      ||-H r3|-M r3|-L*r3|
16554028e53SJohn Marino 				  ||-H*r5|-M_r5|-L_r5|
16654028e53SJohn Marino   */
16754028e53SJohn Marino 
16854028e53SJohn Marino   cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
16954028e53SJohn Marino   cy-= mpn_sub_n (pp + n, pp + n, r5, n);
17054028e53SJohn Marino   if (0 > cy)
17154028e53SJohn Marino     MPN_DECR_U (r7 + n, 2*n + 1, 1);
17254028e53SJohn Marino   else
17354028e53SJohn Marino     MPN_INCR_U (r7 + n, 2*n + 1, cy);
17454028e53SJohn Marino 
17554028e53SJohn Marino   cy = mpn_sub_n (pp + 2*n, r7 + n, r5 + n, n); /* Mr7-Mr5 */
17654028e53SJohn Marino   MPN_DECR_U (r7 + 2*n, n + 1, cy);
17754028e53SJohn Marino 
17854028e53SJohn Marino   cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
17954028e53SJohn Marino   r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
18054028e53SJohn Marino   cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
18154028e53SJohn Marino   if (UNLIKELY(0 > cy))
18254028e53SJohn Marino     MPN_DECR_U (r5 + n + 1, 2*n, 1);
18354028e53SJohn Marino   else
18454028e53SJohn Marino     MPN_INCR_U (r5 + n + 1, 2*n, cy);
18554028e53SJohn Marino 
18654028e53SJohn Marino   ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */
18754028e53SJohn Marino 
18854028e53SJohn Marino   cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
18954028e53SJohn Marino   MPN_INCR_U (r3 + 2*n, n + 1, cy);
19054028e53SJohn Marino   cy = r3[3*n] + mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
191*d2d4b659SJohn Marino   if (LIKELY(spt != n))
19254028e53SJohn Marino     MPN_INCR_U (pp + 8*n, spt - n, cy);
193*d2d4b659SJohn Marino   else
194*d2d4b659SJohn Marino     ASSERT (cy == 0);
19554028e53SJohn Marino }
196