1*86d7f5d3SJohn Marino /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
2*86d7f5d3SJohn Marino 
3*86d7f5d3SJohn Marino    Contributed to the GNU project by Niels M�ller.
4*86d7f5d3SJohn Marino    Improvements by Marco Bodrato.
5*86d7f5d3SJohn Marino 
6*86d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7*86d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8*86d7f5d3SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9*86d7f5d3SJohn Marino 
10*86d7f5d3SJohn Marino Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
11*86d7f5d3SJohn Marino 
12*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
13*86d7f5d3SJohn Marino 
14*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
15*86d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
16*86d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
17*86d7f5d3SJohn Marino option) any later version.
18*86d7f5d3SJohn Marino 
19*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
20*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21*86d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
22*86d7f5d3SJohn Marino License for more details.
23*86d7f5d3SJohn Marino 
24*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
25*86d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
26*86d7f5d3SJohn Marino 
27*86d7f5d3SJohn Marino #include "gmp.h"
28*86d7f5d3SJohn Marino #include "gmp-impl.h"
29*86d7f5d3SJohn Marino 
30*86d7f5d3SJohn Marino #define BINVERT_3 MODLIMB_INVERSE_3
31*86d7f5d3SJohn Marino 
32*86d7f5d3SJohn Marino #define BINVERT_9 \
33*86d7f5d3SJohn Marino   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
34*86d7f5d3SJohn Marino 
35*86d7f5d3SJohn Marino #define BINVERT_15 \
36*86d7f5d3SJohn Marino   ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15))
37*86d7f5d3SJohn Marino 
38*86d7f5d3SJohn Marino /* For the various mpn_divexact_byN here, fall back to using either
39*86d7f5d3SJohn Marino    mpn_pi1_bdiv_q_1 or mpn_divexact_1.  The former has less overhead and is
40*86d7f5d3SJohn Marino    many faster if it is native.  For now, since mpn_divexact_1 is native on
41*86d7f5d3SJohn Marino    several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
42*86d7f5d3SJohn Marino    mpn_pi1_bdiv_q_1 unconditionally.  FIXME.  */
43*86d7f5d3SJohn Marino 
44*86d7f5d3SJohn Marino /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
45*86d7f5d3SJohn Marino #ifndef mpn_divexact_by3
46*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
47*86d7f5d3SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
48*86d7f5d3SJohn Marino #else
49*86d7f5d3SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
50*86d7f5d3SJohn Marino #endif
51*86d7f5d3SJohn Marino #endif
52*86d7f5d3SJohn Marino 
53*86d7f5d3SJohn Marino #ifndef mpn_divexact_by9
54*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
55*86d7f5d3SJohn Marino #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
56*86d7f5d3SJohn Marino #else
57*86d7f5d3SJohn Marino #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
58*86d7f5d3SJohn Marino #endif
59*86d7f5d3SJohn Marino #endif
60*86d7f5d3SJohn Marino 
61*86d7f5d3SJohn Marino #ifndef mpn_divexact_by15
62*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
63*86d7f5d3SJohn Marino #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
64*86d7f5d3SJohn Marino #else
65*86d7f5d3SJohn Marino #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
66*86d7f5d3SJohn Marino #endif
67*86d7f5d3SJohn Marino #endif
68*86d7f5d3SJohn Marino 
69*86d7f5d3SJohn Marino /* Interpolation for toom4, using the evaluation points 0, infinity,
70*86d7f5d3SJohn Marino    1, -1, 2, -2, 1/2. More precisely, we want to compute
71*86d7f5d3SJohn Marino    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
72*86d7f5d3SJohn Marino    seven values
73*86d7f5d3SJohn Marino 
74*86d7f5d3SJohn Marino      w0 = f(0),
75*86d7f5d3SJohn Marino      w1 = f(-2),
76*86d7f5d3SJohn Marino      w2 = f(1),
77*86d7f5d3SJohn Marino      w3 = f(-1),
78*86d7f5d3SJohn Marino      w4 = f(2)
79*86d7f5d3SJohn Marino      w5 = 64 * f(1/2)
80*86d7f5d3SJohn Marino      w6 = limit at infinity of f(x) / x^6,
81*86d7f5d3SJohn Marino 
82*86d7f5d3SJohn Marino    The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
83*86d7f5d3SJohn Marino    w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
84*86d7f5d3SJohn Marino    w6n }. The other values are 2n + 1 limbs each (with most
85*86d7f5d3SJohn Marino    significant limbs small). f(-1) and f(-1/2) may be negative, signs
86*86d7f5d3SJohn Marino    determined by the flag bits. Inputs are destroyed.
87*86d7f5d3SJohn Marino 
88*86d7f5d3SJohn Marino    Needs (2*n + 1) limbs of temporary storage.
89*86d7f5d3SJohn Marino */
90*86d7f5d3SJohn Marino 
91*86d7f5d3SJohn Marino void
mpn_toom_interpolate_7pts(mp_ptr rp,mp_size_t n,enum toom7_flags flags,mp_ptr w1,mp_ptr w3,mp_ptr w4,mp_ptr w5,mp_size_t w6n,mp_ptr tp)92*86d7f5d3SJohn Marino mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
93*86d7f5d3SJohn Marino 			   mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
94*86d7f5d3SJohn Marino 			   mp_size_t w6n, mp_ptr tp)
95*86d7f5d3SJohn Marino {
96*86d7f5d3SJohn Marino   mp_size_t m;
97*86d7f5d3SJohn Marino   mp_limb_t cy;
98*86d7f5d3SJohn Marino 
99*86d7f5d3SJohn Marino   m = 2*n + 1;
100*86d7f5d3SJohn Marino #define w0 rp
101*86d7f5d3SJohn Marino #define w2 (rp + 2*n)
102*86d7f5d3SJohn Marino #define w6 (rp + 6*n)
103*86d7f5d3SJohn Marino 
104*86d7f5d3SJohn Marino   ASSERT (w6n > 0);
105*86d7f5d3SJohn Marino   ASSERT (w6n <= 2*n);
106*86d7f5d3SJohn Marino 
107*86d7f5d3SJohn Marino   /* Using formulas similar to Marco Bodrato's
108*86d7f5d3SJohn Marino 
109*86d7f5d3SJohn Marino      W5 = W5 + W4
110*86d7f5d3SJohn Marino      W1 =(W4 - W1)/2
111*86d7f5d3SJohn Marino      W4 = W4 - W0
112*86d7f5d3SJohn Marino      W4 =(W4 - W1)/4 - W6*16
113*86d7f5d3SJohn Marino      W3 =(W2 - W3)/2
114*86d7f5d3SJohn Marino      W2 = W2 - W3
115*86d7f5d3SJohn Marino 
116*86d7f5d3SJohn Marino      W5 = W5 - W2*65      May be negative.
117*86d7f5d3SJohn Marino      W2 = W2 - W6 - W0
118*86d7f5d3SJohn Marino      W5 =(W5 + W2*45)/2   Now >= 0 again.
119*86d7f5d3SJohn Marino      W4 =(W4 - W2)/3
120*86d7f5d3SJohn Marino      W2 = W2 - W4
121*86d7f5d3SJohn Marino 
122*86d7f5d3SJohn Marino      W1 = W5 - W1         May be negative.
123*86d7f5d3SJohn Marino      W5 =(W5 - W3*8)/9
124*86d7f5d3SJohn Marino      W3 = W3 - W5
125*86d7f5d3SJohn Marino      W1 =(W1/15 + W5)/2   Now >= 0 again.
126*86d7f5d3SJohn Marino      W5 = W5 - W1
127*86d7f5d3SJohn Marino 
128*86d7f5d3SJohn Marino      where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
129*86d7f5d3SJohn Marino 	   W4 = f(2), W5 = f(1/2), W6 = f(oo),
130*86d7f5d3SJohn Marino 
131*86d7f5d3SJohn Marino      Note that most intermediate results are positive; the ones that
132*86d7f5d3SJohn Marino      may be negative are represented in two's complement. We must
133*86d7f5d3SJohn Marino      never shift right a value that may be negative, since that would
134*86d7f5d3SJohn Marino      invalidate the sign bit. On the other hand, divexact by odd
135*86d7f5d3SJohn Marino      numbers work fine with two's complement.
136*86d7f5d3SJohn Marino   */
137*86d7f5d3SJohn Marino 
138*86d7f5d3SJohn Marino   mpn_add_n (w5, w5, w4, m);
139*86d7f5d3SJohn Marino   if (flags & toom7_w1_neg)
140*86d7f5d3SJohn Marino     {
141*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1add_n
142*86d7f5d3SJohn Marino       mpn_rsh1add_n (w1, w1, w4, m);
143*86d7f5d3SJohn Marino #else
144*86d7f5d3SJohn Marino       mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
145*86d7f5d3SJohn Marino       mpn_rshift (w1, w1, m, 1);
146*86d7f5d3SJohn Marino #endif
147*86d7f5d3SJohn Marino     }
148*86d7f5d3SJohn Marino   else
149*86d7f5d3SJohn Marino     {
150*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1sub_n
151*86d7f5d3SJohn Marino       mpn_rsh1sub_n (w1, w4, w1, m);
152*86d7f5d3SJohn Marino #else
153*86d7f5d3SJohn Marino       mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
154*86d7f5d3SJohn Marino       mpn_rshift (w1, w1, m, 1);
155*86d7f5d3SJohn Marino #endif
156*86d7f5d3SJohn Marino     }
157*86d7f5d3SJohn Marino   mpn_sub (w4, w4, m, w0, 2*n);
158*86d7f5d3SJohn Marino   mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
159*86d7f5d3SJohn Marino   mpn_rshift (w4, w4, m, 2); /* w4>=0 */
160*86d7f5d3SJohn Marino 
161*86d7f5d3SJohn Marino   tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
162*86d7f5d3SJohn Marino   mpn_sub (w4, w4, m, tp, w6n+1);
163*86d7f5d3SJohn Marino 
164*86d7f5d3SJohn Marino   if (flags & toom7_w3_neg)
165*86d7f5d3SJohn Marino     {
166*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1add_n
167*86d7f5d3SJohn Marino       mpn_rsh1add_n (w3, w3, w2, m);
168*86d7f5d3SJohn Marino #else
169*86d7f5d3SJohn Marino       mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
170*86d7f5d3SJohn Marino       mpn_rshift (w3, w3, m, 1);
171*86d7f5d3SJohn Marino #endif
172*86d7f5d3SJohn Marino     }
173*86d7f5d3SJohn Marino   else
174*86d7f5d3SJohn Marino     {
175*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1sub_n
176*86d7f5d3SJohn Marino       mpn_rsh1sub_n (w3, w2, w3, m);
177*86d7f5d3SJohn Marino #else
178*86d7f5d3SJohn Marino       mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
179*86d7f5d3SJohn Marino       mpn_rshift (w3, w3, m, 1);
180*86d7f5d3SJohn Marino #endif
181*86d7f5d3SJohn Marino     }
182*86d7f5d3SJohn Marino 
183*86d7f5d3SJohn Marino   mpn_sub_n (w2, w2, w3, m);
184*86d7f5d3SJohn Marino 
185*86d7f5d3SJohn Marino   mpn_submul_1 (w5, w2, m, 65);
186*86d7f5d3SJohn Marino   mpn_sub (w2, w2, m, w6, w6n);
187*86d7f5d3SJohn Marino   mpn_sub (w2, w2, m, w0, 2*n);
188*86d7f5d3SJohn Marino 
189*86d7f5d3SJohn Marino   mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
190*86d7f5d3SJohn Marino   mpn_rshift (w5, w5, m, 1);
191*86d7f5d3SJohn Marino   mpn_sub_n (w4, w4, w2, m);
192*86d7f5d3SJohn Marino 
193*86d7f5d3SJohn Marino   mpn_divexact_by3 (w4, w4, m);
194*86d7f5d3SJohn Marino   mpn_sub_n (w2, w2, w4, m);
195*86d7f5d3SJohn Marino 
196*86d7f5d3SJohn Marino   mpn_sub_n (w1, w5, w1, m);
197*86d7f5d3SJohn Marino   mpn_lshift (tp, w3, m, 3);
198*86d7f5d3SJohn Marino   mpn_sub_n (w5, w5, tp, m);
199*86d7f5d3SJohn Marino   mpn_divexact_by9 (w5, w5, m);
200*86d7f5d3SJohn Marino   mpn_sub_n (w3, w3, w5, m);
201*86d7f5d3SJohn Marino 
202*86d7f5d3SJohn Marino   mpn_divexact_by15 (w1, w1, m);
203*86d7f5d3SJohn Marino   mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
204*86d7f5d3SJohn Marino   mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
205*86d7f5d3SJohn Marino   mpn_sub_n (w5, w5, w1, m);
206*86d7f5d3SJohn Marino 
207*86d7f5d3SJohn Marino   /* These bounds are valid for the 4x4 polynomial product of toom44,
208*86d7f5d3SJohn Marino    * and they are conservative for toom53 and toom62. */
209*86d7f5d3SJohn Marino   ASSERT (w1[2*n] < 2);
210*86d7f5d3SJohn Marino   ASSERT (w2[2*n] < 3);
211*86d7f5d3SJohn Marino   ASSERT (w3[2*n] < 4);
212*86d7f5d3SJohn Marino   ASSERT (w4[2*n] < 3);
213*86d7f5d3SJohn Marino   ASSERT (w5[2*n] < 2);
214*86d7f5d3SJohn Marino 
215*86d7f5d3SJohn Marino   /* Addition chain. Note carries and the 2n'th limbs that need to be
216*86d7f5d3SJohn Marino    * added in.
217*86d7f5d3SJohn Marino    *
218*86d7f5d3SJohn Marino    * Special care is needed for w2[2n] and the corresponding carry,
219*86d7f5d3SJohn Marino    * since the "simple" way of adding it all together would overwrite
220*86d7f5d3SJohn Marino    * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
221*86d7f5d3SJohn Marino    * the high half of w3 and the low half of w4.
222*86d7f5d3SJohn Marino    *
223*86d7f5d3SJohn Marino    *         7    6    5    4    3    2    1    0
224*86d7f5d3SJohn Marino    *    |    |    |    |    |    |    |    |    |
225*86d7f5d3SJohn Marino    *                  ||w3 (2n+1)|
226*86d7f5d3SJohn Marino    *             ||w4 (2n+1)|
227*86d7f5d3SJohn Marino    *        ||w5 (2n+1)|        ||w1 (2n+1)|
228*86d7f5d3SJohn Marino    *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
229*86d7f5d3SJohn Marino    *  -----------------------------------------------
230*86d7f5d3SJohn Marino    *  r |    |    |    |    |    |    |    |    |
231*86d7f5d3SJohn Marino    *        c7   c6   c5   c4   c3                 Carries to propagate
232*86d7f5d3SJohn Marino    */
233*86d7f5d3SJohn Marino 
234*86d7f5d3SJohn Marino   cy = mpn_add_n (rp + n, rp + n, w1, m);
235*86d7f5d3SJohn Marino   MPN_INCR_U (w2 + n + 1, n , cy);
236*86d7f5d3SJohn Marino   cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
237*86d7f5d3SJohn Marino   MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
238*86d7f5d3SJohn Marino   cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
239*86d7f5d3SJohn Marino   MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
240*86d7f5d3SJohn Marino   cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
241*86d7f5d3SJohn Marino   MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
242*86d7f5d3SJohn Marino   if (w6n > n + 1)
243*86d7f5d3SJohn Marino     ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1));
244*86d7f5d3SJohn Marino   else
245*86d7f5d3SJohn Marino     {
246*86d7f5d3SJohn Marino       ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
247*86d7f5d3SJohn Marino #if WANT_ASSERT
248*86d7f5d3SJohn Marino       {
249*86d7f5d3SJohn Marino 	mp_size_t i;
250*86d7f5d3SJohn Marino 	for (i = w6n; i <= n; i++)
251*86d7f5d3SJohn Marino 	  ASSERT (w5[n + i] == 0);
252*86d7f5d3SJohn Marino       }
253*86d7f5d3SJohn Marino #endif
254*86d7f5d3SJohn Marino     }
255*86d7f5d3SJohn Marino }
256