1 /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
2 
3    Contributed to the GNU project by Niels Möller.
4    Improvements by Marco Bodrato.
5 
6    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 #include "gmp.h"
39 #include "gmp-impl.h"
40 
41 #define BINVERT_3 MODLIMB_INVERSE_3
42 
43 #define BINVERT_9 \
44   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
45 
46 #define BINVERT_15 \
47   ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
48 
49 /* For the various mpn_divexact_byN here, fall back to using either
50    mpn_pi1_bdiv_q_1 or mpn_divexact_1.  The former has less overhead and is
51    many faster if it is native.  For now, since mpn_divexact_1 is native on
52    several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
53    mpn_pi1_bdiv_q_1 unconditionally.  FIXME.  */
54 
55 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
56 #ifndef mpn_divexact_by3
57 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
58 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
59 #else
60 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
61 #endif
62 #endif
63 
64 #ifndef mpn_divexact_by9
65 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
66 #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
67 #else
68 #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
69 #endif
70 #endif
71 
72 #ifndef mpn_divexact_by15
73 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
74 #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
75 #else
76 #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
77 #endif
78 #endif
79 
80 /* Interpolation for toom4, using the evaluation points 0, infinity,
81    1, -1, 2, -2, 1/2. More precisely, we want to compute
82    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
83    seven values
84 
85      w0 = f(0),
86      w1 = f(-2),
87      w2 = f(1),
88      w3 = f(-1),
89      w4 = f(2)
90      w5 = 64 * f(1/2)
91      w6 = limit at infinity of f(x) / x^6,
92 
93    The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
94    w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
95    w6n }. The other values are 2n + 1 limbs each (with most
96    significant limbs small). f(-1) and f(-1/2) may be negative, signs
97    determined by the flag bits. Inputs are destroyed.
98 
99    Needs (2*n + 1) limbs of temporary storage.
100 */
101 
102 void
mpn_toom_interpolate_7pts(mp_ptr rp,mp_size_t n,enum toom7_flags flags,mp_ptr w1,mp_ptr w3,mp_ptr w4,mp_ptr w5,mp_size_t w6n,mp_ptr tp)103 mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
104 			   mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
105 			   mp_size_t w6n, mp_ptr tp)
106 {
107   mp_size_t m;
108   mp_limb_t cy;
109 
110   m = 2*n + 1;
111 #define w0 rp
112 #define w2 (rp + 2*n)
113 #define w6 (rp + 6*n)
114 
115   ASSERT (w6n > 0);
116   ASSERT (w6n <= 2*n);
117 
118   /* Using formulas similar to Marco Bodrato's
119 
120      W5 = W5 + W4
121      W1 =(W4 - W1)/2
122      W4 = W4 - W0
123      W4 =(W4 - W1)/4 - W6*16
124      W3 =(W2 - W3)/2
125      W2 = W2 - W3
126 
127      W5 = W5 - W2*65      May be negative.
128      W2 = W2 - W6 - W0
129      W5 =(W5 + W2*45)/2   Now >= 0 again.
130      W4 =(W4 - W2)/3
131      W2 = W2 - W4
132 
133      W1 = W5 - W1         May be negative.
134      W5 =(W5 - W3*8)/9
135      W3 = W3 - W5
136      W1 =(W1/15 + W5)/2   Now >= 0 again.
137      W5 = W5 - W1
138 
139      where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
140 	   W4 = f(2), W5 = f(1/2), W6 = f(oo),
141 
142      Note that most intermediate results are positive; the ones that
143      may be negative are represented in two's complement. We must
144      never shift right a value that may be negative, since that would
145      invalidate the sign bit. On the other hand, divexact by odd
146      numbers work fine with two's complement.
147   */
148 
149   mpn_add_n (w5, w5, w4, m);
150   if (flags & toom7_w1_neg)
151     {
152 #ifdef HAVE_NATIVE_mpn_rsh1add_n
153       mpn_rsh1add_n (w1, w1, w4, m);
154 #else
155       mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
156       mpn_rshift (w1, w1, m, 1);
157 #endif
158     }
159   else
160     {
161 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
162       mpn_rsh1sub_n (w1, w4, w1, m);
163 #else
164       mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
165       mpn_rshift (w1, w1, m, 1);
166 #endif
167     }
168   mpn_sub (w4, w4, m, w0, 2*n);
169   mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
170   mpn_rshift (w4, w4, m, 2); /* w4>=0 */
171 
172   tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
173   mpn_sub (w4, w4, m, tp, w6n+1);
174 
175   if (flags & toom7_w3_neg)
176     {
177 #ifdef HAVE_NATIVE_mpn_rsh1add_n
178       mpn_rsh1add_n (w3, w3, w2, m);
179 #else
180       mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
181       mpn_rshift (w3, w3, m, 1);
182 #endif
183     }
184   else
185     {
186 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
187       mpn_rsh1sub_n (w3, w2, w3, m);
188 #else
189       mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
190       mpn_rshift (w3, w3, m, 1);
191 #endif
192     }
193 
194   mpn_sub_n (w2, w2, w3, m);
195 
196   mpn_submul_1 (w5, w2, m, 65);
197   mpn_sub (w2, w2, m, w6, w6n);
198   mpn_sub (w2, w2, m, w0, 2*n);
199 
200   mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
201   mpn_rshift (w5, w5, m, 1);
202   mpn_sub_n (w4, w4, w2, m);
203 
204   mpn_divexact_by3 (w4, w4, m);
205   mpn_sub_n (w2, w2, w4, m);
206 
207   mpn_sub_n (w1, w5, w1, m);
208   mpn_lshift (tp, w3, m, 3);
209   mpn_sub_n (w5, w5, tp, m);
210   mpn_divexact_by9 (w5, w5, m);
211   mpn_sub_n (w3, w3, w5, m);
212 
213   mpn_divexact_by15 (w1, w1, m);
214   mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
215   mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
216   mpn_sub_n (w5, w5, w1, m);
217 
218   /* These bounds are valid for the 4x4 polynomial product of toom44,
219    * and they are conservative for toom53 and toom62. */
220   ASSERT (w1[2*n] < 2);
221   ASSERT (w2[2*n] < 3);
222   ASSERT (w3[2*n] < 4);
223   ASSERT (w4[2*n] < 3);
224   ASSERT (w5[2*n] < 2);
225 
226   /* Addition chain. Note carries and the 2n'th limbs that need to be
227    * added in.
228    *
229    * Special care is needed for w2[2n] and the corresponding carry,
230    * since the "simple" way of adding it all together would overwrite
231    * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
232    * the high half of w3 and the low half of w4.
233    *
234    *         7    6    5    4    3    2    1    0
235    *    |    |    |    |    |    |    |    |    |
236    *                  ||w3 (2n+1)|
237    *             ||w4 (2n+1)|
238    *        ||w5 (2n+1)|        ||w1 (2n+1)|
239    *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
240    *  -----------------------------------------------
241    *  r |    |    |    |    |    |    |    |    |
242    *        c7   c6   c5   c4   c3                 Carries to propagate
243    */
244 
245   cy = mpn_add_n (rp + n, rp + n, w1, m);
246   MPN_INCR_U (w2 + n + 1, n , cy);
247   cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
248   MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
249   cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
250   MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
251   cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
252   MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
253   if (w6n > n + 1)
254     ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1));
255   else
256     {
257       ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
258 #if WANT_ASSERT
259       {
260 	mp_size_t i;
261 	for (i = w6n; i <= n; i++)
262 	  ASSERT (w5[n + i] == 0);
263       }
264 #endif
265     }
266 }
267