1 /* Interpolation for the algorithm Toom-Cook 6.5-way.
2
3 Contributed to the GNU project by Marco Bodrato.
4
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20 or
21
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
25
26 or both in parallel, as here.
27
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 for more details.
32
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
36
37
38 #include "gmp.h"
39 #include "gmp-impl.h"
40
41
42 #if HAVE_NATIVE_mpn_sublsh_n
43 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
44 #else
45 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)46 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
47 {
48 #if USE_MUL_1 && 0
49 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
50 #else
51 mp_limb_t __cy;
52 __cy = mpn_lshift(ws,src,n,s);
53 return __cy + mpn_sub_n(dst,dst,ws,n);
54 #endif
55 }
56 #endif
57
58 #if HAVE_NATIVE_mpn_addlsh_n
59 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
60 #else
61 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)62 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
63 {
64 #if USE_MUL_1 && 0
65 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
66 #else
67 mp_limb_t __cy;
68 __cy = mpn_lshift(ws,src,n,s);
69 return __cy + mpn_add_n(dst,dst,ws,n);
70 #endif
71 }
72 #endif
73
74 #if HAVE_NATIVE_mpn_subrsh
75 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
76 #else
77 /* FIXME: This is not a correct definition, it assumes no carry */
78 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
79 do { \
80 mp_limb_t __cy; \
81 MPN_DECR_U (dst, nd, src[0] >> s); \
82 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
83 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
84 } while (0)
85 #endif
86
87
88 #if GMP_NUMB_BITS < 21
89 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
90 #endif
91
92 #if GMP_NUMB_BITS < 16
93 #error Not implemented: divexact_by42525 needs splitting.
94 #endif
95
96 #if GMP_NUMB_BITS < 12
97 #error Not implemented: Hard to adapt...
98 #endif
99
100 /* FIXME: tuneup should decide the best variant */
101 #ifndef AORSMUL_FASTER_AORS_AORSLSH
102 #define AORSMUL_FASTER_AORS_AORSLSH 1
103 #endif
104 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
105 #define AORSMUL_FASTER_AORS_2AORSLSH 1
106 #endif
107 #ifndef AORSMUL_FASTER_2AORSLSH
108 #define AORSMUL_FASTER_2AORSLSH 1
109 #endif
110 #ifndef AORSMUL_FASTER_3AORSLSH
111 #define AORSMUL_FASTER_3AORSLSH 1
112 #endif
113
114 #define BINVERT_9 \
115 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
116
117 #define BINVERT_255 \
118 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
119
120 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
121 #if GMP_LIMB_BITS == 32
122 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
123 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
124 #else
125 #if GMP_LIMB_BITS == 64
126 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
127 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
128 #endif
129 #endif
130
131 #ifndef mpn_divexact_by255
132 #if GMP_NUMB_BITS % 8 == 0
133 #define mpn_divexact_by255(dst,src,size) \
134 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
135 #else
136 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
137 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
138 #else
139 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
140 #endif
141 #endif
142 #endif
143
144 #ifndef mpn_divexact_by9x4
145 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
146 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
147 #else
148 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
149 #endif
150 #endif
151
152 #ifndef mpn_divexact_by42525
153 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
154 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
155 #else
156 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
157 #endif
158 #endif
159
160 #ifndef mpn_divexact_by2835x4
161 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
162 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
163 #else
164 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
165 #endif
166 #endif
167
168 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
169 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
170 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
171 degree 11 (or 10), given the 12 (rsp. 11) values:
172
173 r0 = limit at infinity of f(x) / x^7,
174 r1 = f(4),f(-4),
175 r2 = f(2),f(-2),
176 r3 = f(1),f(-1),
177 r4 = f(1/4),f(-1/4),
178 r5 = f(1/2),f(-1/2),
179 r6 = f(0).
180
181 All couples of the form f(n),f(-n) must be already mixed with
182 toom_couple_handling(f(n),...,f(-n),...)
183
184 The result is stored in {pp, spt + 7*n (or 6*n)}.
185 At entry, r6 is stored at {pp, 2n},
186 r4 is stored at {pp + 3n, 3n + 1}.
187 r2 is stored at {pp + 7n, 3n + 1}.
188 r0 is stored at {pp +11n, spt}.
189
190 The other values are 3n+1 limbs each (with most significant limbs small).
191
192 Negative intermediate results are stored two-complemented.
193 Inputs are destroyed.
194 */
195
196 void
mpn_toom_interpolate_12pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)197 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
198 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
199 {
200 mp_limb_t cy;
201 mp_size_t n3;
202 mp_size_t n3p1;
203 n3 = 3 * n;
204 n3p1 = n3 + 1;
205
206 #define r4 (pp + n3) /* 3n+1 */
207 #define r2 (pp + 7 * n) /* 3n+1 */
208 #define r0 (pp +11 * n) /* s+t <= 2*n */
209
210 /******************************* interpolation *****************************/
211 if (half != 0) {
212 cy = mpn_sub_n (r3, r3, r0, spt);
213 MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
214
215 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
216 MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
217 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
218
219 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
220 MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
221 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
222 };
223
224 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
225 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
226
227 #if HAVE_NATIVE_mpn_add_n_sub_n
228 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
229 #else
230 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
231 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
232 MP_PTR_SWAP(r1, wsi);
233 #endif
234
235 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
236 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
237
238 #if HAVE_NATIVE_mpn_add_n_sub_n
239 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
240 #else
241 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
242 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
243 MP_PTR_SWAP(r5, wsi);
244 #endif
245
246 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
247
248 #if AORSMUL_FASTER_AORS_AORSLSH
249 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
250 #else
251 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
252 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
253 #endif
254 /* A division by 2835x4 follows. Warning: the operand can be negative! */
255 mpn_divexact_by2835x4(r4, r4, n3p1);
256 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
257 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
258
259 #if AORSMUL_FASTER_2AORSLSH
260 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
261 #else
262 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
263 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
264 #endif
265 mpn_divexact_by255(r5, r5, n3p1);
266
267 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
268
269 #if AORSMUL_FASTER_3AORSLSH
270 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
271 #else
272 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
273 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
274 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
275 #endif
276 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
277 mpn_divexact_by42525(r1, r1, n3p1);
278
279 #if AORSMUL_FASTER_AORS_2AORSLSH
280 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
281 #else
282 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
283 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
284 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
285 #endif
286 mpn_divexact_by9x4(r2, r2, n3p1);
287
288 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
289
290 mpn_sub_n (r4, r2, r4, n3p1);
291 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
292 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
293
294 mpn_add_n (r5, r5, r1, n3p1);
295 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
296
297 /* last interpolation steps... */
298 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
299 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
300 /* ... could be mixed with recomposition
301 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1|
302 */
303
304 /***************************** recomposition *******************************/
305 /*
306 pp[] prior to operations:
307 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
308
309 summation scheme for remaining operations:
310 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
311 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
312 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5|
313 */
314
315 cy = mpn_add_n (pp + n, pp + n, r5, n);
316 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
317 #if HAVE_NATIVE_mpn_add_nc
318 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
319 #else
320 MPN_INCR_U (r5 + 2 * n, n + 1, cy);
321 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
322 #endif
323 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
324
325 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
326 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
327 #if HAVE_NATIVE_mpn_add_nc
328 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
329 #else
330 MPN_INCR_U (r3 + 2 * n, n + 1, cy);
331 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
332 #endif
333 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
334
335 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
336 if (half) {
337 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
338 #if HAVE_NATIVE_mpn_add_nc
339 if (LIKELY (spt > n)) {
340 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
341 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
342 } else {
343 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
344 }
345 #else
346 MPN_INCR_U (r1 + 2 * n, n + 1, cy);
347 if (LIKELY (spt > n)) {
348 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
349 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
350 } else {
351 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
352 }
353 #endif
354 } else {
355 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
356 }
357
358 #undef r0
359 #undef r2
360 #undef r4
361 }
362