1 /* Interpolation for the algorithm Toom-Cook 6.5-way.
2 
3    Contributed to the GNU project by Marco Bodrato.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 
38 #include "gmp.h"
39 #include "gmp-impl.h"
40 
41 
42 #if HAVE_NATIVE_mpn_sublsh_n
43 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
44 #else
45 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)46 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
47 {
48 #if USE_MUL_1 && 0
49   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
50 #else
51   mp_limb_t __cy;
52   __cy = mpn_lshift(ws,src,n,s);
53   return    __cy + mpn_sub_n(dst,dst,ws,n);
54 #endif
55 }
56 #endif
57 
58 #if HAVE_NATIVE_mpn_addlsh_n
59 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
60 #else
61 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)62 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
63 {
64 #if USE_MUL_1 && 0
65   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
66 #else
67   mp_limb_t __cy;
68   __cy = mpn_lshift(ws,src,n,s);
69   return    __cy + mpn_add_n(dst,dst,ws,n);
70 #endif
71 }
72 #endif
73 
74 #if HAVE_NATIVE_mpn_subrsh
75 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
76 #else
77 /* FIXME: This is not a correct definition, it assumes no carry */
78 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
79 do {									\
80   mp_limb_t __cy;							\
81   MPN_DECR_U (dst, nd, src[0] >> s);					\
82   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
83   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
84 } while (0)
85 #endif
86 
87 
88 #if GMP_NUMB_BITS < 21
89 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
90 #endif
91 
92 #if GMP_NUMB_BITS < 16
93 #error Not implemented: divexact_by42525 needs splitting.
94 #endif
95 
96 #if GMP_NUMB_BITS < 12
97 #error Not implemented: Hard to adapt...
98 #endif
99 
100 /* FIXME: tuneup should decide the best variant */
101 #ifndef AORSMUL_FASTER_AORS_AORSLSH
102 #define AORSMUL_FASTER_AORS_AORSLSH 1
103 #endif
104 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
105 #define AORSMUL_FASTER_AORS_2AORSLSH 1
106 #endif
107 #ifndef AORSMUL_FASTER_2AORSLSH
108 #define AORSMUL_FASTER_2AORSLSH 1
109 #endif
110 #ifndef AORSMUL_FASTER_3AORSLSH
111 #define AORSMUL_FASTER_3AORSLSH 1
112 #endif
113 
114 #define BINVERT_9 \
115   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
116 
117 #define BINVERT_255 \
118   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
119 
120   /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
121 #if GMP_LIMB_BITS == 32
122 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
123 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
124 #else
125 #if GMP_LIMB_BITS == 64
126 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
127 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
128 #endif
129 #endif
130 
131 #ifndef mpn_divexact_by255
132 #if GMP_NUMB_BITS % 8 == 0
133 #define mpn_divexact_by255(dst,src,size) \
134   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
135 #else
136 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
137 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
138 #else
139 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
140 #endif
141 #endif
142 #endif
143 
144 #ifndef mpn_divexact_by9x4
145 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
146 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
147 #else
148 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
149 #endif
150 #endif
151 
152 #ifndef mpn_divexact_by42525
153 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
154 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
155 #else
156 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
157 #endif
158 #endif
159 
160 #ifndef mpn_divexact_by2835x4
161 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
162 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
163 #else
164 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
165 #endif
166 #endif
167 
168 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
169    points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
170    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
171    degree 11 (or 10), given the 12 (rsp. 11) values:
172 
173      r0 = limit at infinity of f(x) / x^7,
174      r1 = f(4),f(-4),
175      r2 = f(2),f(-2),
176      r3 = f(1),f(-1),
177      r4 = f(1/4),f(-1/4),
178      r5 = f(1/2),f(-1/2),
179      r6 = f(0).
180 
181    All couples of the form f(n),f(-n) must be already mixed with
182    toom_couple_handling(f(n),...,f(-n),...)
183 
184    The result is stored in {pp, spt + 7*n (or 6*n)}.
185    At entry, r6 is stored at {pp, 2n},
186    r4 is stored at {pp + 3n, 3n + 1}.
187    r2 is stored at {pp + 7n, 3n + 1}.
188    r0 is stored at {pp +11n, spt}.
189 
190    The other values are 3n+1 limbs each (with most significant limbs small).
191 
192    Negative intermediate results are stored two-complemented.
193    Inputs are destroyed.
194 */
195 
196 void
mpn_toom_interpolate_12pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)197 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
198 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
199 {
200   mp_limb_t cy;
201   mp_size_t n3;
202   mp_size_t n3p1;
203   n3 = 3 * n;
204   n3p1 = n3 + 1;
205 
206 #define   r4    (pp + n3)			/* 3n+1 */
207 #define   r2    (pp + 7 * n)			/* 3n+1 */
208 #define   r0    (pp +11 * n)			/* s+t <= 2*n */
209 
210   /******************************* interpolation *****************************/
211   if (half != 0) {
212     cy = mpn_sub_n (r3, r3, r0, spt);
213     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
214 
215     cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
216     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
217     DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
218 
219     cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
220     MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
221     DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
222   };
223 
224   r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
225   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
226 
227 #if HAVE_NATIVE_mpn_add_n_sub_n
228   mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
229 #else
230   ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
231   mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
232   MP_PTR_SWAP(r1, wsi);
233 #endif
234 
235   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
236   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
237 
238 #if HAVE_NATIVE_mpn_add_n_sub_n
239   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
240 #else
241   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
242   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
243   MP_PTR_SWAP(r5, wsi);
244 #endif
245 
246   r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
247 
248 #if AORSMUL_FASTER_AORS_AORSLSH
249   mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
250 #else
251   mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
252   DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
253 #endif
254   /* A division by 2835x4 follows. Warning: the operand can be negative! */
255   mpn_divexact_by2835x4(r4, r4, n3p1);
256   if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
257     r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
258 
259 #if AORSMUL_FASTER_2AORSLSH
260   mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
261 #else
262   DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
263   DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
264 #endif
265   mpn_divexact_by255(r5, r5, n3p1);
266 
267   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
268 
269 #if AORSMUL_FASTER_3AORSLSH
270   ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
271 #else
272   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
273   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
274   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
275 #endif
276   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
277   mpn_divexact_by42525(r1, r1, n3p1);
278 
279 #if AORSMUL_FASTER_AORS_2AORSLSH
280   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
281 #else
282   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
283   ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
284   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
285 #endif
286   mpn_divexact_by9x4(r2, r2, n3p1);
287 
288   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
289 
290   mpn_sub_n (r4, r2, r4, n3p1);
291   ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
292   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
293 
294   mpn_add_n (r5, r5, r1, n3p1);
295   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
296 
297   /* last interpolation steps... */
298   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
299   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
300   /* ... could be mixed with recomposition
301 	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
302   */
303 
304   /***************************** recomposition *******************************/
305   /*
306     pp[] prior to operations:
307     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
308 
309     summation scheme for remaining operations:
310     |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
311     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
312 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
313   */
314 
315   cy = mpn_add_n (pp + n, pp + n, r5, n);
316   cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
317 #if HAVE_NATIVE_mpn_add_nc
318   cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
319 #else
320   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
321   cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
322 #endif
323   MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
324 
325   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
326   cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
327 #if HAVE_NATIVE_mpn_add_nc
328   cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
329 #else
330   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
331   cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
332 #endif
333   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
334 
335   pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
336   if (half) {
337     cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
338 #if HAVE_NATIVE_mpn_add_nc
339     if (LIKELY (spt > n)) {
340       cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
341       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
342     } else {
343       ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
344     }
345 #else
346     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
347     if (LIKELY (spt > n)) {
348       cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
349       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
350     } else {
351       ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
352     }
353 #endif
354   } else {
355     ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
356   }
357 
358 #undef   r0
359 #undef   r2
360 #undef   r4
361 }
362