1 /* Interpolaton for the algorithm Toom-Cook 6.5-way.
2 
3    Contributed to the GNU project by Marco Bodrato.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2009, 2010 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17 
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21 License for more details.
22 
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25 
26 
27 #include "gmp.h"
28 #include "gmp-impl.h"
29 
30 
31 #if HAVE_NATIVE_mpn_sublsh_n
32 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
33 #else
34 static mp_limb_t
35 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
36 {
37 #if USE_MUL_1 && 0
38   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
39 #else
40   mp_limb_t __cy;
41   __cy = mpn_lshift(ws,src,n,s);
42   return    __cy + mpn_sub_n(dst,dst,ws,n);
43 #endif
44 }
45 #endif
46 
47 #if HAVE_NATIVE_mpn_addlsh_n
48 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
49 #else
50 static mp_limb_t
51 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
52 {
53 #if USE_MUL_1 && 0
54   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
55 #else
56   mp_limb_t __cy;
57   __cy = mpn_lshift(ws,src,n,s);
58   return    __cy + mpn_add_n(dst,dst,ws,n);
59 #endif
60 }
61 #endif
62 
63 #if HAVE_NATIVE_mpn_subrsh
64 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
65 #else
66 /* FIXME: This is not a correct definition, it assumes no carry */
67 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
68 do {									\
69   mp_limb_t __cy;							\
70   MPN_DECR_U (dst, nd, src[0] >> s);					\
71   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
72   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
73 } while (0)
74 #endif
75 
76 
77 #if GMP_NUMB_BITS < 21
78 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
79 #endif
80 
81 #if GMP_NUMB_BITS < 16
82 #error Not implemented: divexact_by42525 needs splitting.
83 #endif
84 
85 #if GMP_NUMB_BITS < 12
86 #error Not implemented: Hard to adapt...
87 #endif
88 
89 /* FIXME: tuneup should decide the best variant */
90 #ifndef AORSMUL_FASTER_AORS_AORSLSH
91 #define AORSMUL_FASTER_AORS_AORSLSH 1
92 #endif
93 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
94 #define AORSMUL_FASTER_AORS_2AORSLSH 1
95 #endif
96 #ifndef AORSMUL_FASTER_2AORSLSH
97 #define AORSMUL_FASTER_2AORSLSH 1
98 #endif
99 #ifndef AORSMUL_FASTER_3AORSLSH
100 #define AORSMUL_FASTER_3AORSLSH 1
101 #endif
102 
103 #define BINVERT_9 \
104   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
105 
106 #define BINVERT_255 \
107   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
108 
109   /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
110 #if GMP_LIMB_BITS == 32
111 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
112 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
113 #else
114 #if GMP_LIMB_BITS == 64
115 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
116 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
117 #endif
118 #endif
119 
120 #ifndef mpn_divexact_by255
121 #if GMP_NUMB_BITS % 8 == 0
122 #define mpn_divexact_by255(dst,src,size) \
123   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
124 #else
125 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
126 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
127 #else
128 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
129 #endif
130 #endif
131 #endif
132 
133 #ifndef mpn_divexact_by9x4
134 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
135 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
136 #else
137 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
138 #endif
139 #endif
140 
141 #ifndef mpn_divexact_by42525
142 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
143 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
144 #else
145 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
146 #endif
147 #endif
148 
149 #ifndef mpn_divexact_by2835x4
150 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
151 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
152 #else
153 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
154 #endif
155 #endif
156 
157 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
158    points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
159    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
160    degree 11 (or 10), given the 12 (rsp. 11) values:
161 
162      r0 = limit at infinity of f(x) / x^7,
163      r1 = f(4),f(-4),
164      r2 = f(2),f(-2),
165      r3 = f(1),f(-1),
166      r4 = f(1/4),f(-1/4),
167      r5 = f(1/2),f(-1/2),
168      r6 = f(0).
169 
170    All couples of the form f(n),f(-n) must be already mixed with
171    toom_couple_handling(f(n),...,f(-n),...)
172 
173    The result is stored in {pp, spt + 7*n (or 6*n)}.
174    At entry, r6 is stored at {pp, 2n},
175    r4 is stored at {pp + 3n, 3n + 1}.
176    r2 is stored at {pp + 7n, 3n + 1}.
177    r0 is stored at {pp +11n, spt}.
178 
179    The other values are 3n+1 limbs each (with most significant limbs small).
180 
181    Negative intermediate results are stored two-complemented.
182    Inputs are destroyed.
183 */
184 
185 void
186 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
187 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
188 {
189   mp_limb_t cy;
190   mp_size_t n3;
191   mp_size_t n3p1;
192   n3 = 3 * n;
193   n3p1 = n3 + 1;
194 
195 #define   r4    (pp + n3)			/* 3n+1 */
196 #define   r2    (pp + 7 * n)			/* 3n+1 */
197 #define   r0    (pp +11 * n)			/* s+t <= 2*n */
198 
199   /******************************* interpolation *****************************/
200   if (half != 0) {
201     cy = mpn_sub_n (r3, r3, r0, spt);
202     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
203 
204     cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
205     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
206     DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
207 
208     cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
209     MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
210     DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
211   };
212 
213   r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
214   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
215 
216 #if HAVE_NATIVE_mpn_add_n_sub_n
217   mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
218 #else
219   ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
220   mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
221   MP_PTR_SWAP(r1, wsi);
222 #endif
223 
224   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
225   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
226 
227 #if HAVE_NATIVE_mpn_add_n_sub_n
228   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
229 #else
230   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
231   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
232   MP_PTR_SWAP(r5, wsi);
233 #endif
234 
235   r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
236 
237 #if AORSMUL_FASTER_AORS_AORSLSH
238   mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
239 #else
240   mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
241   DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
242 #endif
243   /* A division by 2835x4 followsi. Warning: the operand can be negative! */
244   mpn_divexact_by2835x4(r4, r4, n3p1);
245   if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
246     r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
247 
248 #if AORSMUL_FASTER_2AORSLSH
249   mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
250 #else
251   DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
252   DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
253 #endif
254   mpn_divexact_by255(r5, r5, n3p1);
255 
256   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
257 
258 #if AORSMUL_FASTER_3AORSLSH
259   ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
260 #else
261   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
262   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
263   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
264 #endif
265   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
266   mpn_divexact_by42525(r1, r1, n3p1);
267 
268 #if AORSMUL_FASTER_AORS_2AORSLSH
269   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
270 #else
271   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
272   ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
273   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
274 #endif
275   mpn_divexact_by9x4(r2, r2, n3p1);
276 
277   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
278 
279   mpn_sub_n (r4, r2, r4, n3p1);
280   ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
281   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
282 
283   mpn_add_n (r5, r5, r1, n3p1);
284   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
285 
286   /* last interpolation steps... */
287   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
288   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
289   /* ... could be mixed with recomposition
290 	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
291   */
292 
293   /***************************** recomposition *******************************/
294   /*
295     pp[] prior to operations:
296     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
297 
298     summation scheme for remaining operations:
299     |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
300     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
301 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
302   */
303 
304   cy = mpn_add_n (pp + n, pp + n, r5, n);
305   cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
306 #if HAVE_NATIVE_mpn_add_nc
307   cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
308 #else
309   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
310   cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
311 #endif
312   MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
313 
314   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
315   cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
316 #if HAVE_NATIVE_mpn_add_nc
317   cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
318 #else
319   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
320   cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
321 #endif
322   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
323 
324   pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
325   if (half) {
326     cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
327 #if HAVE_NATIVE_mpn_add_nc
328     if (LIKELY (spt > n)) {
329       cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
330       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
331     } else {
332       ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
333     }
334 #else
335     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
336     if (LIKELY (spt > n)) {
337       cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
338       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
339     } else {
340       ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
341     }
342 #endif
343   } else {
344     ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
345   }
346 
347 #undef   r0
348 #undef   r2
349 #undef   r4
350 }
351