1 /* Interpolaton for the algorithm Toom-Cook 8.5-way.
2 
3    Contributed to the GNU project by Marco Bodrato.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17 
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21 License for more details.
22 
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25 
26 
27 #include "mpir.h"
28 #include "gmp-impl.h"
29 
30 #if GMP_NUMB_BITS < 29
31 #error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
32 #endif
33 
34 #if GMP_NUMB_BITS < 28
35 #error Not implemented: divexact_by188513325 and _by182712915 will not work.
36 #endif
37 
38 
39 #if HAVE_NATIVE_mpn_sublsh_n
40 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
41 #else
42 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)43 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
44 {
45 #if USE_MUL_1 && 0
46   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
47 #else
48   mp_limb_t __cy;
49   __cy = mpn_lshift(ws,src,n,s);
50   return    __cy + mpn_sub_n(dst,dst,ws,n);
51 #endif
52 }
53 #endif
54 
55 #if HAVE_NATIVE_mpn_addlsh_n
56 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
57 #else
58 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)59 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
60 {
61 #if USE_MUL_1 && 0
62   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
63 #else
64   mp_limb_t __cy;
65   __cy = mpn_lshift(ws,src,n,s);
66   return    __cy + mpn_add_n(dst,dst,ws,n);
67 #endif
68 }
69 #endif
70 
71 #if HAVE_NATIVE_mpn_subrsh
72 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
73 #else
74 /* FIXME: This is not a correct definition, it assumes no carry */
75 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
76 do {									\
77   mp_limb_t __cy;							\
78   MPN_DECR_U (dst, nd, src[0] >> s);					\
79   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
80   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
81 } while (0)
82 #endif
83 
84 
85 /* FIXME: tuneup should decide the best variant */
86 #ifndef AORSMUL_FASTER_AORS_AORSLSH
87 #define AORSMUL_FASTER_AORS_AORSLSH 1
88 #endif
89 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
90 #define AORSMUL_FASTER_AORS_2AORSLSH 1
91 #endif
92 #ifndef AORSMUL_FASTER_2AORSLSH
93 #define AORSMUL_FASTER_2AORSLSH 1
94 #endif
95 #ifndef AORSMUL_FASTER_3AORSLSH
96 #define AORSMUL_FASTER_3AORSLSH 1
97 #endif
98 
99 #if GMP_NUMB_BITS < 43
100 #define BIT_CORRECTION 1
101 #define CORRECTION_BITS GMP_NUMB_BITS
102 #else
103 #define BIT_CORRECTION 0
104 #define CORRECTION_BITS 0
105 #endif
106 
107 #define BINVERT_9 \
108   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
109 
110 #define BINVERT_255 \
111   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
112 
113   /* FIXME: find some more general expressions for inverses */
114 #if GMP_LIMB_BITS == 32
115 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
116 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
117 #define BINVERT_182712915 (GMP_NUMB_MASK &	CNST_LIMB(0x550659DB))
118 #define BINVERT_188513325 (GMP_NUMB_MASK &	CNST_LIMB(0xFBC333A5))
119 #define BINVERT_255x182712915L (GMP_NUMB_MASK &	CNST_LIMB(0x6FC4CB25))
120 #define BINVERT_255x188513325L (GMP_NUMB_MASK &	CNST_LIMB(0x6864275B))
121 #if GMP_NAIL_BITS == 0
122 #define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
123 #define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
124 #else /* GMP_NAIL_BITS != 0 */
125 #define BINVERT_255x182712915H \
126   (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
127 #define BINVERT_255x188513325H \
128   (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
129 #endif
130 #else
131 #if GMP_LIMB_BITS == 64
132 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
133 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
134 #define BINVERT_255x182712915  (GMP_NUMB_MASK &	CNST_LIMB(0x1B649A076FC4CB25))
135 #define BINVERT_255x188513325  (GMP_NUMB_MASK &	CNST_LIMB(0x06DB993A6864275B))
136 #endif
137 #endif
138 
139 #ifndef mpn_divexact_by255
140 #if GMP_NUMB_BITS % 8 == 0
141 #define mpn_divexact_by255(dst,src,size) \
142   (255 & 1 * mpn_divexact_byfobm1(dst, src, size, 255, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
143 #else
144 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
145 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
146 #else
147 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
148 #endif
149 #endif
150 #endif
151 
152 #ifndef mpn_divexact_by255x4
153 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
154 #define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
155 #else
156 #define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
157 #endif
158 #endif
159 
160 #ifndef mpn_divexact_by9x16
161 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
162 #define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
163 #else
164 #define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
165 #endif
166 #endif
167 
168 #ifndef mpn_divexact_by42525x16
169 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
170 #define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
171 #else
172 #define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
173 #endif
174 #endif
175 
176 #ifndef mpn_divexact_by2835x64
177 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
178 #define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
179 #else
180 #define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
181 #endif
182 #endif
183 
184 #ifndef  mpn_divexact_by255x182712915
185 #if GMP_NUMB_BITS < 36
186 #if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
187 /* FIXME: use mpn_bdiv_q_2_pi2 */
188 #endif
189 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
190 #define mpn_divexact_by255x182712915(dst,src,size)				\
191   do {										\
192     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);	\
193     mpn_divexact_by255(dst,dst,size);						\
194   } while(0)
195 #else
196 #define mpn_divexact_by255x182712915(dst,src,size)	\
197   do {							\
198     mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));	\
199     mpn_divexact_by255(dst,dst,size);			\
200   } while(0)
201 #endif
202 #else /* GMP_NUMB_BITS > 35 */
203 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
204 #define mpn_divexact_by255x182712915(dst,src,size) \
205   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
206 #else
207 #define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
208 #endif
209 #endif /* GMP_NUMB_BITS >?< 36 */
210 #endif
211 
212 #ifndef  mpn_divexact_by255x188513325
213 #if GMP_NUMB_BITS < 36
214 #if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
215 /* FIXME: use mpn_bdiv_q_1_pi2 */
216 #endif
217 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
218 #define mpn_divexact_by255x188513325(dst,src,size)			\
219   do {									\
220     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);	\
221     mpn_divexact_by255(dst,dst,size);					\
222   } while(0)
223 #else
224 #define mpn_divexact_by255x188513325(dst,src,size)	\
225   do {							\
226     mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));	\
227     mpn_divexact_by255(dst,dst,size);			\
228   } while(0)
229 #endif
230 #else /* GMP_NUMB_BITS > 35 */
231 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
232 #define mpn_divexact_by255x188513325(dst,src,size) \
233   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
234 #else
235 #define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
236 #endif
237 #endif /* GMP_NUMB_BITS >?< 36 */
238 #endif
239 
240 /* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
241    points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
242    +-1/8, 0. More precisely, we want to compute
243    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
244    14), given the 16 (rsp. 15) values:
245 
246      r0 = limit at infinity of f(x) / x^7,
247      r1 = f(8),f(-8),
248      r2 = f(4),f(-4),
249      r3 = f(2),f(-2),
250      r4 = f(1),f(-1),
251      r5 = f(1/4),f(-1/4),
252      r6 = f(1/2),f(-1/2),
253      r7 = f(1/8),f(-1/8),
254      r8 = f(0).
255 
256    All couples of the form f(n),f(-n) must be already mixed with
257    toom_couple_handling(f(n),...,f(-n),...)
258 
259    The result is stored in {pp, spt + 7*n (or 8*n)}.
260    At entry, r8 is stored at {pp, 2n},
261    r6 is stored at {pp + 3n, 3n + 1}.
262    r4 is stored at {pp + 7n, 3n + 1}.
263    r2 is stored at {pp +11n, 3n + 1}.
264    r0 is stored at {pp +15n, spt}.
265 
266    The other values are 3n+1 limbs each (with most significant limbs small).
267 
268    Negative intermediate results are stored two-complemented.
269    Inputs are destroyed.
270 */
271 
272 void
mpn_toom_interpolate_16pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_ptr r7,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)273 mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
274 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
275 {
276   mp_limb_t cy;
277   mp_size_t n3;
278   mp_size_t n3p1;
279   n3 = 3 * n;
280   n3p1 = n3 + 1;
281 
282 #define   r6    (pp + n3)			/* 3n+1 */
283 #define   r4    (pp + 7 * n)			/* 3n+1 */
284 #define   r2    (pp +11 * n)			/* 3n+1 */
285 #define   r0    (pp +15 * n)			/* s+t <= 2*n */
286 
287   ASSERT( spt <= 2 * n );
288   /******************************* interpolation *****************************/
289   if( half != 0) {
290     cy = mpn_sub_n (r4, r4, r0, spt);
291     MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
292 
293     cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
294     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
295     DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
296 
297     cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
298     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
299     DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
300 
301     cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
302     //MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
303 #if BIT_CORRECTION
304     cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
305                     n3p1 - spt - BIT_CORRECTION, cy);
306     ASSERT (BIT_CORRECTION > 0 || cy == 0);
307     /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
308     cy = r7[n3p1];
309     r7[n3p1] = 0x80;
310 #else
311     MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
312 #endif
313     DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
314 #if BIT_CORRECTION
315     /* FIXME: assumes r7[n3p1] is writable. */
316     ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
317     r7[n3p1] = cy;
318 #endif
319   };
320 
321   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
322   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
323 
324 #if HAVE_NATIVE_mpn_sumdiff_n
325   mpn_sumdiff_n (r2, wsi, r5, r2, n3p1);
326   MP_PTR_SWAP(r5,wsi);
327 #else
328   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
329   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
330   MP_PTR_SWAP(r5, wsi);
331 #endif
332 
333   r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
334   DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
335 
336 #if HAVE_NATIVE_mpn_sumdiff_n
337   mpn_sumdiff_n (wsi, r6, r6, r3, n3p1);
338   MP_PTR_SWAP(r3, wsi);
339 #else
340   ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
341   mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
342   MP_PTR_SWAP(r3, wsi);
343 #endif
344 
345   r7[n3] -= DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi)
346 	    * (1-BIT_CORRECTION); /* if BIT_CORRECTION != 0, discard the carry. */
347 #if BIT_CORRECTION
348   MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
349   cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
350   cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
351   ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
352 #else
353   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
354 #endif
355 
356 #if HAVE_NATIVE_mpn_sumdiff_n
357   mpn_sumdiff_n (r1, wsi, r7, r1, n3p1);
358   MP_PTR_SWAP(r7, wsi);
359 #else
360   mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
361   mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
362   MP_PTR_SWAP(r7, wsi);
363 #endif
364 
365   r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
366 
367 #if AORSMUL_FASTER_2AORSLSH
368   mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
369 #else
370   DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
371   DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
372 #endif
373 
374   mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
375 #if AORSMUL_FASTER_3AORSLSH
376   mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
377 #else
378   DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
379   DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
380   DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
381 #endif
382   mpn_divexact_by255x188513325(r7, r7, n3p1);
383 
384   mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
385   /* A division by 2835x64 followsi. Warning: the operand can be negative! */
386   mpn_divexact_by2835x64(r5, r5, n3p1);
387   if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
388     r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
389 
390 #if AORSMUL_FASTER_AORS_AORSLSH
391   mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
392 #else
393   mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
394   DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
395 #endif
396 #if AORSMUL_FASTER_2AORSLSH
397   mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
398 #else
399   DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
400   DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
401 #endif
402   /* A division by 255x4 followsi. Warning: the operand can be negative! */
403   mpn_divexact_by255x4(r6, r6, n3p1);
404   if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
405     r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
406 
407   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
408 
409   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
410   ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
411 
412   /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
413   DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
414   mpn_submul_1 (r1, r2, n3p1, 1428);
415   mpn_submul_1 (r1, r3, n3p1, 112896);
416   mpn_divexact_by255x182712915(r1, r1, n3p1);
417 
418   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
419   mpn_divexact_by42525x16(r2, r2, n3p1);
420 
421 #if AORSMUL_FASTER_AORS_2AORSLSH
422   ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
423 #else
424   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
425   ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
426   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
427 #endif
428   ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
429   mpn_divexact_by9x16(r3, r3, n3p1);
430 
431   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
432   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
433   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
434 
435   mpn_add_n (r6, r2, r6, n3p1);
436   ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
437   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
438 
439   mpn_sub_n (r5, r3, r5, n3p1);
440   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
441   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
442 
443   mpn_add_n (r7, r1, r7, n3p1);
444   ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
445   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
446 
447   /* last interpolation steps... */
448   /* ... could be mixed with recomposition
449 	||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
450   */
451 
452   /***************************** recomposition *******************************/
453   /*
454     pp[] prior to operations:
455     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
456 
457     summation scheme for remaining operations:
458     |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
459     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
460 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
461   */
462 
463   cy = mpn_add_n (pp + n, pp + n, r7, n);
464   cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
465 #if HAVE_NATIVE_mpn_add_nc
466   cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
467 #else
468   MPN_INCR_U (r7 + 2 * n, n + 1, cy);
469   cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
470 #endif
471   MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
472 
473   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
474   cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
475 #if HAVE_NATIVE_mpn_add_nc
476   cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
477 #else
478   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
479   cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
480 #endif
481   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
482 
483   pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
484   cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
485 #if HAVE_NATIVE_mpn_add_nc
486   cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
487 #else
488   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
489   cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
490 #endif
491   MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
492 
493   pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
494   if ( half ) {
495     cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
496 #if HAVE_NATIVE_mpn_add_nc
497     if(LIKELY(spt > n)) {
498       cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
499       MPN_INCR_U (pp + 16 * n, spt - n, cy);
500     } else {
501       ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
502     }
503 #else
504     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
505     if(LIKELY(spt > n)) {
506       cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
507       MPN_INCR_U (pp + 16 * n, spt - n, cy);
508     } else {
509       ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
510     }
511 #endif
512   } else {
513     ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
514   }
515 
516 #undef   r0
517 #undef   r2
518 #undef   r4
519 #undef   r6
520 }
521