1 /* Interpolation for the algorithm Toom-Cook 8.5-way.
2 
3    Contributed to the GNU project by Marco Bodrato.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 
38 #include "gmp.h"
39 #include "gmp-impl.h"
40 
41 #if GMP_NUMB_BITS < 29
42 #error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
43 #endif
44 
45 #if GMP_NUMB_BITS < 28
46 #error Not implemented: divexact_by188513325 and _by182712915 will not work.
47 #endif
48 
49 
50 #if HAVE_NATIVE_mpn_sublsh_n
51 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
52 #else
53 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)54 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
55 {
56 #if USE_MUL_1 && 0
57   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
58 #else
59   mp_limb_t __cy;
60   __cy = mpn_lshift(ws,src,n,s);
61   return    __cy + mpn_sub_n(dst,dst,ws,n);
62 #endif
63 }
64 #endif
65 
66 #if HAVE_NATIVE_mpn_addlsh_n
67 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
68 #else
69 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)70 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
71 {
72 #if USE_MUL_1 && 0
73   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
74 #else
75   mp_limb_t __cy;
76   __cy = mpn_lshift(ws,src,n,s);
77   return    __cy + mpn_add_n(dst,dst,ws,n);
78 #endif
79 }
80 #endif
81 
82 #if HAVE_NATIVE_mpn_subrsh
83 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
84 #else
85 /* FIXME: This is not a correct definition, it assumes no carry */
86 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
87 do {									\
88   mp_limb_t __cy;							\
89   MPN_DECR_U (dst, nd, src[0] >> s);					\
90   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
91   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
92 } while (0)
93 #endif
94 
95 
96 /* FIXME: tuneup should decide the best variant */
97 #ifndef AORSMUL_FASTER_AORS_AORSLSH
98 #define AORSMUL_FASTER_AORS_AORSLSH 1
99 #endif
100 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
101 #define AORSMUL_FASTER_AORS_2AORSLSH 1
102 #endif
103 #ifndef AORSMUL_FASTER_2AORSLSH
104 #define AORSMUL_FASTER_2AORSLSH 1
105 #endif
106 #ifndef AORSMUL_FASTER_3AORSLSH
107 #define AORSMUL_FASTER_3AORSLSH 1
108 #endif
109 
110 #if GMP_NUMB_BITS < 43
111 #define BIT_CORRECTION 1
112 #define CORRECTION_BITS GMP_NUMB_BITS
113 #else
114 #define BIT_CORRECTION 0
115 #define CORRECTION_BITS 0
116 #endif
117 
118 #define BINVERT_9 \
119   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
120 
121 #define BINVERT_255 \
122   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
123 
124   /* FIXME: find some more general expressions for inverses */
125 #if GMP_LIMB_BITS == 32
126 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
127 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
128 #define BINVERT_182712915 (GMP_NUMB_MASK &	CNST_LIMB(0x550659DB))
129 #define BINVERT_188513325 (GMP_NUMB_MASK &	CNST_LIMB(0xFBC333A5))
130 #define BINVERT_255x182712915L (GMP_NUMB_MASK &	CNST_LIMB(0x6FC4CB25))
131 #define BINVERT_255x188513325L (GMP_NUMB_MASK &	CNST_LIMB(0x6864275B))
132 #if GMP_NAIL_BITS == 0
133 #define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
134 #define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
135 #else /* GMP_NAIL_BITS != 0 */
136 #define BINVERT_255x182712915H \
137   (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
138 #define BINVERT_255x188513325H \
139   (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
140 #endif
141 #else
142 #if GMP_LIMB_BITS == 64
143 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
144 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
145 #define BINVERT_255x182712915  (GMP_NUMB_MASK &	CNST_LIMB(0x1B649A076FC4CB25))
146 #define BINVERT_255x188513325  (GMP_NUMB_MASK &	CNST_LIMB(0x06DB993A6864275B))
147 #endif
148 #endif
149 
150 #ifndef mpn_divexact_by255
151 #if GMP_NUMB_BITS % 8 == 0
152 #define mpn_divexact_by255(dst,src,size) \
153   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
154 #else
155 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
156 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
157 #else
158 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
159 #endif
160 #endif
161 #endif
162 
163 #ifndef mpn_divexact_by255x4
164 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
165 #define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
166 #else
167 #define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
168 #endif
169 #endif
170 
171 #ifndef mpn_divexact_by9x16
172 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
173 #define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
174 #else
175 #define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
176 #endif
177 #endif
178 
179 #ifndef mpn_divexact_by42525x16
180 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
181 #define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
182 #else
183 #define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
184 #endif
185 #endif
186 
187 #ifndef mpn_divexact_by2835x64
188 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
189 #define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
190 #else
191 #define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
192 #endif
193 #endif
194 
195 #ifndef  mpn_divexact_by255x182712915
196 #if GMP_NUMB_BITS < 36
197 #if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
198 /* FIXME: use mpn_bdiv_q_2_pi2 */
199 #endif
200 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
201 #define mpn_divexact_by255x182712915(dst,src,size)				\
202   do {										\
203     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);	\
204     mpn_divexact_by255(dst,dst,size);						\
205   } while(0)
206 #else
207 #define mpn_divexact_by255x182712915(dst,src,size)	\
208   do {							\
209     mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));	\
210     mpn_divexact_by255(dst,dst,size);			\
211   } while(0)
212 #endif
213 #else /* GMP_NUMB_BITS > 35 */
214 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
215 #define mpn_divexact_by255x182712915(dst,src,size) \
216   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
217 #else
218 #define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
219 #endif
220 #endif /* GMP_NUMB_BITS >?< 36 */
221 #endif
222 
223 #ifndef  mpn_divexact_by255x188513325
224 #if GMP_NUMB_BITS < 36
225 #if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
226 /* FIXME: use mpn_bdiv_q_1_pi2 */
227 #endif
228 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
229 #define mpn_divexact_by255x188513325(dst,src,size)			\
230   do {									\
231     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);	\
232     mpn_divexact_by255(dst,dst,size);					\
233   } while(0)
234 #else
235 #define mpn_divexact_by255x188513325(dst,src,size)	\
236   do {							\
237     mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));	\
238     mpn_divexact_by255(dst,dst,size);			\
239   } while(0)
240 #endif
241 #else /* GMP_NUMB_BITS > 35 */
242 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
243 #define mpn_divexact_by255x188513325(dst,src,size) \
244   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
245 #else
246 #define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
247 #endif
248 #endif /* GMP_NUMB_BITS >?< 36 */
249 #endif
250 
251 /* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
252    points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
253    +-1/8, 0. More precisely, we want to compute
254    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
255    14), given the 16 (rsp. 15) values:
256 
257      r0 = limit at infinity of f(x) / x^7,
258      r1 = f(8),f(-8),
259      r2 = f(4),f(-4),
260      r3 = f(2),f(-2),
261      r4 = f(1),f(-1),
262      r5 = f(1/4),f(-1/4),
263      r6 = f(1/2),f(-1/2),
264      r7 = f(1/8),f(-1/8),
265      r8 = f(0).
266 
267    All couples of the form f(n),f(-n) must be already mixed with
268    toom_couple_handling(f(n),...,f(-n),...)
269 
270    The result is stored in {pp, spt + 7*n (or 8*n)}.
271    At entry, r8 is stored at {pp, 2n},
272    r6 is stored at {pp + 3n, 3n + 1}.
273    r4 is stored at {pp + 7n, 3n + 1}.
274    r2 is stored at {pp +11n, 3n + 1}.
275    r0 is stored at {pp +15n, spt}.
276 
277    The other values are 3n+1 limbs each (with most significant limbs small).
278 
279    Negative intermediate results are stored two-complemented.
280    Inputs are destroyed.
281 */
282 
283 void
mpn_toom_interpolate_16pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_ptr r7,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)284 mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
285 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
286 {
287   mp_limb_t cy;
288   mp_size_t n3;
289   mp_size_t n3p1;
290   n3 = 3 * n;
291   n3p1 = n3 + 1;
292 
293 #define   r6    (pp + n3)			/* 3n+1 */
294 #define   r4    (pp + 7 * n)			/* 3n+1 */
295 #define   r2    (pp +11 * n)			/* 3n+1 */
296 #define   r0    (pp +15 * n)			/* s+t <= 2*n */
297 
298   ASSERT( spt <= 2 * n );
299   /******************************* interpolation *****************************/
300   if( half != 0) {
301     cy = mpn_sub_n (r4, r4, r0, spt);
302     MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
303 
304     cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
305     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
306     DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
307 
308     cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
309     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
310     DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
311 
312     cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
313 #if BIT_CORRECTION
314     cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
315 		    n3p1 - spt - BIT_CORRECTION, cy);
316     ASSERT (BIT_CORRECTION > 0 || cy == 0);
317     /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
318     cy = r7[n3p1];
319     r7[n3p1] = 0x80;
320 #else
321     MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
322 #endif
323     DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
324 #if BIT_CORRECTION
325     /* FIXME: assumes r7[n3p1] is writable. */
326     ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
327     r7[n3p1] = cy;
328 #endif
329   };
330 
331   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
332   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
333 
334 #if HAVE_NATIVE_mpn_add_n_sub_n
335   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
336 #else
337   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
338   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
339   MP_PTR_SWAP(r5, wsi);
340 #endif
341 
342   r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
343   DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
344 
345 #if HAVE_NATIVE_mpn_add_n_sub_n
346   mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
347 #else
348   ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
349   mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
350   MP_PTR_SWAP(r3, wsi);
351 #endif
352 
353   cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
354 #if BIT_CORRECTION
355   MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
356   cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
357   cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
358   ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
359 #else
360   r7[n3] -= cy;
361   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
362 #endif
363 
364 #if HAVE_NATIVE_mpn_add_n_sub_n
365   mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
366 #else
367   mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
368   mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
369   MP_PTR_SWAP(r7, wsi);
370 #endif
371 
372   r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
373 
374 #if AORSMUL_FASTER_2AORSLSH
375   mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
376 #else
377   DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
378   DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
379 #endif
380 
381   mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
382 #if AORSMUL_FASTER_3AORSLSH
383   mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
384 #else
385   DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
386   DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
387   DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
388 #endif
389   mpn_divexact_by255x188513325(r7, r7, n3p1);
390 
391   mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
392   /* A division by 2835x64 follows. Warning: the operand can be negative! */
393   mpn_divexact_by2835x64(r5, r5, n3p1);
394   if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
395     r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
396 
397 #if AORSMUL_FASTER_AORS_AORSLSH
398   mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
399 #else
400   mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
401   DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
402 #endif
403 #if AORSMUL_FASTER_2AORSLSH
404   mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
405 #else
406   DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
407   DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
408 #endif
409   /* A division by 255x4 follows. Warning: the operand can be negative! */
410   mpn_divexact_by255x4(r6, r6, n3p1);
411   if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
412     r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
413 
414   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
415 
416   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
417   ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
418 
419   /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
420   DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
421   mpn_submul_1 (r1, r2, n3p1, 1428);
422   mpn_submul_1 (r1, r3, n3p1, 112896);
423   mpn_divexact_by255x182712915(r1, r1, n3p1);
424 
425   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
426   mpn_divexact_by42525x16(r2, r2, n3p1);
427 
428 #if AORSMUL_FASTER_AORS_2AORSLSH
429   ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
430 #else
431   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
432   ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
433   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
434 #endif
435   ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
436   mpn_divexact_by9x16(r3, r3, n3p1);
437 
438   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
439   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
440   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
441 
442   mpn_add_n (r6, r2, r6, n3p1);
443   ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
444   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
445 
446   mpn_sub_n (r5, r3, r5, n3p1);
447   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
448   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
449 
450   mpn_add_n (r7, r1, r7, n3p1);
451   ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
452   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
453 
454   /* last interpolation steps... */
455   /* ... could be mixed with recomposition
456 	||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
457   */
458 
459   /***************************** recomposition *******************************/
460   /*
461     pp[] prior to operations:
462     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
463 
464     summation scheme for remaining operations:
465     |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
466     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
467 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
468   */
469 
470   cy = mpn_add_n (pp + n, pp + n, r7, n);
471   cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
472 #if HAVE_NATIVE_mpn_add_nc
473   cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
474 #else
475   MPN_INCR_U (r7 + 2 * n, n + 1, cy);
476   cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
477 #endif
478   MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
479 
480   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
481   cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
482 #if HAVE_NATIVE_mpn_add_nc
483   cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
484 #else
485   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
486   cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
487 #endif
488   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
489 
490   pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
491   cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
492 #if HAVE_NATIVE_mpn_add_nc
493   cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
494 #else
495   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
496   cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
497 #endif
498   MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
499 
500   pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
501   if ( half ) {
502     cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
503 #if HAVE_NATIVE_mpn_add_nc
504     if(LIKELY(spt > n)) {
505       cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
506       MPN_INCR_U (pp + 16 * n, spt - n, cy);
507     } else {
508       ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
509     }
510 #else
511     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
512     if(LIKELY(spt > n)) {
513       cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
514       MPN_INCR_U (pp + 16 * n, spt - n, cy);
515     } else {
516       ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
517     }
518 #endif
519   } else {
520     ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
521   }
522 
523 #undef   r0
524 #undef   r2
525 #undef   r4
526 #undef   r6
527 }
528