1 /* Interpolation for the algorithm Toom-Cook 8.5-way.
2 
3    Contributed to the GNU project by Marco Bodrato.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 
38 #include "gmp-impl.h"
39 
40 
41 #if GMP_NUMB_BITS < 29
42 #error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
43 #endif
44 
45 #if GMP_NUMB_BITS < 28
46 #error Not implemented: divexact_by188513325 and _by182712915 will not work.
47 #endif
48 
49 
50 /* FIXME: tuneup should decide the best variant */
51 #ifndef AORSMUL_FASTER_AORS_AORSLSH
52 #define AORSMUL_FASTER_AORS_AORSLSH 1
53 #endif
54 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
55 #define AORSMUL_FASTER_AORS_2AORSLSH 1
56 #endif
57 #ifndef AORSMUL_FASTER_2AORSLSH
58 #define AORSMUL_FASTER_2AORSLSH 1
59 #endif
60 #ifndef AORSMUL_FASTER_3AORSLSH
61 #define AORSMUL_FASTER_3AORSLSH 1
62 #endif
63 
64 
65 #if HAVE_NATIVE_mpn_sublsh_n
66 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
67 #else
68 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)69 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
70 {
71 #if USE_MUL_1 && 0
72   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
73 #else
74   mp_limb_t __cy;
75   __cy = mpn_lshift(ws,src,n,s);
76   return    __cy + mpn_sub_n(dst,dst,ws,n);
77 #endif
78 }
79 #endif
80 
81 #if HAVE_NATIVE_mpn_addlsh_n
82 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
83 #else
84 #if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
85 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)86 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
87 {
88 #if USE_MUL_1 && 0
89   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
90 #else
91   mp_limb_t __cy;
92   __cy = mpn_lshift(ws,src,n,s);
93   return    __cy + mpn_add_n(dst,dst,ws,n);
94 #endif
95 }
96 #endif
97 #endif
98 
99 #if HAVE_NATIVE_mpn_subrsh
100 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
101 #else
102 /* FIXME: This is not a correct definition, it assumes no carry */
103 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
104 do {									\
105   mp_limb_t __cy;							\
106   MPN_DECR_U (dst, nd, src[0] >> s);					\
107   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
108   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
109 } while (0)
110 #endif
111 
112 
113 #if GMP_NUMB_BITS < 43
114 #define BIT_CORRECTION 1
115 #define CORRECTION_BITS GMP_NUMB_BITS
116 #else
117 #define BIT_CORRECTION 0
118 #define CORRECTION_BITS 0
119 #endif
120 
121 #define BINVERT_9 \
122   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
123 
124 #define BINVERT_255 \
125   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
126 
127   /* FIXME: find some more general expressions for inverses */
128 #if GMP_LIMB_BITS == 32
129 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
130 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
131 #define BINVERT_182712915 (GMP_NUMB_MASK &	CNST_LIMB(0x550659DB))
132 #define BINVERT_188513325 (GMP_NUMB_MASK &	CNST_LIMB(0xFBC333A5))
133 #define BINVERT_255x182712915L (GMP_NUMB_MASK &	CNST_LIMB(0x6FC4CB25))
134 #define BINVERT_255x188513325L (GMP_NUMB_MASK &	CNST_LIMB(0x6864275B))
135 #if GMP_NAIL_BITS == 0
136 #define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
137 #define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
138 #else /* GMP_NAIL_BITS != 0 */
139 #define BINVERT_255x182712915H \
140   (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
141 #define BINVERT_255x188513325H \
142   (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
143 #endif
144 #else
145 #if GMP_LIMB_BITS == 64
146 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
147 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
148 #define BINVERT_255x182712915  (GMP_NUMB_MASK &	CNST_LIMB(0x1B649A076FC4CB25))
149 #define BINVERT_255x188513325  (GMP_NUMB_MASK &	CNST_LIMB(0x06DB993A6864275B))
150 #endif
151 #endif
152 
153 #ifndef mpn_divexact_by255
154 #if GMP_NUMB_BITS % 8 == 0
155 #define mpn_divexact_by255(dst,src,size) \
156   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
157 #else
158 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
159 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
160 #else
161 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
162 #endif
163 #endif
164 #endif
165 
166 #ifndef mpn_divexact_by255x4
167 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
168 #define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
169 #else
170 #define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
171 #endif
172 #endif
173 
174 #ifndef mpn_divexact_by9x16
175 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
176 #define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
177 #else
178 #define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
179 #endif
180 #endif
181 
182 #ifndef mpn_divexact_by42525x16
183 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
184 #define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
185 #else
186 #define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
187 #endif
188 #endif
189 
190 #ifndef mpn_divexact_by2835x64
191 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
192 #define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
193 #else
194 #define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
195 #endif
196 #endif
197 
198 #ifndef  mpn_divexact_by255x182712915
199 #if GMP_NUMB_BITS < 36
200 #if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
201 /* FIXME: use mpn_bdiv_q_2_pi2 */
202 #endif
203 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
204 #define mpn_divexact_by255x182712915(dst,src,size)				\
205   do {										\
206     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);	\
207     mpn_divexact_by255(dst,dst,size);						\
208   } while(0)
209 #else
210 #define mpn_divexact_by255x182712915(dst,src,size)	\
211   do {							\
212     mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));	\
213     mpn_divexact_by255(dst,dst,size);			\
214   } while(0)
215 #endif
216 #else /* GMP_NUMB_BITS > 35 */
217 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
218 #define mpn_divexact_by255x182712915(dst,src,size) \
219   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
220 #else
221 #define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
222 #endif
223 #endif /* GMP_NUMB_BITS >?< 36 */
224 #endif
225 
226 #ifndef  mpn_divexact_by255x188513325
227 #if GMP_NUMB_BITS < 36
228 #if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
229 /* FIXME: use mpn_bdiv_q_1_pi2 */
230 #endif
231 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
232 #define mpn_divexact_by255x188513325(dst,src,size)			\
233   do {									\
234     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);	\
235     mpn_divexact_by255(dst,dst,size);					\
236   } while(0)
237 #else
238 #define mpn_divexact_by255x188513325(dst,src,size)	\
239   do {							\
240     mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));	\
241     mpn_divexact_by255(dst,dst,size);			\
242   } while(0)
243 #endif
244 #else /* GMP_NUMB_BITS > 35 */
245 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
246 #define mpn_divexact_by255x188513325(dst,src,size) \
247   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
248 #else
249 #define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
250 #endif
251 #endif /* GMP_NUMB_BITS >?< 36 */
252 #endif
253 
254 /* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
255    points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
256    +-1/8, 0. More precisely, we want to compute
257    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
258    14), given the 16 (rsp. 15) values:
259 
260      r0 = limit at infinity of f(x) / x^15,
261      r1 = f(8),f(-8),
262      r2 = f(4),f(-4),
263      r3 = f(2),f(-2),
264      r4 = f(1),f(-1),
265      r5 = f(1/4),f(-1/4),
266      r6 = f(1/2),f(-1/2),
267      r7 = f(1/8),f(-1/8),
268      r8 = f(0).
269 
270    All couples of the form f(n),f(-n) must be already mixed with
271    toom_couple_handling(f(n),...,f(-n),...)
272 
273    The result is stored in {pp, spt + 7*n (or 8*n)}.
274    At entry, r8 is stored at {pp, 2n},
275    r6 is stored at {pp + 3n, 3n + 1}.
276    r4 is stored at {pp + 7n, 3n + 1}.
277    r2 is stored at {pp +11n, 3n + 1}.
278    r0 is stored at {pp +15n, spt}.
279 
280    The other values are 3n+1 limbs each (with most significant limbs small).
281 
282    Negative intermediate results are stored two-complemented.
283    Inputs are destroyed.
284 */
285 
286 void
mpn_toom_interpolate_16pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_ptr r7,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)287 mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
288 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
289 {
290   mp_limb_t cy;
291   mp_size_t n3;
292   mp_size_t n3p1;
293   n3 = 3 * n;
294   n3p1 = n3 + 1;
295 
296 #define   r6    (pp + n3)			/* 3n+1 */
297 #define   r4    (pp + 7 * n)			/* 3n+1 */
298 #define   r2    (pp +11 * n)			/* 3n+1 */
299 #define   r0    (pp +15 * n)			/* s+t <= 2*n */
300 
301   ASSERT( spt <= 2 * n );
302   /******************************* interpolation *****************************/
303   if( half != 0) {
304     cy = mpn_sub_n (r4, r4, r0, spt);
305     MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
306 
307     cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
308     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
309     DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
310 
311     cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
312     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
313     DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
314 
315     cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
316 #if BIT_CORRECTION
317     cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
318 		    n3p1 - spt - BIT_CORRECTION, cy);
319     ASSERT (BIT_CORRECTION > 0 || cy == 0);
320     /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
321     cy = r7[n3p1];
322     r7[n3p1] = 0x80;
323 #else
324     MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
325 #endif
326     DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
327 #if BIT_CORRECTION
328     /* FIXME: assumes r7[n3p1] is writable. */
329     ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
330     r7[n3p1] = cy;
331 #endif
332   };
333 
334   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
335   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
336 
337 #if HAVE_NATIVE_mpn_add_n_sub_n
338   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
339 #else
340   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
341   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
342   MP_PTR_SWAP(r5, wsi);
343 #endif
344 
345   r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
346   DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
347 
348 #if HAVE_NATIVE_mpn_add_n_sub_n
349   mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
350 #else
351   ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
352   mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
353   MP_PTR_SWAP(r3, wsi);
354 #endif
355 
356   cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
357 #if BIT_CORRECTION
358   MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
359   cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
360   cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
361   ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
362 #else
363   r7[n3] -= cy;
364   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
365 #endif
366 
367 #if HAVE_NATIVE_mpn_add_n_sub_n
368   mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
369 #else
370   mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
371   mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
372   MP_PTR_SWAP(r7, wsi);
373 #endif
374 
375   r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
376 
377 #if AORSMUL_FASTER_2AORSLSH
378   mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
379 #else
380   DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
381   DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
382 #endif
383 
384   mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
385 #if AORSMUL_FASTER_3AORSLSH
386   mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
387 #else
388   DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
389   DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
390   DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
391 #endif
392   mpn_divexact_by255x188513325(r7, r7, n3p1);
393 
394   mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
395   /* A division by 2835x64 follows. Warning: the operand can be negative! */
396   mpn_divexact_by2835x64(r5, r5, n3p1);
397   if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
398     r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
399 
400 #if AORSMUL_FASTER_AORS_AORSLSH
401   mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
402 #else
403   mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
404   DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
405 #endif
406 #if AORSMUL_FASTER_2AORSLSH
407   mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
408 #else
409   DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
410   DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
411 #endif
412   /* A division by 255x4 follows. Warning: the operand can be negative! */
413   mpn_divexact_by255x4(r6, r6, n3p1);
414   if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
415     r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
416 
417   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
418 
419   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
420   ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
421 
422   /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
423   DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
424   mpn_submul_1 (r1, r2, n3p1, 1428);
425   mpn_submul_1 (r1, r3, n3p1, 112896);
426   mpn_divexact_by255x182712915(r1, r1, n3p1);
427 
428   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
429   mpn_divexact_by42525x16(r2, r2, n3p1);
430 
431 #if AORSMUL_FASTER_AORS_2AORSLSH
432   ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
433 #else
434   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
435   ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
436   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
437 #endif
438   ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
439   mpn_divexact_by9x16(r3, r3, n3p1);
440 
441   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
442   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
443   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
444 
445 #ifdef HAVE_NATIVE_mpn_rsh1add_n
446   mpn_rsh1add_n (r6, r2, r6, n3p1);
447   r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
448 #else
449   mpn_add_n (r6, r2, r6, n3p1);
450   ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
451 #endif
452   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
453 
454 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
455   mpn_rsh1sub_n (r5, r3, r5, n3p1);
456   r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
457 #else
458   mpn_sub_n (r5, r3, r5, n3p1);
459   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
460 #endif
461   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
462 
463 #ifdef HAVE_NATIVE_mpn_rsh1add_n
464   mpn_rsh1add_n (r7, r1, r7, n3p1);
465   r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
466 #else
467   mpn_add_n (r7, r1, r7, n3p1);
468   ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
469 #endif
470   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
471 
472   /* last interpolation steps... */
473   /* ... could be mixed with recomposition
474 	||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
475   */
476 
477   /***************************** recomposition *******************************/
478   /*
479     pp[] prior to operations:
480     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
481 
482     summation scheme for remaining operations:
483     |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
484     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
485 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
486   */
487 
488   cy = mpn_add_n (pp + n, pp + n, r7, n);
489   cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
490 #if HAVE_NATIVE_mpn_add_nc
491   cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
492 #else
493   MPN_INCR_U (r7 + 2 * n, n + 1, cy);
494   cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
495 #endif
496   MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
497 
498   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
499   cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
500 #if HAVE_NATIVE_mpn_add_nc
501   cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
502 #else
503   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
504   cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
505 #endif
506   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
507 
508   pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
509   cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
510 #if HAVE_NATIVE_mpn_add_nc
511   cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
512 #else
513   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
514   cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
515 #endif
516   MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
517 
518   pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
519   if ( half ) {
520     cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
521 #if HAVE_NATIVE_mpn_add_nc
522     if(LIKELY(spt > n)) {
523       cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
524       MPN_INCR_U (pp + 16 * n, spt - n, cy);
525     } else {
526       ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
527     }
528 #else
529     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
530     if(LIKELY(spt > n)) {
531       cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
532       MPN_INCR_U (pp + 16 * n, spt - n, cy);
533     } else {
534       ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
535     }
536 #endif
537   } else {
538     ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
539   }
540 
541 #undef   r0
542 #undef   r2
543 #undef   r4
544 #undef   r6
545 }
546