1 /* mpn_toom4_mul_n -- Internal routine to multiply two natural numbers
2    of length n.
3 
4    THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
5    SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
6 */
7 
8 /* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 4-way.
9 
10 Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
11 Copyright 2009 William Hart
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 2.1 of the License, or (at your
18 option) any later version.
19 
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23 License for more details.
24 
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
27 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
28 MA 02110-1301, USA. */
29 
30 /*
31    This implementation is based on that of Paul Zimmmermann, which is available
32 	for mpz_t's at http://www.loria.fr/~zimmerma/software/toom4.c
33 */
34 
35 #include "mpir.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
38 
39 void
40 mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up,
41 		          mp_srcptr vp, mp_size_t n);
42 
_tc4_add(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)43 void _tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n,
44                                                  mp_srcptr r2, mp_size_t r2n)
45 {
46    mp_limb_t cy;
47    mp_size_t s1 = ABS(r1n);
48    mp_size_t s2 = ABS(r2n);
49 
50    if (!s1)
51    {
52       *rn = 0;
53    } else if (!s2)
54    {
55       if (rp != r1) MPN_COPY(rp, r1, s1);
56 		*rn = r1n;
57    } else if ((r1n ^ r2n) >= 0)
58    {
59       *rn = r1n;
60       cy = mpn_add(rp, r1, s1, r2, s2);
61       if (cy)
62       {
63          rp[s1] = cy;
64          if ((*rn) < 0) (*rn)--;
65          else (*rn)++;
66       }
67    } else
68    {
69       mp_size_t ct;
70 		if (s1 != s2) ct = 1;
71 		else MPN_CMP(ct, r1, r2, s1);
72 
73       if (!ct) *rn = 0;
74       else if (ct > 0)
75       {
76          mpn_sub(rp, r1, s1, r2, s2);
77          *rn = s1;
78          MPN_NORMALIZE(rp, (*rn));
79 			if (r1n < 0) *rn = -(*rn);
80       }
81       else
82       {
83          mpn_sub_n(rp, r2, r1, s1);
84          *rn = s1;
85          MPN_NORMALIZE(rp, (*rn));
86 			if (r1n > 0) *rn = -(*rn);
87       }
88    }
89 }
90 
tc4_add(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)91 void tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n,
92                                                  mp_srcptr r2, mp_size_t r2n)
93 {
94    mp_size_t s1 = ABS(r1n);
95    mp_size_t s2 = ABS(r2n);
96 
97    if (s1 < s2) _tc4_add(rp, rn, r2, r2n, r1, r1n);
98 	else _tc4_add(rp, rn, r1, r1n, r2, r2n);
99 }
100 
_tc4_add_unsigned(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)101 void _tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1,
102                                    mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
103 {
104    mp_limb_t cy;
105    mp_size_t s1 = r1n;
106    mp_size_t s2 = r2n;
107 
108    if (!s2)
109    {
110       if (!s1) *rn = 0;
111       else
112       {
113          if (rp != r1) MPN_COPY(rp, r1, s1);
114 		   *rn = r1n;
115 		}
116    } else
117    {
118       *rn = r1n;
119       cy = mpn_add(rp, r1, s1, r2, s2);
120       if (cy)
121       {
122          rp[s1] = cy;
123          if ((*rn) < 0) (*rn)--;
124          else (*rn)++;
125       }
126    }
127 }
128 
tc4_add_unsigned(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)129 void tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1,
130                                     mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
131 {
132    if (r1n < r2n) _tc4_add_unsigned(rp, rn, r2, r2n, r1, r1n);
133 	else _tc4_add_unsigned(rp, rn, r1, r1n, r2, r2n);
134 }
135 
tc4_sub(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)136 void tc4_sub(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n,
137                                                    mp_srcptr r2, mp_size_t r2n)
138 {
139    tc4_add(rp, rn, r1, r1n, r2, -r2n);
140 }
141 
tc4_lshift(mp_ptr rp,mp_size_t * rn,mp_srcptr xp,mp_size_t xn,mp_size_t bits)142 void tc4_lshift(mp_ptr rp, mp_size_t * rn, mp_srcptr xp,
143                                                   mp_size_t xn, mp_size_t bits)
144 {
145    if (xn == 0) *rn = 0;
146    else
147 	{
148 		mp_size_t xu = ABS(xn);
149 		mp_limb_t msl = mpn_lshift(rp, xp, xu, bits);
150       if (msl)
151 		{
152 			rp[xu] = msl;
153 			*rn = (xn >= 0 ? xn + 1 : xn - 1);
154 		} else
155 		   *rn = xn;
156 	}
157 }
158 
tc4_rshift_inplace(mp_ptr rp,mp_size_t * rn,mp_size_t bits)159 void tc4_rshift_inplace(mp_ptr rp, mp_size_t * rn, mp_size_t bits)
160 {
161    if (*rn)
162 	{
163 		if ((*rn) > 0)
164 		{
165 			mpn_rshift(rp, rp, *rn, bits);
166 	      if (rp[(*rn) - 1] == CNST_LIMB(0)) (*rn)--;
167 		} else
168 		{
169 			mpn_rshift(rp, rp, -(*rn), bits);
170 	      if (rp[-(*rn) - 1] == CNST_LIMB(0)) (*rn)++;
171 		}
172 	}
173 }
174 
tc4_addlsh1_unsigned(mp_ptr rp,mp_size_t * rn,mp_srcptr xp,mp_size_t xn)175 void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
176 {
177 	if (xn)
178 	{
179 		if (xn >= *rn)
180 		{
181             mp_limb_t cy;
182 			if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
183 #if HAVE_NATIVE_mpn_addlsh1_n
184             cy = mpn_addlsh1_n(rp, rp, xp, xn);
185 #else
186             cy = mpn_add_n(rp, rp, xp, xn);
187             cy += mpn_add_n(rp, rp, xp, xn);
188 #endif
189 			if (cy)
190 			{
191 				rp[xn] = cy;
192 				*rn = xn + 1;
193 			} else *rn = xn;
194 		} else
195 	   {
196 		   mp_limb_t cy;
197 #if HAVE_NATIVE_mpn_addlsh1_n
198             cy = mpn_addlsh1_n(rp, rp, xp, xn);
199 #else
200             cy = mpn_add_n(rp, rp, xp, xn);
201             cy += mpn_add_n(rp, rp, xp, xn);
202 #endif
203 	      if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
204 		   if (cy)
205 		   {
206 			   rp[*rn] = cy;
207 			   (*rn)++;
208 		   }
209 		}
210 	}
211 }
212 
tc4_divexact_ui(mp_ptr rp,mp_size_t * rn,mp_ptr x,mp_size_t xn,mp_limb_t c)213 void tc4_divexact_ui(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn, mp_limb_t c)
214 {
215   mp_size_t abs_size;
216   if (xn == 0)
217     {
218       *rn = 0;
219       return;
220     }
221   abs_size = ABS (xn);
222 
223   MPN_DIVREM_OR_DIVEXACT_1 (rp, x, abs_size, c);
224   abs_size -= (rp[abs_size-1] == 0);
225   *rn = (xn >= 0 ? abs_size : -abs_size);
226 }
227 
tc4_divexact_by3(mp_ptr rp,mp_size_t * rn,mp_ptr x,mp_size_t xn)228 void tc4_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
229 {
230 	if (xn)
231 	{
232 		mp_size_t xu = ABS(xn);
233 		mpn_divexact_by3(rp, x, xu);
234 		if (xn > 0)
235 		{
236 			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
237 			else *rn = xn;
238 		} else
239 		{
240 			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
241 			else *rn = xn;
242 		}
243 	} else *rn = 0;
244 }
245 
tc4_divexact_by15(mp_ptr rp,mp_size_t * rn,mp_ptr x,mp_size_t xn)246 void tc4_divexact_by15(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
247 {
248 	if (xn)
249 	{
250 		mp_size_t xu = ABS(xn);
251 		mpn_divexact_byfobm1(rp, x, xu, CNST_LIMB(15), CNST_LIMB((~0)/15)); /* works for 32 and 64 bits */
252 		if (xn > 0)
253 		{
254 			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
255 			else *rn = xn;
256 		} else
257 		{
258 			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
259 			else *rn = xn;
260 		}
261 	} else *rn = 0;
262 }
263 
264 #if HAVE_NATIVE_mpn_mul_1c
265 #define MPN_MUL_1C(cout, dst, src, size, n, cin)        \
266   do {                                                  \
267     (cout) = mpn_mul_1c (dst, src, size, n, cin);       \
268   } while (0)
269 #else
270 #define MPN_MUL_1C(cout, dst, src, size, n, cin)        \
271   do {                                                  \
272     mp_limb_t __cy;                                     \
273     __cy = mpn_mul_1 (dst, src, size, n);               \
274     (cout) = __cy + mpn_add_1 (dst, dst, size, cin);    \
275   } while (0)
276 #endif
277 
tc4_addmul_1(mp_ptr wp,mp_size_t * wn,mp_srcptr xp,mp_size_t xn,mp_limb_t y)278 void tc4_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y)
279 {
280   mp_size_t  sign, wu, xu, ws, new_wn, min_size, dsize;
281   mp_limb_t  cy;
282 
283   /* w unaffected if x==0 or y==0 */
284   if (xn == 0 || y == 0)
285     return;
286 
287   sign = xn;
288   xu = ABS (xn);
289 
290   ws = *wn;
291   if (*wn == 0)
292   {
293       /* nothing to add to, just set x*y, "sign" gives the sign */
294       cy = mpn_mul_1 (wp, xp, xu, y);
295       if (cy)
296 		{
297 			wp[xu] = cy;
298          xu = xu + 1;
299 		}
300       *wn = (sign >= 0 ? xu : -xu);
301       return;
302   }
303 
304   sign ^= *wn;
305   wu = ABS (*wn);
306 
307   new_wn = MAX (wu, xu);
308   min_size = MIN (wu, xu);
309 
310   if (sign >= 0)
311   {
312       /* addmul of absolute values */
313 
314       cy = mpn_addmul_1 (wp, xp, min_size, y);
315 
316       dsize = xu - wu;
317 #if HAVE_NATIVE_mpn_mul_1c
318       if (dsize > 0)
319         cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy);
320       else if (dsize < 0)
321       {
322           dsize = -dsize;
323           cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
324       }
325 #else
326       if (dsize != 0)
327       {
328           mp_limb_t cy2;
329           if (dsize > 0)
330             cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y);
331           else
332           {
333               dsize = -dsize;
334               cy2 = 0;
335           }
336           cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
337       }
338 #endif
339 
340       if (cy)
341 		{
342 			wp[dsize + min_size] = cy;
343          new_wn ++;
344 		}
345    } else
346    {
347       /* submul of absolute values */
348 
349       cy = mpn_submul_1 (wp, xp, min_size, y);
350       if (wu >= xu)
351       {
352           /* if w bigger than x, then propagate borrow through it */
353           if (wu != xu)
354             cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy);
355 
356           if (cy != 0)
357           {
358               /* Borrow out of w, take twos complement negative to get
359                  absolute value, flip sign of w.  */
360               wp[new_wn] = ~-cy;  /* extra limb is 0-cy */
361               mpn_not (wp, new_wn);
362               new_wn++;
363               MPN_INCR_U (wp, new_wn, CNST_LIMB(1));
364               ws = -*wn;
365           }
366       } else /* wu < xu */
367       {
368           /* x bigger than w, so want x*y-w.  Submul has given w-x*y, so
369              take twos complement and use an mpn_mul_1 for the rest.  */
370 
371           mp_limb_t  cy2;
372 
373           /* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */
374           mpn_not (wp, wu);
375           cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1));
376           cy -= 1;
377 
378           /* If cy-1 == -1 then hold that -1 for latter.  mpn_submul_1 never
379              returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */
380           cy2 = (cy == MP_LIMB_T_MAX);
381           cy += cy2;
382           MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy);
383           wp[new_wn] = cy;
384           new_wn += (cy != 0);
385 
386           /* Apply any -1 from above.  The value at wp+wsize is non-zero
387              because y!=0 and the high limb of x will be non-zero.  */
388           if (cy2)
389             MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1));
390 
391           ws = -*wn;
392         }
393 
394       /* submul can produce high zero limbs due to cancellation, both when w
395          has more limbs or x has more  */
396       MPN_NORMALIZE (wp, new_wn);
397   }
398 
399   *wn = (ws >= 0 ? new_wn : -new_wn);
400 
401   ASSERT (new_wn == 0 || wp[new_wn - 1] != 0);
402 }
403 
tc4_submul_1(mp_ptr wp,mp_size_t * wn,mp_srcptr x,mp_size_t xn,mp_limb_t y)404 void tc4_submul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr x, mp_size_t xn, mp_limb_t y)
405 {
406 	tc4_addmul_1(wp, wn, x, -xn, y);
407 }
408 
tc4_copy(mp_ptr yp,mp_size_t * yn,mp_size_t offset,mp_srcptr xp,mp_size_t xn)409 void tc4_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn)
410 {
411   mp_size_t yu = ABS(*yn);
412   mp_size_t xu = ABS(xn);
413   mp_limb_t cy = 0;
414 
415   if (xn == 0)
416     return;
417 
418   if (offset < yu) /* low part of x overlaps with y */
419   {
420       if (offset + xu <= yu) /* x entirely inside y */
421       {
422           cy = mpn_add_n (yp + offset, yp + offset, xp, xu);
423           if (offset + xu < yu)
424             cy = mpn_add_1 (yp + offset + xu, yp + offset + xu,
425                             yu - (offset + xu), cy);
426       } else
427         cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset);
428       /* now cy is the carry at yp + yu */
429       if (xu + offset > yu) /* high part of x exceeds y */
430       {
431           MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu);
432           cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy);
433           yu = xu + offset;
434       }
435       /* now cy is the carry at yp + yn */
436       if (cy)
437         yp[yu++] = cy;
438       MPN_NORMALIZE(yp, yu);
439       *yn = yu;
440   } else /* x does not overlap */
441   {
442       if (offset > yu)
443         MPN_ZERO (yp + yu, offset - yu);
444       MPN_COPY (yp + offset, xp, xu);
445       *yn = offset + xu;
446   }
447 }
448 
449 #define MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
450    do \
451    { \
452       if ((n1xx != 0) && (n2xx != 0)) \
453       { mp_size_t len; \
454 	      if (n1xx == n2xx) \
455 		   { \
456 			   if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_mul_n(r3xx, r1xx, r2xx, n1xx); \
457             else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \
458 		   } else if (n1xx > n2xx) \
459 		      mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \
460 		   else \
461 		      mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \
462 	      len = n1xx + n2xx; \
463 		   MPN_NORMALIZE(r3xx, len); \
464 		   n3xx = len; \
465       } else \
466          n3xx = 0; \
467    } while (0)
468 
469 #define MUL_TC4(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
470 	do \
471 	{ \
472 	   mp_size_t sign = n1xx ^ n2xx; \
473 	   mp_size_t un1 = ABS(n1xx); \
474 	   mp_size_t un2 = ABS(n2xx); \
475 		MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \
476 		if (sign < 0) n3xx = -n3xx; \
477 	} while (0)
478 
479 #define SQR_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx) \
480    do \
481    { \
482       if (n1xx != 0) \
483       { mp_size_t len; \
484 	      if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_sqr_n(r3xx, r1xx, n1xx); \
485          else mpn_sqr(r3xx, r1xx, n1xx); \
486 		   len = 2*n1xx; \
487 		   MPN_NORMALIZE(r3xx, len); \
488 		   n3xx = len; \
489       } else \
490          n3xx = 0; \
491    } while (0)
492 
493 #define SQR_TC4(r3xx, n3xx, r1xx, n1xx) \
494 	do \
495 	{ \
496 	   mp_size_t un1 = ABS(n1xx); \
497 	   SQR_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1); \
498 	} while (0)
499 
500 #define TC4_NORM(rxx, nxx, sxx) \
501 	do \
502 	{ \
503 	   nxx = sxx; \
504 	   MPN_NORMALIZE(rxx, nxx); \
505 	} while(0)
506 
507 /* Zero out limbs to end of integer */
508 #define TC4_DENORM(rxx, nxx, sxx) \
509 	do { \
510 	MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \
511 	} while (0)
512 
513 /* Two's complement divexact by power of 2 */
514 #define TC4_DIVEXACT_2EXP(rxx, nxx, sxx) \
515 	do { \
516 	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \
517       mpn_rshift(rxx, rxx, nxx, sxx); \
518 	   rxx[nxx-1] |= sign; \
519 	} while (0)
520 
521 #if HAVE_NATIVE_mpn_rshift1
522 #define TC4_RSHIFT1(rxx, nxx) \
523 	do { \
524 	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
525        mpn_half(rxx, nxx); \
526 	   rxx[nxx-1] |= sign; \
527 	} while (0)
528 #else
529 #define TC4_RSHIFT1(rxx, nxx) \
530 	do { \
531 	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
532        mpn_rshift(rxx, rxx, nxx, 1); \
533 	   rxx[nxx-1] |= sign; \
534 	} while (0)
535 #endif
536 
537 #define r1 (tp)
538 #define r2 (tp + t4)
539 #define r4 (tp + 2*t4)
540 #define r6 (tp + 3*t4)
541 
542 #define r3 (rp + 4*sn)
543 #define r5 (rp + 2*sn)
544 #define r7 (rp)
545 
546 /* Multiply {up, n} by {vp, n} and write the result to
547    {prodp, 2n}.
548 
549    Note that prodp gets 2n limbs stored, even if the actual result
550    only needs 2n - 1.
551 */
552 
553 #define mpn_clearit(rxx, nxx) \
554   do { \
555      mp_size_t ind = 0; \
556      for ( ; ind < nxx; ind++) \
557         (rxx)[ind] = CNST_LIMB(0); \
558   } while (0)
559 
560 void
mpn_toom4_mul_n(mp_ptr rp,mp_srcptr up,mp_srcptr vp,mp_size_t n)561 mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up,
562 		          mp_srcptr vp, mp_size_t n)
563 {
564   mp_size_t ind;
565   mp_limb_t cy, cy2, r30, r31;
566   mp_ptr tp;
567   mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1;
568   TMP_DECL;
569 
570   sn = (n + 3) / 4;
571 
572   h1 = n - 3*sn;
573 
574 #define a0 (up)
575 #define a1 (up + sn)
576 #define a2 (up + 2*sn)
577 #define a3 (up + 3*sn)
578 #define b0 (vp)
579 #define b1 (vp + sn)
580 #define b2 (vp + 2*sn)
581 #define b3 (vp + 3*sn)
582 
583    t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
584 
585    TMP_MARK;
586 
587    tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1));
588 
589 #define u2 (tp + 4*t4)
590 #define u3 (tp + 4*t4 + (sn+1))
591 #define u4 (tp + 4*t4 + 2*(sn+1))
592 #define u5 (tp + 4*t4 + 3*(sn+1))
593 #define u6 (tp + 4*t4 + 4*(sn+1))
594 
595    u6[sn] = mpn_add(u6, a1, sn, a3, h1);
596    u5[sn] = mpn_add_n(u5, a2, a0, sn);
597    mpn_add_n(u3, u5, u6, sn + 1);
598    n4 = sn + 1;
599    if (mpn_cmp(u5, u6, sn + 1) >= 0)
600       mpn_sub_n(u4, u5, u6, sn + 1);
601    else
602    {
603       mpn_sub_n(u4, u6, u5, sn + 1);
604       n4 = -n4;
605    }
606 
607    u6[sn] = mpn_add(u6, b1, sn, b3, h1);
608    u5[sn] = mpn_add_n(u5, b2, b0, sn);
609    mpn_add_n(r2, u5, u6, sn + 1);
610    n5 = sn + 1;
611    if (mpn_cmp(u5, u6, sn + 1) >= 0)
612       mpn_sub_n(u5, u5, u6, sn + 1);
613    else
614    {
615       mpn_sub_n(u5, u6, u5, sn + 1);
616       n5 = -n5;
617    }
618 
619    MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */
620    MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */
621 
622 #if HAVE_NATIVE_mpn_addlsh_n
623    r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2);
624    mpn_lshift(r1, r1, sn + 1, 1);
625    cy = mpn_addlsh_n(r2, a3, a1, h1, 2);
626 #else
627    r1[sn] = mpn_lshift(r1, a2, sn, 1);
628    MPN_COPY(r2, a3, h1);
629    r1[sn] += mpn_addmul_1(r1, a0, sn, 8);
630    cy = mpn_addmul_1(r2, a1, h1, 4);
631 #endif
632    if (sn > h1)
633    {
634       cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2);
635       cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
636    }
637    r2[sn] = cy;
638    mpn_add_n(u5, r1, r2, sn + 1);
639    n6 = sn + 1;
640    if (mpn_cmp(r1, r2, sn + 1) >= 0)
641       mpn_sub_n(u6, r1, r2, sn + 1);
642    else
643    {
644       mpn_sub_n(u6, r2, r1, sn + 1);
645       n6 = -n6;
646    }
647 
648 #if HAVE_NATIVE_mpn_addlsh_n
649    r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2);
650    mpn_lshift(r1, r1, sn + 1, 1);
651    cy = mpn_addlsh_n(r2, b3, b1, h1, 2);
652 #else
653    r1[sn] = mpn_lshift(r1, b2, sn, 1);
654    MPN_COPY(r2, b3, h1);
655    r1[sn] += mpn_addmul_1(r1, b0, sn, 8);
656    cy = mpn_addmul_1(r2, b1, h1, 4);
657 #endif
658    if (sn > h1)
659    {
660       cy2 = mpn_lshift(r2 + h1, b1 + h1, sn - h1, 2);
661       cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
662    }
663    r2[sn] = cy;
664    mpn_add_n(u2, r1, r2, sn + 1);
665    n8 = sn + 1;
666    if (mpn_cmp(r1, r2, sn + 1) >= 0)
667       mpn_sub_n(r2, r1, r2, sn + 1);
668    else
669    {
670       mpn_sub_n(r2, r2, r1, sn + 1);
671       n8 = -n8;
672    }
673 
674    r30 = r3[0];
675    r31 = r3[1];
676    MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */
677    MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */
678    r3[1] = r31;
679 
680 #if HAVE_NATIVE_mpn_addlsh1_n
681    cy = mpn_addlsh1_n(u2, a2, a3, h1);
682    if (sn > h1)
683       cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy);
684    u2[sn] = cy;
685    u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn);
686    u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn);
687 #else
688    MPN_COPY(u2, a0, sn);
689    u2[sn] = mpn_addmul_1(u2, a1, sn, 2);
690    u2[sn] += mpn_addmul_1(u2, a2, sn, 4);
691    cy = mpn_addmul_1(u2, a3, h1, 8);
692    if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy);
693    u2[sn] += cy;
694 #endif
695 
696 #if HAVE_NATIVE_mpn_addlsh1_n
697    cy = mpn_addlsh1_n(r1, b2, b3, h1);
698    if (sn > h1)
699       cy = mpn_add_1(r1 + h1, b2 + h1, sn - h1, cy);
700    r1[sn] = cy;
701    r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn);
702    r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn);
703 #else
704    MPN_COPY(r1, b0, sn);
705    r1[sn] = mpn_addmul_1(r1, b1, sn, 2);
706    r1[sn] += mpn_addmul_1(r1, b2, sn, 4);
707    cy = mpn_addmul_1(r1, b3, h1, 8);
708    if (sn > h1) cy = mpn_add_1(r1 + h1, r1 + h1, sn - h1, cy);
709    r1[sn] += cy;
710 #endif
711 
712    MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */
713 
714    MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h1); /* oo */
715    MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */
716 
717    TC4_DENORM(r1, n1, t4 - 1);
718 
719 /*	rp        rp1          rp2           rp3          rp4           rp5         rp6           rp7
720 <----------- r7-----------><------------r5-------------->
721                                                        <-------------r3------------->
722 
723               <-------------r6------------->                        < -----------r2------------>{           }
724                                          <-------------r4-------------->         <--------------r1---->
725 */
726 
727    mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);
728 
729    if (rpn != 2*n)
730    {
731 	  MPN_ZERO((rp + rpn), 2*n - rpn);
732    }
733 
734    TMP_FREE;
735 }
736 
737 /* Square {up, n} and write the result to {prodp, 2n}.
738 
739    Note that prodp gets 2n limbs stored, even if the actual result
740    only needs 2n - 1.
741 */
742 
743 void
mpn_toom4_sqr_n(mp_ptr rp,mp_srcptr up,mp_size_t n)744 mpn_toom4_sqr_n (mp_ptr rp, mp_srcptr up, mp_size_t n)
745 {
746   mp_size_t len1, ind;
747   mp_limb_t cy, r30, r31;
748   mp_ptr tp;
749   mp_size_t a0n, a1n, a2n, a3n, sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, rpn, t4;
750 
751   len1 = n;
752   ASSERT (n >= 1);
753 
754   MPN_NORMALIZE(up, len1);
755 
756   sn = (n - 1) / 4 + 1;
757 
758   /* a0 - a3 are defined in mpn_toom4_mul_n above */
759 
760    TC4_NORM(a0, a0n, sn);
761 	TC4_NORM(a1, a1n, sn);
762 	TC4_NORM(a2, a2n, sn);
763 	TC4_NORM(a3, a3n, n - 3*sn);
764 
765    t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
766 
767    tp = __GMP_ALLOCATE_FUNC_LIMBS(4*t4 + 4*(sn + 1));
768 
769    tc4_add_unsigned(u5, &n5, a3, a3n, a1, a1n);
770    tc4_add_unsigned(u4, &n4, a2, a2n, a0, a0n);
771 	tc4_add_unsigned(u2, &n2, u4, n4, u5, n5);
772    tc4_sub(u3, &n3, u4, n4, u5, n5);
773 
774 	SQR_TC4(r4, n4, u3, n3);
775    SQR_TC4_UNSIGNED(r3, n3, u2, n2);
776 
777 	tc4_lshift(r1, &n1, a0, a0n, 3);
778 	tc4_addlsh1_unsigned(r1, &n1, a2, a2n);
779  	tc4_lshift(r2, &n8, a1, a1n, 2);
780    tc4_add(r2, &n8, r2, n8, a3, a3n);
781    tc4_add(u4, &n9, r1, n1, r2, n8);
782    tc4_sub(u5, &n5, r1, n1, r2, n8);
783 
784 	r30 = r3[0];
785 	if (!n3) r30 = CNST_LIMB(0);
786    r31 = r3[1];
787 	SQR_TC4(r6, n6, u5, n5);
788    SQR_TC4_UNSIGNED(r5, n5, u4, n9);
789    r3[1] = r31;
790 
791    tc4_lshift(u2, &n8, a3, a3n, 3);
792    tc4_addmul_1(u2, &n8, a2, a2n, 4);
793 	tc4_addlsh1_unsigned(u2, &n8, a1, a1n);
794 	tc4_add(u2, &n8, u2, n8, a0, a0n);
795 
796 	SQR_TC4_UNSIGNED(r2, n2, u2, n8);
797    SQR_TC4_UNSIGNED(r1, n1, a3, a3n);
798    SQR_TC4_UNSIGNED(r7, n7, a0, a0n);
799 
800 	TC4_DENORM(r1, n1,  t4 - 1);
801    TC4_DENORM(r2, n2,  t4 - 1);
802    if (n3)
803      TC4_DENORM(r3, n3,  t4 - 1);
804    else {
805      /* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */
806      for (ind = 1 ; ind < t4 - 1; ind++)
807         (r3)[ind] = CNST_LIMB(0);
808    }
809    TC4_DENORM(r4, n4,  t4 - 1);
810    TC4_DENORM(r5, n5,  t4 - 1);
811    TC4_DENORM(r6, n6,  t4 - 1);
812    TC4_DENORM(r7, n7,  t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)
813 
814 /*	rp        rp1          rp2           rp3          rp4           rp5         rp6           rp7
815 <----------- r7-----------><------------r5-------------->
816                                                        <-------------r3------------->
817 
818               <-------------r6------------->                        < -----------r2------------>{           }
819                                          <-------------r4-------------->         <--------------r1---->
820 */
821 
822 	mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);
823 
824 	if (rpn != 2*n)
825 	{
826 		MPN_ZERO((rp + rpn), 2*n - rpn);
827 	}
828 
829    __GMP_FREE_FUNC_LIMBS (tp, 4*t4 + 4*(sn+1));
830 }
831 
832 /*
833    Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a
834 	polynomial p(x) with 7 coefficients given the values
835 	p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0).
836 	The output is placed in rp and the final number of limbs of the
837 	output is given in rpn.
838 	The 4th and 6th values may be negative, and if so, n4 and n6
839 	should be set to a negative value respectively.
840    To save space we pass r3, r5, r7 in place in the output rp.
841 	The other r's are stored separately in space tp.
842 	The low limb of r3 is stored in r30, as it will be overwritten
843 	by the high limb of r5.
844 
845 rp          rp1          rp2           rp3          rp4           rp5         rp6           rp7
846 <----------- r7-----------><------------r5-------------->
847                                                       <-------------r3------------->
848 
849    We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4)
850 	and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.
851 */
mpn_toom4_interpolate(mp_ptr rp,mp_size_t * rpn,mp_size_t sn,mp_ptr tp,mp_size_t s4,mp_size_t n4,mp_size_t n6,mp_limb_t r30)852 void mpn_toom4_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,
853 		       mp_ptr tp, mp_size_t s4, mp_size_t n4, mp_size_t n6, mp_limb_t r30)
854 {
855 	mp_size_t n1, n2, n3, n5, n7, t4;
856 	mp_limb_t saved, saved2, cy;
857 
858    t4 = s4 + 1;
859 
860 	mpn_add_n(r2, r2, r5, s4);
861 
862    if (n6 < 0)
863 		mpn_add_n(r6, r5, r6, s4);
864 	else
865       mpn_sub_n(r6, r5, r6, s4);
866 	/* r6 is now in twos complement format */
867 
868 	saved = r3[0];
869 	r3[0] = r30;
870 	if (n4 < 0)
871 		mpn_add_n(r4, r3, r4, s4);
872 	else
873       mpn_sub_n(r4, r3, r4, s4);
874 	r3[0] = saved;
875 	/* r4 is now in twos complement format */
876 
877 	mpn_sub_n(r5, r5, r1, s4);
878 
879 #if HAVE_NATIVE_mpn_sublsh_n
880 	r5[s4-1] -= mpn_sublsh_n(r5, r5, r7, s4-1, 6);
881 #else
882 	r5[s4-1] -= mpn_submul_1(r5, r7, s4-1, 64);
883 #endif
884 
885    TC4_RSHIFT1(r4, s4);
886 
887 	saved = r3[0];
888 	r3[0] = r30;
889 	mpn_sub_n(r3, r3, r4, s4);
890 	r30 = r3[0];
891 	r3[0] = saved;
892 
893 	mpn_double(r5, s4);
894 
895 	mpn_sub_n(r5, r5, r6, s4);
896 
897    saved = r3[0];
898 	r3[0] = r30;
899 	mpn_submul_1(r2, r3, s4, 65);
900    r3[0] = saved;
901 
902 	saved2 = r7[s4-1];
903 	r7[s4-1] = CNST_LIMB(0); // r7 is always positive so no sign extend needed
904 	saved = r3[0];
905 	r3[0] = r30;
906 #if HAVE_NATIVE_mpn_subadd_n
907 	mpn_subadd_n(r3, r3, r7, r1, s4);
908 #else
909     mpn_sub_n(r3, r3, r7, s4);
910     mpn_sub_n(r3, r3, r1, s4);
911 #endif
912 	r7[s4-1] = saved2;
913    r30 = r3[0];
914 
915    mpn_addmul_1(r2, r3, s4, 45);
916 
917 #if HAVE_NATIVE_mpn_sublsh_n
918    cy = mpn_sublsh_n(r5, r5, r3, s4 - 1, 3);
919 #else
920    cy = mpn_submul_1(r5, r3, s4 - 1, 8);
921 #endif
922    r3[0] = saved;
923 	r3[0] -= (cy + 8*r3[s4-1]);
924 
925 	mpn_rshift(r5, r5, s4, 3);
926 
927 	mpn_divexact_by3(r5, r5, s4);
928 
929 	mpn_sub_n(r6, r6, r2, s4);
930 
931 #if HAVE_NATIVE_mpn_sublsh_n
932 	mpn_sublsh_n(r2, r2, r4, s4, 4);
933 #else
934 	mpn_submul_1(r2, r4, s4, 16);
935 #endif
936 
937    mpn_rshift(r2, r2, s4, 1);
938 
939 	mpn_divexact_by3(r2, r2, s4);
940 
941    mpn_divexact_by3(r2, r2, s4);
942 
943    saved = r3[0];
944 	r3[0] = r30;
945    cy = mpn_sub_n(r3, r3, r5, s4 - 1);
946    r30 = r3[0];
947 	r3[0] = saved;
948 	r3[s4-1] -= (cy + r5[s4-1]);
949 
950 	mpn_sub_n(r4, r4, r2, s4);
951 
952 	mpn_addmul_1(r6, r2, s4, 30);
953 
954    mpn_divexact_byfobm1(r6, r6, s4, CNST_LIMB(15), CNST_LIMB(~0/15));
955 
956 	mpn_rshift(r6, r6, s4, 2);
957 
958 	mpn_sub_n(r2, r2, r6, s4);
959 
960 	TC4_NORM(r1, n1, s4);
961    TC4_NORM(r2, n2, s4);
962 
963    (*rpn) = 6*sn+1;
964 	cy = mpn_add_1(r3, r3, *rpn - 4*sn, r30); /* don't forget to add r3[0] back in */
965    if (cy)
966 	{
967 		rp[*rpn] = cy;
968 	   (*rpn)++;
969 	}
970 
971 	tc4_copy(rp, rpn, 5*sn, r2, n2);
972    tc4_copy(rp, rpn, 6*sn, r1, n1);
973 
974 	tc4_copy(rp, rpn, sn, r6, s4);
975    tc4_copy(rp, rpn, 3*sn, r4, s4);
976 }
977