1 /* mpn_toom4_mul_n -- Internal routine to multiply two natural numbers
2 of length n.
3
4 THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
5 SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
6 */
7
8 /* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 4-way.
9
10 Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
11 Copyright 2009 William Hart
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 2.1 of the License, or (at your
18 option) any later version.
19
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
23 License for more details.
24
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library; see the file COPYING.LIB. If not, write to
27 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
28 MA 02110-1301, USA. */
29
30 /*
31 This implementation is based on that of Paul Zimmmermann, which is available
32 for mpz_t's at http://www.loria.fr/~zimmerma/software/toom4.c
33 */
34
35 #include "mpir.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
38
39 void
40 mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up,
41 mp_srcptr vp, mp_size_t n);
42
_tc4_add(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)43 void _tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n,
44 mp_srcptr r2, mp_size_t r2n)
45 {
46 mp_limb_t cy;
47 mp_size_t s1 = ABS(r1n);
48 mp_size_t s2 = ABS(r2n);
49
50 if (!s1)
51 {
52 *rn = 0;
53 } else if (!s2)
54 {
55 if (rp != r1) MPN_COPY(rp, r1, s1);
56 *rn = r1n;
57 } else if ((r1n ^ r2n) >= 0)
58 {
59 *rn = r1n;
60 cy = mpn_add(rp, r1, s1, r2, s2);
61 if (cy)
62 {
63 rp[s1] = cy;
64 if ((*rn) < 0) (*rn)--;
65 else (*rn)++;
66 }
67 } else
68 {
69 mp_size_t ct;
70 if (s1 != s2) ct = 1;
71 else MPN_CMP(ct, r1, r2, s1);
72
73 if (!ct) *rn = 0;
74 else if (ct > 0)
75 {
76 mpn_sub(rp, r1, s1, r2, s2);
77 *rn = s1;
78 MPN_NORMALIZE(rp, (*rn));
79 if (r1n < 0) *rn = -(*rn);
80 }
81 else
82 {
83 mpn_sub_n(rp, r2, r1, s1);
84 *rn = s1;
85 MPN_NORMALIZE(rp, (*rn));
86 if (r1n > 0) *rn = -(*rn);
87 }
88 }
89 }
90
tc4_add(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)91 void tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n,
92 mp_srcptr r2, mp_size_t r2n)
93 {
94 mp_size_t s1 = ABS(r1n);
95 mp_size_t s2 = ABS(r2n);
96
97 if (s1 < s2) _tc4_add(rp, rn, r2, r2n, r1, r1n);
98 else _tc4_add(rp, rn, r1, r1n, r2, r2n);
99 }
100
_tc4_add_unsigned(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)101 void _tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1,
102 mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
103 {
104 mp_limb_t cy;
105 mp_size_t s1 = r1n;
106 mp_size_t s2 = r2n;
107
108 if (!s2)
109 {
110 if (!s1) *rn = 0;
111 else
112 {
113 if (rp != r1) MPN_COPY(rp, r1, s1);
114 *rn = r1n;
115 }
116 } else
117 {
118 *rn = r1n;
119 cy = mpn_add(rp, r1, s1, r2, s2);
120 if (cy)
121 {
122 rp[s1] = cy;
123 if ((*rn) < 0) (*rn)--;
124 else (*rn)++;
125 }
126 }
127 }
128
tc4_add_unsigned(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)129 void tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1,
130 mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
131 {
132 if (r1n < r2n) _tc4_add_unsigned(rp, rn, r2, r2n, r1, r1n);
133 else _tc4_add_unsigned(rp, rn, r1, r1n, r2, r2n);
134 }
135
tc4_sub(mp_ptr rp,mp_size_t * rn,mp_srcptr r1,mp_size_t r1n,mp_srcptr r2,mp_size_t r2n)136 void tc4_sub(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n,
137 mp_srcptr r2, mp_size_t r2n)
138 {
139 tc4_add(rp, rn, r1, r1n, r2, -r2n);
140 }
141
tc4_lshift(mp_ptr rp,mp_size_t * rn,mp_srcptr xp,mp_size_t xn,mp_size_t bits)142 void tc4_lshift(mp_ptr rp, mp_size_t * rn, mp_srcptr xp,
143 mp_size_t xn, mp_size_t bits)
144 {
145 if (xn == 0) *rn = 0;
146 else
147 {
148 mp_size_t xu = ABS(xn);
149 mp_limb_t msl = mpn_lshift(rp, xp, xu, bits);
150 if (msl)
151 {
152 rp[xu] = msl;
153 *rn = (xn >= 0 ? xn + 1 : xn - 1);
154 } else
155 *rn = xn;
156 }
157 }
158
tc4_rshift_inplace(mp_ptr rp,mp_size_t * rn,mp_size_t bits)159 void tc4_rshift_inplace(mp_ptr rp, mp_size_t * rn, mp_size_t bits)
160 {
161 if (*rn)
162 {
163 if ((*rn) > 0)
164 {
165 mpn_rshift(rp, rp, *rn, bits);
166 if (rp[(*rn) - 1] == CNST_LIMB(0)) (*rn)--;
167 } else
168 {
169 mpn_rshift(rp, rp, -(*rn), bits);
170 if (rp[-(*rn) - 1] == CNST_LIMB(0)) (*rn)++;
171 }
172 }
173 }
174
tc4_addlsh1_unsigned(mp_ptr rp,mp_size_t * rn,mp_srcptr xp,mp_size_t xn)175 void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
176 {
177 if (xn)
178 {
179 if (xn >= *rn)
180 {
181 mp_limb_t cy;
182 if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
183 #if HAVE_NATIVE_mpn_addlsh1_n
184 cy = mpn_addlsh1_n(rp, rp, xp, xn);
185 #else
186 cy = mpn_add_n(rp, rp, xp, xn);
187 cy += mpn_add_n(rp, rp, xp, xn);
188 #endif
189 if (cy)
190 {
191 rp[xn] = cy;
192 *rn = xn + 1;
193 } else *rn = xn;
194 } else
195 {
196 mp_limb_t cy;
197 #if HAVE_NATIVE_mpn_addlsh1_n
198 cy = mpn_addlsh1_n(rp, rp, xp, xn);
199 #else
200 cy = mpn_add_n(rp, rp, xp, xn);
201 cy += mpn_add_n(rp, rp, xp, xn);
202 #endif
203 if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
204 if (cy)
205 {
206 rp[*rn] = cy;
207 (*rn)++;
208 }
209 }
210 }
211 }
212
tc4_divexact_ui(mp_ptr rp,mp_size_t * rn,mp_ptr x,mp_size_t xn,mp_limb_t c)213 void tc4_divexact_ui(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn, mp_limb_t c)
214 {
215 mp_size_t abs_size;
216 if (xn == 0)
217 {
218 *rn = 0;
219 return;
220 }
221 abs_size = ABS (xn);
222
223 MPN_DIVREM_OR_DIVEXACT_1 (rp, x, abs_size, c);
224 abs_size -= (rp[abs_size-1] == 0);
225 *rn = (xn >= 0 ? abs_size : -abs_size);
226 }
227
tc4_divexact_by3(mp_ptr rp,mp_size_t * rn,mp_ptr x,mp_size_t xn)228 void tc4_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
229 {
230 if (xn)
231 {
232 mp_size_t xu = ABS(xn);
233 mpn_divexact_by3(rp, x, xu);
234 if (xn > 0)
235 {
236 if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
237 else *rn = xn;
238 } else
239 {
240 if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
241 else *rn = xn;
242 }
243 } else *rn = 0;
244 }
245
tc4_divexact_by15(mp_ptr rp,mp_size_t * rn,mp_ptr x,mp_size_t xn)246 void tc4_divexact_by15(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
247 {
248 if (xn)
249 {
250 mp_size_t xu = ABS(xn);
251 mpn_divexact_byfobm1(rp, x, xu, CNST_LIMB(15), CNST_LIMB((~0)/15)); /* works for 32 and 64 bits */
252 if (xn > 0)
253 {
254 if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
255 else *rn = xn;
256 } else
257 {
258 if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
259 else *rn = xn;
260 }
261 } else *rn = 0;
262 }
263
264 #if HAVE_NATIVE_mpn_mul_1c
265 #define MPN_MUL_1C(cout, dst, src, size, n, cin) \
266 do { \
267 (cout) = mpn_mul_1c (dst, src, size, n, cin); \
268 } while (0)
269 #else
270 #define MPN_MUL_1C(cout, dst, src, size, n, cin) \
271 do { \
272 mp_limb_t __cy; \
273 __cy = mpn_mul_1 (dst, src, size, n); \
274 (cout) = __cy + mpn_add_1 (dst, dst, size, cin); \
275 } while (0)
276 #endif
277
tc4_addmul_1(mp_ptr wp,mp_size_t * wn,mp_srcptr xp,mp_size_t xn,mp_limb_t y)278 void tc4_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y)
279 {
280 mp_size_t sign, wu, xu, ws, new_wn, min_size, dsize;
281 mp_limb_t cy;
282
283 /* w unaffected if x==0 or y==0 */
284 if (xn == 0 || y == 0)
285 return;
286
287 sign = xn;
288 xu = ABS (xn);
289
290 ws = *wn;
291 if (*wn == 0)
292 {
293 /* nothing to add to, just set x*y, "sign" gives the sign */
294 cy = mpn_mul_1 (wp, xp, xu, y);
295 if (cy)
296 {
297 wp[xu] = cy;
298 xu = xu + 1;
299 }
300 *wn = (sign >= 0 ? xu : -xu);
301 return;
302 }
303
304 sign ^= *wn;
305 wu = ABS (*wn);
306
307 new_wn = MAX (wu, xu);
308 min_size = MIN (wu, xu);
309
310 if (sign >= 0)
311 {
312 /* addmul of absolute values */
313
314 cy = mpn_addmul_1 (wp, xp, min_size, y);
315
316 dsize = xu - wu;
317 #if HAVE_NATIVE_mpn_mul_1c
318 if (dsize > 0)
319 cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy);
320 else if (dsize < 0)
321 {
322 dsize = -dsize;
323 cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
324 }
325 #else
326 if (dsize != 0)
327 {
328 mp_limb_t cy2;
329 if (dsize > 0)
330 cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y);
331 else
332 {
333 dsize = -dsize;
334 cy2 = 0;
335 }
336 cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
337 }
338 #endif
339
340 if (cy)
341 {
342 wp[dsize + min_size] = cy;
343 new_wn ++;
344 }
345 } else
346 {
347 /* submul of absolute values */
348
349 cy = mpn_submul_1 (wp, xp, min_size, y);
350 if (wu >= xu)
351 {
352 /* if w bigger than x, then propagate borrow through it */
353 if (wu != xu)
354 cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy);
355
356 if (cy != 0)
357 {
358 /* Borrow out of w, take twos complement negative to get
359 absolute value, flip sign of w. */
360 wp[new_wn] = ~-cy; /* extra limb is 0-cy */
361 mpn_not (wp, new_wn);
362 new_wn++;
363 MPN_INCR_U (wp, new_wn, CNST_LIMB(1));
364 ws = -*wn;
365 }
366 } else /* wu < xu */
367 {
368 /* x bigger than w, so want x*y-w. Submul has given w-x*y, so
369 take twos complement and use an mpn_mul_1 for the rest. */
370
371 mp_limb_t cy2;
372
373 /* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */
374 mpn_not (wp, wu);
375 cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1));
376 cy -= 1;
377
378 /* If cy-1 == -1 then hold that -1 for latter. mpn_submul_1 never
379 returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */
380 cy2 = (cy == MP_LIMB_T_MAX);
381 cy += cy2;
382 MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy);
383 wp[new_wn] = cy;
384 new_wn += (cy != 0);
385
386 /* Apply any -1 from above. The value at wp+wsize is non-zero
387 because y!=0 and the high limb of x will be non-zero. */
388 if (cy2)
389 MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1));
390
391 ws = -*wn;
392 }
393
394 /* submul can produce high zero limbs due to cancellation, both when w
395 has more limbs or x has more */
396 MPN_NORMALIZE (wp, new_wn);
397 }
398
399 *wn = (ws >= 0 ? new_wn : -new_wn);
400
401 ASSERT (new_wn == 0 || wp[new_wn - 1] != 0);
402 }
403
tc4_submul_1(mp_ptr wp,mp_size_t * wn,mp_srcptr x,mp_size_t xn,mp_limb_t y)404 void tc4_submul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr x, mp_size_t xn, mp_limb_t y)
405 {
406 tc4_addmul_1(wp, wn, x, -xn, y);
407 }
408
tc4_copy(mp_ptr yp,mp_size_t * yn,mp_size_t offset,mp_srcptr xp,mp_size_t xn)409 void tc4_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn)
410 {
411 mp_size_t yu = ABS(*yn);
412 mp_size_t xu = ABS(xn);
413 mp_limb_t cy = 0;
414
415 if (xn == 0)
416 return;
417
418 if (offset < yu) /* low part of x overlaps with y */
419 {
420 if (offset + xu <= yu) /* x entirely inside y */
421 {
422 cy = mpn_add_n (yp + offset, yp + offset, xp, xu);
423 if (offset + xu < yu)
424 cy = mpn_add_1 (yp + offset + xu, yp + offset + xu,
425 yu - (offset + xu), cy);
426 } else
427 cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset);
428 /* now cy is the carry at yp + yu */
429 if (xu + offset > yu) /* high part of x exceeds y */
430 {
431 MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu);
432 cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy);
433 yu = xu + offset;
434 }
435 /* now cy is the carry at yp + yn */
436 if (cy)
437 yp[yu++] = cy;
438 MPN_NORMALIZE(yp, yu);
439 *yn = yu;
440 } else /* x does not overlap */
441 {
442 if (offset > yu)
443 MPN_ZERO (yp + yu, offset - yu);
444 MPN_COPY (yp + offset, xp, xu);
445 *yn = offset + xu;
446 }
447 }
448
449 #define MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
450 do \
451 { \
452 if ((n1xx != 0) && (n2xx != 0)) \
453 { mp_size_t len; \
454 if (n1xx == n2xx) \
455 { \
456 if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_mul_n(r3xx, r1xx, r2xx, n1xx); \
457 else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \
458 } else if (n1xx > n2xx) \
459 mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \
460 else \
461 mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \
462 len = n1xx + n2xx; \
463 MPN_NORMALIZE(r3xx, len); \
464 n3xx = len; \
465 } else \
466 n3xx = 0; \
467 } while (0)
468
469 #define MUL_TC4(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
470 do \
471 { \
472 mp_size_t sign = n1xx ^ n2xx; \
473 mp_size_t un1 = ABS(n1xx); \
474 mp_size_t un2 = ABS(n2xx); \
475 MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \
476 if (sign < 0) n3xx = -n3xx; \
477 } while (0)
478
479 #define SQR_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx) \
480 do \
481 { \
482 if (n1xx != 0) \
483 { mp_size_t len; \
484 if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_sqr_n(r3xx, r1xx, n1xx); \
485 else mpn_sqr(r3xx, r1xx, n1xx); \
486 len = 2*n1xx; \
487 MPN_NORMALIZE(r3xx, len); \
488 n3xx = len; \
489 } else \
490 n3xx = 0; \
491 } while (0)
492
493 #define SQR_TC4(r3xx, n3xx, r1xx, n1xx) \
494 do \
495 { \
496 mp_size_t un1 = ABS(n1xx); \
497 SQR_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1); \
498 } while (0)
499
500 #define TC4_NORM(rxx, nxx, sxx) \
501 do \
502 { \
503 nxx = sxx; \
504 MPN_NORMALIZE(rxx, nxx); \
505 } while(0)
506
507 /* Zero out limbs to end of integer */
508 #define TC4_DENORM(rxx, nxx, sxx) \
509 do { \
510 MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \
511 } while (0)
512
513 /* Two's complement divexact by power of 2 */
514 #define TC4_DIVEXACT_2EXP(rxx, nxx, sxx) \
515 do { \
516 mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \
517 mpn_rshift(rxx, rxx, nxx, sxx); \
518 rxx[nxx-1] |= sign; \
519 } while (0)
520
521 #if HAVE_NATIVE_mpn_rshift1
522 #define TC4_RSHIFT1(rxx, nxx) \
523 do { \
524 mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
525 mpn_half(rxx, nxx); \
526 rxx[nxx-1] |= sign; \
527 } while (0)
528 #else
529 #define TC4_RSHIFT1(rxx, nxx) \
530 do { \
531 mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
532 mpn_rshift(rxx, rxx, nxx, 1); \
533 rxx[nxx-1] |= sign; \
534 } while (0)
535 #endif
536
537 #define r1 (tp)
538 #define r2 (tp + t4)
539 #define r4 (tp + 2*t4)
540 #define r6 (tp + 3*t4)
541
542 #define r3 (rp + 4*sn)
543 #define r5 (rp + 2*sn)
544 #define r7 (rp)
545
546 /* Multiply {up, n} by {vp, n} and write the result to
547 {prodp, 2n}.
548
549 Note that prodp gets 2n limbs stored, even if the actual result
550 only needs 2n - 1.
551 */
552
553 #define mpn_clearit(rxx, nxx) \
554 do { \
555 mp_size_t ind = 0; \
556 for ( ; ind < nxx; ind++) \
557 (rxx)[ind] = CNST_LIMB(0); \
558 } while (0)
559
560 void
mpn_toom4_mul_n(mp_ptr rp,mp_srcptr up,mp_srcptr vp,mp_size_t n)561 mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up,
562 mp_srcptr vp, mp_size_t n)
563 {
564 mp_size_t ind;
565 mp_limb_t cy, cy2, r30, r31;
566 mp_ptr tp;
567 mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1;
568 TMP_DECL;
569
570 sn = (n + 3) / 4;
571
572 h1 = n - 3*sn;
573
574 #define a0 (up)
575 #define a1 (up + sn)
576 #define a2 (up + 2*sn)
577 #define a3 (up + 3*sn)
578 #define b0 (vp)
579 #define b1 (vp + sn)
580 #define b2 (vp + 2*sn)
581 #define b3 (vp + 3*sn)
582
583 t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
584
585 TMP_MARK;
586
587 tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1));
588
589 #define u2 (tp + 4*t4)
590 #define u3 (tp + 4*t4 + (sn+1))
591 #define u4 (tp + 4*t4 + 2*(sn+1))
592 #define u5 (tp + 4*t4 + 3*(sn+1))
593 #define u6 (tp + 4*t4 + 4*(sn+1))
594
595 u6[sn] = mpn_add(u6, a1, sn, a3, h1);
596 u5[sn] = mpn_add_n(u5, a2, a0, sn);
597 mpn_add_n(u3, u5, u6, sn + 1);
598 n4 = sn + 1;
599 if (mpn_cmp(u5, u6, sn + 1) >= 0)
600 mpn_sub_n(u4, u5, u6, sn + 1);
601 else
602 {
603 mpn_sub_n(u4, u6, u5, sn + 1);
604 n4 = -n4;
605 }
606
607 u6[sn] = mpn_add(u6, b1, sn, b3, h1);
608 u5[sn] = mpn_add_n(u5, b2, b0, sn);
609 mpn_add_n(r2, u5, u6, sn + 1);
610 n5 = sn + 1;
611 if (mpn_cmp(u5, u6, sn + 1) >= 0)
612 mpn_sub_n(u5, u5, u6, sn + 1);
613 else
614 {
615 mpn_sub_n(u5, u6, u5, sn + 1);
616 n5 = -n5;
617 }
618
619 MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */
620 MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */
621
622 #if HAVE_NATIVE_mpn_addlsh_n
623 r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2);
624 mpn_lshift(r1, r1, sn + 1, 1);
625 cy = mpn_addlsh_n(r2, a3, a1, h1, 2);
626 #else
627 r1[sn] = mpn_lshift(r1, a2, sn, 1);
628 MPN_COPY(r2, a3, h1);
629 r1[sn] += mpn_addmul_1(r1, a0, sn, 8);
630 cy = mpn_addmul_1(r2, a1, h1, 4);
631 #endif
632 if (sn > h1)
633 {
634 cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2);
635 cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
636 }
637 r2[sn] = cy;
638 mpn_add_n(u5, r1, r2, sn + 1);
639 n6 = sn + 1;
640 if (mpn_cmp(r1, r2, sn + 1) >= 0)
641 mpn_sub_n(u6, r1, r2, sn + 1);
642 else
643 {
644 mpn_sub_n(u6, r2, r1, sn + 1);
645 n6 = -n6;
646 }
647
648 #if HAVE_NATIVE_mpn_addlsh_n
649 r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2);
650 mpn_lshift(r1, r1, sn + 1, 1);
651 cy = mpn_addlsh_n(r2, b3, b1, h1, 2);
652 #else
653 r1[sn] = mpn_lshift(r1, b2, sn, 1);
654 MPN_COPY(r2, b3, h1);
655 r1[sn] += mpn_addmul_1(r1, b0, sn, 8);
656 cy = mpn_addmul_1(r2, b1, h1, 4);
657 #endif
658 if (sn > h1)
659 {
660 cy2 = mpn_lshift(r2 + h1, b1 + h1, sn - h1, 2);
661 cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
662 }
663 r2[sn] = cy;
664 mpn_add_n(u2, r1, r2, sn + 1);
665 n8 = sn + 1;
666 if (mpn_cmp(r1, r2, sn + 1) >= 0)
667 mpn_sub_n(r2, r1, r2, sn + 1);
668 else
669 {
670 mpn_sub_n(r2, r2, r1, sn + 1);
671 n8 = -n8;
672 }
673
674 r30 = r3[0];
675 r31 = r3[1];
676 MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */
677 MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */
678 r3[1] = r31;
679
680 #if HAVE_NATIVE_mpn_addlsh1_n
681 cy = mpn_addlsh1_n(u2, a2, a3, h1);
682 if (sn > h1)
683 cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy);
684 u2[sn] = cy;
685 u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn);
686 u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn);
687 #else
688 MPN_COPY(u2, a0, sn);
689 u2[sn] = mpn_addmul_1(u2, a1, sn, 2);
690 u2[sn] += mpn_addmul_1(u2, a2, sn, 4);
691 cy = mpn_addmul_1(u2, a3, h1, 8);
692 if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy);
693 u2[sn] += cy;
694 #endif
695
696 #if HAVE_NATIVE_mpn_addlsh1_n
697 cy = mpn_addlsh1_n(r1, b2, b3, h1);
698 if (sn > h1)
699 cy = mpn_add_1(r1 + h1, b2 + h1, sn - h1, cy);
700 r1[sn] = cy;
701 r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn);
702 r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn);
703 #else
704 MPN_COPY(r1, b0, sn);
705 r1[sn] = mpn_addmul_1(r1, b1, sn, 2);
706 r1[sn] += mpn_addmul_1(r1, b2, sn, 4);
707 cy = mpn_addmul_1(r1, b3, h1, 8);
708 if (sn > h1) cy = mpn_add_1(r1 + h1, r1 + h1, sn - h1, cy);
709 r1[sn] += cy;
710 #endif
711
712 MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */
713
714 MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h1); /* oo */
715 MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */
716
717 TC4_DENORM(r1, n1, t4 - 1);
718
719 /* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7
720 <----------- r7-----------><------------r5-------------->
721 <-------------r3------------->
722
723 <-------------r6-------------> < -----------r2------------>{ }
724 <-------------r4--------------> <--------------r1---->
725 */
726
727 mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);
728
729 if (rpn != 2*n)
730 {
731 MPN_ZERO((rp + rpn), 2*n - rpn);
732 }
733
734 TMP_FREE;
735 }
736
737 /* Square {up, n} and write the result to {prodp, 2n}.
738
739 Note that prodp gets 2n limbs stored, even if the actual result
740 only needs 2n - 1.
741 */
742
743 void
mpn_toom4_sqr_n(mp_ptr rp,mp_srcptr up,mp_size_t n)744 mpn_toom4_sqr_n (mp_ptr rp, mp_srcptr up, mp_size_t n)
745 {
746 mp_size_t len1, ind;
747 mp_limb_t cy, r30, r31;
748 mp_ptr tp;
749 mp_size_t a0n, a1n, a2n, a3n, sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, rpn, t4;
750
751 len1 = n;
752 ASSERT (n >= 1);
753
754 MPN_NORMALIZE(up, len1);
755
756 sn = (n - 1) / 4 + 1;
757
758 /* a0 - a3 are defined in mpn_toom4_mul_n above */
759
760 TC4_NORM(a0, a0n, sn);
761 TC4_NORM(a1, a1n, sn);
762 TC4_NORM(a2, a2n, sn);
763 TC4_NORM(a3, a3n, n - 3*sn);
764
765 t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
766
767 tp = __GMP_ALLOCATE_FUNC_LIMBS(4*t4 + 4*(sn + 1));
768
769 tc4_add_unsigned(u5, &n5, a3, a3n, a1, a1n);
770 tc4_add_unsigned(u4, &n4, a2, a2n, a0, a0n);
771 tc4_add_unsigned(u2, &n2, u4, n4, u5, n5);
772 tc4_sub(u3, &n3, u4, n4, u5, n5);
773
774 SQR_TC4(r4, n4, u3, n3);
775 SQR_TC4_UNSIGNED(r3, n3, u2, n2);
776
777 tc4_lshift(r1, &n1, a0, a0n, 3);
778 tc4_addlsh1_unsigned(r1, &n1, a2, a2n);
779 tc4_lshift(r2, &n8, a1, a1n, 2);
780 tc4_add(r2, &n8, r2, n8, a3, a3n);
781 tc4_add(u4, &n9, r1, n1, r2, n8);
782 tc4_sub(u5, &n5, r1, n1, r2, n8);
783
784 r30 = r3[0];
785 if (!n3) r30 = CNST_LIMB(0);
786 r31 = r3[1];
787 SQR_TC4(r6, n6, u5, n5);
788 SQR_TC4_UNSIGNED(r5, n5, u4, n9);
789 r3[1] = r31;
790
791 tc4_lshift(u2, &n8, a3, a3n, 3);
792 tc4_addmul_1(u2, &n8, a2, a2n, 4);
793 tc4_addlsh1_unsigned(u2, &n8, a1, a1n);
794 tc4_add(u2, &n8, u2, n8, a0, a0n);
795
796 SQR_TC4_UNSIGNED(r2, n2, u2, n8);
797 SQR_TC4_UNSIGNED(r1, n1, a3, a3n);
798 SQR_TC4_UNSIGNED(r7, n7, a0, a0n);
799
800 TC4_DENORM(r1, n1, t4 - 1);
801 TC4_DENORM(r2, n2, t4 - 1);
802 if (n3)
803 TC4_DENORM(r3, n3, t4 - 1);
804 else {
805 /* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */
806 for (ind = 1 ; ind < t4 - 1; ind++)
807 (r3)[ind] = CNST_LIMB(0);
808 }
809 TC4_DENORM(r4, n4, t4 - 1);
810 TC4_DENORM(r5, n5, t4 - 1);
811 TC4_DENORM(r6, n6, t4 - 1);
812 TC4_DENORM(r7, n7, t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)
813
814 /* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7
815 <----------- r7-----------><------------r5-------------->
816 <-------------r3------------->
817
818 <-------------r6-------------> < -----------r2------------>{ }
819 <-------------r4--------------> <--------------r1---->
820 */
821
822 mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);
823
824 if (rpn != 2*n)
825 {
826 MPN_ZERO((rp + rpn), 2*n - rpn);
827 }
828
829 __GMP_FREE_FUNC_LIMBS (tp, 4*t4 + 4*(sn+1));
830 }
831
832 /*
833 Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a
834 polynomial p(x) with 7 coefficients given the values
835 p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0).
836 The output is placed in rp and the final number of limbs of the
837 output is given in rpn.
838 The 4th and 6th values may be negative, and if so, n4 and n6
839 should be set to a negative value respectively.
840 To save space we pass r3, r5, r7 in place in the output rp.
841 The other r's are stored separately in space tp.
842 The low limb of r3 is stored in r30, as it will be overwritten
843 by the high limb of r5.
844
845 rp rp1 rp2 rp3 rp4 rp5 rp6 rp7
846 <----------- r7-----------><------------r5-------------->
847 <-------------r3------------->
848
849 We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4)
850 and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.
851 */
mpn_toom4_interpolate(mp_ptr rp,mp_size_t * rpn,mp_size_t sn,mp_ptr tp,mp_size_t s4,mp_size_t n4,mp_size_t n6,mp_limb_t r30)852 void mpn_toom4_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,
853 mp_ptr tp, mp_size_t s4, mp_size_t n4, mp_size_t n6, mp_limb_t r30)
854 {
855 mp_size_t n1, n2, n3, n5, n7, t4;
856 mp_limb_t saved, saved2, cy;
857
858 t4 = s4 + 1;
859
860 mpn_add_n(r2, r2, r5, s4);
861
862 if (n6 < 0)
863 mpn_add_n(r6, r5, r6, s4);
864 else
865 mpn_sub_n(r6, r5, r6, s4);
866 /* r6 is now in twos complement format */
867
868 saved = r3[0];
869 r3[0] = r30;
870 if (n4 < 0)
871 mpn_add_n(r4, r3, r4, s4);
872 else
873 mpn_sub_n(r4, r3, r4, s4);
874 r3[0] = saved;
875 /* r4 is now in twos complement format */
876
877 mpn_sub_n(r5, r5, r1, s4);
878
879 #if HAVE_NATIVE_mpn_sublsh_n
880 r5[s4-1] -= mpn_sublsh_n(r5, r5, r7, s4-1, 6);
881 #else
882 r5[s4-1] -= mpn_submul_1(r5, r7, s4-1, 64);
883 #endif
884
885 TC4_RSHIFT1(r4, s4);
886
887 saved = r3[0];
888 r3[0] = r30;
889 mpn_sub_n(r3, r3, r4, s4);
890 r30 = r3[0];
891 r3[0] = saved;
892
893 mpn_double(r5, s4);
894
895 mpn_sub_n(r5, r5, r6, s4);
896
897 saved = r3[0];
898 r3[0] = r30;
899 mpn_submul_1(r2, r3, s4, 65);
900 r3[0] = saved;
901
902 saved2 = r7[s4-1];
903 r7[s4-1] = CNST_LIMB(0); // r7 is always positive so no sign extend needed
904 saved = r3[0];
905 r3[0] = r30;
906 #if HAVE_NATIVE_mpn_subadd_n
907 mpn_subadd_n(r3, r3, r7, r1, s4);
908 #else
909 mpn_sub_n(r3, r3, r7, s4);
910 mpn_sub_n(r3, r3, r1, s4);
911 #endif
912 r7[s4-1] = saved2;
913 r30 = r3[0];
914
915 mpn_addmul_1(r2, r3, s4, 45);
916
917 #if HAVE_NATIVE_mpn_sublsh_n
918 cy = mpn_sublsh_n(r5, r5, r3, s4 - 1, 3);
919 #else
920 cy = mpn_submul_1(r5, r3, s4 - 1, 8);
921 #endif
922 r3[0] = saved;
923 r3[0] -= (cy + 8*r3[s4-1]);
924
925 mpn_rshift(r5, r5, s4, 3);
926
927 mpn_divexact_by3(r5, r5, s4);
928
929 mpn_sub_n(r6, r6, r2, s4);
930
931 #if HAVE_NATIVE_mpn_sublsh_n
932 mpn_sublsh_n(r2, r2, r4, s4, 4);
933 #else
934 mpn_submul_1(r2, r4, s4, 16);
935 #endif
936
937 mpn_rshift(r2, r2, s4, 1);
938
939 mpn_divexact_by3(r2, r2, s4);
940
941 mpn_divexact_by3(r2, r2, s4);
942
943 saved = r3[0];
944 r3[0] = r30;
945 cy = mpn_sub_n(r3, r3, r5, s4 - 1);
946 r30 = r3[0];
947 r3[0] = saved;
948 r3[s4-1] -= (cy + r5[s4-1]);
949
950 mpn_sub_n(r4, r4, r2, s4);
951
952 mpn_addmul_1(r6, r2, s4, 30);
953
954 mpn_divexact_byfobm1(r6, r6, s4, CNST_LIMB(15), CNST_LIMB(~0/15));
955
956 mpn_rshift(r6, r6, s4, 2);
957
958 mpn_sub_n(r2, r2, r6, s4);
959
960 TC4_NORM(r1, n1, s4);
961 TC4_NORM(r2, n2, s4);
962
963 (*rpn) = 6*sn+1;
964 cy = mpn_add_1(r3, r3, *rpn - 4*sn, r30); /* don't forget to add r3[0] back in */
965 if (cy)
966 {
967 rp[*rpn] = cy;
968 (*rpn)++;
969 }
970
971 tc4_copy(rp, rpn, 5*sn, r2, n2);
972 tc4_copy(rp, rpn, 6*sn, r1, n1);
973
974 tc4_copy(rp, rpn, sn, r6, s4);
975 tc4_copy(rp, rpn, 3*sn, r4, s4);
976 }
977