1 /* Polynomial multiplication using GMP's integer multiplication code
2
3 Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Dave Newman,
4 Paul Zimmermann, Alexander Kruppa.
5
6 This file is part of the ECM Library.
7
8 The ECM Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12
13 The ECM Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with the ECM Library; see the file COPYING.LIB. If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23 #include <stdlib.h>
24 #include "ecm-gmp.h" /* for MPZ_REALLOC and MPN_COPY */
25 #include "ecm-impl.h"
26
27 #if defined(HAVE___GMPN_MULMOD_BNM1) && defined(HAVE___GMPN_MULMOD_BNM1_NEXT_SIZE)
28 #define FFT_WRAP /* use the wrap-around trick */
29 #endif
30
31 /* Copy at r+i*s the content of A[i*stride] for 0 <= i < l
32 Assume all A[i*stride] are non-negative, and their size is <= s.
33 */
34 static void
pack(mp_ptr r,mpz_t * A,mp_size_t l,mp_size_t stride,mp_size_t s)35 pack (mp_ptr r, mpz_t *A, mp_size_t l, mp_size_t stride, mp_size_t s)
36 {
37 mp_size_t i, j, m;
38
39 for (i = 0, j = 0; i < l; i++, j += stride, r += s)
40 {
41 m = SIZ(A[j]);
42 ASSERT((0 <= m) && (m <= s));
43 if (m)
44 MPN_COPY (r, PTR(A[j]), m);
45 if (m < s)
46 MPN_ZERO (r + m, s - m);
47 }
48 }
49
50 /* put in R[i*stride] for 0 <= i < l the content of {t+i*s, s} */
51 void
unpack(mpz_t * R,mp_size_t stride,mp_ptr t,mp_size_t l,mp_size_t s)52 unpack (mpz_t *R, mp_size_t stride, mp_ptr t, mp_size_t l, mp_size_t s)
53 {
54 mp_size_t i, j, size_tmp;
55 mp_ptr r_ptr;
56
57 for (i = 0, j = 0; i < l; i++, t += s, j += stride)
58 {
59 size_tmp = s;
60 MPN_NORMALIZE(t, size_tmp); /* compute the actual size */
61 r_ptr = MPZ_REALLOC (R[j], size_tmp);
62 if (size_tmp)
63 MPN_COPY (r_ptr, t, size_tmp);
64 SIZ(R[j]) = size_tmp;
65 }
66 }
67
68 /* R <- A * B where A = A[0] + A[1]*x + ... + A[n-1]*x^(n-1), idem for B */
69 void
list_mul_n_basecase(listz_t R,listz_t A,listz_t B,unsigned int n)70 list_mul_n_basecase (listz_t R, listz_t A, listz_t B, unsigned int n)
71 {
72 unsigned int i, j;
73
74 if (n == 1)
75 {
76 mpz_mul (R[0], A[0], B[0]);
77 return;
78 }
79
80 for (i = 0; i < n; i++)
81 for (j = 0; j < n; j++)
82 {
83 if (i == 0 || j == n - 1)
84 mpz_mul (R[i+j], A[i], B[j]);
85 else
86 mpz_addmul (R[i+j], A[i], B[j]);
87 }
88 }
89
90 static void
list_mul_n_kara2(listz_t R,listz_t A,listz_t B)91 list_mul_n_kara2 (listz_t R, listz_t A, listz_t B)
92 {
93 mpz_add (R[0], A[0], A[1]);
94 mpz_add (R[2], B[0], B[1]);
95 mpz_mul (R[1], R[0], R[2]);
96 mpz_mul (R[0], A[0], B[0]);
97 mpz_mul (R[2], A[1], B[1]);
98 mpz_sub (R[1], R[1], R[0]);
99 mpz_sub (R[1], R[1], R[2]);
100 }
101
102 /* R[0..4] <- A[0..2] * B[0..2] in 7 multiplies */
103 static void
list_mul_n_kara3(listz_t R,listz_t A,listz_t B,listz_t T)104 list_mul_n_kara3 (listz_t R, listz_t A, listz_t B, listz_t T)
105 {
106 mpz_add (T[0], A[0], A[2]);
107 mpz_add (R[0], B[0], B[2]);
108 mpz_mul (R[2], T[0], R[0]); /* (A0+A2)*(B0+B2) */
109 mpz_mul (R[3], T[0], B[1]); /* (A0+A2)*B1 */
110 mpz_mul (R[4], A[1], R[0]); /* A1*(B0+B2) */
111 mpz_add (R[3], R[3], R[4]); /* (A0+A2)*B1+A1*(B0+B2) */
112 list_mul_n_kara2 (T, A, B);
113 mpz_sub (R[2], R[2], T[0]); /* A0*A2+A2*B0+A2*B2 */
114 mpz_sub (R[3], R[3], T[1]); /* A2*B1+A1*B2 */
115 mpz_add (R[2], R[2], T[2]); /* A0*A2+A2*B0+A2*B2+A1*B1 */
116 mpz_swap (R[0], T[0]); /* A0*B0 */
117 mpz_swap (R[1], T[1]); /* A0*B1+A1*B0 */
118 mpz_mul (R[4], A[2], B[2]); /* A2*B2 */
119 mpz_sub (R[2], R[2], R[4]); /* A0*A2+A2*B0+A1*B1 */
120 }
121
122 /* Assume n >= 2. T is a scratch space of enough entries. */
123 static void
list_mul_n_karatsuba_aux(listz_t R,listz_t A,listz_t B,unsigned int n,listz_t T)124 list_mul_n_karatsuba_aux (listz_t R, listz_t A, listz_t B, unsigned int n,
125 listz_t T)
126 {
127 unsigned int h, l;
128
129 if (n == 1)
130 {
131 list_mul_n_basecase (R, A, B, n);
132 return;
133 }
134
135 if (n == 2)
136 {
137 list_mul_n_kara2 (R, A, B);
138 return;
139 }
140
141 if (n == 3)
142 {
143 list_mul_n_kara3 (R, A, B, T);
144 return;
145 }
146
147 h = n / 2;
148 l = n - h;
149 list_add (R, A, A + l, h);
150 list_add (R + l, B, B + l, h);
151 if (h < l)
152 {
153 mpz_set (R[h], A[h]);
154 mpz_set (R[l + h], B[h]);
155 }
156 list_mul_n_karatsuba_aux (T, R, R + l, l, T + 2 * l - 1);
157 list_mul_n_karatsuba_aux (R, A, B, l, T + 2 * l - 1);
158 /* {R,2l-1} = Al * Bl */
159 list_mul_n_karatsuba_aux (R + 2 * l, A + l, B + l, h, T + 2 * l - 1);
160 /* {R+2l,2h-1} = Ah * Bh */
161 /* T will contain Al*Bh+Ah*Bl, it thus suffices to compute its low n-1
162 coefficients */
163 list_sub (T, T, R, n - 1);
164 list_sub (T, T, R + 2 * l, 2 * h - 1);
165 mpz_set_ui (R[2 * l - 1], 0);
166 list_add (R + l, R + l, T, n - 1);
167 }
168
169 static unsigned int
list_mul_n_mem(unsigned int n)170 list_mul_n_mem (unsigned int n)
171 {
172 if (n == 1)
173 return 0;
174 else
175 {
176 unsigned int k = (n + 1) / 2;
177 return 2 * k - 1 + list_mul_n_mem (k);
178 }
179 }
180
181 void
list_mul_n_karatsuba(listz_t R,listz_t A,listz_t B,unsigned int n)182 list_mul_n_karatsuba (listz_t R, listz_t A, listz_t B, unsigned int n)
183 {
184 listz_t T;
185 unsigned int s;
186
187 s = list_mul_n_mem (n);
188 T = init_list (s);
189 list_mul_n_karatsuba_aux (R, A, B, n, T);
190 clear_list (T, s);
191 }
192
193 /* Classical one-point Kronecker-Schoenhage substitution.
194 Notes:
195 - this code aligns the coeffs at limb boundaries - if instead we aligned
196 at byte boundaries then we could save up to 3*n bytes,
197 but tests have shown this doesn't give any significant speed increase,
198 even for large degree polynomials.
199 - this code requires that all coefficients A[] and B[] are nonnegative. */
200 void
list_mul_n_KS1(listz_t R,listz_t A,listz_t B,unsigned int l)201 list_mul_n_KS1 (listz_t R, listz_t A, listz_t B, unsigned int l)
202 {
203 unsigned long i;
204 mp_size_t s, t = 0, size_t0;
205 mp_ptr t0_ptr, t1_ptr, t2_ptr;
206
207 /* compute the largest bit-size t of the A[i] and B[i] */
208 for (i = 0; i < l; i++)
209 {
210 if ((s = mpz_sizeinbase (A[i], 2)) > t)
211 t = s;
212 if ((s = mpz_sizeinbase (B[i], 2)) > t)
213 t = s;
214 }
215 /* For n > 0, s = sizeinbase (n, 2) ==> n < 2^s.
216 For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
217 Hence all A[i], B[i] < 2^t */
218
219 /* Each coeff of A(x)*B(x) < l * 2^(2*t), so max number of bits in a
220 coeff of the product will be 2 * t + ceil(log_2(l)) */
221 s = 2 * t;
222 for (i = l; i > 1; s++, i = (i + 1) >> 1);
223
224 /* work out the corresponding number of limbs */
225 s = 1 + (s - 1) / GMP_NUMB_BITS;
226
227 size_t0 = s * l;
228
229 /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
230 t0_ptr = (mp_ptr) malloc (4 * size_t0 * sizeof (mp_limb_t));
231 if (t0_ptr == NULL)
232 {
233 outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
234 exit (1);
235 }
236 t1_ptr = t0_ptr + size_t0;
237 t2_ptr = t1_ptr + size_t0;
238
239 pack (t0_ptr, A, l, 1, s);
240 pack (t1_ptr, B, l, 1, s);
241
242 mpn_mul_n (t2_ptr, t0_ptr, t1_ptr, size_t0);
243
244 unpack (R, 1, t2_ptr, 2 * l - 1, s);
245
246 free (t0_ptr);
247 }
248
249 /* Two-point Kronecker substitition.
250 Reference: Algorithm 2 from "Faster polynomial multiplication via multipoint
251 Kronecker substitution", David Harvey, Journal of Symbolic Computation,
252 number 44 (2009), pages 1502-1510.
253 Assume n >= 2.
254 Notes:
255 - this code aligns the coeffs at limb boundaries - if instead we aligned
256 at byte boundaries then we could save up to 3*n bytes,
257 but tests have shown this doesn't give any significant speed increase,
258 even for large degree polynomials.
259 - this code requires that all coefficients A[] and B[] are nonnegative.
260 */
261 void
list_mul_n_KS2(listz_t R,listz_t A,listz_t B,unsigned int n)262 list_mul_n_KS2 (listz_t R, listz_t A, listz_t B, unsigned int n)
263 {
264 unsigned long i;
265 mp_size_t s, s2, t = 0, l, h, ns2;
266 mp_ptr tmp, A0, A1, B0, B1, C0, C1;
267 int sA, sB;
268
269 ASSERT_ALWAYS (n >= 2);
270
271 /* compute the largest bit-size t of the A[i] and B[i] */
272 for (i = 0; i < n; i++)
273 {
274 if ((s = mpz_sizeinbase (A[i], 2)) > t)
275 t = s;
276 if ((s = mpz_sizeinbase (B[i], 2)) > t)
277 t = s;
278 }
279 /* For n > 0, s = sizeinbase (n, 2) ==> n < 2^s.
280 For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
281 Hence all A[i], B[i] < 2^t */
282
283 /* Each coeff of A(x)*B(x) < n * 2^(2*t), so max number of bits in a
284 coeff of the product will be 2 * t + ceil(log_2(n)) */
285 s = 2 * t;
286 for (i = n; i > 1; s++, i = (i + 1) >> 1);
287
288 /* work out the corresponding number of limbs */
289 s = 1 + (s - 1) / GMP_NUMB_BITS;
290
291 /* ensure s is even */
292 s = s + (s & 1);
293 s2 = s >> 1;
294 ns2 = n * s2;
295
296 l = n / 2;
297 h = n - l;
298
299 /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
300 tmp = (mp_ptr) malloc (8 * ns2 * sizeof (mp_limb_t));
301 if (tmp == NULL)
302 {
303 outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
304 exit (1);
305 }
306
307 A0 = tmp;
308 A1 = A0 + ns2;
309 B0 = A1 + ns2;
310 B1 = B0 + ns2;
311 C0 = B1 + ns2;
312 C1 = C0 + 2 * ns2;
313
314 pack (A0, A, h, 2, s); /* A0 = Aeven(S) where S = 2^(s*GMP_NUMB_BITS) */
315 /* A0 has in fact only n * s2 significant limbs:
316 if n=2h, h*s = n*s2
317 if n=2h-1, the last chunk from A0 has at most s2 limbs */
318 MPN_ZERO(B0, s2);
319 pack (B0 + s2, A + 1, l, 2, s);
320 /* for the same reason as above, we have at most l*s-s2 significant limbs
321 at B0+s2, thus at most l*s <= n*s2 at B0 */
322 if ((sA = mpn_cmp (A0, B0, ns2)) >= 0)
323 mpn_sub_n (A1, A0, B0, ns2);
324 else
325 mpn_sub_n (A1, B0, A0, ns2);
326 mpn_add_n (A0, A0, B0, ns2);
327 /* now A0 is X+ with the notations of Algorithm, A1 is sA*X- */
328
329 pack (B0, B, h, 2, s);
330 MPN_ZERO(C0, s2);
331 pack (C0 + s2, B + 1, l, 2, s);
332 if ((sB = mpn_cmp (B0, C0, ns2)) >= 0)
333 mpn_sub_n (B1, B0, C0, ns2);
334 else
335 mpn_sub_n (B1, C0, B0, ns2);
336 mpn_add_n (B0, B0, C0, ns2);
337 /* B0 is Y+, B1 is sB*Y- with the notations of Algorithm 2 */
338
339 mpn_mul_n (C0, A0, B0, ns2); /* C0 is Z+ = X+ * Y+ */
340 mpn_mul_n (C1, A1, B1, ns2); /* C1 is sA * sB * Z- */
341
342 if (sA * sB >= 0)
343 {
344 mpn_add_n (A0, C0, C1, 2 * ns2);
345 mpn_sub_n (B0, C0, C1, 2 * ns2);
346 }
347 else
348 {
349 mpn_sub_n (A0, C0, C1, 2 * ns2);
350 mpn_add_n (B0, C0, C1, 2 * ns2);
351 }
352 mpn_rshift (A0, A0, 4 * ns2, 1); /* global division by 2 */
353
354 /* If A[] and B[] have n coefficients, the product has 2n-1 coefficients.
355 The even part has n coefficients and the odd part n-1 coefficients */
356 unpack (R, 2, A0, n, s);
357 unpack (R + 1, 2, B0 + s2, n - 1, s);
358
359 free (tmp);
360 }
361
362 /* Puts in R[0..2n-2] the product of A[0..n-1] and B[0..n-1], seen as
363 polynomials.
364 */
365 void
list_mult_n(listz_t R,listz_t A,listz_t B,unsigned int n)366 list_mult_n (listz_t R, listz_t A, listz_t B, unsigned int n)
367 {
368 int T[TUNE_LIST_MUL_N_MAX_SIZE] = LIST_MUL_TABLE, best;
369
370 /* See tune_list_mul_n() in tune.c:
371 0 : list_mul_n_basecase
372 2 : list_mul_n_KS1
373 3 : list_mul_n_KS2 */
374 best = (n < TUNE_LIST_MUL_N_MAX_SIZE) ? T[n] : 3;
375
376 if (best == 0)
377 list_mul_n_basecase (R, A, B, n);
378 else if (best == 1)
379 list_mul_n_karatsuba (R, A, B, n);
380 else if (best == 2)
381 list_mul_n_KS1 (R, A, B, n);
382 else
383 list_mul_n_KS2 (R, A, B, n);
384 }
385
386 /* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients
387 of degree m to n+m of rev(a)*c, i.e.
388 b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l)
389 ...
390 b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k)
391 ...
392 b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n].
393
394 If rev=0, consider a instead of rev(a).
395
396 Assumes n <= l.
397
398 Return non-zero if an error occurred.
399
400 low(b) is the coefficients of degree 0 to m-1 of a*c (or rev(a)*c)
401 mid(b) is the coefficients of degree m to m+n of a*c
402 high(b) is the coefficients of degree m+n+1 to m+l+1 of a*c
403 */
404
405 int
TMulKS(listz_t b,unsigned int n,listz_t a,unsigned int m,listz_t c,unsigned int l,mpz_t modulus,int rev)406 TMulKS (listz_t b, unsigned int n, listz_t a, unsigned int m,
407 listz_t c, unsigned int l, mpz_t modulus, int rev)
408 {
409 unsigned long i, s = 0, t;
410 mp_ptr ap, bp, cp;
411 mp_size_t an, bn, cn;
412 int ret = 0; /* default return value */
413 #ifdef DEBUG
414 long st = cputime ();
415 fprintf (ECM_STDOUT, "n=%u m=%u l=%u bits=%u n*bits=%u: ", n, m, l,
416 mpz_sizeinbase (modulus, 2), n * mpz_sizeinbase (modulus, 2));
417 #endif
418
419 ASSERT (n <= l); /* otherwise the upper coefficients of b are 0 */
420 if (l > n + m)
421 l = n + m; /* otherwise, c has too many coeffs */
422
423 /* make coefficients a[] and c[] non-negative and compute max #bits */
424 for (i = 0; i <= m; i++)
425 {
426 if (mpz_sgn (a[i]) < 0)
427 mpz_mod (a[i], a[i], modulus);
428 if ((t = mpz_sizeinbase (a[i], 2)) > s)
429 s = t;
430 }
431 for (i = 0; i <= l; i++)
432 {
433 if (mpz_sgn (c[i]) < 0)
434 mpz_mod (c[i], c[i], modulus);
435 if ((t = mpz_sizeinbase (c[i], 2)) > s)
436 s = t;
437 }
438
439 #ifdef FFT_WRAP
440 s ++; /* need one extra bit to prevent carry of low(b) + high(b) */
441 #endif
442
443 /* max coeff has 2*s+ceil(log2(min(m+1,l+1))) bits,
444 i.e. 2*s + 1 + floor(log2(min(m,l))) */
445 for (s = 2 * s, i = (m < l) ? m : l; i; s++, i >>= 1);
446
447 /* corresponding number of limbs */
448 s = 1 + (s - 1) / GMP_NUMB_BITS;
449
450 an = (m + 1) * s;
451 cn = (l + 1) * s;
452 bn = an + cn;
453
454 /* a[0..m] needs (m+1) * s limbs */
455 ap = (mp_ptr) malloc (an * sizeof (mp_limb_t));
456 if (ap == NULL)
457 {
458 ret = 1;
459 goto TMulKS_end;
460 }
461 cp = (mp_ptr) malloc (cn * sizeof (mp_limb_t));
462 if (cp == NULL)
463 {
464 ret = 1;
465 goto TMulKS_free_ap;
466 }
467
468 MPN_ZERO (ap, an);
469 MPN_ZERO (cp, cn);
470
471 /* a is reverted */
472 for (i = 0; i <= m; i++)
473 if (SIZ(a[i]))
474 MPN_COPY (ap + ((rev) ? (m - i) : i) * s, PTR(a[i]), SIZ(a[i]));
475 for (i = 0; i <= l; i++)
476 if (SIZ(c[i]))
477 MPN_COPY (cp + i * s, PTR(c[i]), SIZ(c[i]));
478
479 #ifdef FFT_WRAP
480 /* the product rev(a) * c has m+l+1 coefficients.
481 We throw away the first m and the last l-n <= m.
482 If we compute mod (m+n+1) * s limbs, we are ok */
483 bn = mpn_mulmod_bnm1_next_size ((m + n + 1) * s);
484
485 bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
486 if (bp == NULL)
487 {
488 ret = 1;
489 goto TMulKS_free_cp;
490 }
491 {
492 mp_ptr tp;
493 tp = (mp_ptr) malloc ((2 * bn + 4) * sizeof (mp_limb_t));
494 if (tp == NULL)
495 {
496 ret = 1;
497 goto TMulKS_free_cp;
498 }
499 /* mpn_mulmod_bnm1 requires that the first operand is larger */
500 if (an >= cn)
501 mpn_mulmod_bnm1 (bp, bn, ap, an, cp, cn, tp);
502 else
503 mpn_mulmod_bnm1 (bp, bn, cp, cn, ap, an, tp);
504 free (tp);
505 }
506 #else /* FFT_WRAP is not defined */
507 bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
508 if (bp == NULL)
509 {
510 ret = 1;
511 goto TMulKS_free_cp;
512 }
513 if (an >= cn)
514 mpn_mul (bp, ap, an, cp, cn);
515 else
516 mpn_mul (bp, cp, cn, ap, an);
517 #endif
518
519 /* recover coefficients of degree m to n+m of product in b[0..n] */
520 bp += m * s;
521 for (i = 0; i <= n; i++)
522 {
523 t = s;
524 MPN_NORMALIZE(bp, t);
525 MPZ_REALLOC (b[i], (mp_size_t) t);
526 if (t)
527 MPN_COPY (PTR(b[i]), bp, t);
528 SIZ(b[i]) = t;
529 bp += s;
530 }
531 bp -= (m + n + 1) * s;
532
533 free (bp);
534 TMulKS_free_cp:
535 free (cp);
536 TMulKS_free_ap:
537 free (ap);
538
539 #ifdef DEBUG
540 fprintf (ECM_STDOUT, "%ldms\n", elltime (st, cputime ()));
541 #endif
542
543 TMulKS_end:
544 return ret;
545 }
546
547 unsigned int
ks_wrapmul_m(unsigned int m0,unsigned int k,mpz_t n)548 ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n)
549 {
550 mp_size_t t, s;
551 unsigned long i, m;
552
553 #ifdef FFT_WRAP
554 t = mpz_sizeinbase (n, 2);
555 s = t * 2 + 1;
556 for (i = k - 1; i; s++, i >>= 1);
557 s = 1 + (s - 1) / GMP_NUMB_BITS;
558 i = mpn_mulmod_bnm1_next_size (m0 * s);
559 while (i % s)
560 i = mpn_mulmod_bnm1_next_size (i + 1);
561 m = i / s;
562 return m;
563 #else
564 return ~ (unsigned int) 0;
565 #endif
566 }
567
568 /* multiply in R[] A[0]+A[1]*x+...+A[k-1]*x^(k-1)
569 by B[0]+B[1]*x+...+B[l-1]*x^(l-1) modulo n,
570 wrapping around coefficients of the product up from degree m >= m0.
571 Assumes k >= l.
572 R is assumed to have 2*m0-3+list_mul_mem(m0-1) allocated cells.
573 Return m (or 0 if an error occurred).
574 */
575 unsigned int
ks_wrapmul(listz_t R,unsigned int m0,listz_t A,unsigned int k,listz_t B,unsigned int l,mpz_t n)576 ks_wrapmul (listz_t R, unsigned int m0,
577 listz_t A, unsigned int k,
578 listz_t B, unsigned int l,
579 mpz_t n)
580 {
581 #ifndef FFT_WRAP
582 ASSERT_ALWAYS(0); /* ks_wrapmul should not be called in that case */
583 return 0;
584 #else
585 unsigned long i, m, t;
586 mp_size_t s, size_t0, size_t1, size_tmp;
587 mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr, tp;
588
589 ASSERT(k >= l);
590
591 t = mpz_sizeinbase (n, 2);
592 for (i = 0; i < k; i++)
593 if (mpz_sgn (A[i]) < 0 || mpz_sizeinbase (A[i], 2) > t)
594 mpz_mod (A[i], A[i], n);
595 for (i = 0; i < l; i++)
596 if (mpz_sgn (B[i]) < 0 || mpz_sizeinbase (B[i], 2) > t)
597 mpz_mod (B[i], B[i], n);
598
599 s = t * 2 + 1; /* one extra sign bit */
600 for (i = k - 1; i; s++, i >>= 1);
601
602 s = 1 + (s - 1) / GMP_NUMB_BITS;
603
604 size_t0 = s * k;
605 size_t1 = s * l;
606
607 /* allocate one double-buffer to save malloc/MPN_ZERO/free calls */
608 t0_ptr = (mp_ptr) malloc (size_t0 * sizeof (mp_limb_t));
609 if (t0_ptr == NULL)
610 return 0;
611 t1_ptr = (mp_ptr) malloc (size_t1 * sizeof (mp_limb_t));
612 if (t1_ptr == NULL)
613 {
614 free (t0_ptr);
615 return 0;
616 }
617
618 MPN_ZERO (t0_ptr, size_t0);
619 MPN_ZERO (t1_ptr, size_t1);
620
621 for (i = 0; i < k; i++)
622 if (SIZ(A[i]))
623 MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i]));
624 for (i = 0; i < l; i++)
625 if (SIZ(B[i]))
626 MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i]));
627
628 i = mpn_mulmod_bnm1_next_size (m0 * s);
629 /* the following loop ensures we don't cut in the middle of a
630 coefficient */
631 while (i % s)
632 i = mpn_mulmod_bnm1_next_size (i + 1);
633 ASSERT(i % s == 0);
634 m = i / s;
635 ASSERT(m <= 2 * m0 - 3 + list_mul_mem (m0 - 1));
636
637 t2_ptr = (mp_ptr) malloc ((i + 1) * sizeof (mp_limb_t));
638 if (t2_ptr == NULL)
639 {
640 free (t0_ptr);
641 free (t1_ptr);
642 return 0;
643 }
644
645 {
646 mp_ptr tp = malloc ((2 * i + 4) * sizeof (mp_limb_t));
647 if (tp == NULL)
648 {
649 free (t0_ptr);
650 free (t1_ptr);
651 return 0;
652 }
653 mpn_mulmod_bnm1 (t2_ptr, i, t0_ptr, size_t0, t1_ptr, size_t1, tp);
654 if ((mp_size_t) i > size_t0 + size_t1)
655 MPN_ZERO(t2_ptr + size_t0 + size_t1, i - (size_t0 + size_t1));
656 free (tp);
657 }
658
659 for (t = 0, tp = t2_ptr; t < m; t++, tp += s)
660 {
661 size_tmp = s;
662 MPN_NORMALIZE(tp, size_tmp);
663 r_ptr = MPZ_REALLOC (R[t], size_tmp);
664 if (size_tmp)
665 MPN_COPY (r_ptr, tp, size_tmp);
666 SIZ(R[t]) = size_tmp;
667 }
668
669 free (t0_ptr);
670 free (t1_ptr);
671 free (t2_ptr);
672
673 return m;
674 #endif /* FFT_WRAP */
675 }
676