1 /* Polynomial multiplication using GMP's integer multiplication code
2 
3 Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Dave Newman,
4 Paul Zimmermann, Alexander Kruppa.
5 
6 This file is part of the ECM Library.
7 
8 The ECM Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The ECM Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the ECM Library; see the file COPYING.LIB.  If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 #include <stdlib.h>
24 #include "ecm-gmp.h" /* for MPZ_REALLOC and MPN_COPY */
25 #include "ecm-impl.h"
26 
27 #if defined(HAVE___GMPN_MULMOD_BNM1) && defined(HAVE___GMPN_MULMOD_BNM1_NEXT_SIZE)
28 #define FFT_WRAP /* use the wrap-around trick */
29 #endif
30 
31 /* Copy at r+i*s the content of A[i*stride] for 0 <= i < l
32    Assume all A[i*stride] are non-negative, and their size is <= s.
33  */
34 static void
pack(mp_ptr r,mpz_t * A,mp_size_t l,mp_size_t stride,mp_size_t s)35 pack (mp_ptr r, mpz_t *A, mp_size_t l, mp_size_t stride, mp_size_t s)
36 {
37   mp_size_t i, j, m;
38 
39   for (i = 0, j = 0; i < l; i++, j += stride, r += s)
40     {
41       m = SIZ(A[j]);
42       ASSERT((0 <= m) && (m <= s));
43       if (m)
44         MPN_COPY (r, PTR(A[j]), m);
45       if (m < s)
46         MPN_ZERO (r + m, s - m);
47     }
48 }
49 
50 /* put in R[i*stride] for 0 <= i < l the content of {t+i*s, s} */
51 void
unpack(mpz_t * R,mp_size_t stride,mp_ptr t,mp_size_t l,mp_size_t s)52 unpack (mpz_t *R, mp_size_t stride, mp_ptr t, mp_size_t l, mp_size_t s)
53 {
54   mp_size_t i, j, size_tmp;
55   mp_ptr r_ptr;
56 
57   for (i = 0, j = 0; i < l; i++, t += s, j += stride)
58     {
59       size_tmp = s;
60       MPN_NORMALIZE(t, size_tmp); /* compute the actual size */
61       r_ptr = MPZ_REALLOC (R[j], size_tmp);
62       if (size_tmp)
63         MPN_COPY (r_ptr, t, size_tmp);
64       SIZ(R[j]) = size_tmp;
65     }
66 }
67 
68 /* R <- A * B where A = A[0] + A[1]*x + ... + A[n-1]*x^(n-1), idem for B */
69 void
list_mul_n_basecase(listz_t R,listz_t A,listz_t B,unsigned int n)70 list_mul_n_basecase (listz_t R, listz_t A, listz_t B, unsigned int n)
71 {
72   unsigned int i, j;
73 
74   if (n == 1)
75     {
76       mpz_mul (R[0], A[0], B[0]);
77       return;
78     }
79 
80   for (i = 0; i < n; i++)
81     for (j = 0; j < n; j++)
82       {
83         if (i == 0 || j == n - 1)
84           mpz_mul (R[i+j], A[i], B[j]);
85         else
86           mpz_addmul (R[i+j], A[i], B[j]);
87       }
88 }
89 
90 static void
list_mul_n_kara2(listz_t R,listz_t A,listz_t B)91 list_mul_n_kara2 (listz_t R, listz_t A, listz_t B)
92 {
93   mpz_add (R[0], A[0], A[1]);
94   mpz_add (R[2], B[0], B[1]);
95   mpz_mul (R[1], R[0], R[2]);
96   mpz_mul (R[0], A[0], B[0]);
97   mpz_mul (R[2], A[1], B[1]);
98   mpz_sub (R[1], R[1], R[0]);
99   mpz_sub (R[1], R[1], R[2]);
100 }
101 
102 /* R[0..4] <- A[0..2] * B[0..2] in 7 multiplies */
103 static void
list_mul_n_kara3(listz_t R,listz_t A,listz_t B,listz_t T)104 list_mul_n_kara3 (listz_t R, listz_t A, listz_t B, listz_t T)
105 {
106   mpz_add (T[0], A[0], A[2]);
107   mpz_add (R[0], B[0], B[2]);
108   mpz_mul (R[2], T[0], R[0]); /* (A0+A2)*(B0+B2) */
109   mpz_mul (R[3], T[0], B[1]); /* (A0+A2)*B1 */
110   mpz_mul (R[4], A[1], R[0]); /* A1*(B0+B2) */
111   mpz_add (R[3], R[3], R[4]); /* (A0+A2)*B1+A1*(B0+B2) */
112   list_mul_n_kara2 (T, A, B);
113   mpz_sub (R[2], R[2], T[0]); /* A0*A2+A2*B0+A2*B2 */
114   mpz_sub (R[3], R[3], T[1]); /* A2*B1+A1*B2 */
115   mpz_add (R[2], R[2], T[2]); /* A0*A2+A2*B0+A2*B2+A1*B1 */
116   mpz_swap (R[0], T[0]);      /* A0*B0 */
117   mpz_swap (R[1], T[1]);      /* A0*B1+A1*B0 */
118   mpz_mul (R[4], A[2], B[2]); /* A2*B2 */
119   mpz_sub (R[2], R[2], R[4]); /* A0*A2+A2*B0+A1*B1 */
120 }
121 
122 /* Assume n >= 2. T is a scratch space of enough entries. */
123 static void
list_mul_n_karatsuba_aux(listz_t R,listz_t A,listz_t B,unsigned int n,listz_t T)124 list_mul_n_karatsuba_aux (listz_t R, listz_t A, listz_t B, unsigned int n,
125                           listz_t T)
126 {
127   unsigned int h, l;
128 
129   if (n == 1)
130     {
131       list_mul_n_basecase (R, A, B, n);
132       return;
133     }
134 
135   if (n == 2)
136     {
137       list_mul_n_kara2 (R, A, B);
138       return;
139     }
140 
141   if (n == 3)
142     {
143       list_mul_n_kara3 (R, A, B, T);
144       return;
145     }
146 
147   h = n / 2;
148   l = n - h;
149   list_add (R, A, A + l, h);
150   list_add (R + l, B, B + l, h);
151   if (h < l)
152     {
153       mpz_set (R[h], A[h]);
154       mpz_set (R[l + h], B[h]);
155     }
156   list_mul_n_karatsuba_aux (T, R, R + l, l, T + 2 * l - 1);
157   list_mul_n_karatsuba_aux (R, A, B, l, T + 2 * l - 1);
158   /* {R,2l-1} = Al * Bl */
159   list_mul_n_karatsuba_aux (R + 2 * l, A + l, B + l, h, T + 2 * l - 1);
160   /* {R+2l,2h-1} = Ah * Bh */
161   /* T will contain Al*Bh+Ah*Bl, it thus suffices to compute its low n-1
162      coefficients */
163   list_sub (T, T, R, n - 1);
164   list_sub (T, T, R + 2 * l, 2 * h - 1);
165   mpz_set_ui (R[2 * l - 1], 0);
166   list_add (R + l, R + l, T, n - 1);
167 }
168 
169 static unsigned int
list_mul_n_mem(unsigned int n)170 list_mul_n_mem (unsigned int n)
171 {
172   if (n == 1)
173     return 0;
174   else
175     {
176       unsigned int k = (n + 1) / 2;
177       return 2 * k - 1 + list_mul_n_mem (k);
178     }
179 }
180 
181 void
list_mul_n_karatsuba(listz_t R,listz_t A,listz_t B,unsigned int n)182 list_mul_n_karatsuba (listz_t R, listz_t A, listz_t B, unsigned int n)
183 {
184   listz_t T;
185   unsigned int s;
186 
187   s = list_mul_n_mem (n);
188   T = init_list (s);
189   list_mul_n_karatsuba_aux (R, A, B, n, T);
190   clear_list (T, s);
191 }
192 
193 /* Classical one-point Kronecker-Schoenhage substitution.
194    Notes:
195     - this code aligns the coeffs at limb boundaries - if instead we aligned
196       at byte boundaries then we could save up to 3*n bytes,
197       but tests have shown this doesn't give any significant speed increase,
198       even for large degree polynomials.
199      - this code requires that all coefficients A[] and B[] are nonnegative. */
200 void
list_mul_n_KS1(listz_t R,listz_t A,listz_t B,unsigned int l)201 list_mul_n_KS1 (listz_t R, listz_t A, listz_t B, unsigned int l)
202 {
203   unsigned long i;
204   mp_size_t s, t = 0, size_t0;
205   mp_ptr t0_ptr, t1_ptr, t2_ptr;
206 
207   /* compute the largest bit-size t of the A[i] and B[i] */
208   for (i = 0; i < l; i++)
209     {
210       if ((s = mpz_sizeinbase (A[i], 2)) > t)
211         t = s;
212       if ((s = mpz_sizeinbase (B[i], 2)) > t)
213         t = s;
214     }
215   /* For n > 0, s = sizeinbase (n, 2)     ==> n < 2^s.
216      For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
217      Hence all A[i], B[i] < 2^t */
218 
219   /* Each coeff of A(x)*B(x) < l * 2^(2*t), so max number of bits in a
220      coeff of the product will be 2 * t + ceil(log_2(l)) */
221   s = 2 * t;
222   for (i = l; i > 1; s++, i = (i + 1) >> 1);
223 
224   /* work out the corresponding number of limbs */
225   s = 1 + (s - 1) / GMP_NUMB_BITS;
226 
227   size_t0 = s * l;
228 
229   /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
230   t0_ptr = (mp_ptr) malloc (4 * size_t0 * sizeof (mp_limb_t));
231   if (t0_ptr == NULL)
232     {
233       outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
234       exit (1);
235     }
236   t1_ptr = t0_ptr + size_t0;
237   t2_ptr = t1_ptr + size_t0;
238 
239   pack (t0_ptr, A, l, 1, s);
240   pack (t1_ptr, B, l, 1, s);
241 
242   mpn_mul_n (t2_ptr, t0_ptr, t1_ptr, size_t0);
243 
244   unpack (R, 1, t2_ptr, 2 * l - 1, s);
245 
246   free (t0_ptr);
247 }
248 
249 /* Two-point Kronecker substitition.
250    Reference: Algorithm 2 from "Faster polynomial multiplication via multipoint
251    Kronecker substitution", David Harvey, Journal of Symbolic Computation,
252    number 44 (2009), pages 1502-1510.
253    Assume n >= 2.
254    Notes:
255     - this code aligns the coeffs at limb boundaries - if instead we aligned
256       at byte boundaries then we could save up to 3*n bytes,
257       but tests have shown this doesn't give any significant speed increase,
258       even for large degree polynomials.
259      - this code requires that all coefficients A[] and B[] are nonnegative.
260 */
261 void
list_mul_n_KS2(listz_t R,listz_t A,listz_t B,unsigned int n)262 list_mul_n_KS2 (listz_t R, listz_t A, listz_t B, unsigned int n)
263 {
264   unsigned long i;
265   mp_size_t s, s2, t = 0, l, h, ns2;
266   mp_ptr tmp, A0, A1, B0, B1, C0, C1;
267   int sA, sB;
268 
269   ASSERT_ALWAYS (n >= 2);
270 
271   /* compute the largest bit-size t of the A[i] and B[i] */
272   for (i = 0; i < n; i++)
273     {
274       if ((s = mpz_sizeinbase (A[i], 2)) > t)
275         t = s;
276       if ((s = mpz_sizeinbase (B[i], 2)) > t)
277         t = s;
278     }
279   /* For n > 0, s = sizeinbase (n, 2)     ==> n < 2^s.
280      For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
281      Hence all A[i], B[i] < 2^t */
282 
283   /* Each coeff of A(x)*B(x) < n * 2^(2*t), so max number of bits in a
284      coeff of the product will be 2 * t + ceil(log_2(n)) */
285   s = 2 * t;
286   for (i = n; i > 1; s++, i = (i + 1) >> 1);
287 
288   /* work out the corresponding number of limbs */
289   s = 1 + (s - 1) / GMP_NUMB_BITS;
290 
291   /* ensure s is even */
292   s = s + (s & 1);
293   s2 = s >> 1;
294   ns2 = n * s2;
295 
296   l = n / 2;
297   h = n - l;
298 
299   /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
300   tmp = (mp_ptr) malloc (8 * ns2 * sizeof (mp_limb_t));
301   if (tmp == NULL)
302     {
303       outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
304       exit (1);
305     }
306 
307   A0 = tmp;
308   A1 = A0 + ns2;
309   B0 = A1 + ns2;
310   B1 = B0 + ns2;
311   C0 = B1 + ns2;
312   C1 = C0 + 2 * ns2;
313 
314   pack (A0, A, h, 2, s); /* A0 = Aeven(S) where S = 2^(s*GMP_NUMB_BITS) */
315   /* A0 has in fact only n * s2 significant limbs:
316      if n=2h, h*s = n*s2
317      if n=2h-1, the last chunk from A0 has at most s2 limbs */
318   MPN_ZERO(B0, s2);
319   pack (B0 + s2, A + 1, l, 2, s);
320   /* for the same reason as above, we have at most l*s-s2 significant limbs
321      at B0+s2, thus at most l*s <= n*s2 at B0 */
322   if ((sA = mpn_cmp (A0, B0, ns2)) >= 0)
323     mpn_sub_n (A1, A0, B0, ns2);
324   else
325     mpn_sub_n (A1, B0, A0, ns2);
326   mpn_add_n (A0, A0, B0, ns2);
327   /* now A0 is X+ with the notations of Algorithm, A1 is sA*X- */
328 
329   pack (B0, B, h, 2, s);
330   MPN_ZERO(C0, s2);
331   pack (C0 + s2, B + 1, l, 2, s);
332   if ((sB = mpn_cmp (B0, C0, ns2)) >= 0)
333     mpn_sub_n (B1, B0, C0, ns2);
334   else
335     mpn_sub_n (B1, C0, B0, ns2);
336   mpn_add_n (B0, B0, C0, ns2);
337   /* B0 is Y+, B1 is sB*Y- with the notations of Algorithm 2 */
338 
339   mpn_mul_n (C0, A0, B0, ns2); /* C0 is Z+ = X+ * Y+ */
340   mpn_mul_n (C1, A1, B1, ns2); /* C1 is sA * sB * Z- */
341 
342   if (sA * sB >= 0)
343     {
344       mpn_add_n (A0, C0, C1, 2 * ns2);
345       mpn_sub_n (B0, C0, C1, 2 * ns2);
346     }
347   else
348     {
349       mpn_sub_n (A0, C0, C1, 2 * ns2);
350       mpn_add_n (B0, C0, C1, 2 * ns2);
351     }
352   mpn_rshift (A0, A0, 4 * ns2, 1); /* global division by 2 */
353 
354   /* If A[] and B[] have n coefficients, the product has 2n-1 coefficients.
355      The even part has n coefficients and the odd part n-1 coefficients */
356   unpack (R, 2, A0, n, s);
357   unpack (R + 1, 2, B0 + s2, n - 1, s);
358 
359   free (tmp);
360 }
361 
362 /* Puts in R[0..2n-2] the product of A[0..n-1] and B[0..n-1], seen as
363    polynomials.
364 */
365 void
list_mult_n(listz_t R,listz_t A,listz_t B,unsigned int n)366 list_mult_n (listz_t R, listz_t A, listz_t B, unsigned int n)
367 {
368   int T[TUNE_LIST_MUL_N_MAX_SIZE] = LIST_MUL_TABLE, best;
369 
370   /* See tune_list_mul_n() in tune.c:
371      0 : list_mul_n_basecase
372      2 : list_mul_n_KS1
373      3 : list_mul_n_KS2 */
374   best = (n < TUNE_LIST_MUL_N_MAX_SIZE) ? T[n] : 3;
375 
376   if (best == 0)
377     list_mul_n_basecase (R, A, B, n);
378   else if (best == 1)
379     list_mul_n_karatsuba (R, A, B, n);
380   else if (best == 2)
381     list_mul_n_KS1 (R, A, B, n);
382   else
383     list_mul_n_KS2 (R, A, B, n);
384 }
385 
386 /* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients
387    of degree m to n+m of rev(a)*c, i.e.
388    b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l)
389    ...
390    b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k)
391    ...
392    b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n].
393 
394    If rev=0, consider a instead of rev(a).
395 
396    Assumes n <= l.
397 
398    Return non-zero if an error occurred.
399 
400    low(b) is the coefficients of degree 0 to m-1 of a*c (or rev(a)*c)
401    mid(b) is the coefficients of degree m to m+n of a*c
402    high(b) is the coefficients of degree m+n+1 to m+l+1 of a*c
403 */
404 
405 int
TMulKS(listz_t b,unsigned int n,listz_t a,unsigned int m,listz_t c,unsigned int l,mpz_t modulus,int rev)406 TMulKS (listz_t b, unsigned int n, listz_t a, unsigned int m,
407         listz_t c, unsigned int l, mpz_t modulus, int rev)
408 {
409   unsigned long i, s = 0, t;
410   mp_ptr ap, bp, cp;
411   mp_size_t an, bn, cn;
412   int ret = 0; /* default return value */
413 #ifdef DEBUG
414   long st = cputime ();
415   fprintf (ECM_STDOUT, "n=%u m=%u l=%u bits=%u n*bits=%u: ", n, m, l,
416 	   mpz_sizeinbase (modulus, 2), n * mpz_sizeinbase (modulus, 2));
417 #endif
418 
419   ASSERT (n <= l); /* otherwise the upper coefficients of b are 0 */
420   if (l > n + m)
421     l = n + m; /* otherwise, c has too many coeffs */
422 
423   /* make coefficients a[] and c[] non-negative and compute max #bits */
424   for (i = 0; i <= m; i++)
425     {
426       if (mpz_sgn (a[i]) < 0)
427         mpz_mod (a[i], a[i], modulus);
428       if ((t = mpz_sizeinbase (a[i], 2)) > s)
429         s = t;
430     }
431   for (i = 0; i <= l; i++)
432     {
433       if (mpz_sgn (c[i]) < 0)
434         mpz_mod (c[i], c[i], modulus);
435       if ((t = mpz_sizeinbase (c[i], 2)) > s)
436         s = t;
437     }
438 
439 #ifdef FFT_WRAP
440   s ++; /* need one extra bit to prevent carry of low(b) + high(b) */
441 #endif
442 
443   /* max coeff has 2*s+ceil(log2(min(m+1,l+1))) bits,
444    i.e. 2*s + 1 + floor(log2(min(m,l))) */
445   for (s = 2 * s, i = (m < l) ? m : l; i; s++, i >>= 1);
446 
447   /* corresponding number of limbs */
448   s = 1 + (s - 1) / GMP_NUMB_BITS;
449 
450   an = (m + 1) * s;
451   cn = (l + 1) * s;
452   bn = an + cn;
453 
454   /* a[0..m] needs (m+1) * s limbs */
455   ap = (mp_ptr) malloc (an * sizeof (mp_limb_t));
456   if (ap == NULL)
457     {
458       ret = 1;
459       goto TMulKS_end;
460     }
461   cp = (mp_ptr) malloc (cn * sizeof (mp_limb_t));
462   if (cp == NULL)
463     {
464       ret = 1;
465       goto TMulKS_free_ap;
466     }
467 
468   MPN_ZERO (ap, an);
469   MPN_ZERO (cp, cn);
470 
471   /* a is reverted */
472   for (i = 0; i <= m; i++)
473     if (SIZ(a[i]))
474       MPN_COPY (ap + ((rev) ? (m - i) : i) * s, PTR(a[i]), SIZ(a[i]));
475   for (i = 0; i <= l; i++)
476     if (SIZ(c[i]))
477       MPN_COPY (cp + i * s, PTR(c[i]), SIZ(c[i]));
478 
479 #ifdef FFT_WRAP
480   /* the product rev(a) * c has m+l+1 coefficients.
481      We throw away the first m and the last l-n <= m.
482      If we compute mod (m+n+1) * s limbs, we are ok */
483   bn = mpn_mulmod_bnm1_next_size ((m + n + 1) * s);
484 
485   bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
486   if (bp == NULL)
487     {
488       ret = 1;
489       goto TMulKS_free_cp;
490     }
491   {
492     mp_ptr tp;
493     tp = (mp_ptr) malloc ((2 * bn + 4) * sizeof (mp_limb_t));
494     if (tp == NULL)
495       {
496         ret = 1;
497         goto TMulKS_free_cp;
498       }
499     /* mpn_mulmod_bnm1 requires that the first operand is larger */
500     if (an >= cn)
501       mpn_mulmod_bnm1 (bp, bn, ap, an, cp, cn, tp);
502     else
503       mpn_mulmod_bnm1 (bp, bn, cp, cn, ap, an, tp);
504     free (tp);
505   }
506 #else /* FFT_WRAP is not defined */
507   bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
508   if (bp == NULL)
509     {
510       ret = 1;
511       goto TMulKS_free_cp;
512     }
513   if (an >= cn)
514     mpn_mul (bp, ap, an, cp, cn);
515   else
516     mpn_mul (bp, cp, cn, ap, an);
517 #endif
518 
519   /* recover coefficients of degree m to n+m of product in b[0..n] */
520   bp += m * s;
521   for (i = 0; i <= n; i++)
522     {
523       t = s;
524       MPN_NORMALIZE(bp, t);
525       MPZ_REALLOC (b[i], (mp_size_t) t);
526       if (t)
527         MPN_COPY (PTR(b[i]), bp, t);
528       SIZ(b[i]) = t;
529       bp += s;
530     }
531   bp -= (m + n + 1) * s;
532 
533   free (bp);
534  TMulKS_free_cp:
535   free (cp);
536  TMulKS_free_ap:
537   free (ap);
538 
539 #ifdef DEBUG
540   fprintf (ECM_STDOUT, "%ldms\n", elltime (st, cputime ()));
541 #endif
542 
543  TMulKS_end:
544   return ret;
545 }
546 
547 unsigned int
ks_wrapmul_m(unsigned int m0,unsigned int k,mpz_t n)548 ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n)
549 {
550   mp_size_t t, s;
551   unsigned long i, m;
552 
553 #ifdef FFT_WRAP
554   t = mpz_sizeinbase (n, 2);
555   s = t * 2 + 1;
556   for (i = k - 1; i; s++, i >>= 1);
557   s = 1 + (s - 1) / GMP_NUMB_BITS;
558   i = mpn_mulmod_bnm1_next_size (m0 * s);
559   while (i % s)
560     i = mpn_mulmod_bnm1_next_size (i + 1);
561   m = i / s;
562   return m;
563 #else
564   return ~ (unsigned int) 0;
565 #endif
566 }
567 
568 /* multiply in R[] A[0]+A[1]*x+...+A[k-1]*x^(k-1)
569                 by B[0]+B[1]*x+...+B[l-1]*x^(l-1) modulo n,
570    wrapping around coefficients of the product up from degree m >= m0.
571    Assumes k >= l.
572    R is assumed to have 2*m0-3+list_mul_mem(m0-1) allocated cells.
573    Return m (or 0 if an error occurred).
574 */
575 unsigned int
ks_wrapmul(listz_t R,unsigned int m0,listz_t A,unsigned int k,listz_t B,unsigned int l,mpz_t n)576 ks_wrapmul (listz_t R, unsigned int m0,
577             listz_t A, unsigned int k,
578             listz_t B, unsigned int l,
579 	    mpz_t n)
580 {
581 #ifndef FFT_WRAP
582   ASSERT_ALWAYS(0); /* ks_wrapmul should not be called in that case */
583   return 0;
584 #else
585   unsigned long i, m, t;
586   mp_size_t s, size_t0, size_t1, size_tmp;
587   mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr, tp;
588 
589   ASSERT(k >= l);
590 
591   t = mpz_sizeinbase (n, 2);
592   for (i = 0; i < k; i++)
593     if (mpz_sgn (A[i]) < 0 || mpz_sizeinbase (A[i], 2) > t)
594       mpz_mod (A[i], A[i], n);
595   for (i = 0; i < l; i++)
596     if (mpz_sgn (B[i]) < 0 || mpz_sizeinbase (B[i], 2) > t)
597       mpz_mod (B[i], B[i], n);
598 
599   s = t * 2 + 1; /* one extra sign bit */
600   for (i = k - 1; i; s++, i >>= 1);
601 
602   s = 1 + (s - 1) / GMP_NUMB_BITS;
603 
604   size_t0 = s * k;
605   size_t1 = s * l;
606 
607   /* allocate one double-buffer to save malloc/MPN_ZERO/free calls */
608   t0_ptr = (mp_ptr) malloc (size_t0 * sizeof (mp_limb_t));
609   if (t0_ptr == NULL)
610     return 0;
611   t1_ptr = (mp_ptr) malloc (size_t1 * sizeof (mp_limb_t));
612   if (t1_ptr == NULL)
613     {
614       free (t0_ptr);
615       return 0;
616     }
617 
618   MPN_ZERO (t0_ptr, size_t0);
619   MPN_ZERO (t1_ptr, size_t1);
620 
621   for (i = 0; i < k; i++)
622     if (SIZ(A[i]))
623       MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i]));
624   for (i = 0; i < l; i++)
625     if (SIZ(B[i]))
626       MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i]));
627 
628   i = mpn_mulmod_bnm1_next_size (m0 * s);
629   /* the following loop ensures we don't cut in the middle of a
630      coefficient */
631   while (i % s)
632     i = mpn_mulmod_bnm1_next_size (i + 1);
633   ASSERT(i % s == 0);
634   m = i / s;
635   ASSERT(m <= 2 * m0 - 3 + list_mul_mem (m0 - 1));
636 
637   t2_ptr = (mp_ptr) malloc ((i + 1) * sizeof (mp_limb_t));
638   if (t2_ptr == NULL)
639     {
640       free (t0_ptr);
641       free (t1_ptr);
642       return 0;
643     }
644 
645   {
646     mp_ptr tp = malloc ((2 * i + 4) * sizeof (mp_limb_t));
647     if (tp == NULL)
648       {
649         free (t0_ptr);
650         free (t1_ptr);
651         return 0;
652       }
653     mpn_mulmod_bnm1 (t2_ptr, i, t0_ptr, size_t0, t1_ptr, size_t1, tp);
654     if ((mp_size_t) i > size_t0 + size_t1)
655       MPN_ZERO(t2_ptr + size_t0 + size_t1, i - (size_t0 + size_t1));
656     free (tp);
657   }
658 
659   for (t = 0, tp = t2_ptr; t < m; t++, tp += s)
660     {
661       size_tmp = s;
662       MPN_NORMALIZE(tp, size_tmp);
663       r_ptr = MPZ_REALLOC (R[t], size_tmp);
664       if (size_tmp)
665         MPN_COPY (r_ptr, tp, size_tmp);
666       SIZ(R[t]) = size_tmp;
667     }
668 
669   free (t0_ptr);
670   free (t1_ptr);
671   free (t2_ptr);
672 
673   return m;
674 #endif /* FFT_WRAP */
675 }
676