1 /*
2     Copyright (C) 2017 Daniel Schultz
3 
4     This file is part of FLINT.
5 
6     FLINT is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
10 */
11 
12 #include <gmp.h>
13 #include <stdlib.h>
14 #include "flint.h"
15 #include "fmpz.h"
16 #include "fmpz_mpoly.h"
17 
18 /*
19    Set poly1 to poly2*poly3 using Johnson's heap method. The function
20    realocates its output and returns the length of the product. This
21    version of the function assumes the exponent vectors all fit in a
22    single word. Assumes input polys are nonzero.
23 */
_fmpz_mpoly_mul_johnson1(fmpz ** poly1,ulong ** exp1,slong * alloc,const fmpz * poly2,const ulong * exp2,slong len2,const fmpz * poly3,const ulong * exp3,slong len3,ulong maskhi)24 slong _fmpz_mpoly_mul_johnson1(fmpz ** poly1, ulong ** exp1, slong * alloc,
25               const fmpz * poly2, const ulong * exp2, slong len2,
26               const fmpz * poly3, const ulong * exp3, slong len3, ulong maskhi)
27 {
28    slong i, j, k;
29    slong next_loc;
30    slong Q_len = 0, heap_len = 2; /* heap zero index unused */
31    mpoly_heap1_s * heap;
32    mpoly_heap_t * chain;
33    slong * Q;
34    mpoly_heap_t * x;
35    fmpz * p1 = *poly1;
36    ulong * e1 = *exp1;
37    slong * hind;
38    ulong exp, cy;
39    ulong c[3], p[2]; /* for accumulating coefficients */
40    int first, small;
41    TMP_INIT;
42 
43    TMP_START;
44 
45    /* whether input coeffs are small, thus output coeffs fit in three words */
46    small = _fmpz_mpoly_fits_small(poly2, len2) &&
47                                            _fmpz_mpoly_fits_small(poly3, len3);
48 
49    next_loc = len2 + 4;   /* something bigger than heap can ever be */
50    heap = (mpoly_heap1_s *) TMP_ALLOC((len2 + 1)*sizeof(mpoly_heap1_s));
51    /* alloc array of heap nodes which can be chained together */
52    chain = (mpoly_heap_t *) TMP_ALLOC(len2*sizeof(mpoly_heap_t));
53    /* space for temporary storage of pointers to heap nodes */
54    Q = (slong *) TMP_ALLOC(2*len2*sizeof(slong));
55 
56     /* space for heap indices */
57     hind = (slong *) TMP_ALLOC(len2*sizeof(slong));
58     for (i = 0; i < len2; i++)
59         hind[i] = 1;
60 
61    /* put (0, 0, exp2[0] + exp3[0]) on heap */
62    x = chain + 0;
63    x->i = 0;
64    x->j = 0;
65    x->next = NULL;
66 
67    HEAP_ASSIGN(heap[1], exp2[0] + exp3[0], x);
68    hind[0] = 2*1 + 0;
69 
70    /* output poly index starts at -1, will be immediately updated to 0 */
71    k = -WORD(1);
72 
73    /* while heap is nonempty */
74    while (heap_len > 1)
75    {
76       /* get exponent field of heap top */
77       exp = heap[1].exp;
78 
79       /* realloc output poly ready for next product term */
80       k++;
81       _fmpz_mpoly_fit_length(&p1, &e1, alloc, k + 1, 1);
82 
83       /* whether we are on first coeff product for this output exponent */
84       first = 1;
85 
86       /* set temporary coeff to zero */
87       c[0] = c[1] = c[2] = 0;
88 
89       /* while heap nonempty and contains chain with current output exponent */
90       while (heap_len > 1 && heap[1].exp == exp)
91       {
92          /* pop chain from heap */
93          x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
94 
95          /* take node out of heap and put into store */
96          hind[x->i] |= WORD(1);
97          Q[Q_len++] = x->i;
98          Q[Q_len++] = x->j;
99 
100          /* if output coeffs will fit in three words */
101          if (small)
102          {
103             /* compute product of input poly coeffs */
104             if (first)
105             {
106                smul_ppmm(c[1], c[0], poly2[x->i], poly3[x->j]);
107                c[2] = -(c[1] >> (FLINT_BITS - 1));
108 
109                /* set output monomial */
110                e1[k] = exp;
111                first = 0;
112             } else /* addmul product of input poly coeffs */
113             {
114                smul_ppmm(p[1], p[0], poly2[x->i], poly3[x->j]);
115                add_sssaaaaaa(cy, c[1], c[0], 0, c[1], c[0], 0, p[1], p[0]);
116                c[2] += (0 <= (slong) p[1]) ? cy : cy - 1;
117             }
118 
119             /* for every node in this chain */
120             while ((x = x->next) != NULL)
121             {
122                /* addmul product of input poly coeffs */
123                smul_ppmm(p[1], p[0], poly2[x->i], poly3[x->j]);
124                add_sssaaaaaa(cy, c[1], c[0], 0, c[1], c[0], 0, p[1], p[0]);
125                c[2] += (0 <= (slong) p[1]) ? cy : cy - 1;
126 
127                /* take node out of heap and put into store */
128                hind[x->i] |= WORD(1);
129                Q[Q_len++] = x->i;
130                Q[Q_len++] = x->j;
131             }
132          } else /* output coeffs require multiprecision */
133          {
134             if (first) /* compute product of input poly coeffs */
135             {
136                fmpz_mul(p1 + k, poly2 + x->i, poly3 + x->j);
137 
138                e1[k] = exp;
139                first = 0;
140             } else
141             {  /* addmul product of input poly coeffs */
142                fmpz_addmul(p1 + k, poly2 + x->i, poly3 + x->j);
143             }
144 
145             /* for each node in this chain */
146             while ((x = x->next) != NULL)
147             {
148                /* addmul product of input poly coeffs */
149                fmpz_addmul(p1 + k, poly2 + x->i, poly3 + x->j);
150 
151                /* take node out of heap and put into store */
152                hind[x->i] |= WORD(1);
153                Q[Q_len++] = x->i;
154                Q[Q_len++] = x->j;
155             }
156          }
157       }
158 
159       /* for each node temporarily stored */
160       while (Q_len > 0)
161       {
162          /* take node from store */
163          j = Q[--Q_len];
164          i = Q[--Q_len];
165 
166          /* should we go right? */
167          if (  (i + 1 < len2)
168             && (hind[i + 1] == 2*j + 1)
169             )
170          {
171             x = chain + i + 1;
172             x->i = i + 1;
173             x->j = j;
174             x->next = NULL;
175 
176             hind[x->i] = 2*(x->j+1) + 0;
177             _mpoly_heap_insert1(heap, exp2[x->i] + exp3[x->j], x,
178                                                  &next_loc, &heap_len, maskhi);
179          }
180 
181          /* should we go up? */
182          if (  (j + 1 < len3)
183             && ((hind[i] & 1) == 1)
184             && (  (i == 0)
185                || (hind[i - 1] >  2*(j + 2) + 1)
186                || (hind[i - 1] == 2*(j + 2) + 1) /* gcc should fuse */
187                )
188             )
189          {
190             x = chain + i;
191             x->i = i;
192             x->j = j + 1;
193             x->next = NULL;
194 
195             hind[x->i] = 2*(x->j+1) + 0;
196             _mpoly_heap_insert1(heap, exp2[x->i] + exp3[x->j], x,
197                                                  &next_loc, &heap_len, maskhi);
198          }
199       }
200 
201       /* set output poly coeff from temporary accumulation, if not multiprec */
202       if (small)
203          fmpz_set_signed_uiuiui(p1 + k, c[2], c[1], c[0]);
204 
205       if (fmpz_is_zero(p1 + k))
206          k--;
207    }
208 
209    k++;
210 
211    (*poly1) = p1;
212    (*exp1) = e1;
213 
214    TMP_END;
215 
216    return k;
217 }
218 
219 /*
220    Set poly1 to poly2*poly3 using Johnson's heap method. The function
221    realocates its output and returns the length of the product. This
222    version of the function assumes the exponent vectors take N words.
223 */
_fmpz_mpoly_mul_johnson(fmpz ** poly1,ulong ** exp1,slong * alloc,const fmpz * poly2,const ulong * exp2,slong len2,const fmpz * poly3,const ulong * exp3,slong len3,flint_bitcnt_t bits,slong N,const ulong * cmpmask)224 slong _fmpz_mpoly_mul_johnson(fmpz ** poly1, ulong ** exp1, slong * alloc,
225                  const fmpz * poly2, const ulong * exp2, slong len2,
226                  const fmpz * poly3, const ulong * exp3, slong len3,
227                               flint_bitcnt_t bits, slong N, const ulong * cmpmask)
228 {
229    slong i, j, k;
230    slong next_loc;
231    slong Q_len = 0, heap_len = 2; /* heap zero index unused */
232    mpoly_heap_s * heap;
233    mpoly_heap_t * chain;
234    slong * Q;
235    mpoly_heap_t * x;
236    fmpz * p1 = *poly1;
237    ulong * e1 = *exp1;
238    ulong cy;
239    ulong c[3], p[2]; /* for accumulating coefficients */
240    ulong * exp, * exps;
241    ulong ** exp_list;
242    slong exp_next;
243    slong * hind;
244    int first, small;
245    TMP_INIT;
246 
247    /* if exponent vectors fit in single word, call special version */
248    if (N == 1)
249       return _fmpz_mpoly_mul_johnson1(poly1, exp1, alloc,
250                              poly2, exp2, len2, poly3, exp3, len3, cmpmask[0]);
251 
252    TMP_START;
253 
254    /* whether input coeffs are small, thus output coeffs fit in three words */
255    small = _fmpz_mpoly_fits_small(poly2, len2) &&
256                                            _fmpz_mpoly_fits_small(poly3, len3);
257 
258    next_loc = len2 + 4;   /* something bigger than heap can ever be */
259    heap = (mpoly_heap_s *) TMP_ALLOC((len2 + 1)*sizeof(mpoly_heap_s));
260    /* alloc array of heap nodes which can be chained together */
261    chain = (mpoly_heap_t *) TMP_ALLOC(len2*sizeof(mpoly_heap_t));
262    /* space for temporary storage of pointers to heap nodes */
263    Q = (slong *) TMP_ALLOC(2*len2*sizeof(slong));
264    /* allocate space for exponent vectors of N words */
265    exps = (ulong *) TMP_ALLOC(len2*N*sizeof(ulong));
266    /* list of pointers to allocated exponent vectors */
267    exp_list = (ulong **) TMP_ALLOC(len2*sizeof(ulong *));
268    for (i = 0; i < len2; i++)
269       exp_list[i] = exps + i*N;
270 
271    /* space for heap indices */
272    hind = (slong *) TMP_ALLOC(len2*sizeof(slong));
273    for (i = 0; i < len2; i++)
274        hind[i] = 1;
275 
276    /* start with no heap nodes and no exponent vectors in use */
277    exp_next = 0;
278 
279    /* put (0, 0, exp2[0] + exp3[0]) on heap */
280    x = chain + 0;
281    x->i = 0;
282    x->j = 0;
283    x->next = NULL;
284 
285    heap[1].next = x;
286    heap[1].exp = exp_list[exp_next++];
287 
288     if (bits <= FLINT_BITS)
289         mpoly_monomial_add(heap[1].exp, exp2, exp3, N);
290     else
291         mpoly_monomial_add_mp(heap[1].exp, exp2, exp3, N);
292 
293     hind[0] = 2*1 + 0;
294 
295    /* output poly index starts at -1, will be immediately updated to 0 */
296    k = -WORD(1);
297 
298    /* while heap is nonempty */
299    while (heap_len > 1)
300    {
301       /* get pointer to exponent field of heap top */
302       exp = heap[1].exp;
303 
304       /* realloc output poly ready for next product term */
305       k++;
306       _fmpz_mpoly_fit_length(&p1, &e1, alloc, k + 1, N);
307 
308       /* whether we are on first coeff product for this output exponent */
309       first = 1;
310 
311       /* set temporary coeff to zero */
312       c[0] = c[1] = c[2] = 0;
313 
314       /* while heap nonempty and contains chain with current output exponent */
315       while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N))
316       {
317          /* pop chain from heap and set exponent field to be reused */
318          exp_list[--exp_next] = heap[1].exp;
319 
320          x = _mpoly_heap_pop(heap, &heap_len, N, cmpmask);
321 
322          /* take node out of heap and put into store */
323          hind[x->i] |= WORD(1);
324          Q[Q_len++] = x->i;
325          Q[Q_len++] = x->j;
326 
327          /* if output coeffs will fit in three words */
328          if (small)
329          {
330             /* compute product of input poly coeffs */
331             if (first)
332             {
333                smul_ppmm(c[1], c[0], poly2[x->i], poly3[x->j]);
334                c[2] = -(c[1] >> (FLINT_BITS - 1));
335 
336                /* set output monomial */
337                mpoly_monomial_set(e1 + k*N, exp, N);
338 
339                first = 0;
340             } else /* addmul product of input poly coeffs */
341             {
342                smul_ppmm(p[1], p[0], poly2[x->i], poly3[x->j]);
343                add_sssaaaaaa(cy, c[1], c[0], 0, c[1], c[0], 0, p[1], p[0]);
344                c[2] += (0 <= (slong) p[1]) ? cy : cy - 1;
345             }
346 
347             /* for every node in this chain */
348             while ((x = x->next) != NULL)
349             {
350                /* addmul product of input poly coeffs */
351                smul_ppmm(p[1], p[0], poly2[x->i], poly3[x->j]);
352                add_sssaaaaaa(cy, c[1], c[0], 0, c[1], c[0], 0, p[1], p[0]);
353                c[2] += (0 <= (slong) p[1]) ? cy : cy - 1;
354 
355                /* take node out of heap and put into store */
356                hind[x->i] |= WORD(1);
357                Q[Q_len++] = x->i;
358                Q[Q_len++] = x->j;
359             }
360          } else /* output coeffs require multiprecision */
361          {
362             if (first) /* compute product of input poly coeffs */
363             {
364                fmpz_mul(p1 + k, poly2 + x->i, poly3 + x->j);
365 
366                /* set output monomial */
367                mpoly_monomial_set(e1 + k*N, exp, N);
368 
369                first = 0;
370             } else
371             {  /* addmul product of input poly coeffs */
372                fmpz_addmul(p1 + k, poly2 + x->i, poly3 + x->j);
373             }
374 
375             /* for each node in this chain */
376             while ((x = x->next) != NULL)
377             {
378                /* addmul product of input poly coeffs */
379                fmpz_addmul(p1 + k, poly2 + x->i, poly3 + x->j);
380 
381                /* take node out of heap and put into store */
382                hind[x->i] |= WORD(1);
383                Q[Q_len++] = x->i;
384                Q[Q_len++] = x->j;
385             }
386          }
387       }
388 
389       /* for each node temporarily stored */
390       while (Q_len > 0)
391       {
392          /* take node from store */
393          j = Q[--Q_len];
394          i = Q[--Q_len];
395 
396          /* should we go right? */
397          if (  (i + 1 < len2)
398             && (hind[i + 1] == 2*j + 1)
399             )
400          {
401             x = chain + i + 1;
402             x->i = i + 1;
403             x->j = j;
404             x->next = NULL;
405 
406             hind[x->i] = 2*(x->j+1) + 0;
407 
408             if (bits <= FLINT_BITS)
409                 mpoly_monomial_add(exp_list[exp_next], exp2 + x->i*N, exp3 + x->j*N, N);
410             else
411                 mpoly_monomial_add_mp(exp_list[exp_next], exp2 + x->i*N, exp3 + x->j*N, N);
412 
413             if (!_mpoly_heap_insert(heap, exp_list[exp_next++], x,
414                                       &next_loc, &heap_len, N, cmpmask))
415                exp_next--;
416          }
417 
418          /* should we go up? */
419          if (  (j + 1 < len3)
420             && ((hind[i] & 1) == 1)
421             && (  (i == 0)
422                || (hind[i - 1] >  2*(j + 2) + 1)
423                || (hind[i - 1] == 2*(j + 2) + 1) /* gcc should fuse */
424                )
425             )
426          {
427             x = chain + i;
428             x->i = i;
429             x->j = j + 1;
430             x->next = NULL;
431 
432             hind[x->i] = 2*(x->j+1) + 0;
433 
434             if (bits <= FLINT_BITS)
435                 mpoly_monomial_add(exp_list[exp_next], exp2 + x->i*N, exp3 + x->j*N, N);
436             else
437                 mpoly_monomial_add_mp(exp_list[exp_next], exp2 + x->i*N, exp3 + x->j*N, N);
438 
439             if (!_mpoly_heap_insert(heap, exp_list[exp_next++], x,
440                                       &next_loc, &heap_len, N, cmpmask))
441                exp_next--;
442          }
443       }
444 
445       /* set output poly coeff from temporary accumulation, if not multiprec */
446       if (small)
447          fmpz_set_signed_uiuiui(p1 + k, c[2], c[1], c[0]);
448 
449       if (fmpz_is_zero(p1 + k))
450          k--;
451    }
452 
453    k++;
454 
455    (*poly1) = p1;
456    (*exp1) = e1;
457 
458    TMP_END;
459 
460    return k;
461 }
462 
463 /* maxBfields gets clobbered */
_fmpz_mpoly_mul_johnson_maxfields(fmpz_mpoly_t A,const fmpz_mpoly_t B,fmpz * maxBfields,const fmpz_mpoly_t C,fmpz * maxCfields,const fmpz_mpoly_ctx_t ctx)464 void _fmpz_mpoly_mul_johnson_maxfields(
465     fmpz_mpoly_t A,
466     const fmpz_mpoly_t B, fmpz * maxBfields,
467     const fmpz_mpoly_t C, fmpz * maxCfields,
468     const fmpz_mpoly_ctx_t ctx)
469 {
470     slong N, Alen;
471     flint_bitcnt_t Abits;
472     ulong * cmpmask;
473     ulong * Bexp, * Cexp;
474     int freeBexp, freeCexp;
475     TMP_INIT;
476 
477     TMP_START;
478 
479     _fmpz_vec_add(maxBfields, maxBfields, maxCfields, ctx->minfo->nfields);
480 
481     Abits = _fmpz_vec_max_bits(maxBfields, ctx->minfo->nfields);
482     Abits = FLINT_MAX(MPOLY_MIN_BITS, Abits + 1);
483     Abits = FLINT_MAX(Abits, B->bits);
484     Abits = FLINT_MAX(Abits, C->bits);
485     Abits = mpoly_fix_bits(Abits, ctx->minfo);
486 
487     N = mpoly_words_per_exp(Abits, ctx->minfo);
488     cmpmask = (ulong *) TMP_ALLOC(N*sizeof(ulong));
489     mpoly_get_cmpmask(cmpmask, N, Abits, ctx->minfo);
490 
491     /* ensure input exponents are packed into same sized fields as output */
492     freeBexp = 0;
493     Bexp = B->exps;
494     if (Abits > B->bits)
495     {
496         freeBexp = 1;
497         Bexp = (ulong *) flint_malloc(N*B->length*sizeof(ulong));
498         mpoly_repack_monomials(Bexp, Abits, B->exps, B->bits,
499                                                         B->length, ctx->minfo);
500     }
501 
502     freeCexp = 0;
503     Cexp = C->exps;
504     if (Abits > C->bits)
505     {
506         freeCexp = 1;
507         Cexp = (ulong *) flint_malloc(N*C->length*sizeof(ulong));
508         mpoly_repack_monomials(Cexp, Abits, C->exps, C->bits,
509                                                         C->length, ctx->minfo);
510     }
511 
512     /* deal with aliasing and do multiplication */
513     if (A == B || A == C)
514     {
515         fmpz_mpoly_t T;
516         fmpz_mpoly_init2(T, B->length + C->length - 1, ctx);
517         fmpz_mpoly_fit_bits(T, Abits, ctx);
518         T->bits = Abits;
519 
520         /* algorithm more efficient if smaller poly first */
521         if (B->length > C->length)
522         {
523             Alen = _fmpz_mpoly_mul_johnson(&T->coeffs, &T->exps, &T->alloc,
524                                                   C->coeffs, Cexp, C->length,
525                                                   B->coeffs, Bexp, B->length,
526                                                          Abits, N, cmpmask);
527         }
528         else
529         {
530             Alen = _fmpz_mpoly_mul_johnson(&T->coeffs, &T->exps, &T->alloc,
531                                                   B->coeffs, Bexp, B->length,
532                                                   C->coeffs, Cexp, C->length,
533                                                          Abits, N, cmpmask);
534         }
535 
536         fmpz_mpoly_swap(T, A, ctx);
537         fmpz_mpoly_clear(T, ctx);
538     }
539     else
540     {
541         fmpz_mpoly_fit_length(A, B->length + C->length - 1, ctx);
542         fmpz_mpoly_fit_bits(A, Abits, ctx);
543         A->bits = Abits;
544 
545         /* algorithm more efficient if smaller poly first */
546         if (B->length > C->length)
547         {
548             Alen = _fmpz_mpoly_mul_johnson(&A->coeffs, &A->exps, &A->alloc,
549                                                   C->coeffs, Cexp, C->length,
550                                                   B->coeffs, Bexp, B->length,
551                                                          Abits, N, cmpmask);
552         }
553         else
554         {
555             Alen = _fmpz_mpoly_mul_johnson(&A->coeffs, &A->exps, &A->alloc,
556                                                   B->coeffs, Bexp, B->length,
557                                                   C->coeffs, Cexp, C->length,
558                                                          Abits, N, cmpmask);
559         }
560     }
561 
562     if (freeBexp)
563         flint_free(Bexp);
564 
565     if (freeCexp)
566         flint_free(Cexp);
567 
568     _fmpz_mpoly_set_length(A, Alen, ctx);
569 
570     TMP_END;
571 }
572 
573 
574 
fmpz_mpoly_mul_johnson(fmpz_mpoly_t A,const fmpz_mpoly_t B,const fmpz_mpoly_t C,const fmpz_mpoly_ctx_t ctx)575 void fmpz_mpoly_mul_johnson(
576     fmpz_mpoly_t A,
577     const fmpz_mpoly_t B,
578     const fmpz_mpoly_t C,
579     const fmpz_mpoly_ctx_t ctx)
580 {
581     slong i;
582     fmpz * maxBfields, * maxCfields;
583     TMP_INIT;
584 
585     if (B->length == 0 || C->length == 0)
586     {
587         fmpz_mpoly_zero(A, ctx);
588         return;
589     }
590 
591     TMP_START;
592 
593     maxBfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz));
594     maxCfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz));
595     for (i = 0; i < ctx->minfo->nfields; i++)
596     {
597         fmpz_init(maxBfields + i);
598         fmpz_init(maxCfields + i);
599     }
600     mpoly_max_fields_fmpz(maxBfields, B->exps, B->length, B->bits, ctx->minfo);
601     mpoly_max_fields_fmpz(maxCfields, C->exps, C->length, C->bits, ctx->minfo);
602 
603     _fmpz_mpoly_mul_johnson_maxfields(A, B, maxBfields, C, maxCfields, ctx);
604 
605     for (i = 0; i < ctx->minfo->nfields; i++)
606     {
607         fmpz_clear(maxBfields + i);
608         fmpz_clear(maxCfields + i);
609     }
610 
611     TMP_END;
612 }
613