1 /*
2     Copyright (C) 2018 Daniel Schultz
3 
4     This file is part of FLINT.
5 
6     FLINT is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
10 */
11 
12 #include "nmod_mpoly.h"
13 
14 /* improve locality */
15 #define BLOCK 128
16 #define MAX_ARRAY_SIZE (WORD(300000))
17 #define MAX_LEX_SIZE (WORD(300))
18 
19 
20 
_nmod_mpoly_addmul_array1_ulong1(ulong * poly1,const ulong * poly2,const ulong * exp2,slong len2,const ulong * poly3,const ulong * exp3,slong len3)21 void _nmod_mpoly_addmul_array1_ulong1(ulong * poly1,
22                            const ulong * poly2, const ulong * exp2, slong len2,
23                            const ulong * poly3, const ulong * exp3, slong len3)
24 {
25    slong ii, i, jj, j;
26    ulong * c2;
27 
28    for (ii = 0; ii < len2 + BLOCK; ii += BLOCK)
29    {
30       for (jj = 0; jj < len3 + BLOCK; jj += BLOCK)
31       {
32          for (i = ii; i < FLINT_MIN(ii + BLOCK, len2); i++)
33          {
34             c2 = poly1 + (slong) exp2[i];
35 
36             if (poly2[i] != 0)
37             {
38                for (j = jj; j < FLINT_MIN(jj + BLOCK, len3); j++)
39                {
40                   c2[(slong) exp3[j]] += poly2[i]*poly3[j];
41                }
42             }
43          }
44       }
45    }
46 }
47 
_nmod_mpoly_addmul_array1_ulong2(ulong * poly1,const ulong * poly2,const ulong * exp2,slong len2,const ulong * poly3,const ulong * exp3,slong len3)48 void _nmod_mpoly_addmul_array1_ulong2(ulong * poly1,
49                            const ulong * poly2, const ulong * exp2, slong len2,
50                            const ulong * poly3, const ulong * exp3, slong len3)
51 {
52    slong ii, i, jj, j;
53    ulong p[2];
54    ulong * c2, * c;
55 
56    for (ii = 0; ii < len2 + BLOCK; ii += BLOCK)
57    {
58       for (jj = 0; jj < len3 + BLOCK; jj += BLOCK)
59       {
60          for (i = ii; i < FLINT_MIN(ii + BLOCK, len2); i++)
61          {
62             c2 = poly1 + 2*((slong) exp2[i]);
63 
64             if (poly2[i] != 0)
65             {
66                for (j = jj; j < FLINT_MIN(jj + BLOCK, len3); j++)
67                {
68                   c = c2 + 2*((slong) exp3[j]);
69                   umul_ppmm(p[1], p[0], poly2[i], poly3[j]);
70                   add_ssaaaa(c[1], c[0], c[1], c[0], p[1], p[0]);
71                }
72             }
73          }
74       }
75    }
76 }
77 
_nmod_mpoly_addmul_array1_ulong3(ulong * poly1,const ulong * poly2,const ulong * exp2,slong len2,const ulong * poly3,const ulong * exp3,slong len3)78 void _nmod_mpoly_addmul_array1_ulong3(ulong * poly1,
79                            const ulong * poly2, const ulong * exp2, slong len2,
80                            const ulong * poly3, const ulong * exp3, slong len3)
81 {
82    slong ii, i, jj, j;
83    ulong p[2];
84    ulong * c2, * c;
85 
86    for (ii = 0; ii < len2 + BLOCK; ii += BLOCK)
87    {
88       for (jj = 0; jj < len3 + BLOCK; jj += BLOCK)
89       {
90          for (i = ii; i < FLINT_MIN(ii + BLOCK, len2); i++)
91          {
92             c2 = poly1 + 3*((slong) exp2[i]);
93 
94             if (poly2[i] != 0)
95             {
96                for (j = jj; j < FLINT_MIN(jj + BLOCK, len3); j++)
97                {
98                   c = c2 + 3*((slong) exp3[j]);
99                   umul_ppmm(p[1], p[0], poly2[i], poly3[j]);
100                   add_sssaaaaaa(c[2], c[1], c[0], c[2], c[1], c[0], 0, p[1], p[0]);
101                }
102             }
103          }
104       }
105    }
106 }
107 
108 
109 /****************************************************
110     LEX
111 ****************************************************/
112 
113 
114 void mpoly_main_variable_split_LEX(slong * ind, ulong * pexp, const ulong * Aexp,
115              slong l1, slong Alen, const ulong * mults, slong num, slong Abits);
116 
117 
118 #define LEX_UNPACK_MACRO(fxn_name, coeff_decl, nonzero_test, reduce_coeff)     \
119 slong fxn_name(nmod_mpoly_t P, slong Plen, coeff_decl,                         \
120              const ulong * mults, slong num, slong array_size, slong top,      \
121                                             const nmod_mpoly_ctx_t ctx)        \
122 {                                                                              \
123     slong off, j;                                                              \
124     slong topmult = num == 0 ? 1 : mults[num - 1];                             \
125     slong lastd   = topmult - 1;                                               \
126     slong reset   = array_size/topmult;                                        \
127     slong counter = reset;                                                     \
128     ulong startexp = (top << (P->bits*num)) + (lastd << (P->bits*(num-1)));    \
129     ulong coeff;                                                               \
130     for (off = array_size - 1; off >= 0; off--)                                \
131     {                                                                          \
132         if (nonzero_test)                                                      \
133         {                                                                      \
134             reduce_coeff                                                       \
135             if (coeff != UWORD(0))                                             \
136             {                                                                  \
137                 slong d = off;                                                 \
138                 ulong exp = startexp;                                          \
139                 for (j = 0; j + 1 < num; j++) {                                \
140                     exp += (d % mults[j]) << (P->bits*j);                      \
141                     d = d / mults[j];                                          \
142                 }                                                              \
143                 _nmod_mpoly_fit_length(&P->coeffs, &P->exps, &P->alloc,        \
144                                                                  Plen + 1, 1); \
145                 P->exps[Plen] = exp;                                           \
146                 P->coeffs[Plen] = coeff;                                       \
147                 Plen++;                                                        \
148             }                                                                  \
149         }                                                                      \
150         counter--;                                                             \
151         if (counter <= 0) {                                                    \
152             counter = reset;                                                   \
153             lastd--;                                                           \
154             startexp -= UWORD(1) << (P->bits*(num-1));                         \
155         }                                                                      \
156     }                                                                          \
157     return Plen;                                                               \
158 }
159 
160 /*
161     These four functions will replace
162         _fmpz_mpoly_from_ulong_array, ..., _fmpz_mpoly_from_fmpz_array
163     defined above.
164 */
165 
166 LEX_UNPACK_MACRO(
167     nmod_mpoly_append_array_sm1_LEX, ulong * coeff_array
168 ,
169     coeff_array[off] != UWORD(0)
170 ,
171     NMOD_RED(coeff, coeff_array[off], ctx->ffinfo->mod);
172     coeff_array[off] = 0;
173 )
174 
175 LEX_UNPACK_MACRO(
176     nmod_mpoly_append_array_sm2_LEX, ulong * coeff_array
177 ,
178     (coeff_array[2*off + 0] || coeff_array[2*off + 1]) != UWORD(0)
179 ,
180     NMOD2_RED2(coeff, coeff_array[2*off + 1], coeff_array[2*off + 0], ctx->ffinfo->mod);
181     coeff_array[2*off + 0] = coeff_array[2*off + 1] = 0;
182 )
183 
184 LEX_UNPACK_MACRO(
185     nmod_mpoly_append_array_sm3_LEX, ulong * coeff_array
186 ,
187     (coeff_array[3*off + 0] || coeff_array[3*off + 1] || coeff_array[3*off + 2]) != UWORD(0)
188 ,
189     NMOD_RED3(coeff, coeff_array[3*off + 2], coeff_array[3*off + 1], coeff_array[3*off + 0], ctx->ffinfo->mod);
190     coeff_array[3*off + 0] = coeff_array[3*off + 1] = coeff_array[3*off + 2] = UWORD(0);
191 )
192 
193 
194 
_nmod_mpoly_mul_array_chunked_LEX(nmod_mpoly_t P,const nmod_mpoly_t A,const nmod_mpoly_t B,const ulong * mults,const nmod_mpoly_ctx_t ctx)195 void _nmod_mpoly_mul_array_chunked_LEX(
196     nmod_mpoly_t P,
197     const nmod_mpoly_t A,
198     const nmod_mpoly_t B,
199     const ulong * mults,
200     const nmod_mpoly_ctx_t ctx)
201 {
202     slong num = ctx->minfo->nfields - 1;
203     slong Pi, i, j, Plen, Pl, Al, Bl, array_size;
204     slong * Amain, * Bmain;
205     ulong * Apexp, * Bpexp;
206     TMP_INIT;
207 
208     array_size = 1;
209     for (i = 0; i < num; i++) {
210         array_size *= mults[i];
211     }
212 
213     /* compute lengths of poly2 and poly3 in chunks */
214     Al = 1 + (slong) (A->exps[0] >> (A->bits*num));
215     Bl = 1 + (slong) (B->exps[0] >> (B->bits*num));
216 
217     TMP_START;
218 
219     /* compute indices and lengths of coefficients of polys in main variable */
220     Amain = (slong *) TMP_ALLOC((Al + 1)*sizeof(slong));
221     Bmain = (slong *) TMP_ALLOC((Bl + 1)*sizeof(slong));
222     Apexp = (ulong *) flint_malloc(A->length*sizeof(ulong));
223     Bpexp = (ulong *) flint_malloc(B->length*sizeof(ulong));
224     mpoly_main_variable_split_LEX(Amain, Apexp, A->exps, Al, A->length, mults, num, A->bits);
225     mpoly_main_variable_split_LEX(Bmain, Bpexp, B->exps, Bl, B->length, mults, num, B->bits);
226 
227     Pl = Al + Bl - 1;
228     Plen = 0;
229 
230     {
231         ulong * coeff_array = (ulong *) TMP_ALLOC(3*array_size*sizeof(ulong));
232         for (j = 0; j < 3*array_size; j++)
233             coeff_array[j] = 0;
234 
235         /* for each output chunk */
236         for (Pi = 0; Pi < Pl; Pi++)
237         {
238             /* compute bound on coeffs of output chunk */
239             slong len = 0;
240             mp_limb_t t2, t1, t0, u1, u0;
241 
242             for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
243             {
244                 if (j < Bl)
245                 {
246                     len += FLINT_MIN(Amain[i + 1] - Amain[i],
247                                      Bmain[j + 1] - Bmain[j]);
248                 }
249             }
250 
251             umul_ppmm(t1, t0, ctx->ffinfo->mod.n - 1, ctx->ffinfo->mod.n - 1);
252             umul_ppmm(t2, t1, t1, len);
253             umul_ppmm(u1, u0, t0, len);
254             add_sssaaaaaa(t2, t1, t0,  t2, t1, UWORD(0),  UWORD(0), u1, u0);
255 
256             if (t2 != UWORD(0))
257             {
258                 /* need three words */
259                 for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
260                 {
261                     if (j < Bl)
262                     {
263                         _nmod_mpoly_addmul_array1_ulong3(coeff_array,
264                                 A->coeffs + Amain[i],
265                                 Apexp + Amain[i], Amain[i + 1] - Amain[i],
266                                 B->coeffs + Bmain[j],
267                                 Bpexp + Bmain[j], Bmain[j + 1] - Bmain[j]);
268                     }
269                 }
270                 Plen = nmod_mpoly_append_array_sm3_LEX(P, Plen, coeff_array,
271                                       mults, num, array_size, Pl - Pi - 1, ctx);
272 
273             } else if (t1 != UWORD(0))
274             {
275                 /* fits into two words */
276                 for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
277                 {
278                     if (j < Bl)
279                     {
280                         _nmod_mpoly_addmul_array1_ulong2(coeff_array,
281                                 A->coeffs + Amain[i],
282                                 Apexp + Amain[i], Amain[i + 1] - Amain[i],
283                                 B->coeffs + Bmain[j],
284                                 Bpexp + Bmain[j], Bmain[j + 1] - Bmain[j]);
285 
286                     }
287                 }
288                 Plen = nmod_mpoly_append_array_sm2_LEX(P, Plen, coeff_array,
289                                       mults, num, array_size, Pl - Pi - 1, ctx);
290 
291             } else
292             {
293                 /* fits into one word */
294                 for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
295                 {
296                     if (j < Bl)
297                     {
298                         _nmod_mpoly_addmul_array1_ulong1(coeff_array,
299                                 A->coeffs + Amain[i],
300                                 Apexp + Amain[i], Amain[i + 1] - Amain[i],
301                                 B->coeffs + Bmain[j],
302                                 Bpexp + Bmain[j], Bmain[j + 1] - Bmain[j]);
303                     }
304                 }
305                 Plen = nmod_mpoly_append_array_sm1_LEX(P, Plen, coeff_array,
306                                       mults, num, array_size, Pl - Pi - 1, ctx);
307             }
308         }
309     }
310 
311     _nmod_mpoly_set_length(P, Plen, ctx);
312 
313     flint_free(Apexp);
314     flint_free(Bpexp);
315     TMP_END;
316 }
317 
318 
319 
_nmod_mpoly_mul_array_LEX(nmod_mpoly_t A,const nmod_mpoly_t B,fmpz * maxBfields,const nmod_mpoly_t C,fmpz * maxCfields,const nmod_mpoly_ctx_t ctx)320 int _nmod_mpoly_mul_array_LEX(
321     nmod_mpoly_t A,
322     const nmod_mpoly_t B,
323     fmpz * maxBfields,
324     const nmod_mpoly_t C,
325     fmpz * maxCfields,
326     const nmod_mpoly_ctx_t ctx)
327 {
328     slong i, exp_bits, array_size;
329     ulong max, * mults;
330     int success;
331     TMP_INIT;
332 
333     FLINT_ASSERT(B->length != 0);
334     FLINT_ASSERT(C->length != 0);
335 
336     FLINT_ASSERT(ctx->minfo->ord == ORD_LEX);
337 
338     FLINT_ASSERT(1 == mpoly_words_per_exp(B->bits, ctx->minfo));
339     FLINT_ASSERT(1 == mpoly_words_per_exp(C->bits, ctx->minfo));
340 
341     TMP_START;
342 
343     /* compute maximum exponents for each variable */
344     mults = (ulong *) TMP_ALLOC(ctx->minfo->nfields*sizeof(ulong));
345 
346     /* the field of index n-1 is the one that wil be pulled out */
347     i = ctx->minfo->nfields - 1;
348     FLINT_ASSERT(fmpz_fits_si(maxBfields + i));
349     FLINT_ASSERT(fmpz_fits_si(maxCfields + i));
350     mults[i] = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i);
351     max = mults[i];
352     if (((slong) mults[i]) <= 0 || mults[i] > MAX_LEX_SIZE)
353     {
354         success = 0;
355         goto cleanup;
356     }
357 
358     /* the fields of index n-2...0, contribute to the array size */
359     array_size = WORD(1);
360     for (i--; i >= 0; i--)
361     {
362         ulong hi;
363         FLINT_ASSERT(fmpz_fits_si(maxBfields + i));
364         FLINT_ASSERT(fmpz_fits_si(maxCfields + i));
365         mults[i] = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i);
366         max |= mults[i];
367         umul_ppmm(hi, array_size, array_size, mults[i]);
368         if (hi != 0 || (slong) mults[i] <= 0
369                     || array_size <= 0
370                     || array_size > MAX_ARRAY_SIZE)
371         {
372             success = 0;
373             goto cleanup;
374         }
375     }
376 
377     exp_bits = FLINT_MAX(MPOLY_MIN_BITS, FLINT_BIT_COUNT(max) + 1);
378     exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo);
379 
380     /* array multiplication assumes result fit into 1 word */
381     if (1 != mpoly_words_per_exp(exp_bits, ctx->minfo))
382     {
383         success = 0;
384         goto cleanup;
385     }
386 
387     /* handle aliasing and do array multiplication */
388     if (A == B || A == C)
389     {
390         nmod_mpoly_t T;
391         nmod_mpoly_init3(T, B->length + C->length - 1, exp_bits, ctx);
392         _nmod_mpoly_mul_array_chunked_LEX(T, C, B, mults, ctx);
393         nmod_mpoly_swap(T, A, ctx);
394         nmod_mpoly_clear(T, ctx);
395     }
396     else
397     {
398         nmod_mpoly_fit_length(A, B->length + C->length - 1, ctx);
399         nmod_mpoly_fit_bits(A, exp_bits, ctx);
400         A->bits = exp_bits;
401         _nmod_mpoly_mul_array_chunked_LEX(A, C, B, mults, ctx);
402     }
403     success = 1;
404 
405 cleanup:
406 
407     TMP_END;
408 
409     return success;
410 }
411 
412 
413 
414 
415 /****************************************************
416     DEGLEX and DEGREVLEX
417 ****************************************************/
418 
419 void mpoly_main_variable_split_DEG(slong * ind, ulong * pexp, const ulong * Aexp,
420              slong l1, slong Alen, ulong deg, slong num, slong Abits);
421 
422 
423 
424 #define DEGLEX_UNPACK_MACRO(fxn_name, coeff_decl, nonzero_test, reduce_coeff)  \
425 slong fxn_name(nmod_mpoly_t P, slong Plen, coeff_decl,                         \
426                          slong top, slong nvars, slong degb,                   \
427                                              const nmod_mpoly_ctx_t ctx)       \
428 {                                                                              \
429     slong i;                                                                   \
430     ulong exp, lomask = (UWORD(1) << (P->bits - 1)) - 1;                       \
431     slong off, array_size;                                                     \
432     slong * curexp, * degpow;                                                  \
433     ulong * oneexp;                                                            \
434     ulong coeff;                                                               \
435     int carry;                                                                 \
436     TMP_INIT;                                                                  \
437                                                                                \
438     TMP_START;                                                                 \
439     curexp = (slong *) TMP_ALLOC(nvars*sizeof(slong));                         \
440     degpow = (slong *) TMP_ALLOC(nvars*sizeof(slong));                         \
441     oneexp = (ulong *) TMP_ALLOC(nvars*sizeof(ulong));                         \
442     array_size = 1;                                                            \
443     curexp[0] = 0;                                                             \
444     oneexp[0] = 0;                                                             \
445     degpow[0] = 1;                                                             \
446     for (i = 0; i < nvars-1; i++)                                              \
447     {                                                                          \
448         curexp[i] = 0;                                                         \
449         degpow[i] = array_size;                                                \
450         oneexp[i] = (UWORD(1) << (P->bits*(i+1))) - UWORD(1);                  \
451         array_size *= degb;                                                    \
452     }                                                                          \
453     off = 0;                                                                   \
454     if (nvars > 1)                                                             \
455     {                                                                          \
456         curexp[nvars - 2] = top;                                               \
457         off = top * degpow[nvars - 2];                                         \
458     }                                                                          \
459     exp = (top << (P->bits*nvars)) + (top << (P->bits*(nvars-1)));             \
460                                                                                \
461     carry = 1;                                                                 \
462     do {                                                                       \
463         if (nonzero_test)                                                      \
464         {                                                                      \
465             reduce_coeff                                                       \
466             if (coeff != UWORD(0))                                             \
467             {                                                                  \
468                 _nmod_mpoly_fit_length(&P->coeffs, &P->exps, &P->alloc,        \
469                                                                  Plen + 1, 1); \
470                 P->exps[Plen] = exp;                                           \
471                 P->coeffs[Plen] = coeff;                                       \
472                 Plen++;                                                        \
473             }                                                                  \
474         }                                                                      \
475                                                                                \
476         exp -= oneexp[0];                                                      \
477         off -= 1;                                                              \
478         curexp[0] -= 1;                                                        \
479         if (curexp[0] >= 0)                                                    \
480         {                                                                      \
481             carry = 0;                                                         \
482         } else                                                                 \
483         {                                                                      \
484             exp -= curexp[0]*oneexp[0];                                        \
485             off -= curexp[0];                                                  \
486             curexp[0] = 0;                                                     \
487             carry = 1;                                                         \
488                                                                                \
489             for (i = 1; i < nvars - 1; i++)                                    \
490             {                                                                  \
491                 exp -= oneexp[i];                                              \
492                 off -= degpow[i];                                              \
493                 curexp[i] -= 1;                                                \
494                 if (curexp[i] < 0)                                             \
495                 {                                                              \
496                     exp -= curexp[i]*oneexp[i];                                \
497                     off -= curexp[i]*degpow[i];                                \
498                     curexp[i] = 0;                                             \
499                     carry = 1;                                                 \
500                 } else                                                         \
501                 {                                                              \
502                     ulong t = exp & lomask;                                    \
503                     off += t*degpow[i - 1];                                    \
504                     curexp[i - 1] = t;                                         \
505                     exp += t*oneexp[i - 1];                                    \
506                     carry = 0;                                                 \
507                     break;                                                     \
508                 }                                                              \
509             }                                                                  \
510         }                                                                      \
511     } while (!carry);                                                          \
512                                                                                \
513     TMP_END;                                                                   \
514                                                                                \
515     return Plen;                                                               \
516 }
517 
518 DEGLEX_UNPACK_MACRO(
519     nmod_mpoly_append_array_sm1_DEGLEX, ulong * coeff_array
520 ,
521     coeff_array[off] != UWORD(0)
522 ,
523     NMOD_RED(coeff, coeff_array[off], ctx->ffinfo->mod);
524     coeff_array[off] = 0;
525 )
526 
527 DEGLEX_UNPACK_MACRO(
528     nmod_mpoly_append_array_sm2_DEGLEX, ulong * coeff_array
529 ,
530     (coeff_array[2*off + 0] || coeff_array[2*off + 1]) != UWORD(0)
531 ,
532     NMOD2_RED2(coeff, coeff_array[2*off + 1], coeff_array[2*off + 0], ctx->ffinfo->mod);
533     coeff_array[2*off + 0] = coeff_array[2*off + 1] = 0;
534 )
535 
536 DEGLEX_UNPACK_MACRO(
537     nmod_mpoly_append_array_sm3_DEGLEX, ulong * coeff_array
538 ,
539     (coeff_array[3*off + 0] || coeff_array[3*off + 1] || coeff_array[3*off + 2]) != UWORD(0)
540 ,
541     NMOD_RED3(coeff, coeff_array[3*off + 2], coeff_array[3*off + 1], coeff_array[3*off + 0], ctx->ffinfo->mod);
542     coeff_array[3*off + 0] = coeff_array[3*off + 1] = coeff_array[3*off + 2] = 0;
543 )
544 
545 
546 
547 
548 
549 #define DEGREVLEX_UNPACK_MACRO(fxn_name, coeff_decl, nonzero_test, reduce_coeff) \
550 slong fxn_name(nmod_mpoly_t P, slong Plen, coeff_decl,                         \
551                            slong top, slong nvars, slong degb,                 \
552                                             const nmod_mpoly_ctx_t ctx)        \
553 {                                                                              \
554     slong i;                                                                   \
555     ulong exp, mask = UWORD(1) << (P->bits - 1);                               \
556     slong off, array_size;                                                     \
557     slong * curexp, * degpow;                                                  \
558     ulong * oneexp;                                                            \
559     ulong coeff;                                                               \
560     int carry;                                                                 \
561     TMP_INIT;                                                                  \
562                                                                                \
563     TMP_START;                                                                 \
564     curexp = (slong *) TMP_ALLOC(nvars*sizeof(slong));                         \
565     degpow = (slong *) TMP_ALLOC(nvars*sizeof(slong));                         \
566     oneexp = (ulong *) TMP_ALLOC(nvars*sizeof(ulong));                         \
567     array_size = 1;                                                            \
568     oneexp[0] = 0;                                                             \
569     for (i = 0; i < nvars-1; i++) {                                            \
570         curexp[i] = 0;                                                         \
571         degpow[i] = array_size;                                                \
572         oneexp[i] = (UWORD(1) << (P->bits*(i+1))) - UWORD(1);                  \
573         array_size *= degb;                                                    \
574     }                                                                          \
575                                                                                \
576     off = 0;                                                                   \
577     exp = (top << (P->bits*nvars)) + top;                                      \
578                                                                                \
579     do {                                                                       \
580         if (nonzero_test)                                                      \
581         {                                                                      \
582             reduce_coeff                                                       \
583             if (coeff != UWORD(0))                                             \
584             {                                                                  \
585                 _nmod_mpoly_fit_length(&P->coeffs, &P->exps, &P->alloc,        \
586                                                                  Plen + 1, 1); \
587                 P->exps[Plen] = exp;                                           \
588                 P->coeffs[Plen] = coeff;                                       \
589                 Plen++;                                                        \
590             }                                                                  \
591         }                                                                      \
592                                                                                \
593         exp += oneexp[0];                                                      \
594         off += 1;                                                              \
595         curexp[0] += 1;                                                        \
596         if ((exp & mask) == 0)                                                 \
597         {                                                                      \
598             carry = (nvars - 1 == 0);                                          \
599         } else                                                                 \
600         {                                                                      \
601             carry = 1;                                                         \
602             exp -= curexp[0]*oneexp[0];                                        \
603             off -= curexp[0];                                                  \
604             curexp[0] = 0;                                                     \
605             for (i = 1; i < nvars - 1; i++)                                    \
606             {                                                                  \
607                 exp += oneexp[i];                                              \
608                 off += degpow[i];                                              \
609                 curexp[i] += 1;                                                \
610                 if ((exp & mask) == 0)                                         \
611                 {                                                              \
612                     carry = 0;                                                 \
613                     break;                                                     \
614                 } else {                                                       \
615                     carry = 1;                                                 \
616                     exp -= curexp[i]*oneexp[i];                                \
617                     off -= curexp[i]*degpow[i];                                \
618                     curexp[i] = 0;                                             \
619                 }                                                              \
620             }                                                                  \
621         }                                                                      \
622     } while (!carry);                                                          \
623                                                                                \
624     TMP_END;                                                                   \
625                                                                                \
626     return Plen;                                                               \
627 }
628 
629 DEGREVLEX_UNPACK_MACRO(
630     nmod_mpoly_append_array_sm1_DEGREVLEX, ulong * coeff_array
631 ,
632     coeff_array[off] != WORD(0)
633 ,
634     NMOD_RED(coeff, coeff_array[off], ctx->ffinfo->mod);
635     coeff_array[off] = 0;
636 )
637 
638 DEGREVLEX_UNPACK_MACRO(
639     nmod_mpoly_append_array_sm2_DEGREVLEX, ulong * coeff_array
640 ,
641     (coeff_array[2*off + 0] || coeff_array[2*off + 1]) != WORD(0)
642 ,
643     NMOD2_RED2(coeff, coeff_array[2*off + 1], coeff_array[2*off + 0], ctx->ffinfo->mod);
644     coeff_array[2*off + 0] = coeff_array[2*off + 1] = 0;
645 )
646 
647 DEGREVLEX_UNPACK_MACRO(
648     nmod_mpoly_append_array_sm3_DEGREVLEX, ulong * coeff_array
649 ,
650     (coeff_array[3*off + 0] || coeff_array[3*off + 1] || coeff_array[3*off + 2]) != WORD(0)
651 ,
652     NMOD_RED3(coeff, coeff_array[3*off + 2], coeff_array[3*off + 1], coeff_array[3*off + 0], ctx->ffinfo->mod);
653     coeff_array[3*off + 0] = coeff_array[3*off + 1] = coeff_array[3*off + 2] = 0;
654 )
655 
656 
_nmod_mpoly_mul_array_chunked_DEG(nmod_mpoly_t P,const nmod_mpoly_t A,const nmod_mpoly_t B,ulong degb,const nmod_mpoly_ctx_t ctx)657 void _nmod_mpoly_mul_array_chunked_DEG(
658     nmod_mpoly_t P,
659     const nmod_mpoly_t A,
660     const nmod_mpoly_t B,
661     ulong degb,
662     const nmod_mpoly_ctx_t ctx)
663 {
664     slong nvars = ctx->minfo->nvars;
665     slong Pi, i, j, Plen, Pl, Al, Bl, array_size;
666     slong * Amain, * Bmain;
667     ulong * Apexp, * Bpexp;
668     slong (* upack_sm1)(nmod_mpoly_t, slong, ulong *, slong, slong, slong, const nmod_mpoly_ctx_t);
669     slong (* upack_sm2)(nmod_mpoly_t, slong, ulong *, slong, slong, slong, const nmod_mpoly_ctx_t);
670     slong (* upack_sm3)(nmod_mpoly_t, slong, ulong *, slong, slong, slong, const nmod_mpoly_ctx_t);
671     TMP_INIT;
672 
673     TMP_START;
674 
675     /* compute lengths of poly2 and poly3 in chunks */
676     Al = 1 + (slong) (A->exps[0] >> (A->bits*nvars));
677     Bl = 1 + (slong) (B->exps[0] >> (B->bits*nvars));
678 
679     array_size = 1;
680     for (i = 0; i < nvars-1; i++) {
681         array_size *= degb;
682     }
683 
684     upack_sm1  = &nmod_mpoly_append_array_sm1_DEGLEX;
685     upack_sm2  = &nmod_mpoly_append_array_sm2_DEGLEX;
686     upack_sm3  = &nmod_mpoly_append_array_sm3_DEGLEX;
687     if (ctx->minfo->ord == ORD_DEGREVLEX) {
688         upack_sm1  = &nmod_mpoly_append_array_sm1_DEGREVLEX;
689         upack_sm2  = &nmod_mpoly_append_array_sm2_DEGREVLEX;
690         upack_sm3  = &nmod_mpoly_append_array_sm3_DEGREVLEX;
691     }
692 
693     /* compute indices and lengths of coefficients of polys in main variable */
694     Amain = (slong *) TMP_ALLOC((Al + 1)*sizeof(slong));
695     Bmain = (slong *) TMP_ALLOC((Bl + 1)*sizeof(slong));
696     Apexp = (ulong *) flint_malloc(A->length*sizeof(ulong));
697     Bpexp = (ulong *) flint_malloc(B->length*sizeof(ulong));
698     mpoly_main_variable_split_DEG(Amain, Apexp, A->exps, Al, A->length,
699                                                          degb, nvars, A->bits);
700     mpoly_main_variable_split_DEG(Bmain, Bpexp, B->exps, Bl, B->length,
701                                                          degb, nvars, B->bits);
702 
703     Pl = Al + Bl - 1;
704     FLINT_ASSERT(Pl == degb);
705     Plen = 0;
706 
707     {
708         ulong * coeff_array = (ulong *) TMP_ALLOC(3*array_size*sizeof(ulong));
709         for (j = 0; j < 3*array_size; j++)
710             coeff_array[j] = 0;
711 
712         /* for each output chunk */
713         for (Pi = 0; Pi < Pl; Pi++)
714         {
715             /* compute bound on coeffs of output chunk */
716             slong len = 0;
717             mp_limb_t t2, t1, t0, u1, u0;
718 
719             for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
720             {
721                 if (j < Bl)
722                 {
723                     len += FLINT_MIN(Amain[i + 1] - Amain[i],
724                                      Bmain[j + 1] - Bmain[j]);
725                 }
726             }
727 
728             umul_ppmm(t1, t0, ctx->ffinfo->mod.n - 1, ctx->ffinfo->mod.n - 1);
729             umul_ppmm(t2, t1, t1, len);
730             umul_ppmm(u1, u0, t0, len);
731             add_sssaaaaaa(t2, t1, t0,  t2, t1, UWORD(0),  UWORD(0), u1, u0);
732 
733             if (t2 != UWORD(0))
734             {
735                 /* need three words */
736                 for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
737                 {
738                     if (j < Bl)
739                     {
740                         _nmod_mpoly_addmul_array1_ulong3(coeff_array,
741                                 A->coeffs + Amain[i],
742                                 Apexp + Amain[i], Amain[i + 1] - Amain[i],
743                                 B->coeffs + Bmain[j],
744                                 Bpexp + Bmain[j], Bmain[j + 1] - Bmain[j]);
745                     }
746                 }
747 
748                 Plen = upack_sm3(P, Plen, coeff_array, Pl - Pi - 1, nvars, degb, ctx);
749 
750             } else if (t1 != UWORD(0))
751             {
752                 /* fits into two words */
753                 for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
754                 {
755                     if (j < Bl)
756                     {
757                         _nmod_mpoly_addmul_array1_ulong2(coeff_array,
758                                 A->coeffs + Amain[i],
759                                 Apexp + Amain[i], Amain[i + 1] - Amain[i],
760                                 B->coeffs + Bmain[j],
761                                 Bpexp + Bmain[j], Bmain[j + 1] - Bmain[j]);
762 
763                     }
764                 }
765                 Plen = upack_sm2(P, Plen, coeff_array, Pl - Pi - 1, nvars, degb, ctx);
766 
767             } else if (t0 != UWORD(0))
768             {
769                 /* fits into one word */
770                 for (i = 0, j = Pi; i < Al && j >= 0; i++, j--)
771                 {
772                     if (j < Bl)
773                     {
774                         _nmod_mpoly_addmul_array1_ulong1(coeff_array,
775                                 A->coeffs + Amain[i],
776                                 Apexp + Amain[i], Amain[i + 1] - Amain[i],
777                                 B->coeffs + Bmain[j],
778                                 Bpexp + Bmain[j], Bmain[j + 1] - Bmain[j]);
779                     }
780                 }
781                 Plen = upack_sm1(P, Plen, coeff_array, Pl - Pi - 1, nvars, degb, ctx);
782             }
783         }
784     }
785 
786     _nmod_mpoly_set_length(P, Plen, ctx);
787 
788     flint_free(Apexp);
789     flint_free(Bpexp);
790     TMP_END;
791 }
792 
793 
794 
_nmod_mpoly_mul_array_DEG(nmod_mpoly_t A,const nmod_mpoly_t B,fmpz * maxBfields,const nmod_mpoly_t C,fmpz * maxCfields,const nmod_mpoly_ctx_t ctx)795 int _nmod_mpoly_mul_array_DEG(
796     nmod_mpoly_t A,
797     const nmod_mpoly_t B, fmpz * maxBfields,
798     const nmod_mpoly_t C, fmpz * maxCfields,
799     const nmod_mpoly_ctx_t ctx)
800 {
801     slong i, exp_bits, array_size;
802     ulong deg;
803     int success;
804 
805     FLINT_ASSERT(B->length != 0);
806     FLINT_ASSERT(C->length != 0);
807 
808     FLINT_ASSERT(  ctx->minfo->ord == ORD_DEGREVLEX
809                 || ctx->minfo->ord == ORD_DEGLEX);
810 
811     FLINT_ASSERT(1 == mpoly_words_per_exp(B->bits, ctx->minfo));
812     FLINT_ASSERT(1 == mpoly_words_per_exp(C->bits, ctx->minfo));
813 
814 
815     /* the field of index n-1 is the one that wil be pulled out */
816     i = ctx->minfo->nfields - 1;
817     FLINT_ASSERT(fmpz_fits_si(maxBfields + i));
818     FLINT_ASSERT(fmpz_fits_si(maxCfields + i));
819     deg = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i);
820     if (((slong) deg) <= 0 || deg > MAX_ARRAY_SIZE)
821     {
822         success = 0;
823         goto cleanup;
824     }
825 
826     /* the fields of index n-2...1, contribute to the array size */
827     array_size = WORD(1);
828     for (i--; i >= 1; i--)
829     {
830         ulong hi;
831         umul_ppmm(hi, array_size, array_size, deg);
832         if (hi != WORD(0) || array_size <= 0
833                           || array_size > MAX_ARRAY_SIZE)
834         {
835             success = 0;
836             goto cleanup;
837         }
838     }
839 
840     exp_bits = FLINT_MAX(MPOLY_MIN_BITS, FLINT_BIT_COUNT(deg) + 1);
841     exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo);
842 
843     /* array multiplication assumes result fit into 1 word */
844     if (1 != mpoly_words_per_exp(exp_bits, ctx->minfo))
845     {
846         success = 0;
847         goto cleanup;
848     }
849 
850     /* handle aliasing and do array multiplication */
851     if (A == B || A == C)
852     {
853         nmod_mpoly_t T;
854         nmod_mpoly_init3(T, B->length + C->length - 1, exp_bits, ctx);
855         _nmod_mpoly_mul_array_chunked_DEG(T, C, B, deg, ctx);
856         nmod_mpoly_swap(T, A, ctx);
857         nmod_mpoly_clear(T, ctx);
858     }
859     else
860     {
861         nmod_mpoly_fit_length(A, B->length + C->length - 1, ctx);
862         nmod_mpoly_fit_bits(A, exp_bits, ctx);
863         A->bits = exp_bits;
864         _nmod_mpoly_mul_array_chunked_DEG(A, C, B, deg, ctx);
865     }
866     success = 1;
867 
868 cleanup:
869 
870     return success;
871 }
872 
873 
874 
nmod_mpoly_mul_array(nmod_mpoly_t A,const nmod_mpoly_t B,const nmod_mpoly_t C,const nmod_mpoly_ctx_t ctx)875 int nmod_mpoly_mul_array(nmod_mpoly_t A, const nmod_mpoly_t B,
876                               const nmod_mpoly_t C, const nmod_mpoly_ctx_t ctx)
877 {
878     slong i;
879     int success;
880     fmpz * maxBfields, * maxCfields;
881     TMP_INIT;
882 
883     if (B->length == 0 || C->length == 0)
884     {
885         nmod_mpoly_zero(A, ctx);
886         return 1;
887     }
888 
889     if (  1 != mpoly_words_per_exp(B->bits, ctx->minfo)
890        || 1 != mpoly_words_per_exp(C->bits, ctx->minfo)
891        )
892     {
893         return 0;
894     }
895 
896     TMP_START;
897 
898     maxBfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz));
899     maxCfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz));
900     for (i = 0; i < ctx->minfo->nfields; i++)
901     {
902         fmpz_init(maxBfields + i);
903         fmpz_init(maxCfields + i);
904     }
905     mpoly_max_fields_fmpz(maxBfields, B->exps, B->length, B->bits, ctx->minfo);
906     mpoly_max_fields_fmpz(maxCfields, C->exps, C->length, C->bits, ctx->minfo);
907 
908     switch (ctx->minfo->ord)
909     {
910         case ORD_LEX:
911         {
912             success = _nmod_mpoly_mul_array_LEX(A, B, maxBfields,
913                                                    C, maxCfields, ctx);
914             break;
915         }
916         case ORD_DEGLEX:
917         case ORD_DEGREVLEX:
918         {
919             success = _nmod_mpoly_mul_array_DEG(A, B, maxBfields,
920                                                    C, maxCfields, ctx);
921             break;
922         }
923         default:
924         {
925             success = 0;
926             break;
927         }
928     }
929 
930     for (i = 0; i < ctx->minfo->nfields; i++)
931     {
932         fmpz_clear(maxBfields + i);
933         fmpz_clear(maxCfields + i);
934     }
935 
936     TMP_END;
937     return success;
938 }
939