1 /*
2     Copyright (C) 2019 Daniel Schultz
3 
4     This file is part of FLINT.
5 
6     FLINT is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
10 */
11 
12 #include "thread_pool.h"
13 #include "fmpz_mpoly.h"
14 
15 /*
16     a thread safe mpoly supports three mutating operations
17     - init from an array of terms
18     - append an array of terms
19     - clear out contents to a normal mpoly
20 */
21 typedef struct _fmpz_mpoly_ts_struct
22 {
23     fmpz * volatile coeffs; /* this is coeff_array[idx] */
24     ulong * volatile exps;       /* this is exp_array[idx] */
25     volatile slong length;
26     slong alloc;
27     flint_bitcnt_t bits;
28     flint_bitcnt_t idx;
29     ulong * exp_array[FLINT_BITS];
30     fmpz * coeff_array[FLINT_BITS];
31 } fmpz_mpoly_ts_struct;
32 
33 typedef fmpz_mpoly_ts_struct fmpz_mpoly_ts_t[1];
34 
35 /* Bcoeff is changed */
fmpz_mpoly_ts_init(fmpz_mpoly_ts_t A,fmpz * Bcoeff,ulong * Bexp,slong Blen,flint_bitcnt_t bits,slong N)36 void fmpz_mpoly_ts_init(fmpz_mpoly_ts_t A,
37                               fmpz * Bcoeff, ulong * Bexp, slong Blen,
38                                                     flint_bitcnt_t bits, slong N)
39 {
40     slong i;
41     flint_bitcnt_t idx = FLINT_BIT_COUNT(Blen);
42     idx = (idx <= 8) ? 0 : idx - 8;
43     for (i = 0; i < FLINT_BITS; i++)
44     {
45         A->exp_array[i] = NULL;
46         A->coeff_array[i] = NULL;
47     }
48     A->bits = bits;
49     A->idx = idx;
50     A->alloc = WORD(256) << idx;
51     A->exps = A->exp_array[idx]
52             = (ulong *) flint_malloc(N*A->alloc*sizeof(ulong));
53     A->coeffs = A->coeff_array[idx]
54               = (fmpz *) flint_calloc(A->alloc, sizeof(fmpz));
55     A->length = Blen;
56     for (i = 0; i < Blen; i++)
57     {
58         fmpz_swap(A->coeffs + i, Bcoeff + i);
59         mpoly_monomial_set(A->exps + N*i, Bexp + N*i, N);
60     }
61 }
62 
fmpz_mpoly_ts_print(const fmpz_mpoly_ts_t B,const char ** x,const fmpz_mpoly_ctx_t ctx)63 void fmpz_mpoly_ts_print(const fmpz_mpoly_ts_t B, const char ** x,
64                                                     const fmpz_mpoly_ctx_t ctx)
65 {
66     fmpz_mpoly_t A;
67     A->length = B->length;
68     A->alloc = B->alloc;
69     A->coeffs = B->coeffs;
70     A->exps = B->exps;
71     A->bits = B->bits;
72     fmpz_mpoly_print_pretty(A, x, ctx);
73 
74     fmpz_mpoly_assert_canonical(A, ctx);
75 }
76 
fmpz_mpoly_ts_clear(fmpz_mpoly_ts_t A)77 void fmpz_mpoly_ts_clear(fmpz_mpoly_ts_t A)
78 {
79     slong i;
80 
81     for (i = 0; i < A->length; i++)
82     {
83         fmpz_clear(A->coeffs + i);
84     }
85 
86     for (i = 0; i < FLINT_BITS; i++)
87     {
88         if (A->exp_array[i] != NULL)
89         {
90             FLINT_ASSERT(A->coeff_array[i] != NULL);
91             flint_free(A->coeff_array[i]);
92             flint_free(A->exp_array[i]);
93         }
94     }
95 }
96 
fmpz_mpoly_ts_clear_poly(fmpz_mpoly_t Q,fmpz_mpoly_ts_t A)97 void fmpz_mpoly_ts_clear_poly(fmpz_mpoly_t Q, fmpz_mpoly_ts_t A)
98 {
99     if (Q->alloc != 0)
100     {
101         slong i;
102 
103         FLINT_ASSERT(Q->exps != NULL);
104         FLINT_ASSERT(Q->coeffs != NULL);
105 
106         for (i = 0; i < Q->alloc; i++)
107         {
108             _fmpz_demote(Q->coeffs + i);
109         }
110         flint_free(Q->exps);
111         flint_free(Q->coeffs);
112     }
113 
114     Q->exps = A->exps;
115     Q->coeffs = A->coeffs;
116     Q->bits = A->bits;
117     Q->alloc = A->alloc;
118     Q->length = A->length;
119 
120     A->length = 0;
121     A->coeff_array[A->idx] = NULL;
122     A->exp_array[A->idx] = NULL;
123     fmpz_mpoly_ts_clear(A);
124 }
125 
126 
127 /* put B on the end of A - Bcoeff is changed*/
fmpz_mpoly_ts_append(fmpz_mpoly_ts_t A,fmpz * Bcoeff,ulong * Bexps,slong Blen,slong N)128 void fmpz_mpoly_ts_append(fmpz_mpoly_ts_t A,
129                              fmpz * Bcoeff, ulong * Bexps, slong Blen, slong N)
130 {
131 /* TODO: this needs barriers on non-x86 */
132 
133     slong i;
134     ulong * oldexps = A->exps;
135     fmpz * oldcoeffs = A->coeffs;
136     slong oldlength = A->length;
137     slong newlength = A->length + Blen;
138 
139     if (newlength <= A->alloc)
140     {
141         /* write new terms first */
142         for (i = 0; i < Blen; i++)
143         {
144             fmpz_swap(oldcoeffs + oldlength + i, Bcoeff + i);
145             mpoly_monomial_set(oldexps + N*(oldlength + i), Bexps + N*i, N);
146         }
147     }
148     else
149     {
150         slong newalloc;
151         ulong * newexps;
152         fmpz * newcoeffs;
153         flint_bitcnt_t newidx;
154         newidx = FLINT_BIT_COUNT(newlength - 1);
155         newidx = (newidx > 8) ? newidx - 8 : 0;
156         FLINT_ASSERT(newidx > A->idx);
157 
158         newalloc = UWORD(256) << newidx;
159         FLINT_ASSERT(newlength <= newalloc);
160         newexps = A->exp_array[newidx]
161                 = (ulong *) flint_malloc(N*newalloc*sizeof(ulong));
162         newcoeffs = A->coeff_array[newidx]
163                   = (fmpz *) flint_calloc(newalloc, sizeof(fmpz));
164 
165         for (i = 0; i < oldlength; i++)
166         {
167             newcoeffs[i] = oldcoeffs[i]; /* just copy the bits */
168             mpoly_monomial_set(newexps + N*i, oldexps + N*i, N);
169         }
170         for (i = 0; i < Blen; i++)
171         {
172             fmpz_swap(newcoeffs + oldlength + i, Bcoeff + i);
173             mpoly_monomial_set(newexps + N*(oldlength + i), Bexps + N*i, N);
174         }
175 
176         A->alloc = newalloc;
177         A->exps = newexps;
178         A->coeffs = newcoeffs;
179         A->idx = newidx;
180 
181         /* do not free oldcoeff/exps as other threads may be using them */
182     }
183 
184     /* update length at the very end */
185     A->length = newlength;
186 }
187 
188 
189 /*
190     a chunk holds an exponent range on the dividend
191 */
192 typedef struct _divides_heap_chunk_struct
193 {
194     fmpz_mpoly_t polyC;
195     struct _divides_heap_chunk_struct * next;
196     ulong * emin;
197     ulong * emax;
198     slong startidx;
199     slong endidx;
200     int upperclosed;
201     volatile int lock;
202     volatile int producer;
203     volatile slong ma;
204     volatile slong mq;
205     int Cinited;
206 } divides_heap_chunk_struct;
207 
208 typedef divides_heap_chunk_struct divides_heap_chunk_t[1];
209 
210 /*
211     the base struct includes a linked list of chunks
212 */
213 typedef struct
214 {
215 #if HAVE_PTHREAD
216     pthread_mutex_t mutex;
217 #endif
218     divides_heap_chunk_struct * head;
219     divides_heap_chunk_struct * tail;
220     divides_heap_chunk_struct * volatile cur;
221     fmpz_mpoly_t polyA;
222     fmpz_mpoly_t polyB;
223     fmpz_mpoly_ts_t polyQ;
224     const fmpz_mpoly_ctx_struct * ctx;
225     slong length;
226     slong N;
227     flint_bitcnt_t bits;
228     slong polyBcoeff_bits;
229     ulong * cmpmask;
230     int failed;
231 } divides_heap_base_struct;
232 
233 typedef divides_heap_base_struct divides_heap_base_t[1];
234 
235 /*
236     the worker stuct has a big chunk of memory in the stripe_t
237     and two polys for work space
238 */
239 typedef struct _worker_arg_struct
240 {
241     divides_heap_base_struct * H;
242     fmpz_mpoly_stripe_t S;
243     fmpz_mpoly_t polyT1;
244     fmpz_mpoly_t polyT2;
245 } worker_arg_struct;
246 
247 typedef worker_arg_struct worker_arg_t[1];
248 
249 
divides_heap_base_init(divides_heap_base_t H)250 static void divides_heap_base_init(divides_heap_base_t H)
251 {
252     H->head = NULL;
253     H->tail = NULL;
254     H->cur = NULL;
255     H->ctx = NULL;
256     H->length = 0;
257     H->N = 0;
258     H->bits = 0;
259     H->cmpmask = NULL;
260 }
261 
divides_heap_chunk_clear(divides_heap_chunk_t L,divides_heap_base_t H)262 static void divides_heap_chunk_clear(divides_heap_chunk_t L, divides_heap_base_t H)
263 {
264     if (L->Cinited)
265     {
266         fmpz_mpoly_clear(L->polyC, H->ctx);
267     }
268 }
divides_heap_base_clear(fmpz_mpoly_t Q,divides_heap_base_t H)269 static int divides_heap_base_clear(fmpz_mpoly_t Q, divides_heap_base_t H)
270 {
271     divides_heap_chunk_struct * L = H->head;
272     while (L != NULL)
273     {
274         divides_heap_chunk_struct * nextL = L->next;
275         divides_heap_chunk_clear(L, H);
276         flint_free(L);
277         L = nextL;
278     }
279     H->head = NULL;
280     H->tail = NULL;
281     H->cur = NULL;
282     H->ctx = NULL;
283     H->length = 0;
284     H->N = 0;
285     H->bits = 0;
286     H->cmpmask = NULL;
287 
288     if (H->failed)
289     {
290         fmpz_mpoly_zero(Q, H->ctx);
291         fmpz_mpoly_ts_clear(H->polyQ);
292         return 0;
293     }
294     else
295     {
296         fmpz_mpoly_ts_clear_poly(Q, H->polyQ);
297         return 1;
298     }
299 }
300 
divides_heap_base_add_chunk(divides_heap_base_t H,divides_heap_chunk_t L)301 static void divides_heap_base_add_chunk(divides_heap_base_t H, divides_heap_chunk_t L)
302 {
303     L->next = NULL;
304 
305     if (H->tail == NULL)
306     {
307         FLINT_ASSERT(H->head == NULL);
308         H->tail = L;
309         H->head = L;
310     }
311     else
312     {
313         divides_heap_chunk_struct * tail = H->tail;
314         FLINT_ASSERT(tail->next == NULL);
315         tail->next = L;
316         H->tail = L;
317     }
318     H->length++;
319 }
320 
321 
322 /*
323     A = D - (a stripe of B * C)
324     S->startidx and S->endidx are assumed to be correct
325         that is, we expect and successive calls to keep
326             B decreasing
327             C the same
328 
329     saveD: 0 means we can modify coeffs of input D
330            1 means we must not modify coeffs of input D
331 */
_fmpz_mpoly_mulsub_stripe1(fmpz ** A_coeff,ulong ** A_exp,slong * A_alloc,const fmpz * Dcoeff,const ulong * Dexp,slong Dlen,int saveD,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz * Ccoeff,const ulong * Cexp,slong Clen,const fmpz_mpoly_stripe_t S)332 slong _fmpz_mpoly_mulsub_stripe1(fmpz ** A_coeff, ulong ** A_exp, slong * A_alloc,
333                          const fmpz * Dcoeff, const ulong * Dexp, slong Dlen, int saveD,
334                          const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
335                          const fmpz * Ccoeff, const ulong * Cexp, slong Clen,
336                                                    const fmpz_mpoly_stripe_t S)
337 {
338     int upperclosed;
339     slong startidx, endidx;
340     ulong prev_startidx;
341     ulong maskhi = S->cmpmask[0];
342     ulong emax = S->emax[0];
343     ulong emin = S->emin[0];
344     slong i, j;
345     slong next_loc = Blen + 4;   /* something bigger than heap can ever be */
346     slong heap_len = 1; /* heap zero index unused */
347     mpoly_heap1_s * heap;
348     mpoly_heap_t * chain;
349     slong * store, * store_base;
350     mpoly_heap_t * x;
351     slong Di;
352     slong Alen;
353     slong Aalloc = *A_alloc;
354     fmpz * Acoeff = *A_coeff;
355     ulong * Aexp = *A_exp;
356     ulong exp;
357     slong * ends;
358     ulong texp;
359     slong * hind;
360     int small;
361     ulong acc_sm[3], pp0, pp1;
362 
363     FLINT_ASSERT(S->N == 1);
364 
365     i = 0;
366     hind = (slong *)(S->big_mem + i);
367     i += Blen*sizeof(slong);
368     ends = (slong *)(S->big_mem + i);
369     i += Blen*sizeof(slong);
370     store = store_base = (slong *) (S->big_mem + i);
371     i += 2*Blen*sizeof(slong);
372     heap = (mpoly_heap1_s *)(S->big_mem + i);
373     i += (Blen + 1)*sizeof(mpoly_heap1_s);
374     chain = (mpoly_heap_t *)(S->big_mem + i);
375     i += Blen*sizeof(mpoly_heap_t);
376     FLINT_ASSERT(i <= S->big_mem_alloc);
377 
378     startidx = *S->startidx;
379     endidx = *S->endidx;
380     upperclosed = S->upperclosed;
381     emax = S->emax[0];
382     emin = S->emin[0];
383 
384     /* put all the starting nodes on the heap */
385     prev_startidx = -UWORD(1);
386     for (i = 0; i < Blen; i++)
387     {
388         if (startidx < Clen)
389         {
390             texp = Bexp[i] + Cexp[startidx];
391             FLINT_ASSERT(mpoly_monomial_cmp1(emax, texp, maskhi)
392                                                                > -upperclosed);
393         }
394         while (startidx > 0)
395         {
396             texp = Bexp[i] + Cexp[startidx - 1];
397             if (mpoly_monomial_cmp1(emax, texp, maskhi) <= -upperclosed)
398             {
399                 break;
400             }
401             startidx--;
402         }
403 
404         if (endidx < Clen)
405         {
406             texp = Bexp[i] + Cexp[endidx];
407             FLINT_ASSERT(mpoly_monomial_cmp1(emin, texp, maskhi) > 0);
408         }
409         while (endidx > 0)
410         {
411             texp = Bexp[i] + Cexp[endidx - 1];
412             if (mpoly_monomial_cmp1(emin, texp, maskhi) <= 0)
413             {
414                 break;
415             }
416             endidx--;
417         }
418 
419         ends[i] = endidx;
420 
421         hind[i] = 2*startidx + 1;
422 
423         if (  (startidx < endidx)
424            && (((ulong)startidx) < prev_startidx)
425            )
426         {
427             x = chain + i;
428             x->i = i;
429             x->j = startidx;
430             x->next = NULL;
431             hind[x->i] = 2*(x->j + 1) + 0;
432             _mpoly_heap_insert1(heap, Bexp[x->i] + Cexp[x->j], x,
433                                       &next_loc, &heap_len, maskhi);
434         }
435 
436         prev_startidx = startidx;
437     }
438 
439     /* set the indices for the next time mul is called */
440     *S->startidx = startidx;
441     *S->endidx = endidx;
442 
443     /* whether input coeffs are small, thus output coeffs fit in three words */
444     FLINT_ASSERT(ends[0] >= startidx);
445     FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Ccoeff, Clen)));
446     small = S->coeff_bits <= FLINT_BITS - 2
447             && _fmpz_mpoly_fits_small(Bcoeff, Blen)
448             && FLINT_ABS(_fmpz_vec_max_bits(Dcoeff, Dlen)) < 3*FLINT_BITS - 3;
449 
450     Alen = 0;
451     Di = 0;
452     while (heap_len > 1)
453     {
454         exp = heap[1].exp;
455 
456         while (Di < Dlen && mpoly_monomial_gt1(Dexp[Di], exp, maskhi))
457         {
458             _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, 1);
459             Aexp[Alen] = Dexp[Di];
460             if (saveD)
461                 fmpz_set(Acoeff + Alen, Dcoeff + Di);
462             else
463                 fmpz_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di));
464             Alen++;
465             Di++;
466         }
467 
468         _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, 1);
469 
470         Aexp[Alen] = exp;
471 
472         if (small)
473         {
474             acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
475             if (Di < Dlen && Dexp[Di] == exp)
476             {
477                 _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Dcoeff + Di);
478                 Di++;
479             }
480 
481             do
482             {
483                 x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
484 
485                 hind[x->i] |= WORD(1);
486                 *store++ = x->i;
487                 *store++ = x->j;
488                 FLINT_ASSERT(startidx <= x->j);
489                 FLINT_ASSERT(x->j < ends[0]);
490                 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
491                 FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
492                 smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
493                 sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
494                               acc_sm[2], acc_sm[1], acc_sm[0],
495                               FLINT_SIGN_EXT(pp1), pp1, pp0);
496 
497                 while ((x = x->next) != NULL)
498                 {
499                     hind[x->i] |= WORD(1);
500                     *store++ = x->i;
501                     *store++ = x->j;
502                     FLINT_ASSERT(startidx <= x->j);
503                     FLINT_ASSERT(x->j < ends[0]);
504                     FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
505                     FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
506                     smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
507                     sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
508                                   acc_sm[2], acc_sm[1], acc_sm[0],
509                                   FLINT_SIGN_EXT(pp1), pp1, pp0);
510                 }
511             } while (heap_len > 1 && heap[1].exp == exp);
512 
513             fmpz_set_signed_uiuiui(Acoeff + Alen,
514                                              acc_sm[2], acc_sm[1], acc_sm[0]);
515         }
516         else
517         {
518             if (Di < Dlen && Dexp[Di] == exp)
519             {
520                 fmpz_set(Acoeff + Alen, Dcoeff + Di);
521                 Di++;
522             }
523             else
524             {
525                 fmpz_zero(Acoeff + Alen);
526             }
527 
528             do
529             {
530                 x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
531 
532                 hind[x->i] |= WORD(1);
533                 *store++ = x->i;
534                 *store++ = x->j;
535                 fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
536 
537                 while ((x = x->next) != NULL)
538                 {
539                     hind[x->i] |= WORD(1);
540                     *store++ = x->i;
541                     *store++ = x->j;
542                     fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
543                 }
544             } while (heap_len > 1 && heap[1].exp == exp);
545         }
546 
547         Alen += !fmpz_is_zero(Acoeff + Alen);
548 
549         /* process nodes taken from the heap */
550         while (store > store_base)
551         {
552             j = *--store;
553             i = *--store;
554 
555             /* should we go right? */
556             if (  (i + 1 < Blen)
557                && (j + 0 < ends[i + 1])
558                && (hind[i + 1] == 2*j + 1)
559                )
560             {
561                 x = chain + i + 1;
562                 x->i = i + 1;
563                 x->j = j;
564                 x->next = NULL;
565 
566                 hind[x->i] = 2*(x->j + 1) + 0;
567                 _mpoly_heap_insert1(heap, Bexp[x->i] + Cexp[x->j], x,
568                                                  &next_loc, &heap_len, maskhi);
569             }
570 
571             /* should we go up? */
572             if (  (j + 1 < ends[i + 0])
573                && ((hind[i] & 1) == 1)
574                && (  (i == 0)
575                   || (hind[i - 1] >= 2*(j + 2) + 1)
576                   )
577                )
578             {
579                 x = chain + i;
580                 x->i = i;
581                 x->j = j + 1;
582                 x->next = NULL;
583 
584                 hind[x->i] = 2*(x->j + 1) + 0;
585                 _mpoly_heap_insert1(heap, Bexp[x->i] + Cexp[x->j], x,
586                                                  &next_loc, &heap_len, maskhi);
587             }
588         }
589     }
590 
591     FLINT_ASSERT(Di <= Dlen);
592     _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + Dlen - Di, 1);
593     if (saveD)
594         _fmpz_vec_set(Acoeff + Alen, Dcoeff + Di, Dlen - Di);
595     else
596         _fmpz_vec_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di), Dlen - Di);
597     mpoly_copy_monomials(Aexp + 1*Alen, Dexp + 1*Di, Dlen - Di, 1);
598     Alen += Dlen - Di;
599 
600     *A_coeff = Acoeff;
601     *A_exp = Aexp;
602     *A_alloc = Aalloc;
603 
604     return Alen;
605 }
606 
607 
_fmpz_mpoly_mulsub_stripe(fmpz ** A_coeff,ulong ** A_exp,slong * A_alloc,const fmpz * Dcoeff,const ulong * Dexp,slong Dlen,int saveD,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz * Ccoeff,const ulong * Cexp,slong Clen,const fmpz_mpoly_stripe_t S)608 slong _fmpz_mpoly_mulsub_stripe(fmpz ** A_coeff, ulong ** A_exp, slong * A_alloc,
609                          const fmpz * Dcoeff, const ulong * Dexp, slong Dlen, int saveD,
610                          const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
611                          const fmpz * Ccoeff, const ulong * Cexp, slong Clen,
612                                                    const fmpz_mpoly_stripe_t S)
613 {
614     int upperclosed;
615     slong startidx, endidx;
616     ulong prev_startidx;
617     ulong * emax = S->emax;
618     ulong * emin = S->emin;
619     slong N = S->N;
620     slong i, j;
621     slong next_loc = Blen + 4;   /* something bigger than heap can ever be */
622     slong heap_len = 1; /* heap zero index unused */
623     mpoly_heap_s * heap;
624     mpoly_heap_t * chain;
625     slong * store, * store_base;
626     mpoly_heap_t * x;
627     slong Di;
628     slong Alen;
629     slong Aalloc = *A_alloc;
630     fmpz * Acoeff = *A_coeff;
631     ulong * Aexp = *A_exp;
632     ulong * exp, * exps;
633     ulong ** exp_list;
634     slong exp_next;
635     slong * ends;
636     ulong * texp;
637     slong * hind;
638     int small;
639     ulong acc_sm[3], pp0, pp1;
640 
641     i = 0;
642     hind = (slong *)(S->big_mem + i);
643     i += Blen*sizeof(slong);
644     ends = (slong *)(S->big_mem + i);
645     i += Blen*sizeof(slong);
646     store = store_base = (slong *) (S->big_mem + i);
647     i += 2*Blen*sizeof(slong);
648     heap = (mpoly_heap_s *)(S->big_mem + i);
649     i += (Blen + 1)*sizeof(mpoly_heap_s);
650     chain = (mpoly_heap_t *)(S->big_mem + i);
651     i += Blen*sizeof(mpoly_heap_t);
652     exps = (ulong *)(S->big_mem + i);
653     i +=  Blen*N*sizeof(ulong);
654     exp_list = (ulong **)(S->big_mem + i);
655     i +=  Blen*sizeof(ulong *);
656     texp = (ulong *)(S->big_mem + i);
657     i +=  N*sizeof(ulong);
658     FLINT_ASSERT(i <= S->big_mem_alloc);
659 
660     exp_next = 0;
661 
662     startidx = *S->startidx;
663     endidx = *S->endidx;
664     upperclosed = S->upperclosed;
665 
666     for (i = 0; i < Blen; i++)
667         exp_list[i] = exps + i*N;
668 
669     /* put all the starting nodes on the heap */
670     prev_startidx = -UWORD(1);
671     for (i = 0; i < Blen; i++)
672     {
673         if (startidx < Clen)
674         {
675             mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*startidx, N);
676             FLINT_ASSERT(mpoly_monomial_cmp(emax, texp, N, S->cmpmask)
677                                                                > -upperclosed);
678         }
679         while (startidx > 0)
680         {
681             mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*(startidx - 1), N);
682             if (mpoly_monomial_cmp(emax, texp, N, S->cmpmask) <= -upperclosed)
683             {
684                 break;
685             }
686             startidx--;
687         }
688 
689         if (endidx < Clen)
690         {
691             mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*endidx, N);
692             FLINT_ASSERT(mpoly_monomial_cmp(emin, texp, N, S->cmpmask) > 0);
693         }
694         while (endidx > 0)
695         {
696             mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*(endidx - 1), N);
697             if (mpoly_monomial_cmp(emin, texp, N, S->cmpmask) <= 0)
698             {
699                 break;
700             }
701             endidx--;
702         }
703 
704         ends[i] = endidx;
705 
706         hind[i] = 2*startidx + 1;
707 
708         if (  (startidx < endidx)
709            && (((ulong)startidx) < prev_startidx)
710            )
711         {
712             x = chain + i;
713             x->i = i;
714             x->j = startidx;
715             x->next = NULL;
716             hind[x->i] = 2*(x->j + 1) + 0;
717 
718             mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
719                                                       Cexp + N*x->j, N);
720 
721             exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
722                                           &next_loc, &heap_len, N, S->cmpmask);
723         }
724 
725         prev_startidx = startidx;
726     }
727 
728     *S->startidx = startidx;
729     *S->endidx = endidx;
730 
731     /* whether input coeffs are small, thus output coeffs fit in three words */
732     FLINT_ASSERT(ends[0] >= startidx);
733     FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Ccoeff, Clen)));
734     small = S->coeff_bits <= FLINT_BITS - 2
735             && _fmpz_mpoly_fits_small(Bcoeff, Blen)
736             && FLINT_ABS(_fmpz_vec_max_bits(Dcoeff, Dlen)) < 3*FLINT_BITS - 3;
737 
738     Alen = 0;
739     Di = 0;
740     while (heap_len > 1)
741     {
742         exp = heap[1].exp;
743 
744         while (Di < Dlen && mpoly_monomial_gt(Dexp + N*Di, exp, N, S->cmpmask))
745         {
746             _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, N);
747             mpoly_monomial_set(Aexp + N*Alen, Dexp + N*Di, N);
748             if (saveD)
749                 fmpz_set(Acoeff + Alen, Dcoeff + Di);
750             else
751                 fmpz_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di));
752             Alen++;
753             Di++;
754         }
755 
756         _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, N);
757 
758         mpoly_monomial_set(Aexp + N*Alen, exp, N);
759 
760         if (small)
761         {
762             acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
763             if (Di < Dlen && mpoly_monomial_equal(Dexp + N*Di, exp, N))
764             {
765                 _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Dcoeff + Di);
766                 Di++;
767             }
768 
769             do
770             {
771                 exp_list[--exp_next] = heap[1].exp;
772                 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
773 
774                 hind[x->i] |= WORD(1);
775                 *store++ = x->i;
776                 *store++ = x->j;
777                 FLINT_ASSERT(startidx <= x->j);
778                 FLINT_ASSERT(x->j < ends[0]);
779                 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
780                 FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
781                 smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
782                 sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
783                               acc_sm[2], acc_sm[1], acc_sm[0],
784                               FLINT_SIGN_EXT(pp1), pp1, pp0);
785 
786                 while ((x = x->next) != NULL)
787                 {
788                     hind[x->i] |= WORD(1);
789                     *store++ = x->i;
790                     *store++ = x->j;
791                     FLINT_ASSERT(startidx <= x->j);
792                     FLINT_ASSERT(x->j < ends[0]);
793                     FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
794                     FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
795                     smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
796                     sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
797                                   acc_sm[2], acc_sm[1], acc_sm[0],
798                                   FLINT_SIGN_EXT(pp1), pp1, pp0);
799                 }
800             } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
801 
802             fmpz_set_signed_uiuiui(Acoeff + Alen,
803                                              acc_sm[2], acc_sm[1], acc_sm[0]);
804         }
805         else
806         {
807             if (Di < Dlen && mpoly_monomial_equal(Dexp + N*Di, exp, N))
808             {
809                 fmpz_set(Acoeff + Alen, Dcoeff + Di);
810                 Di++;
811             }
812             else
813             {
814                 fmpz_zero(Acoeff + Alen);
815             }
816 
817             do
818             {
819                 exp_list[--exp_next] = heap[1].exp;
820                 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
821 
822                 hind[x->i] |= WORD(1);
823                 *store++ = x->i;
824                 *store++ = x->j;
825                 FLINT_ASSERT(startidx <= x->j);
826                 FLINT_ASSERT(x->j < ends[0]);
827                 fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
828 
829                 while ((x = x->next) != NULL)
830                 {
831                     hind[x->i] |= WORD(1);
832                     *store++ = x->i;
833                     *store++ = x->j;
834                     FLINT_ASSERT(startidx <= x->j);
835                     FLINT_ASSERT(x->j < ends[0]);
836                     fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
837                 }
838             } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
839         }
840 
841         Alen += !fmpz_is_zero(Acoeff + Alen);
842 
843         /* process nodes taken from the heap */
844         while (store > store_base)
845         {
846             j = *--store;
847             i = *--store;
848 
849             /* should we go right? */
850             if (  (i + 1 < Blen)
851                && (j + 0 < ends[i + 1])
852                && (hind[i + 1] == 2*j + 1)
853                )
854             {
855                 x = chain + i + 1;
856                 x->i = i + 1;
857                 x->j = j;
858                 x->next = NULL;
859 
860                 hind[x->i] = 2*(x->j + 1) + 0;
861 
862                 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
863                                                           Cexp + N*x->j, N);
864 
865                 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
866                                           &next_loc, &heap_len, N, S->cmpmask);
867             }
868 
869             /* should we go up? */
870             if (  (j + 1 < ends[i + 0])
871                && ((hind[i] & 1) == 1)
872                && (  (i == 0)
873                   || (hind[i - 1] >= 2*(j + 2) + 1)
874                   )
875                )
876             {
877                 x = chain + i;
878                 x->i = i;
879                 x->j = j + 1;
880                 x->next = NULL;
881 
882                 hind[x->i] = 2*(x->j + 1) + 0;
883 
884                 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
885                                                           Cexp + N*x->j, N);
886 
887                 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
888                                           &next_loc, &heap_len, N, S->cmpmask);
889             }
890         }
891     }
892 
893     FLINT_ASSERT(Di <= Dlen);
894     _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + Dlen - Di, N);
895     if (saveD)
896         _fmpz_vec_set(Acoeff + Alen, Dcoeff + Di, Dlen - Di);
897     else
898         _fmpz_vec_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di), Dlen - Di);
899     mpoly_copy_monomials(Aexp + N*Alen, Dexp + N*Di, Dlen - Di, N);
900     Alen += Dlen - Di;
901 
902     *A_coeff = Acoeff;
903     *A_exp = Aexp;
904     *A_alloc = Aalloc;
905 
906     return Alen;
907 }
908 
909 /*
910     Q = stripe of A/B (assume A != 0)
911     return Qlen = 0 if exact division is impossible
912 */
_fmpz_mpoly_divides_stripe1(fmpz ** Q_coeff,ulong ** Q_exp,slong * Q_alloc,const fmpz * Acoeff,const ulong * Aexp,slong Alen,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz_mpoly_stripe_t S)913 slong _fmpz_mpoly_divides_stripe1(
914                         fmpz ** Q_coeff,     ulong ** Q_exp, slong * Q_alloc,
915                     const fmpz * Acoeff, const ulong * Aexp, slong Alen,
916                     const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
917                                                    const fmpz_mpoly_stripe_t S)
918 {
919     flint_bitcnt_t bits = S->bits;
920     ulong emin = S->emin[0];
921     ulong cmpmask = S->cmpmask[0];
922     ulong texp;
923     int lt_divides;
924     slong i, j, s;
925     slong next_loc, heap_len;
926     mpoly_heap1_s * heap;
927     mpoly_heap_t * chain;
928     slong * store, * store_base;
929     mpoly_heap_t * x;
930     slong Qlen;
931     slong Qalloc = * Q_alloc;
932     fmpz * Qcoeff = * Q_coeff;
933     ulong * Qexp = * Q_exp;
934     ulong exp;
935     ulong mask;
936     slong * hind;
937     fmpz_t acc_lg, r;
938     ulong acc_sm[3];
939     slong Acoeffbits;
940     ulong lc_norm = 0, lc_abs = 0, lc_sign = 0, lc_n = 0, lc_i = 0;
941     int small;
942 
943     FLINT_ASSERT(Alen > 0);
944     FLINT_ASSERT(Blen > 0);
945     FLINT_ASSERT(S->N == 1);
946 
947     Acoeffbits = _fmpz_vec_max_bits(Acoeff, Alen);
948     FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Bcoeff, Blen)));
949 
950     /* whether intermediate computations A - Q*B will fit in three words */
951     /* allow one bit for sign, one bit for subtraction NOT QUITE */
952     small = S->coeff_bits <= FLINT_BITS - 2
953          && FLINT_ABS(Acoeffbits) <= (S->coeff_bits
954                                      + FLINT_BIT_COUNT(Blen) + FLINT_BITS - 2);
955 
956     next_loc = Blen + 4;   /* something bigger than heap can ever be */
957 
958     i = 0;
959     hind = (slong *)(S->big_mem + i);
960     i += Blen*sizeof(slong);
961     store = store_base = (slong *) (S->big_mem + i);
962     i += 2*Blen*sizeof(slong);
963     heap = (mpoly_heap1_s *)(S->big_mem + i);
964     i += (Blen + 1)*sizeof(mpoly_heap1_s);
965     chain = (mpoly_heap_t *)(S->big_mem + i);
966     i += Blen*sizeof(mpoly_heap_t);
967     FLINT_ASSERT(i <= S->big_mem_alloc);
968 
969     fmpz_init(acc_lg);
970     fmpz_init(r);
971 
972     for (i = 0; i < Blen; i++)
973         hind[i] = 1;
974 
975     /* mask with high bit set in each word of each field of exponent vector */
976     mask = 0;
977     for (i = 0; i < FLINT_BITS/bits; i++)
978         mask = (mask << bits) + (UWORD(1) << (bits - 1));
979 
980     Qlen = WORD(0);
981 
982     /* s is the number of terms * (latest quotient) we should put into heap */
983     s = Blen;
984 
985     /* insert (-1, 0, exp2[0]) into heap */
986     heap_len = 2;
987     x = chain + 0;
988     x->i = -WORD(1);
989     x->j = 0;
990     x->next = NULL;
991     heap[1].next = x;
992     HEAP_ASSIGN(heap[1], Aexp[0], x);
993 
994     FLINT_ASSERT(mpoly_monomial_cmp1(Aexp[0], emin, cmpmask) >= 0);
995 
996     if (small)
997     {
998         FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[0]));
999         lc_abs = FLINT_ABS(Bcoeff[0]);
1000         lc_sign = FLINT_SIGN_EXT(Bcoeff[0]);
1001         count_leading_zeros(lc_norm, lc_abs);
1002         lc_n = lc_abs << lc_norm;
1003         invert_limb(lc_i, lc_n);
1004     }
1005 
1006     while (heap_len > 1)
1007     {
1008         exp = heap[1].exp;
1009 
1010         if (mpoly_monomial_overflows1(exp, mask))
1011             goto not_exact_division;
1012 
1013         FLINT_ASSERT(mpoly_monomial_cmp1(exp, emin, cmpmask) >= 0);
1014 
1015         _fmpz_mpoly_fit_length(&Qcoeff, &Qexp, &Qalloc, Qlen + 1, 1);
1016 
1017         lt_divides = mpoly_monomial_divides1(Qexp + Qlen, exp, Bexp[0], mask);
1018 
1019         if (small)
1020         {
1021             acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
1022             do
1023             {
1024                 x = _mpoly_heap_pop1(heap, &heap_len, cmpmask);
1025                 do
1026                 {
1027                     *store++ = x->i;
1028                     *store++ = x->j;
1029                     if (x->i != -WORD(1))
1030                         hind[x->i] |= WORD(1);
1031 
1032                     if (x->i == -WORD(1))
1033                     {
1034                         _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Acoeff + x->j);
1035                     }
1036                     else
1037                     {
1038                         FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
1039                         FLINT_ASSERT(!COEFF_IS_MPZ(Qcoeff[x->j]));
1040                         _fmpz_mpoly_submul_uiuiui_fmpz(acc_sm,
1041                                                    Bcoeff[x->i], Qcoeff[x->j]);
1042                     }
1043                 } while ((x = x->next) != NULL);
1044             } while (heap_len > 1 && heap[1].exp == exp);
1045         }
1046         else
1047         {
1048             fmpz_zero(acc_lg);
1049             do
1050             {
1051                 x = _mpoly_heap_pop1(heap, &heap_len, cmpmask);
1052                 do
1053                 {
1054                     *store++ = x->i;
1055                     *store++ = x->j;
1056                     if (x->i != -WORD(1))
1057                         hind[x->i] |= WORD(1);
1058 
1059                     if (x->i == -WORD(1))
1060                     {
1061                         fmpz_add(acc_lg, acc_lg, Acoeff + x->j);
1062                     }
1063                     else
1064                     {
1065                         fmpz_submul(acc_lg, Bcoeff + x->i, Qcoeff + x->j);
1066                     }
1067                 } while ((x = x->next) != NULL);
1068             } while (heap_len > 1 && heap[1].exp == exp);
1069         }
1070 
1071         /* process nodes taken from the heap */
1072         while (store > store_base)
1073         {
1074             j = *--store;
1075             i = *--store;
1076 
1077             if (i == -WORD(1))
1078             {
1079                 /* take next dividend term */
1080                 if (j + 1 < Alen)
1081                 {
1082                     x = chain + 0;
1083                     x->i = i;
1084                     x->j = j + 1;
1085                     x->next = NULL;
1086 
1087                     FLINT_ASSERT(mpoly_monomial_cmp1(Aexp[x->j], emin, cmpmask)
1088                                                                          >= 0);
1089 
1090                     _mpoly_heap_insert1(heap, Aexp[x->j], x,
1091                                                 &next_loc, &heap_len, cmpmask);
1092                 }
1093             }
1094             else
1095             {
1096                 /* should we go up */
1097                 if (  (i + 1 < Blen)
1098                    && (hind[i + 1] == 2*j + 1)
1099                    )
1100                 {
1101                     x = chain + i + 1;
1102                     x->i = i + 1;
1103                     x->j = j;
1104                     x->next = NULL;
1105                     hind[x->i] = 2*(x->j + 1) + 0;
1106 
1107                     texp = Bexp[x->i] + Qexp[x->j];
1108                     if (mpoly_monomial_cmp1(texp, emin, cmpmask) >= 0)
1109                     {
1110                         _mpoly_heap_insert1(heap, texp, x,
1111                                                 &next_loc, &heap_len, cmpmask);
1112                     }
1113                     else
1114                     {
1115                         hind[x->i] |= 1;
1116                     }
1117                 }
1118                 /* should we go up? */
1119                 if (j + 1 == Qlen)
1120                 {
1121                     s++;
1122                 } else if (  ((hind[i] & 1) == 1)
1123                           && ((i == 1) || (hind[i - 1] >= 2*(j + 2) + 1))
1124                           )
1125                 {
1126                     x = chain + i;
1127                     x->i = i;
1128                     x->j = j + 1;
1129                     x->next = NULL;
1130                     hind[x->i] = 2*(x->j + 1) + 0;
1131 
1132                     texp = Bexp[x->i] + Qexp[x->j];
1133                     if (mpoly_monomial_cmp1(texp, emin, cmpmask) >= 0)
1134                     {
1135                         _mpoly_heap_insert1(heap, texp, x,
1136                                                 &next_loc, &heap_len, cmpmask);
1137                     }
1138                     else
1139                     {
1140                         hind[x->i] |= 1;
1141                     }
1142                 }
1143             }
1144         }
1145 
1146         if (small)
1147         {
1148             ulong d0, d1, ds = acc_sm[2];
1149 
1150             /* d1:d0 = abs(acc_sm[1:0]) assuming ds is sign extension of acc_sm[1] */
1151             sub_ddmmss(d1, d0, acc_sm[1]^ds, acc_sm[0]^ds, ds, ds);
1152 
1153             if ((acc_sm[0] | acc_sm[1] | acc_sm[2]) == 0)
1154             {
1155                 continue;
1156             }
1157 
1158             if (ds == FLINT_SIGN_EXT(acc_sm[1]) && d1 < lc_abs)
1159             {
1160                 ulong qq, rr, nhi, nlo;
1161                 FLINT_ASSERT(0 < lc_norm && lc_norm < FLINT_BITS);
1162                 nhi = (d1 << lc_norm) | (d0 >> (FLINT_BITS - lc_norm));
1163                 nlo = d0 << lc_norm;
1164                 udiv_qrnnd_preinv(qq, rr, nhi, nlo, lc_n, lc_i);
1165                 if (rr != UWORD(0))
1166                     goto not_exact_division;
1167 
1168                 if (qq <= COEFF_MAX)
1169                 {
1170                     _fmpz_demote(Qcoeff + Qlen);
1171                     Qcoeff[Qlen] = qq;
1172                     if (ds != lc_sign)
1173                         Qcoeff[Qlen] = -Qcoeff[Qlen];
1174                 }
1175                 else
1176                 {
1177                     small = 0;
1178                     fmpz_set_ui(Qcoeff + Qlen, qq);
1179                     if (ds != lc_sign)
1180                         fmpz_neg(Qcoeff + Qlen, Qcoeff + Qlen);
1181                 }
1182             }
1183             else
1184             {
1185                 small = 0;
1186                 fmpz_set_signed_uiuiui(acc_lg, acc_sm[2], acc_sm[1], acc_sm[0]);
1187                 fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1188                 if (!fmpz_is_zero(r))
1189                     goto not_exact_division;
1190             }
1191         }
1192         else
1193         {
1194             if (fmpz_is_zero(acc_lg))
1195             {
1196                 continue;
1197             }
1198 
1199             fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1200 
1201             if (!fmpz_is_zero(r))
1202             {
1203                 goto not_exact_division;
1204             }
1205         }
1206 
1207         if (!lt_divides)
1208         {
1209             goto not_exact_division;
1210         }
1211 
1212         if (s > 1)
1213         {
1214             i = 1;
1215             x = chain + i;
1216             x->i = i;
1217             x->j = Qlen;
1218             x->next = NULL;
1219             hind[x->i] = 2*(x->j + 1) + 0;
1220 
1221             texp = Bexp[x->i] + Qexp[x->j];
1222             if (mpoly_monomial_cmp1(texp, emin, cmpmask) >= 0)
1223             {
1224                 _mpoly_heap_insert1(heap, texp, x,
1225                                          &next_loc, &heap_len, cmpmask);
1226             }
1227             else
1228             {
1229                 hind[x->i] |= 1;
1230             }
1231         }
1232         s = 1;
1233         Qlen++;
1234     }
1235 
1236 
1237 cleanup:
1238 
1239     fmpz_clear(acc_lg);
1240     fmpz_clear(r);
1241 
1242     *Q_alloc = Qalloc;
1243     *Q_coeff = Qcoeff;
1244     *Q_exp = Qexp;
1245 
1246     return Qlen;
1247 
1248 not_exact_division:
1249     Qlen = 0;
1250     goto cleanup;
1251 }
1252 
_fmpz_mpoly_divides_stripe(fmpz ** Q_coeff,ulong ** Q_exp,slong * Q_alloc,const fmpz * Acoeff,const ulong * Aexp,slong Alen,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz_mpoly_stripe_t S)1253 slong _fmpz_mpoly_divides_stripe(
1254                         fmpz ** Q_coeff,     ulong ** Q_exp, slong * Q_alloc,
1255                     const fmpz * Acoeff, const ulong * Aexp, slong Alen,
1256                     const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
1257                                                    const fmpz_mpoly_stripe_t S)
1258 {
1259     flint_bitcnt_t bits = S->bits;
1260     slong N = S->N;
1261     int lt_divides;
1262     slong i, j, s;
1263     slong next_loc, heap_len;
1264     mpoly_heap_s * heap;
1265     mpoly_heap_t * chain;
1266     slong * store, * store_base;
1267     mpoly_heap_t * x;
1268     slong Qlen;
1269     slong Qalloc = * Q_alloc;
1270     fmpz * Qcoeff = * Q_coeff;
1271     ulong * Qexp = * Q_exp;
1272     ulong * exp, * exps;
1273     ulong ** exp_list;
1274     slong exp_next;
1275     ulong mask;
1276     slong * hind;
1277     fmpz_t acc_lg, r;
1278     ulong acc_sm[3];
1279     slong Acoeffbits;
1280     ulong lc_norm = 0, lc_abs = 0, lc_sign = 0, lc_n = 0, lc_i = 0;
1281     int small;
1282 
1283     FLINT_ASSERT(Alen > 0);
1284     FLINT_ASSERT(Blen > 0);
1285 
1286     Acoeffbits = _fmpz_vec_max_bits(Acoeff, Alen);
1287     FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Bcoeff, Blen)));
1288 
1289     /* whether intermediate computations A - Q*B will fit in three words */
1290     /* allow one bit for sign, one bit for subtraction NOT QUITE */
1291     small = S->coeff_bits <= FLINT_BITS - 2
1292          && FLINT_ABS(Acoeffbits) <= (S->coeff_bits
1293                                      + FLINT_BIT_COUNT(Blen) + FLINT_BITS - 2);
1294 
1295     FLINT_ASSERT(Alen > 0);
1296     FLINT_ASSERT(Blen > 0);
1297 
1298     next_loc = Blen + 4;   /* something bigger than heap can ever be */
1299 
1300     i = 0;
1301     hind = (slong *) (S->big_mem + i);
1302     i += Blen*sizeof(slong);
1303     store = store_base = (slong *) (S->big_mem + i);
1304     i += 2*Blen*sizeof(slong);
1305     heap = (mpoly_heap_s *)(S->big_mem + i);
1306     i += (Blen + 1)*sizeof(mpoly_heap_s);
1307     chain = (mpoly_heap_t *)(S->big_mem + i);
1308     i += Blen*sizeof(mpoly_heap_t);
1309     exps = (ulong *)(S->big_mem + i);
1310     i +=  Blen*N*sizeof(ulong);
1311     exp_list = (ulong **)(S->big_mem + i);
1312     i +=  Blen*sizeof(ulong *);
1313     FLINT_ASSERT(i <= S->big_mem_alloc);
1314 
1315     fmpz_init(acc_lg);
1316     fmpz_init(r);
1317 
1318     exp_next = 0;
1319     for (i = 0; i < Blen; i++)
1320         exp_list[i] = exps + i*N;
1321 
1322     for (i = 0; i < Blen; i++)
1323         hind[i] = 1;
1324 
1325     /* mask with high bit set in each word of each field of exponent vector */
1326     mask = 0;
1327     for (i = 0; i < FLINT_BITS/bits; i++)
1328         mask = (mask << bits) + (UWORD(1) << (bits - 1));
1329 
1330     Qlen = WORD(0);
1331 
1332     /* s is the number of terms * (latest quotient) we should put into heap */
1333     s = Blen;
1334 
1335     /* insert (-1, 0, exp2[0]) into heap */
1336     heap_len = 2;
1337     x = chain + 0;
1338     x->i = -WORD(1);
1339     x->j = 0;
1340     x->next = NULL;
1341     heap[1].next = x;
1342     heap[1].exp = exp_list[exp_next++];
1343 
1344     FLINT_ASSERT(mpoly_monomial_cmp(Aexp + N*0, S->emin, N, S->cmpmask) >= 0);
1345 
1346     mpoly_monomial_set(heap[1].exp, Aexp + N*0, N);
1347 
1348     if (small)
1349     {
1350         FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[0]));
1351         lc_abs = FLINT_ABS(Bcoeff[0]);
1352         lc_sign = FLINT_SIGN_EXT(Bcoeff[0]);
1353         count_leading_zeros(lc_norm, lc_abs);
1354         lc_n = lc_abs << lc_norm;
1355         invert_limb(lc_i, lc_n);
1356     }
1357 
1358     while (heap_len > 1)
1359     {
1360         exp = heap[1].exp;
1361 
1362         if (bits <= FLINT_BITS)
1363         {
1364             if (mpoly_monomial_overflows(exp, N, mask))
1365             {
1366                 goto not_exact_division;
1367             }
1368         }
1369         else
1370         {
1371             if (mpoly_monomial_overflows_mp(exp, N, bits))
1372             {
1373                 goto not_exact_division;
1374             }
1375         }
1376 
1377         FLINT_ASSERT(mpoly_monomial_cmp(exp, S->emin, N, S->cmpmask) >= 0);
1378 
1379         _fmpz_mpoly_fit_length(&Qcoeff, &Qexp, &Qalloc, Qlen + 1, N);
1380 
1381         if (bits <= FLINT_BITS)
1382             lt_divides = mpoly_monomial_divides(Qexp + N*Qlen, exp,
1383                                                          Bexp + N*0, N, mask);
1384         else
1385             lt_divides = mpoly_monomial_divides_mp(Qexp + N*Qlen, exp,
1386                                                          Bexp + N*0, N, bits);
1387 
1388         if (small)
1389         {
1390             acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
1391             do
1392             {
1393                 exp_list[--exp_next] = heap[1].exp;
1394                 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
1395                 do
1396                 {
1397                     *store++ = x->i;
1398                     *store++ = x->j;
1399                     if (x->i != -WORD(1))
1400                         hind[x->i] |= WORD(1);
1401 
1402                     if (x->i == -WORD(1))
1403                     {
1404                         _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Acoeff + x->j);
1405                     }
1406                     else
1407                     {
1408                         FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
1409                         FLINT_ASSERT(!COEFF_IS_MPZ(Qcoeff[x->j]));
1410                         _fmpz_mpoly_submul_uiuiui_fmpz(acc_sm,
1411                                                    Bcoeff[x->i], Qcoeff[x->j]);
1412                     }
1413                 } while ((x = x->next) != NULL);
1414             } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
1415         }
1416         else
1417         {
1418             fmpz_zero(acc_lg);
1419             do
1420             {
1421                 exp_list[--exp_next] = heap[1].exp;
1422                 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
1423                 do
1424                 {
1425                     *store++ = x->i;
1426                     *store++ = x->j;
1427                     if (x->i != -WORD(1))
1428                         hind[x->i] |= WORD(1);
1429 
1430                     if (x->i == -WORD(1))
1431                     {
1432                         fmpz_add(acc_lg, acc_lg, Acoeff + x->j);
1433                     }
1434                     else
1435                     {
1436                         fmpz_submul(acc_lg, Bcoeff + x->i, Qcoeff + x->j);
1437                     }
1438                 } while ((x = x->next) != NULL);
1439             } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
1440         }
1441 
1442         /* process nodes taken from the heap */
1443         while (store > store_base)
1444         {
1445             j = *--store;
1446             i = *--store;
1447 
1448             if (i == -WORD(1))
1449             {
1450                 /* take next dividend term */
1451                 if (j + 1 < Alen)
1452                 {
1453                     x = chain + 0;
1454                     x->i = i;
1455                     x->j = j + 1;
1456                     x->next = NULL;
1457                     mpoly_monomial_set(exp_list[exp_next], Aexp + x->j*N, N);
1458 
1459                     FLINT_ASSERT(mpoly_monomial_cmp(exp_list[exp_next],
1460                                                  S->emin, N, S->cmpmask) >= 0);
1461 
1462                     exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
1463                                           &next_loc, &heap_len, N, S->cmpmask);
1464                 }
1465             }
1466             else
1467             {
1468                 /* should we go up */
1469                 if (  (i + 1 < Blen)
1470                    && (hind[i + 1] == 2*j + 1)
1471                    )
1472                 {
1473                     x = chain + i + 1;
1474                     x->i = i + 1;
1475                     x->j = j;
1476                     x->next = NULL;
1477                     hind[x->i] = 2*(x->j + 1) + 0;
1478 
1479                     mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
1480                                                               Qexp + N*x->j, N);
1481 
1482                     if (mpoly_monomial_cmp(exp_list[exp_next], S->emin, N,
1483                                                               S->cmpmask) >= 0)
1484                     {
1485                         exp_next += _mpoly_heap_insert(heap, exp_list[exp_next],
1486                                        x, &next_loc, &heap_len, N, S->cmpmask);
1487                     }
1488                     else
1489                     {
1490                         hind[x->i] |= 1;
1491                     }
1492                 }
1493                 /* should we go up? */
1494                 if (j + 1 == Qlen)
1495                 {
1496                     s++;
1497                 } else if (  ((hind[i] & 1) == 1)
1498                           && ((i == 1) || (hind[i - 1] >= 2*(j + 2) + 1))
1499                           )
1500                 {
1501                     x = chain + i;
1502                     x->i = i;
1503                     x->j = j + 1;
1504                     x->next = NULL;
1505                     hind[x->i] = 2*(x->j + 1) + 0;
1506 
1507                     mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
1508                                                               Qexp + N*x->j, N);
1509 
1510                     if (mpoly_monomial_cmp(exp_list[exp_next], S->emin, N,
1511                                                               S->cmpmask) >= 0)
1512                     {
1513                         exp_next += _mpoly_heap_insert(heap, exp_list[exp_next],
1514                                        x, &next_loc, &heap_len, N, S->cmpmask);
1515                     }
1516                     else
1517                     {
1518                         hind[x->i] |= 1;
1519                     }
1520                 }
1521             }
1522         }
1523 
1524         if (small)
1525         {
1526             ulong d0, d1, ds = acc_sm[2];
1527 
1528             /* d1:d0 = abs(acc_sm[1:0]) assuming ds is sign extension of acc_sm[1] */
1529             sub_ddmmss(d1, d0, acc_sm[1]^ds, acc_sm[0]^ds, ds, ds);
1530 
1531             if ((acc_sm[0] | acc_sm[1] | acc_sm[2]) == 0)
1532             {
1533                 continue;
1534             }
1535 
1536             if (ds == FLINT_SIGN_EXT(acc_sm[1]) && d1 < lc_abs)
1537             {
1538                 ulong qq, rr, nhi, nlo;
1539                 FLINT_ASSERT(0 < lc_norm && lc_norm < FLINT_BITS);
1540                 nhi = (d1 << lc_norm) | (d0 >> (FLINT_BITS - lc_norm));
1541                 nlo = d0 << lc_norm;
1542                 udiv_qrnnd_preinv(qq, rr, nhi, nlo, lc_n, lc_i);
1543                 if (rr != UWORD(0))
1544                     goto not_exact_division;
1545 
1546                 if (qq <= COEFF_MAX)
1547                 {
1548                     _fmpz_demote(Qcoeff + Qlen);
1549                     Qcoeff[Qlen] = qq;
1550                     if (ds != lc_sign)
1551                         Qcoeff[Qlen] = -Qcoeff[Qlen];
1552                 }
1553                 else
1554                 {
1555                     small = 0;
1556                     fmpz_set_ui(Qcoeff + Qlen, qq);
1557                     if (ds != lc_sign)
1558                         fmpz_neg(Qcoeff + Qlen, Qcoeff + Qlen);
1559                 }
1560             }
1561             else
1562             {
1563                 small = 0;
1564                 fmpz_set_signed_uiuiui(acc_lg, acc_sm[2], acc_sm[1], acc_sm[0]);
1565                 fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1566                 if (!fmpz_is_zero(r))
1567                     goto not_exact_division;
1568             }
1569         }
1570         else
1571         {
1572             if (fmpz_is_zero(acc_lg))
1573             {
1574                 continue;
1575             }
1576 
1577             fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1578 
1579             if (!fmpz_is_zero(r))
1580             {
1581                 goto not_exact_division;
1582             }
1583         }
1584 
1585         if (!lt_divides)
1586         {
1587             goto not_exact_division;
1588         }
1589 
1590         if (s > 1)
1591         {
1592             i = 1;
1593             x = chain + i;
1594             x->i = i;
1595             x->j = Qlen;
1596             x->next = NULL;
1597             hind[x->i] = 2*(x->j + 1) + 0;
1598 
1599             mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
1600                                                              Qexp + N*x->j, N);
1601 
1602             if (mpoly_monomial_cmp(exp_list[exp_next], S->emin, N, S->cmpmask)
1603                                                                           >= 0)
1604             {
1605                 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
1606                                           &next_loc, &heap_len, N, S->cmpmask);
1607             }
1608             else
1609             {
1610                 hind[x->i] |= 1;
1611             }
1612         }
1613         s = 1;
1614         Qlen++;
1615     }
1616 
1617 
1618 cleanup:
1619 
1620     fmpz_clear(acc_lg);
1621     fmpz_clear(r);
1622 
1623     *Q_alloc = Qalloc;
1624     *Q_coeff = Qcoeff;
1625     *Q_exp = Qexp;
1626 
1627     return Qlen;
1628 
1629 not_exact_division:
1630     Qlen = 0;
1631     goto cleanup;
1632 }
1633 
1634 
chunk_find_exp(ulong * exp,slong a,const divides_heap_base_t H)1635 static slong chunk_find_exp(ulong * exp, slong a, const divides_heap_base_t H)
1636 {
1637     slong N = H->N;
1638     slong b = H->polyA->length;
1639     const ulong * Aexp = H->polyA->exps;
1640 
1641 try_again:
1642     FLINT_ASSERT(b >= a);
1643 
1644     FLINT_ASSERT(a > 0);
1645     FLINT_ASSERT(mpoly_monomial_cmp(Aexp + N*(a - 1), exp, N, H->cmpmask) >= 0);
1646     FLINT_ASSERT(b >= H->polyA->length
1647                   ||  mpoly_monomial_cmp(Aexp + N*b, exp, N, H->cmpmask) < 0);
1648 
1649     if (b - a < 5)
1650     {
1651         slong i = a;
1652         while (i < b
1653                 && mpoly_monomial_cmp(Aexp + N*i, exp, N, H->cmpmask) >= 0)
1654         {
1655             i++;
1656         }
1657         return i;
1658     }
1659     else
1660     {
1661         slong c = a + (b - a)/2;
1662         if (mpoly_monomial_cmp(Aexp + N*c, exp, N, H->cmpmask) < 0)
1663         {
1664             b = c;
1665         }
1666         else
1667         {
1668             a = c;
1669         }
1670         goto try_again;
1671     }
1672 }
1673 
stripe_fit_length(fmpz_mpoly_stripe_struct * S,slong new_len)1674 static void stripe_fit_length(fmpz_mpoly_stripe_struct * S, slong new_len)
1675 {
1676     slong N = S->N;
1677     slong new_alloc;
1678     new_alloc = 0;
1679     if (N == 1)
1680     {
1681         new_alloc += new_len*sizeof(slong);
1682         new_alloc += new_len*sizeof(slong);
1683         new_alloc += 2*new_len*sizeof(slong);
1684         new_alloc += (new_len + 1)*sizeof(mpoly_heap1_s);
1685         new_alloc += new_len*sizeof(mpoly_heap_t);
1686     }
1687     else
1688     {
1689         new_alloc += new_len*sizeof(slong);
1690         new_alloc += new_len*sizeof(slong);
1691         new_alloc += 2*new_len*sizeof(slong);
1692         new_alloc += (new_len + 1)*sizeof(mpoly_heap_s);
1693         new_alloc += new_len*sizeof(mpoly_heap_t);
1694         new_alloc += new_len*N*sizeof(ulong);
1695         new_alloc += new_len*sizeof(ulong *);
1696         new_alloc += N*sizeof(ulong);
1697     }
1698 
1699     if (S->big_mem_alloc >= new_alloc)
1700     {
1701         return;
1702     }
1703 
1704     new_alloc = FLINT_MAX(new_alloc, S->big_mem_alloc + S->big_mem_alloc/4);
1705     S->big_mem_alloc = new_alloc;
1706 
1707     if (S->big_mem != NULL)
1708     {
1709         S->big_mem = (char *) flint_realloc(S->big_mem, new_alloc);
1710     }
1711     else
1712     {
1713         S->big_mem = (char *) flint_malloc(new_alloc);
1714     }
1715 
1716 }
1717 
1718 
chunk_mulsub(worker_arg_t W,divides_heap_chunk_t L,slong q_prev_length)1719 static void chunk_mulsub(worker_arg_t W, divides_heap_chunk_t L, slong q_prev_length)
1720 {
1721     divides_heap_base_struct * H = W->H;
1722     slong N = H->N;
1723     fmpz_mpoly_struct * C = L->polyC;
1724     const fmpz_mpoly_struct * B = H->polyB;
1725     const fmpz_mpoly_struct * A = H->polyA;
1726     fmpz_mpoly_ts_struct * Q = H->polyQ;
1727     fmpz_mpoly_struct * T1 = W->polyT1;
1728     fmpz_mpoly_stripe_struct * S = W->S;
1729 
1730     S->startidx = &L->startidx;
1731     S->endidx = &L->endidx;
1732     S->emin = L->emin;
1733     S->emax = L->emax;
1734     S->upperclosed = L->upperclosed;
1735     FLINT_ASSERT(S->N == N);
1736     stripe_fit_length(S, q_prev_length - L->mq);
1737 
1738     if (L->Cinited)
1739     {
1740         if (N == 1)
1741         {
1742             T1->length = _fmpz_mpoly_mulsub_stripe1(
1743                     &T1->coeffs, &T1->exps, &T1->alloc,
1744                     C->coeffs, C->exps, C->length, 1,
1745                     Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1746                     B->coeffs, B->exps, B->length, S);
1747         }
1748         else
1749         {
1750             T1->length = _fmpz_mpoly_mulsub_stripe(
1751                     &T1->coeffs, &T1->exps, &T1->alloc,
1752                     C->coeffs, C->exps, C->length, 1,
1753                     Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1754                     B->coeffs, B->exps, B->length, S);
1755         }
1756         fmpz_mpoly_swap(C, T1, H->ctx);
1757     }
1758     else
1759     {
1760         slong startidx, stopidx;
1761         if (L->upperclosed)
1762         {
1763             startidx = 0;
1764             stopidx = chunk_find_exp(L->emin, 1, H);
1765         }
1766         else
1767         {
1768             startidx = chunk_find_exp(L->emax, 1, H);
1769             stopidx = chunk_find_exp(L->emin, startidx, H);
1770         }
1771 
1772         L->Cinited = 1;
1773         fmpz_mpoly_init2(C, 16 + stopidx - startidx, H->ctx); /*any is OK*/
1774         fmpz_mpoly_fit_bits(C, H->bits, H->ctx);
1775         C->bits = H->bits;
1776 
1777         if (N == 1)
1778         {
1779             C->length = _fmpz_mpoly_mulsub_stripe1(
1780                     &C->coeffs, &C->exps, &C->alloc,
1781                     A->coeffs + startidx, A->exps + N*startidx, stopidx - startidx, 1,
1782                     Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1783                     B->coeffs, B->exps, B->length, S);
1784         }
1785         else
1786         {
1787             C->length = _fmpz_mpoly_mulsub_stripe(
1788                     &C->coeffs, &C->exps, &C->alloc,
1789                     A->coeffs + startidx, A->exps + N*startidx, stopidx - startidx, 1,
1790                     Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1791                     B->coeffs, B->exps, B->length, S);
1792         }
1793     }
1794 
1795     L->mq = q_prev_length;
1796 }
1797 
trychunk(worker_arg_t W,divides_heap_chunk_t L)1798 static void trychunk(worker_arg_t W, divides_heap_chunk_t L)
1799 {
1800     divides_heap_base_struct * H = W->H;
1801     slong i;
1802     slong N = H->N;
1803     fmpz_mpoly_struct * C = L->polyC;
1804     slong q_prev_length;
1805     ulong mask;
1806     const fmpz_mpoly_struct * B = H->polyB;
1807     const fmpz_mpoly_struct * A = H->polyA;
1808     fmpz_mpoly_ts_struct * Q = H->polyQ;
1809     fmpz_mpoly_struct * T2 = W->polyT2;
1810 
1811     mask = 0;
1812     for (i = 0; i < FLINT_BITS/H->bits; i++)
1813         mask = (mask << H->bits) + (UWORD(1) << (H->bits - 1));
1814 
1815     /* return if this section has already finished processing */
1816     if (L->mq < 0)
1817     {
1818         return;
1819     }
1820 
1821     /* process more quotient terms if available */
1822     q_prev_length = Q->length;
1823     if (q_prev_length > L->mq)
1824     {
1825         if (L->producer == 0 && q_prev_length - L->mq < 20)
1826             return;
1827 
1828         chunk_mulsub(W, L, q_prev_length);
1829     }
1830 
1831     if (L->producer == 1)
1832     {
1833         divides_heap_chunk_struct * next;
1834         fmpz * Rcoeff;
1835         ulong * Rexp;
1836         slong Rlen;
1837 
1838         /* process the remaining quotient terms */
1839         q_prev_length = Q->length;
1840         if (q_prev_length > L->mq)
1841         {
1842             chunk_mulsub(W, L, q_prev_length);
1843         }
1844 
1845         /* find location of remaining terms */
1846         if (L->Cinited)
1847         {
1848             Rlen = C->length;
1849             Rexp = C->exps;
1850             Rcoeff = C->coeffs;
1851         }
1852         else
1853         {
1854             slong startidx, stopidx;
1855             if (L->upperclosed)
1856             {
1857                 startidx = 0;
1858                 stopidx = chunk_find_exp(L->emin, 1, H);
1859             }
1860             else
1861             {
1862                 startidx = chunk_find_exp(L->emax, 1, H);
1863                 stopidx = chunk_find_exp(L->emin, startidx, H);
1864             }
1865             Rlen = stopidx - startidx;
1866             Rcoeff = A->coeffs + startidx;
1867             Rexp = A->exps + N*startidx;
1868         }
1869 
1870         /* if we have remaining terms, add to quotient  */
1871         if (Rlen > 0)
1872         {
1873             fmpz_mpoly_stripe_struct * S = W->S;
1874             S->startidx = &L->startidx;
1875             S->endidx = &L->endidx;
1876             S->emin = L->emin;
1877             S->emax = L->emax;
1878             S->upperclosed = L->upperclosed;
1879             if (N == 1)
1880             {
1881                 T2->length = _fmpz_mpoly_divides_stripe1(
1882                                     &T2->coeffs, &T2->exps, &T2->alloc,
1883                                        Rcoeff, Rexp, Rlen,
1884                                        B->coeffs, B->exps, B->length,  S);
1885             }
1886             else
1887             {
1888                 T2->length = _fmpz_mpoly_divides_stripe(
1889                                     &T2->coeffs, &T2->exps, &T2->alloc,
1890                                        Rcoeff, Rexp, Rlen,
1891                                        B->coeffs, B->exps, B->length,  S);
1892             }
1893             if (T2->length == 0)
1894             {
1895                 H->failed = 1;
1896                 return;
1897             }
1898             else
1899             {
1900                 fmpz_mpoly_ts_append(H->polyQ, T2->coeffs, T2->exps,
1901                                                                 T2->length, N);
1902             }
1903         }
1904 
1905         next = L->next;
1906         H->length--;
1907         H->cur = next;
1908 
1909         if (next != NULL)
1910         {
1911             next->producer = 1;
1912         }
1913 
1914         L->producer = 0;
1915         L->mq = -1;
1916     }
1917 
1918     return;
1919 }
1920 
1921 
worker_loop(void * varg)1922 static void worker_loop(void * varg)
1923 {
1924     worker_arg_struct * W = (worker_arg_struct *) varg;
1925     divides_heap_base_struct * H = W->H;
1926     fmpz_mpoly_stripe_struct * S = W->S;
1927     const fmpz_mpoly_struct * B = H->polyB;
1928     fmpz_mpoly_struct * T1 = W->polyT1;
1929     fmpz_mpoly_struct * T2 = W->polyT2;
1930     slong N = H->N;
1931     slong Blen = B->length;
1932 
1933     /* initialize stripe working memory */
1934     S->N = N;
1935     S->bits = H->bits;
1936     S->coeff_bits = FLINT_ABS(H->polyBcoeff_bits);
1937     S->cmpmask = H->cmpmask;
1938     S->big_mem_alloc = 0;
1939     S->big_mem = NULL;
1940 
1941     stripe_fit_length(S, Blen);
1942 
1943     fmpz_mpoly_init2(T1, 16, H->ctx);
1944     fmpz_mpoly_fit_bits(T1, H->bits, H->ctx);
1945     T1->bits = H->bits;
1946     fmpz_mpoly_init2(T2, 16, H->ctx);
1947     fmpz_mpoly_fit_bits(T2, H->bits, H->ctx);
1948     T2->bits = H->bits;
1949 
1950     while (!H->failed)
1951     {
1952         divides_heap_chunk_struct * L;
1953         L = H->cur;
1954 
1955         if (L == NULL)
1956         {
1957             break;
1958         }
1959         while (L != NULL)
1960         {
1961 #if HAVE_PTHREAD
1962             pthread_mutex_lock(&H->mutex);
1963 #endif
1964 	    if (L->lock != -1)
1965             {
1966                 L->lock = -1;
1967 #if HAVE_PTHREAD
1968                 pthread_mutex_unlock(&H->mutex);
1969 #endif
1970 		trychunk(W, L);
1971 #if HAVE_PTHREAD
1972                 pthread_mutex_lock(&H->mutex);
1973 #endif
1974 		L->lock = 0;
1975 #if HAVE_PTHREAD
1976                 pthread_mutex_unlock(&H->mutex);
1977 #endif
1978 		break;
1979             }
1980             else
1981             {
1982 #if HAVE_PTHREAD
1983                 pthread_mutex_unlock(&H->mutex);
1984 #endif
1985 	    }
1986 
1987             L = L->next;
1988         }
1989     }
1990 
1991     fmpz_mpoly_clear(T1, H->ctx);
1992     fmpz_mpoly_clear(T2, H->ctx);
1993     flint_free(S->big_mem);
1994 
1995     return;
1996 }
1997 
1998 
1999 /* return 1 if quotient is exact */
_fmpz_mpoly_divides_heap_threaded_pool(fmpz_mpoly_t Q,const fmpz_mpoly_t A,const fmpz_mpoly_t B,const fmpz_mpoly_ctx_t ctx,const thread_pool_handle * handles,slong num_handles)2000 int _fmpz_mpoly_divides_heap_threaded_pool(
2001     fmpz_mpoly_t Q,
2002     const fmpz_mpoly_t A,
2003     const fmpz_mpoly_t B,
2004     const fmpz_mpoly_ctx_t ctx,
2005     const thread_pool_handle * handles,
2006     slong num_handles)
2007 {
2008     ulong mask;
2009     int divides;
2010     fmpz_mpoly_ctx_t zctx;
2011     fmpz_mpoly_t S;
2012     slong i, k, N;
2013     flint_bitcnt_t exp_bits;
2014     ulong * cmpmask;
2015     ulong * Aexp, * Bexp;
2016     int freeAexp, freeBexp;
2017     worker_arg_struct * worker_args;
2018     fmpz_t qcoeff, r;
2019     ulong * texps, * qexps;
2020     divides_heap_base_t H;
2021     TMP_INIT;
2022 
2023 #if !FLINT_KNOW_STRONG_ORDER
2024     return fmpz_mpoly_divides_monagan_pearce(Q, A, B, ctx);
2025 #endif
2026 
2027     if (B->length < 2 || A->length < 2)
2028     {
2029         return fmpz_mpoly_divides_monagan_pearce(Q, A, B, ctx);
2030     }
2031 
2032     TMP_START;
2033 
2034     fmpz_init(qcoeff);
2035     fmpz_init(r);
2036 
2037     exp_bits = MPOLY_MIN_BITS;
2038     exp_bits = FLINT_MAX(exp_bits, A->bits);
2039     exp_bits = FLINT_MAX(exp_bits, B->bits);
2040     exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo);
2041 
2042     N = mpoly_words_per_exp(exp_bits, ctx->minfo);
2043     cmpmask = (ulong*) TMP_ALLOC(N*sizeof(ulong));
2044     mpoly_get_cmpmask(cmpmask, N, exp_bits, ctx->minfo);
2045 
2046     /* ensure input exponents packed to same size as output exponents */
2047     Aexp = A->exps;
2048     freeAexp = 0;
2049     if (exp_bits > A->bits)
2050     {
2051         freeAexp = 1;
2052         Aexp = (ulong *) flint_malloc(N*A->length*sizeof(ulong));
2053         mpoly_repack_monomials(Aexp, exp_bits, A->exps, A->bits,
2054                                                         A->length, ctx->minfo);
2055     }
2056 
2057     Bexp = B->exps;
2058     freeBexp = 0;
2059     if (exp_bits > B->bits)
2060     {
2061         freeBexp = 1;
2062         Bexp = (ulong *) flint_malloc(N*B->length*sizeof(ulong));
2063         mpoly_repack_monomials(Bexp, exp_bits, B->exps, B->bits,
2064                                                     B->length, ctx->minfo);
2065     }
2066 
2067     fmpz_mpoly_ctx_init(zctx, ctx->minfo->nvars, ctx->minfo->ord);
2068     fmpz_mpoly_init(S, zctx);
2069 
2070     fmpz_mod(r, A->coeffs + 0, B->coeffs + 0);
2071     if (!fmpz_is_zero(r))
2072     {
2073         divides = 0;
2074         fmpz_mpoly_zero(Q, ctx);
2075         goto cleanup1;
2076     }
2077 
2078     if (mpoly_divides_select_exps(S, zctx, num_handles,
2079                                    Aexp, A->length, Bexp, B->length, exp_bits))
2080     {
2081         divides = 0;
2082         fmpz_mpoly_zero(Q, ctx);
2083         goto cleanup1;
2084     }
2085 
2086     /*
2087         At this point A and B both have at least two terms
2088             and the leading coefficients and monomials divide
2089             and the exponent selection did not give an easy exit
2090     */
2091 
2092     divides_heap_base_init(H);
2093 
2094     H->polyA->coeffs = A->coeffs;
2095     H->polyA->exps = Aexp;
2096     H->polyA->bits = exp_bits;
2097     H->polyA->length = A->length;
2098     H->polyA->alloc = A->alloc;
2099 
2100     H->polyB->coeffs = B->coeffs;
2101     H->polyB->exps = Bexp;
2102     H->polyB->bits = exp_bits;
2103     H->polyB->length = B->length;
2104     H->polyB->alloc = B->alloc;
2105 
2106     H->polyBcoeff_bits = _fmpz_vec_max_bits(B->coeffs, B->length);
2107 
2108     H->ctx = ctx;
2109     H->bits = exp_bits;
2110     H->N = N;
2111     H->cmpmask = cmpmask;
2112     H->failed = 0;
2113 
2114     for (i = 0; i + 1 < S->length; i++)
2115     {
2116         divides_heap_chunk_struct * L;
2117         L = (divides_heap_chunk_struct *) flint_malloc(
2118                                             sizeof(divides_heap_chunk_struct));
2119         L->ma = 0;
2120         L->mq = 0;
2121         L->emax = S->exps + N*i;
2122         L->emin = S->exps + N*(i + 1);
2123         L->upperclosed = 0;
2124         L->startidx = B->length;
2125         L->endidx = B->length;
2126         L->producer = 0;
2127         L->Cinited = 0;
2128         L->lock = -2;
2129         divides_heap_base_add_chunk(H, L);
2130     }
2131 
2132     H->head->upperclosed = 1;
2133     H->head->producer = 1;
2134     H->cur = H->head;
2135 
2136     /* generate at least the first quotient term */
2137 
2138     texps = (ulong *) TMP_ALLOC(N*sizeof(ulong));
2139     qexps = (ulong *) TMP_ALLOC(N*sizeof(ulong));
2140 
2141     mpoly_monomial_sub_mp(qexps + N*0, Aexp + N*0, Bexp + N*0, N);
2142     fmpz_divexact(qcoeff, A->coeffs + 0, B->coeffs + 0); /* already checked */
2143 
2144     fmpz_mpoly_ts_init(H->polyQ, qcoeff, qexps, 1, H->bits, H->N);
2145 
2146     mpoly_monomial_add_mp(texps, qexps + N*0, Bexp + N*1, N);
2147 
2148     mask = 0;
2149     for (i = 0; i < FLINT_BITS/exp_bits; i++)
2150         mask = (mask << exp_bits) + (UWORD(1) << (exp_bits - 1));
2151 
2152     k = 1;
2153     while (k < A->length && mpoly_monomial_gt(Aexp + N*k, texps, N, cmpmask))
2154     {
2155         int lt_divides;
2156         if (exp_bits <= FLINT_BITS)
2157             lt_divides = mpoly_monomial_divides(qexps, Aexp + N*k,
2158                                                       Bexp + N*0, N, mask);
2159         else
2160             lt_divides = mpoly_monomial_divides_mp(qexps, Aexp + N*k,
2161                                                       Bexp + N*0, N, exp_bits);
2162         if (!lt_divides)
2163         {
2164             H->failed = 1;
2165             break;
2166         }
2167 
2168         fmpz_fdiv_qr(qcoeff, r, A->coeffs + k, B->coeffs + 0);
2169         if (!fmpz_is_zero(r))
2170         {
2171             H->failed = 1;
2172             break;
2173         }
2174 
2175         fmpz_mpoly_ts_append(H->polyQ, qcoeff, qexps, 1, H->N);
2176         k++;
2177     }
2178 
2179     /* start the workers */
2180 
2181 #if HAVE_PTHREAD
2182     pthread_mutex_init(&H->mutex, NULL);
2183 #endif
2184 
2185     worker_args = (worker_arg_struct *) flint_malloc((num_handles + 1)
2186                                                         *sizeof(worker_arg_t));
2187 
2188     for (i = 0; i < num_handles; i++)
2189     {
2190         (worker_args + i)->H = H;
2191         thread_pool_wake(global_thread_pool, handles[i], 0,
2192                                                  worker_loop, worker_args + i);
2193     }
2194     (worker_args + num_handles)->H = H;
2195     worker_loop(worker_args + num_handles);
2196     for (i = 0; i < num_handles; i++)
2197     {
2198         thread_pool_wait(global_thread_pool, handles[i]);
2199     }
2200 
2201     flint_free(worker_args);
2202 
2203 #if HAVE_PTHREAD
2204     pthread_mutex_destroy(&H->mutex);
2205 #endif
2206 
2207     divides = divides_heap_base_clear(Q, H);
2208 
2209 cleanup1:
2210     fmpz_mpoly_clear(S, zctx);
2211     fmpz_mpoly_ctx_clear(zctx);
2212 
2213     if (freeAexp)
2214         flint_free(Aexp);
2215 
2216     if (freeBexp)
2217         flint_free(Bexp);
2218 
2219     fmpz_clear(qcoeff);
2220     fmpz_clear(r);
2221 
2222     TMP_END;
2223 
2224     return divides;
2225 }
2226 
2227 
fmpz_mpoly_divides_heap_threaded(fmpz_mpoly_t Q,const fmpz_mpoly_t A,const fmpz_mpoly_t B,const fmpz_mpoly_ctx_t ctx)2228 int fmpz_mpoly_divides_heap_threaded(
2229     fmpz_mpoly_t Q,
2230     const fmpz_mpoly_t A,
2231     const fmpz_mpoly_t B,
2232     const fmpz_mpoly_ctx_t ctx)
2233 {
2234     thread_pool_handle * handles;
2235     slong num_handles;
2236     int divides;
2237     slong thread_limit = A->length/32;
2238 
2239     if (B->length < 2 || A->length < 2)
2240     {
2241         if (B->length == 0)
2242         {
2243             flint_throw(FLINT_DIVZERO,
2244                          "Divide by zero in fmpz_mpoly_divides_heap_threaded");
2245         }
2246 
2247         if (A->length == 0)
2248         {
2249             fmpz_mpoly_zero(Q, ctx);
2250             return 1;
2251         }
2252         return fmpz_mpoly_divides_monagan_pearce(Q, A, B, ctx);
2253     }
2254 
2255     num_handles = flint_request_threads(&handles, thread_limit);
2256 
2257     divides = _fmpz_mpoly_divides_heap_threaded_pool(Q, A, B, ctx,
2258                                                          handles, num_handles);
2259 
2260     flint_give_back_threads(handles, num_handles);
2261 
2262     return divides;
2263 }
2264