1 /*
2 Copyright (C) 2019 Daniel Schultz
3
4 This file is part of FLINT.
5
6 FLINT is free software: you can redistribute it and/or modify it under
7 the terms of the GNU Lesser General Public License (LGPL) as published
8 by the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version. See <http://www.gnu.org/licenses/>.
10 */
11
12 #include "thread_pool.h"
13 #include "fmpz_mpoly.h"
14
15 /*
16 a thread safe mpoly supports three mutating operations
17 - init from an array of terms
18 - append an array of terms
19 - clear out contents to a normal mpoly
20 */
21 typedef struct _fmpz_mpoly_ts_struct
22 {
23 fmpz * volatile coeffs; /* this is coeff_array[idx] */
24 ulong * volatile exps; /* this is exp_array[idx] */
25 volatile slong length;
26 slong alloc;
27 flint_bitcnt_t bits;
28 flint_bitcnt_t idx;
29 ulong * exp_array[FLINT_BITS];
30 fmpz * coeff_array[FLINT_BITS];
31 } fmpz_mpoly_ts_struct;
32
33 typedef fmpz_mpoly_ts_struct fmpz_mpoly_ts_t[1];
34
35 /* Bcoeff is changed */
fmpz_mpoly_ts_init(fmpz_mpoly_ts_t A,fmpz * Bcoeff,ulong * Bexp,slong Blen,flint_bitcnt_t bits,slong N)36 void fmpz_mpoly_ts_init(fmpz_mpoly_ts_t A,
37 fmpz * Bcoeff, ulong * Bexp, slong Blen,
38 flint_bitcnt_t bits, slong N)
39 {
40 slong i;
41 flint_bitcnt_t idx = FLINT_BIT_COUNT(Blen);
42 idx = (idx <= 8) ? 0 : idx - 8;
43 for (i = 0; i < FLINT_BITS; i++)
44 {
45 A->exp_array[i] = NULL;
46 A->coeff_array[i] = NULL;
47 }
48 A->bits = bits;
49 A->idx = idx;
50 A->alloc = WORD(256) << idx;
51 A->exps = A->exp_array[idx]
52 = (ulong *) flint_malloc(N*A->alloc*sizeof(ulong));
53 A->coeffs = A->coeff_array[idx]
54 = (fmpz *) flint_calloc(A->alloc, sizeof(fmpz));
55 A->length = Blen;
56 for (i = 0; i < Blen; i++)
57 {
58 fmpz_swap(A->coeffs + i, Bcoeff + i);
59 mpoly_monomial_set(A->exps + N*i, Bexp + N*i, N);
60 }
61 }
62
fmpz_mpoly_ts_print(const fmpz_mpoly_ts_t B,const char ** x,const fmpz_mpoly_ctx_t ctx)63 void fmpz_mpoly_ts_print(const fmpz_mpoly_ts_t B, const char ** x,
64 const fmpz_mpoly_ctx_t ctx)
65 {
66 fmpz_mpoly_t A;
67 A->length = B->length;
68 A->alloc = B->alloc;
69 A->coeffs = B->coeffs;
70 A->exps = B->exps;
71 A->bits = B->bits;
72 fmpz_mpoly_print_pretty(A, x, ctx);
73
74 fmpz_mpoly_assert_canonical(A, ctx);
75 }
76
fmpz_mpoly_ts_clear(fmpz_mpoly_ts_t A)77 void fmpz_mpoly_ts_clear(fmpz_mpoly_ts_t A)
78 {
79 slong i;
80
81 for (i = 0; i < A->length; i++)
82 {
83 fmpz_clear(A->coeffs + i);
84 }
85
86 for (i = 0; i < FLINT_BITS; i++)
87 {
88 if (A->exp_array[i] != NULL)
89 {
90 FLINT_ASSERT(A->coeff_array[i] != NULL);
91 flint_free(A->coeff_array[i]);
92 flint_free(A->exp_array[i]);
93 }
94 }
95 }
96
fmpz_mpoly_ts_clear_poly(fmpz_mpoly_t Q,fmpz_mpoly_ts_t A)97 void fmpz_mpoly_ts_clear_poly(fmpz_mpoly_t Q, fmpz_mpoly_ts_t A)
98 {
99 if (Q->alloc != 0)
100 {
101 slong i;
102
103 FLINT_ASSERT(Q->exps != NULL);
104 FLINT_ASSERT(Q->coeffs != NULL);
105
106 for (i = 0; i < Q->alloc; i++)
107 {
108 _fmpz_demote(Q->coeffs + i);
109 }
110 flint_free(Q->exps);
111 flint_free(Q->coeffs);
112 }
113
114 Q->exps = A->exps;
115 Q->coeffs = A->coeffs;
116 Q->bits = A->bits;
117 Q->alloc = A->alloc;
118 Q->length = A->length;
119
120 A->length = 0;
121 A->coeff_array[A->idx] = NULL;
122 A->exp_array[A->idx] = NULL;
123 fmpz_mpoly_ts_clear(A);
124 }
125
126
127 /* put B on the end of A - Bcoeff is changed*/
fmpz_mpoly_ts_append(fmpz_mpoly_ts_t A,fmpz * Bcoeff,ulong * Bexps,slong Blen,slong N)128 void fmpz_mpoly_ts_append(fmpz_mpoly_ts_t A,
129 fmpz * Bcoeff, ulong * Bexps, slong Blen, slong N)
130 {
131 /* TODO: this needs barriers on non-x86 */
132
133 slong i;
134 ulong * oldexps = A->exps;
135 fmpz * oldcoeffs = A->coeffs;
136 slong oldlength = A->length;
137 slong newlength = A->length + Blen;
138
139 if (newlength <= A->alloc)
140 {
141 /* write new terms first */
142 for (i = 0; i < Blen; i++)
143 {
144 fmpz_swap(oldcoeffs + oldlength + i, Bcoeff + i);
145 mpoly_monomial_set(oldexps + N*(oldlength + i), Bexps + N*i, N);
146 }
147 }
148 else
149 {
150 slong newalloc;
151 ulong * newexps;
152 fmpz * newcoeffs;
153 flint_bitcnt_t newidx;
154 newidx = FLINT_BIT_COUNT(newlength - 1);
155 newidx = (newidx > 8) ? newidx - 8 : 0;
156 FLINT_ASSERT(newidx > A->idx);
157
158 newalloc = UWORD(256) << newidx;
159 FLINT_ASSERT(newlength <= newalloc);
160 newexps = A->exp_array[newidx]
161 = (ulong *) flint_malloc(N*newalloc*sizeof(ulong));
162 newcoeffs = A->coeff_array[newidx]
163 = (fmpz *) flint_calloc(newalloc, sizeof(fmpz));
164
165 for (i = 0; i < oldlength; i++)
166 {
167 newcoeffs[i] = oldcoeffs[i]; /* just copy the bits */
168 mpoly_monomial_set(newexps + N*i, oldexps + N*i, N);
169 }
170 for (i = 0; i < Blen; i++)
171 {
172 fmpz_swap(newcoeffs + oldlength + i, Bcoeff + i);
173 mpoly_monomial_set(newexps + N*(oldlength + i), Bexps + N*i, N);
174 }
175
176 A->alloc = newalloc;
177 A->exps = newexps;
178 A->coeffs = newcoeffs;
179 A->idx = newidx;
180
181 /* do not free oldcoeff/exps as other threads may be using them */
182 }
183
184 /* update length at the very end */
185 A->length = newlength;
186 }
187
188
189 /*
190 a chunk holds an exponent range on the dividend
191 */
192 typedef struct _divides_heap_chunk_struct
193 {
194 fmpz_mpoly_t polyC;
195 struct _divides_heap_chunk_struct * next;
196 ulong * emin;
197 ulong * emax;
198 slong startidx;
199 slong endidx;
200 int upperclosed;
201 volatile int lock;
202 volatile int producer;
203 volatile slong ma;
204 volatile slong mq;
205 int Cinited;
206 } divides_heap_chunk_struct;
207
208 typedef divides_heap_chunk_struct divides_heap_chunk_t[1];
209
210 /*
211 the base struct includes a linked list of chunks
212 */
213 typedef struct
214 {
215 #if HAVE_PTHREAD
216 pthread_mutex_t mutex;
217 #endif
218 divides_heap_chunk_struct * head;
219 divides_heap_chunk_struct * tail;
220 divides_heap_chunk_struct * volatile cur;
221 fmpz_mpoly_t polyA;
222 fmpz_mpoly_t polyB;
223 fmpz_mpoly_ts_t polyQ;
224 const fmpz_mpoly_ctx_struct * ctx;
225 slong length;
226 slong N;
227 flint_bitcnt_t bits;
228 slong polyBcoeff_bits;
229 ulong * cmpmask;
230 int failed;
231 } divides_heap_base_struct;
232
233 typedef divides_heap_base_struct divides_heap_base_t[1];
234
235 /*
236 the worker stuct has a big chunk of memory in the stripe_t
237 and two polys for work space
238 */
239 typedef struct _worker_arg_struct
240 {
241 divides_heap_base_struct * H;
242 fmpz_mpoly_stripe_t S;
243 fmpz_mpoly_t polyT1;
244 fmpz_mpoly_t polyT2;
245 } worker_arg_struct;
246
247 typedef worker_arg_struct worker_arg_t[1];
248
249
divides_heap_base_init(divides_heap_base_t H)250 static void divides_heap_base_init(divides_heap_base_t H)
251 {
252 H->head = NULL;
253 H->tail = NULL;
254 H->cur = NULL;
255 H->ctx = NULL;
256 H->length = 0;
257 H->N = 0;
258 H->bits = 0;
259 H->cmpmask = NULL;
260 }
261
divides_heap_chunk_clear(divides_heap_chunk_t L,divides_heap_base_t H)262 static void divides_heap_chunk_clear(divides_heap_chunk_t L, divides_heap_base_t H)
263 {
264 if (L->Cinited)
265 {
266 fmpz_mpoly_clear(L->polyC, H->ctx);
267 }
268 }
divides_heap_base_clear(fmpz_mpoly_t Q,divides_heap_base_t H)269 static int divides_heap_base_clear(fmpz_mpoly_t Q, divides_heap_base_t H)
270 {
271 divides_heap_chunk_struct * L = H->head;
272 while (L != NULL)
273 {
274 divides_heap_chunk_struct * nextL = L->next;
275 divides_heap_chunk_clear(L, H);
276 flint_free(L);
277 L = nextL;
278 }
279 H->head = NULL;
280 H->tail = NULL;
281 H->cur = NULL;
282 H->ctx = NULL;
283 H->length = 0;
284 H->N = 0;
285 H->bits = 0;
286 H->cmpmask = NULL;
287
288 if (H->failed)
289 {
290 fmpz_mpoly_zero(Q, H->ctx);
291 fmpz_mpoly_ts_clear(H->polyQ);
292 return 0;
293 }
294 else
295 {
296 fmpz_mpoly_ts_clear_poly(Q, H->polyQ);
297 return 1;
298 }
299 }
300
divides_heap_base_add_chunk(divides_heap_base_t H,divides_heap_chunk_t L)301 static void divides_heap_base_add_chunk(divides_heap_base_t H, divides_heap_chunk_t L)
302 {
303 L->next = NULL;
304
305 if (H->tail == NULL)
306 {
307 FLINT_ASSERT(H->head == NULL);
308 H->tail = L;
309 H->head = L;
310 }
311 else
312 {
313 divides_heap_chunk_struct * tail = H->tail;
314 FLINT_ASSERT(tail->next == NULL);
315 tail->next = L;
316 H->tail = L;
317 }
318 H->length++;
319 }
320
321
322 /*
323 A = D - (a stripe of B * C)
324 S->startidx and S->endidx are assumed to be correct
325 that is, we expect and successive calls to keep
326 B decreasing
327 C the same
328
329 saveD: 0 means we can modify coeffs of input D
330 1 means we must not modify coeffs of input D
331 */
_fmpz_mpoly_mulsub_stripe1(fmpz ** A_coeff,ulong ** A_exp,slong * A_alloc,const fmpz * Dcoeff,const ulong * Dexp,slong Dlen,int saveD,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz * Ccoeff,const ulong * Cexp,slong Clen,const fmpz_mpoly_stripe_t S)332 slong _fmpz_mpoly_mulsub_stripe1(fmpz ** A_coeff, ulong ** A_exp, slong * A_alloc,
333 const fmpz * Dcoeff, const ulong * Dexp, slong Dlen, int saveD,
334 const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
335 const fmpz * Ccoeff, const ulong * Cexp, slong Clen,
336 const fmpz_mpoly_stripe_t S)
337 {
338 int upperclosed;
339 slong startidx, endidx;
340 ulong prev_startidx;
341 ulong maskhi = S->cmpmask[0];
342 ulong emax = S->emax[0];
343 ulong emin = S->emin[0];
344 slong i, j;
345 slong next_loc = Blen + 4; /* something bigger than heap can ever be */
346 slong heap_len = 1; /* heap zero index unused */
347 mpoly_heap1_s * heap;
348 mpoly_heap_t * chain;
349 slong * store, * store_base;
350 mpoly_heap_t * x;
351 slong Di;
352 slong Alen;
353 slong Aalloc = *A_alloc;
354 fmpz * Acoeff = *A_coeff;
355 ulong * Aexp = *A_exp;
356 ulong exp;
357 slong * ends;
358 ulong texp;
359 slong * hind;
360 int small;
361 ulong acc_sm[3], pp0, pp1;
362
363 FLINT_ASSERT(S->N == 1);
364
365 i = 0;
366 hind = (slong *)(S->big_mem + i);
367 i += Blen*sizeof(slong);
368 ends = (slong *)(S->big_mem + i);
369 i += Blen*sizeof(slong);
370 store = store_base = (slong *) (S->big_mem + i);
371 i += 2*Blen*sizeof(slong);
372 heap = (mpoly_heap1_s *)(S->big_mem + i);
373 i += (Blen + 1)*sizeof(mpoly_heap1_s);
374 chain = (mpoly_heap_t *)(S->big_mem + i);
375 i += Blen*sizeof(mpoly_heap_t);
376 FLINT_ASSERT(i <= S->big_mem_alloc);
377
378 startidx = *S->startidx;
379 endidx = *S->endidx;
380 upperclosed = S->upperclosed;
381 emax = S->emax[0];
382 emin = S->emin[0];
383
384 /* put all the starting nodes on the heap */
385 prev_startidx = -UWORD(1);
386 for (i = 0; i < Blen; i++)
387 {
388 if (startidx < Clen)
389 {
390 texp = Bexp[i] + Cexp[startidx];
391 FLINT_ASSERT(mpoly_monomial_cmp1(emax, texp, maskhi)
392 > -upperclosed);
393 }
394 while (startidx > 0)
395 {
396 texp = Bexp[i] + Cexp[startidx - 1];
397 if (mpoly_monomial_cmp1(emax, texp, maskhi) <= -upperclosed)
398 {
399 break;
400 }
401 startidx--;
402 }
403
404 if (endidx < Clen)
405 {
406 texp = Bexp[i] + Cexp[endidx];
407 FLINT_ASSERT(mpoly_monomial_cmp1(emin, texp, maskhi) > 0);
408 }
409 while (endidx > 0)
410 {
411 texp = Bexp[i] + Cexp[endidx - 1];
412 if (mpoly_monomial_cmp1(emin, texp, maskhi) <= 0)
413 {
414 break;
415 }
416 endidx--;
417 }
418
419 ends[i] = endidx;
420
421 hind[i] = 2*startidx + 1;
422
423 if ( (startidx < endidx)
424 && (((ulong)startidx) < prev_startidx)
425 )
426 {
427 x = chain + i;
428 x->i = i;
429 x->j = startidx;
430 x->next = NULL;
431 hind[x->i] = 2*(x->j + 1) + 0;
432 _mpoly_heap_insert1(heap, Bexp[x->i] + Cexp[x->j], x,
433 &next_loc, &heap_len, maskhi);
434 }
435
436 prev_startidx = startidx;
437 }
438
439 /* set the indices for the next time mul is called */
440 *S->startidx = startidx;
441 *S->endidx = endidx;
442
443 /* whether input coeffs are small, thus output coeffs fit in three words */
444 FLINT_ASSERT(ends[0] >= startidx);
445 FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Ccoeff, Clen)));
446 small = S->coeff_bits <= FLINT_BITS - 2
447 && _fmpz_mpoly_fits_small(Bcoeff, Blen)
448 && FLINT_ABS(_fmpz_vec_max_bits(Dcoeff, Dlen)) < 3*FLINT_BITS - 3;
449
450 Alen = 0;
451 Di = 0;
452 while (heap_len > 1)
453 {
454 exp = heap[1].exp;
455
456 while (Di < Dlen && mpoly_monomial_gt1(Dexp[Di], exp, maskhi))
457 {
458 _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, 1);
459 Aexp[Alen] = Dexp[Di];
460 if (saveD)
461 fmpz_set(Acoeff + Alen, Dcoeff + Di);
462 else
463 fmpz_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di));
464 Alen++;
465 Di++;
466 }
467
468 _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, 1);
469
470 Aexp[Alen] = exp;
471
472 if (small)
473 {
474 acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
475 if (Di < Dlen && Dexp[Di] == exp)
476 {
477 _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Dcoeff + Di);
478 Di++;
479 }
480
481 do
482 {
483 x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
484
485 hind[x->i] |= WORD(1);
486 *store++ = x->i;
487 *store++ = x->j;
488 FLINT_ASSERT(startidx <= x->j);
489 FLINT_ASSERT(x->j < ends[0]);
490 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
491 FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
492 smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
493 sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
494 acc_sm[2], acc_sm[1], acc_sm[0],
495 FLINT_SIGN_EXT(pp1), pp1, pp0);
496
497 while ((x = x->next) != NULL)
498 {
499 hind[x->i] |= WORD(1);
500 *store++ = x->i;
501 *store++ = x->j;
502 FLINT_ASSERT(startidx <= x->j);
503 FLINT_ASSERT(x->j < ends[0]);
504 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
505 FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
506 smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
507 sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
508 acc_sm[2], acc_sm[1], acc_sm[0],
509 FLINT_SIGN_EXT(pp1), pp1, pp0);
510 }
511 } while (heap_len > 1 && heap[1].exp == exp);
512
513 fmpz_set_signed_uiuiui(Acoeff + Alen,
514 acc_sm[2], acc_sm[1], acc_sm[0]);
515 }
516 else
517 {
518 if (Di < Dlen && Dexp[Di] == exp)
519 {
520 fmpz_set(Acoeff + Alen, Dcoeff + Di);
521 Di++;
522 }
523 else
524 {
525 fmpz_zero(Acoeff + Alen);
526 }
527
528 do
529 {
530 x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
531
532 hind[x->i] |= WORD(1);
533 *store++ = x->i;
534 *store++ = x->j;
535 fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
536
537 while ((x = x->next) != NULL)
538 {
539 hind[x->i] |= WORD(1);
540 *store++ = x->i;
541 *store++ = x->j;
542 fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
543 }
544 } while (heap_len > 1 && heap[1].exp == exp);
545 }
546
547 Alen += !fmpz_is_zero(Acoeff + Alen);
548
549 /* process nodes taken from the heap */
550 while (store > store_base)
551 {
552 j = *--store;
553 i = *--store;
554
555 /* should we go right? */
556 if ( (i + 1 < Blen)
557 && (j + 0 < ends[i + 1])
558 && (hind[i + 1] == 2*j + 1)
559 )
560 {
561 x = chain + i + 1;
562 x->i = i + 1;
563 x->j = j;
564 x->next = NULL;
565
566 hind[x->i] = 2*(x->j + 1) + 0;
567 _mpoly_heap_insert1(heap, Bexp[x->i] + Cexp[x->j], x,
568 &next_loc, &heap_len, maskhi);
569 }
570
571 /* should we go up? */
572 if ( (j + 1 < ends[i + 0])
573 && ((hind[i] & 1) == 1)
574 && ( (i == 0)
575 || (hind[i - 1] >= 2*(j + 2) + 1)
576 )
577 )
578 {
579 x = chain + i;
580 x->i = i;
581 x->j = j + 1;
582 x->next = NULL;
583
584 hind[x->i] = 2*(x->j + 1) + 0;
585 _mpoly_heap_insert1(heap, Bexp[x->i] + Cexp[x->j], x,
586 &next_loc, &heap_len, maskhi);
587 }
588 }
589 }
590
591 FLINT_ASSERT(Di <= Dlen);
592 _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + Dlen - Di, 1);
593 if (saveD)
594 _fmpz_vec_set(Acoeff + Alen, Dcoeff + Di, Dlen - Di);
595 else
596 _fmpz_vec_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di), Dlen - Di);
597 mpoly_copy_monomials(Aexp + 1*Alen, Dexp + 1*Di, Dlen - Di, 1);
598 Alen += Dlen - Di;
599
600 *A_coeff = Acoeff;
601 *A_exp = Aexp;
602 *A_alloc = Aalloc;
603
604 return Alen;
605 }
606
607
_fmpz_mpoly_mulsub_stripe(fmpz ** A_coeff,ulong ** A_exp,slong * A_alloc,const fmpz * Dcoeff,const ulong * Dexp,slong Dlen,int saveD,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz * Ccoeff,const ulong * Cexp,slong Clen,const fmpz_mpoly_stripe_t S)608 slong _fmpz_mpoly_mulsub_stripe(fmpz ** A_coeff, ulong ** A_exp, slong * A_alloc,
609 const fmpz * Dcoeff, const ulong * Dexp, slong Dlen, int saveD,
610 const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
611 const fmpz * Ccoeff, const ulong * Cexp, slong Clen,
612 const fmpz_mpoly_stripe_t S)
613 {
614 int upperclosed;
615 slong startidx, endidx;
616 ulong prev_startidx;
617 ulong * emax = S->emax;
618 ulong * emin = S->emin;
619 slong N = S->N;
620 slong i, j;
621 slong next_loc = Blen + 4; /* something bigger than heap can ever be */
622 slong heap_len = 1; /* heap zero index unused */
623 mpoly_heap_s * heap;
624 mpoly_heap_t * chain;
625 slong * store, * store_base;
626 mpoly_heap_t * x;
627 slong Di;
628 slong Alen;
629 slong Aalloc = *A_alloc;
630 fmpz * Acoeff = *A_coeff;
631 ulong * Aexp = *A_exp;
632 ulong * exp, * exps;
633 ulong ** exp_list;
634 slong exp_next;
635 slong * ends;
636 ulong * texp;
637 slong * hind;
638 int small;
639 ulong acc_sm[3], pp0, pp1;
640
641 i = 0;
642 hind = (slong *)(S->big_mem + i);
643 i += Blen*sizeof(slong);
644 ends = (slong *)(S->big_mem + i);
645 i += Blen*sizeof(slong);
646 store = store_base = (slong *) (S->big_mem + i);
647 i += 2*Blen*sizeof(slong);
648 heap = (mpoly_heap_s *)(S->big_mem + i);
649 i += (Blen + 1)*sizeof(mpoly_heap_s);
650 chain = (mpoly_heap_t *)(S->big_mem + i);
651 i += Blen*sizeof(mpoly_heap_t);
652 exps = (ulong *)(S->big_mem + i);
653 i += Blen*N*sizeof(ulong);
654 exp_list = (ulong **)(S->big_mem + i);
655 i += Blen*sizeof(ulong *);
656 texp = (ulong *)(S->big_mem + i);
657 i += N*sizeof(ulong);
658 FLINT_ASSERT(i <= S->big_mem_alloc);
659
660 exp_next = 0;
661
662 startidx = *S->startidx;
663 endidx = *S->endidx;
664 upperclosed = S->upperclosed;
665
666 for (i = 0; i < Blen; i++)
667 exp_list[i] = exps + i*N;
668
669 /* put all the starting nodes on the heap */
670 prev_startidx = -UWORD(1);
671 for (i = 0; i < Blen; i++)
672 {
673 if (startidx < Clen)
674 {
675 mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*startidx, N);
676 FLINT_ASSERT(mpoly_monomial_cmp(emax, texp, N, S->cmpmask)
677 > -upperclosed);
678 }
679 while (startidx > 0)
680 {
681 mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*(startidx - 1), N);
682 if (mpoly_monomial_cmp(emax, texp, N, S->cmpmask) <= -upperclosed)
683 {
684 break;
685 }
686 startidx--;
687 }
688
689 if (endidx < Clen)
690 {
691 mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*endidx, N);
692 FLINT_ASSERT(mpoly_monomial_cmp(emin, texp, N, S->cmpmask) > 0);
693 }
694 while (endidx > 0)
695 {
696 mpoly_monomial_add_mp(texp, Bexp + N*i, Cexp + N*(endidx - 1), N);
697 if (mpoly_monomial_cmp(emin, texp, N, S->cmpmask) <= 0)
698 {
699 break;
700 }
701 endidx--;
702 }
703
704 ends[i] = endidx;
705
706 hind[i] = 2*startidx + 1;
707
708 if ( (startidx < endidx)
709 && (((ulong)startidx) < prev_startidx)
710 )
711 {
712 x = chain + i;
713 x->i = i;
714 x->j = startidx;
715 x->next = NULL;
716 hind[x->i] = 2*(x->j + 1) + 0;
717
718 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
719 Cexp + N*x->j, N);
720
721 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
722 &next_loc, &heap_len, N, S->cmpmask);
723 }
724
725 prev_startidx = startidx;
726 }
727
728 *S->startidx = startidx;
729 *S->endidx = endidx;
730
731 /* whether input coeffs are small, thus output coeffs fit in three words */
732 FLINT_ASSERT(ends[0] >= startidx);
733 FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Ccoeff, Clen)));
734 small = S->coeff_bits <= FLINT_BITS - 2
735 && _fmpz_mpoly_fits_small(Bcoeff, Blen)
736 && FLINT_ABS(_fmpz_vec_max_bits(Dcoeff, Dlen)) < 3*FLINT_BITS - 3;
737
738 Alen = 0;
739 Di = 0;
740 while (heap_len > 1)
741 {
742 exp = heap[1].exp;
743
744 while (Di < Dlen && mpoly_monomial_gt(Dexp + N*Di, exp, N, S->cmpmask))
745 {
746 _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, N);
747 mpoly_monomial_set(Aexp + N*Alen, Dexp + N*Di, N);
748 if (saveD)
749 fmpz_set(Acoeff + Alen, Dcoeff + Di);
750 else
751 fmpz_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di));
752 Alen++;
753 Di++;
754 }
755
756 _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + 1, N);
757
758 mpoly_monomial_set(Aexp + N*Alen, exp, N);
759
760 if (small)
761 {
762 acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
763 if (Di < Dlen && mpoly_monomial_equal(Dexp + N*Di, exp, N))
764 {
765 _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Dcoeff + Di);
766 Di++;
767 }
768
769 do
770 {
771 exp_list[--exp_next] = heap[1].exp;
772 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
773
774 hind[x->i] |= WORD(1);
775 *store++ = x->i;
776 *store++ = x->j;
777 FLINT_ASSERT(startidx <= x->j);
778 FLINT_ASSERT(x->j < ends[0]);
779 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
780 FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
781 smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
782 sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
783 acc_sm[2], acc_sm[1], acc_sm[0],
784 FLINT_SIGN_EXT(pp1), pp1, pp0);
785
786 while ((x = x->next) != NULL)
787 {
788 hind[x->i] |= WORD(1);
789 *store++ = x->i;
790 *store++ = x->j;
791 FLINT_ASSERT(startidx <= x->j);
792 FLINT_ASSERT(x->j < ends[0]);
793 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
794 FLINT_ASSERT(!COEFF_IS_MPZ(Ccoeff[x->j]));
795 smul_ppmm(pp1, pp0, Bcoeff[x->i], Ccoeff[x->j]);
796 sub_dddmmmsss(acc_sm[2], acc_sm[1], acc_sm[0],
797 acc_sm[2], acc_sm[1], acc_sm[0],
798 FLINT_SIGN_EXT(pp1), pp1, pp0);
799 }
800 } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
801
802 fmpz_set_signed_uiuiui(Acoeff + Alen,
803 acc_sm[2], acc_sm[1], acc_sm[0]);
804 }
805 else
806 {
807 if (Di < Dlen && mpoly_monomial_equal(Dexp + N*Di, exp, N))
808 {
809 fmpz_set(Acoeff + Alen, Dcoeff + Di);
810 Di++;
811 }
812 else
813 {
814 fmpz_zero(Acoeff + Alen);
815 }
816
817 do
818 {
819 exp_list[--exp_next] = heap[1].exp;
820 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
821
822 hind[x->i] |= WORD(1);
823 *store++ = x->i;
824 *store++ = x->j;
825 FLINT_ASSERT(startidx <= x->j);
826 FLINT_ASSERT(x->j < ends[0]);
827 fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
828
829 while ((x = x->next) != NULL)
830 {
831 hind[x->i] |= WORD(1);
832 *store++ = x->i;
833 *store++ = x->j;
834 FLINT_ASSERT(startidx <= x->j);
835 FLINT_ASSERT(x->j < ends[0]);
836 fmpz_submul(Acoeff + Alen, Bcoeff + x->i, Ccoeff + x->j);
837 }
838 } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
839 }
840
841 Alen += !fmpz_is_zero(Acoeff + Alen);
842
843 /* process nodes taken from the heap */
844 while (store > store_base)
845 {
846 j = *--store;
847 i = *--store;
848
849 /* should we go right? */
850 if ( (i + 1 < Blen)
851 && (j + 0 < ends[i + 1])
852 && (hind[i + 1] == 2*j + 1)
853 )
854 {
855 x = chain + i + 1;
856 x->i = i + 1;
857 x->j = j;
858 x->next = NULL;
859
860 hind[x->i] = 2*(x->j + 1) + 0;
861
862 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
863 Cexp + N*x->j, N);
864
865 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
866 &next_loc, &heap_len, N, S->cmpmask);
867 }
868
869 /* should we go up? */
870 if ( (j + 1 < ends[i + 0])
871 && ((hind[i] & 1) == 1)
872 && ( (i == 0)
873 || (hind[i - 1] >= 2*(j + 2) + 1)
874 )
875 )
876 {
877 x = chain + i;
878 x->i = i;
879 x->j = j + 1;
880 x->next = NULL;
881
882 hind[x->i] = 2*(x->j + 1) + 0;
883
884 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
885 Cexp + N*x->j, N);
886
887 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
888 &next_loc, &heap_len, N, S->cmpmask);
889 }
890 }
891 }
892
893 FLINT_ASSERT(Di <= Dlen);
894 _fmpz_mpoly_fit_length(&Acoeff, &Aexp, &Aalloc, Alen + Dlen - Di, N);
895 if (saveD)
896 _fmpz_vec_set(Acoeff + Alen, Dcoeff + Di, Dlen - Di);
897 else
898 _fmpz_vec_swap(Acoeff + Alen, (fmpz *)(Dcoeff + Di), Dlen - Di);
899 mpoly_copy_monomials(Aexp + N*Alen, Dexp + N*Di, Dlen - Di, N);
900 Alen += Dlen - Di;
901
902 *A_coeff = Acoeff;
903 *A_exp = Aexp;
904 *A_alloc = Aalloc;
905
906 return Alen;
907 }
908
909 /*
910 Q = stripe of A/B (assume A != 0)
911 return Qlen = 0 if exact division is impossible
912 */
_fmpz_mpoly_divides_stripe1(fmpz ** Q_coeff,ulong ** Q_exp,slong * Q_alloc,const fmpz * Acoeff,const ulong * Aexp,slong Alen,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz_mpoly_stripe_t S)913 slong _fmpz_mpoly_divides_stripe1(
914 fmpz ** Q_coeff, ulong ** Q_exp, slong * Q_alloc,
915 const fmpz * Acoeff, const ulong * Aexp, slong Alen,
916 const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
917 const fmpz_mpoly_stripe_t S)
918 {
919 flint_bitcnt_t bits = S->bits;
920 ulong emin = S->emin[0];
921 ulong cmpmask = S->cmpmask[0];
922 ulong texp;
923 int lt_divides;
924 slong i, j, s;
925 slong next_loc, heap_len;
926 mpoly_heap1_s * heap;
927 mpoly_heap_t * chain;
928 slong * store, * store_base;
929 mpoly_heap_t * x;
930 slong Qlen;
931 slong Qalloc = * Q_alloc;
932 fmpz * Qcoeff = * Q_coeff;
933 ulong * Qexp = * Q_exp;
934 ulong exp;
935 ulong mask;
936 slong * hind;
937 fmpz_t acc_lg, r;
938 ulong acc_sm[3];
939 slong Acoeffbits;
940 ulong lc_norm = 0, lc_abs = 0, lc_sign = 0, lc_n = 0, lc_i = 0;
941 int small;
942
943 FLINT_ASSERT(Alen > 0);
944 FLINT_ASSERT(Blen > 0);
945 FLINT_ASSERT(S->N == 1);
946
947 Acoeffbits = _fmpz_vec_max_bits(Acoeff, Alen);
948 FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Bcoeff, Blen)));
949
950 /* whether intermediate computations A - Q*B will fit in three words */
951 /* allow one bit for sign, one bit for subtraction NOT QUITE */
952 small = S->coeff_bits <= FLINT_BITS - 2
953 && FLINT_ABS(Acoeffbits) <= (S->coeff_bits
954 + FLINT_BIT_COUNT(Blen) + FLINT_BITS - 2);
955
956 next_loc = Blen + 4; /* something bigger than heap can ever be */
957
958 i = 0;
959 hind = (slong *)(S->big_mem + i);
960 i += Blen*sizeof(slong);
961 store = store_base = (slong *) (S->big_mem + i);
962 i += 2*Blen*sizeof(slong);
963 heap = (mpoly_heap1_s *)(S->big_mem + i);
964 i += (Blen + 1)*sizeof(mpoly_heap1_s);
965 chain = (mpoly_heap_t *)(S->big_mem + i);
966 i += Blen*sizeof(mpoly_heap_t);
967 FLINT_ASSERT(i <= S->big_mem_alloc);
968
969 fmpz_init(acc_lg);
970 fmpz_init(r);
971
972 for (i = 0; i < Blen; i++)
973 hind[i] = 1;
974
975 /* mask with high bit set in each word of each field of exponent vector */
976 mask = 0;
977 for (i = 0; i < FLINT_BITS/bits; i++)
978 mask = (mask << bits) + (UWORD(1) << (bits - 1));
979
980 Qlen = WORD(0);
981
982 /* s is the number of terms * (latest quotient) we should put into heap */
983 s = Blen;
984
985 /* insert (-1, 0, exp2[0]) into heap */
986 heap_len = 2;
987 x = chain + 0;
988 x->i = -WORD(1);
989 x->j = 0;
990 x->next = NULL;
991 heap[1].next = x;
992 HEAP_ASSIGN(heap[1], Aexp[0], x);
993
994 FLINT_ASSERT(mpoly_monomial_cmp1(Aexp[0], emin, cmpmask) >= 0);
995
996 if (small)
997 {
998 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[0]));
999 lc_abs = FLINT_ABS(Bcoeff[0]);
1000 lc_sign = FLINT_SIGN_EXT(Bcoeff[0]);
1001 count_leading_zeros(lc_norm, lc_abs);
1002 lc_n = lc_abs << lc_norm;
1003 invert_limb(lc_i, lc_n);
1004 }
1005
1006 while (heap_len > 1)
1007 {
1008 exp = heap[1].exp;
1009
1010 if (mpoly_monomial_overflows1(exp, mask))
1011 goto not_exact_division;
1012
1013 FLINT_ASSERT(mpoly_monomial_cmp1(exp, emin, cmpmask) >= 0);
1014
1015 _fmpz_mpoly_fit_length(&Qcoeff, &Qexp, &Qalloc, Qlen + 1, 1);
1016
1017 lt_divides = mpoly_monomial_divides1(Qexp + Qlen, exp, Bexp[0], mask);
1018
1019 if (small)
1020 {
1021 acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
1022 do
1023 {
1024 x = _mpoly_heap_pop1(heap, &heap_len, cmpmask);
1025 do
1026 {
1027 *store++ = x->i;
1028 *store++ = x->j;
1029 if (x->i != -WORD(1))
1030 hind[x->i] |= WORD(1);
1031
1032 if (x->i == -WORD(1))
1033 {
1034 _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Acoeff + x->j);
1035 }
1036 else
1037 {
1038 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
1039 FLINT_ASSERT(!COEFF_IS_MPZ(Qcoeff[x->j]));
1040 _fmpz_mpoly_submul_uiuiui_fmpz(acc_sm,
1041 Bcoeff[x->i], Qcoeff[x->j]);
1042 }
1043 } while ((x = x->next) != NULL);
1044 } while (heap_len > 1 && heap[1].exp == exp);
1045 }
1046 else
1047 {
1048 fmpz_zero(acc_lg);
1049 do
1050 {
1051 x = _mpoly_heap_pop1(heap, &heap_len, cmpmask);
1052 do
1053 {
1054 *store++ = x->i;
1055 *store++ = x->j;
1056 if (x->i != -WORD(1))
1057 hind[x->i] |= WORD(1);
1058
1059 if (x->i == -WORD(1))
1060 {
1061 fmpz_add(acc_lg, acc_lg, Acoeff + x->j);
1062 }
1063 else
1064 {
1065 fmpz_submul(acc_lg, Bcoeff + x->i, Qcoeff + x->j);
1066 }
1067 } while ((x = x->next) != NULL);
1068 } while (heap_len > 1 && heap[1].exp == exp);
1069 }
1070
1071 /* process nodes taken from the heap */
1072 while (store > store_base)
1073 {
1074 j = *--store;
1075 i = *--store;
1076
1077 if (i == -WORD(1))
1078 {
1079 /* take next dividend term */
1080 if (j + 1 < Alen)
1081 {
1082 x = chain + 0;
1083 x->i = i;
1084 x->j = j + 1;
1085 x->next = NULL;
1086
1087 FLINT_ASSERT(mpoly_monomial_cmp1(Aexp[x->j], emin, cmpmask)
1088 >= 0);
1089
1090 _mpoly_heap_insert1(heap, Aexp[x->j], x,
1091 &next_loc, &heap_len, cmpmask);
1092 }
1093 }
1094 else
1095 {
1096 /* should we go up */
1097 if ( (i + 1 < Blen)
1098 && (hind[i + 1] == 2*j + 1)
1099 )
1100 {
1101 x = chain + i + 1;
1102 x->i = i + 1;
1103 x->j = j;
1104 x->next = NULL;
1105 hind[x->i] = 2*(x->j + 1) + 0;
1106
1107 texp = Bexp[x->i] + Qexp[x->j];
1108 if (mpoly_monomial_cmp1(texp, emin, cmpmask) >= 0)
1109 {
1110 _mpoly_heap_insert1(heap, texp, x,
1111 &next_loc, &heap_len, cmpmask);
1112 }
1113 else
1114 {
1115 hind[x->i] |= 1;
1116 }
1117 }
1118 /* should we go up? */
1119 if (j + 1 == Qlen)
1120 {
1121 s++;
1122 } else if ( ((hind[i] & 1) == 1)
1123 && ((i == 1) || (hind[i - 1] >= 2*(j + 2) + 1))
1124 )
1125 {
1126 x = chain + i;
1127 x->i = i;
1128 x->j = j + 1;
1129 x->next = NULL;
1130 hind[x->i] = 2*(x->j + 1) + 0;
1131
1132 texp = Bexp[x->i] + Qexp[x->j];
1133 if (mpoly_monomial_cmp1(texp, emin, cmpmask) >= 0)
1134 {
1135 _mpoly_heap_insert1(heap, texp, x,
1136 &next_loc, &heap_len, cmpmask);
1137 }
1138 else
1139 {
1140 hind[x->i] |= 1;
1141 }
1142 }
1143 }
1144 }
1145
1146 if (small)
1147 {
1148 ulong d0, d1, ds = acc_sm[2];
1149
1150 /* d1:d0 = abs(acc_sm[1:0]) assuming ds is sign extension of acc_sm[1] */
1151 sub_ddmmss(d1, d0, acc_sm[1]^ds, acc_sm[0]^ds, ds, ds);
1152
1153 if ((acc_sm[0] | acc_sm[1] | acc_sm[2]) == 0)
1154 {
1155 continue;
1156 }
1157
1158 if (ds == FLINT_SIGN_EXT(acc_sm[1]) && d1 < lc_abs)
1159 {
1160 ulong qq, rr, nhi, nlo;
1161 FLINT_ASSERT(0 < lc_norm && lc_norm < FLINT_BITS);
1162 nhi = (d1 << lc_norm) | (d0 >> (FLINT_BITS - lc_norm));
1163 nlo = d0 << lc_norm;
1164 udiv_qrnnd_preinv(qq, rr, nhi, nlo, lc_n, lc_i);
1165 if (rr != UWORD(0))
1166 goto not_exact_division;
1167
1168 if (qq <= COEFF_MAX)
1169 {
1170 _fmpz_demote(Qcoeff + Qlen);
1171 Qcoeff[Qlen] = qq;
1172 if (ds != lc_sign)
1173 Qcoeff[Qlen] = -Qcoeff[Qlen];
1174 }
1175 else
1176 {
1177 small = 0;
1178 fmpz_set_ui(Qcoeff + Qlen, qq);
1179 if (ds != lc_sign)
1180 fmpz_neg(Qcoeff + Qlen, Qcoeff + Qlen);
1181 }
1182 }
1183 else
1184 {
1185 small = 0;
1186 fmpz_set_signed_uiuiui(acc_lg, acc_sm[2], acc_sm[1], acc_sm[0]);
1187 fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1188 if (!fmpz_is_zero(r))
1189 goto not_exact_division;
1190 }
1191 }
1192 else
1193 {
1194 if (fmpz_is_zero(acc_lg))
1195 {
1196 continue;
1197 }
1198
1199 fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1200
1201 if (!fmpz_is_zero(r))
1202 {
1203 goto not_exact_division;
1204 }
1205 }
1206
1207 if (!lt_divides)
1208 {
1209 goto not_exact_division;
1210 }
1211
1212 if (s > 1)
1213 {
1214 i = 1;
1215 x = chain + i;
1216 x->i = i;
1217 x->j = Qlen;
1218 x->next = NULL;
1219 hind[x->i] = 2*(x->j + 1) + 0;
1220
1221 texp = Bexp[x->i] + Qexp[x->j];
1222 if (mpoly_monomial_cmp1(texp, emin, cmpmask) >= 0)
1223 {
1224 _mpoly_heap_insert1(heap, texp, x,
1225 &next_loc, &heap_len, cmpmask);
1226 }
1227 else
1228 {
1229 hind[x->i] |= 1;
1230 }
1231 }
1232 s = 1;
1233 Qlen++;
1234 }
1235
1236
1237 cleanup:
1238
1239 fmpz_clear(acc_lg);
1240 fmpz_clear(r);
1241
1242 *Q_alloc = Qalloc;
1243 *Q_coeff = Qcoeff;
1244 *Q_exp = Qexp;
1245
1246 return Qlen;
1247
1248 not_exact_division:
1249 Qlen = 0;
1250 goto cleanup;
1251 }
1252
_fmpz_mpoly_divides_stripe(fmpz ** Q_coeff,ulong ** Q_exp,slong * Q_alloc,const fmpz * Acoeff,const ulong * Aexp,slong Alen,const fmpz * Bcoeff,const ulong * Bexp,slong Blen,const fmpz_mpoly_stripe_t S)1253 slong _fmpz_mpoly_divides_stripe(
1254 fmpz ** Q_coeff, ulong ** Q_exp, slong * Q_alloc,
1255 const fmpz * Acoeff, const ulong * Aexp, slong Alen,
1256 const fmpz * Bcoeff, const ulong * Bexp, slong Blen,
1257 const fmpz_mpoly_stripe_t S)
1258 {
1259 flint_bitcnt_t bits = S->bits;
1260 slong N = S->N;
1261 int lt_divides;
1262 slong i, j, s;
1263 slong next_loc, heap_len;
1264 mpoly_heap_s * heap;
1265 mpoly_heap_t * chain;
1266 slong * store, * store_base;
1267 mpoly_heap_t * x;
1268 slong Qlen;
1269 slong Qalloc = * Q_alloc;
1270 fmpz * Qcoeff = * Q_coeff;
1271 ulong * Qexp = * Q_exp;
1272 ulong * exp, * exps;
1273 ulong ** exp_list;
1274 slong exp_next;
1275 ulong mask;
1276 slong * hind;
1277 fmpz_t acc_lg, r;
1278 ulong acc_sm[3];
1279 slong Acoeffbits;
1280 ulong lc_norm = 0, lc_abs = 0, lc_sign = 0, lc_n = 0, lc_i = 0;
1281 int small;
1282
1283 FLINT_ASSERT(Alen > 0);
1284 FLINT_ASSERT(Blen > 0);
1285
1286 Acoeffbits = _fmpz_vec_max_bits(Acoeff, Alen);
1287 FLINT_ASSERT(S->coeff_bits == FLINT_ABS(_fmpz_vec_max_bits(Bcoeff, Blen)));
1288
1289 /* whether intermediate computations A - Q*B will fit in three words */
1290 /* allow one bit for sign, one bit for subtraction NOT QUITE */
1291 small = S->coeff_bits <= FLINT_BITS - 2
1292 && FLINT_ABS(Acoeffbits) <= (S->coeff_bits
1293 + FLINT_BIT_COUNT(Blen) + FLINT_BITS - 2);
1294
1295 FLINT_ASSERT(Alen > 0);
1296 FLINT_ASSERT(Blen > 0);
1297
1298 next_loc = Blen + 4; /* something bigger than heap can ever be */
1299
1300 i = 0;
1301 hind = (slong *) (S->big_mem + i);
1302 i += Blen*sizeof(slong);
1303 store = store_base = (slong *) (S->big_mem + i);
1304 i += 2*Blen*sizeof(slong);
1305 heap = (mpoly_heap_s *)(S->big_mem + i);
1306 i += (Blen + 1)*sizeof(mpoly_heap_s);
1307 chain = (mpoly_heap_t *)(S->big_mem + i);
1308 i += Blen*sizeof(mpoly_heap_t);
1309 exps = (ulong *)(S->big_mem + i);
1310 i += Blen*N*sizeof(ulong);
1311 exp_list = (ulong **)(S->big_mem + i);
1312 i += Blen*sizeof(ulong *);
1313 FLINT_ASSERT(i <= S->big_mem_alloc);
1314
1315 fmpz_init(acc_lg);
1316 fmpz_init(r);
1317
1318 exp_next = 0;
1319 for (i = 0; i < Blen; i++)
1320 exp_list[i] = exps + i*N;
1321
1322 for (i = 0; i < Blen; i++)
1323 hind[i] = 1;
1324
1325 /* mask with high bit set in each word of each field of exponent vector */
1326 mask = 0;
1327 for (i = 0; i < FLINT_BITS/bits; i++)
1328 mask = (mask << bits) + (UWORD(1) << (bits - 1));
1329
1330 Qlen = WORD(0);
1331
1332 /* s is the number of terms * (latest quotient) we should put into heap */
1333 s = Blen;
1334
1335 /* insert (-1, 0, exp2[0]) into heap */
1336 heap_len = 2;
1337 x = chain + 0;
1338 x->i = -WORD(1);
1339 x->j = 0;
1340 x->next = NULL;
1341 heap[1].next = x;
1342 heap[1].exp = exp_list[exp_next++];
1343
1344 FLINT_ASSERT(mpoly_monomial_cmp(Aexp + N*0, S->emin, N, S->cmpmask) >= 0);
1345
1346 mpoly_monomial_set(heap[1].exp, Aexp + N*0, N);
1347
1348 if (small)
1349 {
1350 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[0]));
1351 lc_abs = FLINT_ABS(Bcoeff[0]);
1352 lc_sign = FLINT_SIGN_EXT(Bcoeff[0]);
1353 count_leading_zeros(lc_norm, lc_abs);
1354 lc_n = lc_abs << lc_norm;
1355 invert_limb(lc_i, lc_n);
1356 }
1357
1358 while (heap_len > 1)
1359 {
1360 exp = heap[1].exp;
1361
1362 if (bits <= FLINT_BITS)
1363 {
1364 if (mpoly_monomial_overflows(exp, N, mask))
1365 {
1366 goto not_exact_division;
1367 }
1368 }
1369 else
1370 {
1371 if (mpoly_monomial_overflows_mp(exp, N, bits))
1372 {
1373 goto not_exact_division;
1374 }
1375 }
1376
1377 FLINT_ASSERT(mpoly_monomial_cmp(exp, S->emin, N, S->cmpmask) >= 0);
1378
1379 _fmpz_mpoly_fit_length(&Qcoeff, &Qexp, &Qalloc, Qlen + 1, N);
1380
1381 if (bits <= FLINT_BITS)
1382 lt_divides = mpoly_monomial_divides(Qexp + N*Qlen, exp,
1383 Bexp + N*0, N, mask);
1384 else
1385 lt_divides = mpoly_monomial_divides_mp(Qexp + N*Qlen, exp,
1386 Bexp + N*0, N, bits);
1387
1388 if (small)
1389 {
1390 acc_sm[0] = acc_sm[1] = acc_sm[2] = 0;
1391 do
1392 {
1393 exp_list[--exp_next] = heap[1].exp;
1394 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
1395 do
1396 {
1397 *store++ = x->i;
1398 *store++ = x->j;
1399 if (x->i != -WORD(1))
1400 hind[x->i] |= WORD(1);
1401
1402 if (x->i == -WORD(1))
1403 {
1404 _fmpz_mpoly_add_uiuiui_fmpz(acc_sm, Acoeff + x->j);
1405 }
1406 else
1407 {
1408 FLINT_ASSERT(!COEFF_IS_MPZ(Bcoeff[x->i]));
1409 FLINT_ASSERT(!COEFF_IS_MPZ(Qcoeff[x->j]));
1410 _fmpz_mpoly_submul_uiuiui_fmpz(acc_sm,
1411 Bcoeff[x->i], Qcoeff[x->j]);
1412 }
1413 } while ((x = x->next) != NULL);
1414 } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
1415 }
1416 else
1417 {
1418 fmpz_zero(acc_lg);
1419 do
1420 {
1421 exp_list[--exp_next] = heap[1].exp;
1422 x = _mpoly_heap_pop(heap, &heap_len, N, S->cmpmask);
1423 do
1424 {
1425 *store++ = x->i;
1426 *store++ = x->j;
1427 if (x->i != -WORD(1))
1428 hind[x->i] |= WORD(1);
1429
1430 if (x->i == -WORD(1))
1431 {
1432 fmpz_add(acc_lg, acc_lg, Acoeff + x->j);
1433 }
1434 else
1435 {
1436 fmpz_submul(acc_lg, Bcoeff + x->i, Qcoeff + x->j);
1437 }
1438 } while ((x = x->next) != NULL);
1439 } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
1440 }
1441
1442 /* process nodes taken from the heap */
1443 while (store > store_base)
1444 {
1445 j = *--store;
1446 i = *--store;
1447
1448 if (i == -WORD(1))
1449 {
1450 /* take next dividend term */
1451 if (j + 1 < Alen)
1452 {
1453 x = chain + 0;
1454 x->i = i;
1455 x->j = j + 1;
1456 x->next = NULL;
1457 mpoly_monomial_set(exp_list[exp_next], Aexp + x->j*N, N);
1458
1459 FLINT_ASSERT(mpoly_monomial_cmp(exp_list[exp_next],
1460 S->emin, N, S->cmpmask) >= 0);
1461
1462 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
1463 &next_loc, &heap_len, N, S->cmpmask);
1464 }
1465 }
1466 else
1467 {
1468 /* should we go up */
1469 if ( (i + 1 < Blen)
1470 && (hind[i + 1] == 2*j + 1)
1471 )
1472 {
1473 x = chain + i + 1;
1474 x->i = i + 1;
1475 x->j = j;
1476 x->next = NULL;
1477 hind[x->i] = 2*(x->j + 1) + 0;
1478
1479 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
1480 Qexp + N*x->j, N);
1481
1482 if (mpoly_monomial_cmp(exp_list[exp_next], S->emin, N,
1483 S->cmpmask) >= 0)
1484 {
1485 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next],
1486 x, &next_loc, &heap_len, N, S->cmpmask);
1487 }
1488 else
1489 {
1490 hind[x->i] |= 1;
1491 }
1492 }
1493 /* should we go up? */
1494 if (j + 1 == Qlen)
1495 {
1496 s++;
1497 } else if ( ((hind[i] & 1) == 1)
1498 && ((i == 1) || (hind[i - 1] >= 2*(j + 2) + 1))
1499 )
1500 {
1501 x = chain + i;
1502 x->i = i;
1503 x->j = j + 1;
1504 x->next = NULL;
1505 hind[x->i] = 2*(x->j + 1) + 0;
1506
1507 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
1508 Qexp + N*x->j, N);
1509
1510 if (mpoly_monomial_cmp(exp_list[exp_next], S->emin, N,
1511 S->cmpmask) >= 0)
1512 {
1513 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next],
1514 x, &next_loc, &heap_len, N, S->cmpmask);
1515 }
1516 else
1517 {
1518 hind[x->i] |= 1;
1519 }
1520 }
1521 }
1522 }
1523
1524 if (small)
1525 {
1526 ulong d0, d1, ds = acc_sm[2];
1527
1528 /* d1:d0 = abs(acc_sm[1:0]) assuming ds is sign extension of acc_sm[1] */
1529 sub_ddmmss(d1, d0, acc_sm[1]^ds, acc_sm[0]^ds, ds, ds);
1530
1531 if ((acc_sm[0] | acc_sm[1] | acc_sm[2]) == 0)
1532 {
1533 continue;
1534 }
1535
1536 if (ds == FLINT_SIGN_EXT(acc_sm[1]) && d1 < lc_abs)
1537 {
1538 ulong qq, rr, nhi, nlo;
1539 FLINT_ASSERT(0 < lc_norm && lc_norm < FLINT_BITS);
1540 nhi = (d1 << lc_norm) | (d0 >> (FLINT_BITS - lc_norm));
1541 nlo = d0 << lc_norm;
1542 udiv_qrnnd_preinv(qq, rr, nhi, nlo, lc_n, lc_i);
1543 if (rr != UWORD(0))
1544 goto not_exact_division;
1545
1546 if (qq <= COEFF_MAX)
1547 {
1548 _fmpz_demote(Qcoeff + Qlen);
1549 Qcoeff[Qlen] = qq;
1550 if (ds != lc_sign)
1551 Qcoeff[Qlen] = -Qcoeff[Qlen];
1552 }
1553 else
1554 {
1555 small = 0;
1556 fmpz_set_ui(Qcoeff + Qlen, qq);
1557 if (ds != lc_sign)
1558 fmpz_neg(Qcoeff + Qlen, Qcoeff + Qlen);
1559 }
1560 }
1561 else
1562 {
1563 small = 0;
1564 fmpz_set_signed_uiuiui(acc_lg, acc_sm[2], acc_sm[1], acc_sm[0]);
1565 fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1566 if (!fmpz_is_zero(r))
1567 goto not_exact_division;
1568 }
1569 }
1570 else
1571 {
1572 if (fmpz_is_zero(acc_lg))
1573 {
1574 continue;
1575 }
1576
1577 fmpz_fdiv_qr(Qcoeff + Qlen, r, acc_lg, Bcoeff + 0);
1578
1579 if (!fmpz_is_zero(r))
1580 {
1581 goto not_exact_division;
1582 }
1583 }
1584
1585 if (!lt_divides)
1586 {
1587 goto not_exact_division;
1588 }
1589
1590 if (s > 1)
1591 {
1592 i = 1;
1593 x = chain + i;
1594 x->i = i;
1595 x->j = Qlen;
1596 x->next = NULL;
1597 hind[x->i] = 2*(x->j + 1) + 0;
1598
1599 mpoly_monomial_add_mp(exp_list[exp_next], Bexp + N*x->i,
1600 Qexp + N*x->j, N);
1601
1602 if (mpoly_monomial_cmp(exp_list[exp_next], S->emin, N, S->cmpmask)
1603 >= 0)
1604 {
1605 exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
1606 &next_loc, &heap_len, N, S->cmpmask);
1607 }
1608 else
1609 {
1610 hind[x->i] |= 1;
1611 }
1612 }
1613 s = 1;
1614 Qlen++;
1615 }
1616
1617
1618 cleanup:
1619
1620 fmpz_clear(acc_lg);
1621 fmpz_clear(r);
1622
1623 *Q_alloc = Qalloc;
1624 *Q_coeff = Qcoeff;
1625 *Q_exp = Qexp;
1626
1627 return Qlen;
1628
1629 not_exact_division:
1630 Qlen = 0;
1631 goto cleanup;
1632 }
1633
1634
chunk_find_exp(ulong * exp,slong a,const divides_heap_base_t H)1635 static slong chunk_find_exp(ulong * exp, slong a, const divides_heap_base_t H)
1636 {
1637 slong N = H->N;
1638 slong b = H->polyA->length;
1639 const ulong * Aexp = H->polyA->exps;
1640
1641 try_again:
1642 FLINT_ASSERT(b >= a);
1643
1644 FLINT_ASSERT(a > 0);
1645 FLINT_ASSERT(mpoly_monomial_cmp(Aexp + N*(a - 1), exp, N, H->cmpmask) >= 0);
1646 FLINT_ASSERT(b >= H->polyA->length
1647 || mpoly_monomial_cmp(Aexp + N*b, exp, N, H->cmpmask) < 0);
1648
1649 if (b - a < 5)
1650 {
1651 slong i = a;
1652 while (i < b
1653 && mpoly_monomial_cmp(Aexp + N*i, exp, N, H->cmpmask) >= 0)
1654 {
1655 i++;
1656 }
1657 return i;
1658 }
1659 else
1660 {
1661 slong c = a + (b - a)/2;
1662 if (mpoly_monomial_cmp(Aexp + N*c, exp, N, H->cmpmask) < 0)
1663 {
1664 b = c;
1665 }
1666 else
1667 {
1668 a = c;
1669 }
1670 goto try_again;
1671 }
1672 }
1673
stripe_fit_length(fmpz_mpoly_stripe_struct * S,slong new_len)1674 static void stripe_fit_length(fmpz_mpoly_stripe_struct * S, slong new_len)
1675 {
1676 slong N = S->N;
1677 slong new_alloc;
1678 new_alloc = 0;
1679 if (N == 1)
1680 {
1681 new_alloc += new_len*sizeof(slong);
1682 new_alloc += new_len*sizeof(slong);
1683 new_alloc += 2*new_len*sizeof(slong);
1684 new_alloc += (new_len + 1)*sizeof(mpoly_heap1_s);
1685 new_alloc += new_len*sizeof(mpoly_heap_t);
1686 }
1687 else
1688 {
1689 new_alloc += new_len*sizeof(slong);
1690 new_alloc += new_len*sizeof(slong);
1691 new_alloc += 2*new_len*sizeof(slong);
1692 new_alloc += (new_len + 1)*sizeof(mpoly_heap_s);
1693 new_alloc += new_len*sizeof(mpoly_heap_t);
1694 new_alloc += new_len*N*sizeof(ulong);
1695 new_alloc += new_len*sizeof(ulong *);
1696 new_alloc += N*sizeof(ulong);
1697 }
1698
1699 if (S->big_mem_alloc >= new_alloc)
1700 {
1701 return;
1702 }
1703
1704 new_alloc = FLINT_MAX(new_alloc, S->big_mem_alloc + S->big_mem_alloc/4);
1705 S->big_mem_alloc = new_alloc;
1706
1707 if (S->big_mem != NULL)
1708 {
1709 S->big_mem = (char *) flint_realloc(S->big_mem, new_alloc);
1710 }
1711 else
1712 {
1713 S->big_mem = (char *) flint_malloc(new_alloc);
1714 }
1715
1716 }
1717
1718
chunk_mulsub(worker_arg_t W,divides_heap_chunk_t L,slong q_prev_length)1719 static void chunk_mulsub(worker_arg_t W, divides_heap_chunk_t L, slong q_prev_length)
1720 {
1721 divides_heap_base_struct * H = W->H;
1722 slong N = H->N;
1723 fmpz_mpoly_struct * C = L->polyC;
1724 const fmpz_mpoly_struct * B = H->polyB;
1725 const fmpz_mpoly_struct * A = H->polyA;
1726 fmpz_mpoly_ts_struct * Q = H->polyQ;
1727 fmpz_mpoly_struct * T1 = W->polyT1;
1728 fmpz_mpoly_stripe_struct * S = W->S;
1729
1730 S->startidx = &L->startidx;
1731 S->endidx = &L->endidx;
1732 S->emin = L->emin;
1733 S->emax = L->emax;
1734 S->upperclosed = L->upperclosed;
1735 FLINT_ASSERT(S->N == N);
1736 stripe_fit_length(S, q_prev_length - L->mq);
1737
1738 if (L->Cinited)
1739 {
1740 if (N == 1)
1741 {
1742 T1->length = _fmpz_mpoly_mulsub_stripe1(
1743 &T1->coeffs, &T1->exps, &T1->alloc,
1744 C->coeffs, C->exps, C->length, 1,
1745 Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1746 B->coeffs, B->exps, B->length, S);
1747 }
1748 else
1749 {
1750 T1->length = _fmpz_mpoly_mulsub_stripe(
1751 &T1->coeffs, &T1->exps, &T1->alloc,
1752 C->coeffs, C->exps, C->length, 1,
1753 Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1754 B->coeffs, B->exps, B->length, S);
1755 }
1756 fmpz_mpoly_swap(C, T1, H->ctx);
1757 }
1758 else
1759 {
1760 slong startidx, stopidx;
1761 if (L->upperclosed)
1762 {
1763 startidx = 0;
1764 stopidx = chunk_find_exp(L->emin, 1, H);
1765 }
1766 else
1767 {
1768 startidx = chunk_find_exp(L->emax, 1, H);
1769 stopidx = chunk_find_exp(L->emin, startidx, H);
1770 }
1771
1772 L->Cinited = 1;
1773 fmpz_mpoly_init2(C, 16 + stopidx - startidx, H->ctx); /*any is OK*/
1774 fmpz_mpoly_fit_bits(C, H->bits, H->ctx);
1775 C->bits = H->bits;
1776
1777 if (N == 1)
1778 {
1779 C->length = _fmpz_mpoly_mulsub_stripe1(
1780 &C->coeffs, &C->exps, &C->alloc,
1781 A->coeffs + startidx, A->exps + N*startidx, stopidx - startidx, 1,
1782 Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1783 B->coeffs, B->exps, B->length, S);
1784 }
1785 else
1786 {
1787 C->length = _fmpz_mpoly_mulsub_stripe(
1788 &C->coeffs, &C->exps, &C->alloc,
1789 A->coeffs + startidx, A->exps + N*startidx, stopidx - startidx, 1,
1790 Q->coeffs + L->mq, Q->exps + N*L->mq, q_prev_length - L->mq,
1791 B->coeffs, B->exps, B->length, S);
1792 }
1793 }
1794
1795 L->mq = q_prev_length;
1796 }
1797
trychunk(worker_arg_t W,divides_heap_chunk_t L)1798 static void trychunk(worker_arg_t W, divides_heap_chunk_t L)
1799 {
1800 divides_heap_base_struct * H = W->H;
1801 slong i;
1802 slong N = H->N;
1803 fmpz_mpoly_struct * C = L->polyC;
1804 slong q_prev_length;
1805 ulong mask;
1806 const fmpz_mpoly_struct * B = H->polyB;
1807 const fmpz_mpoly_struct * A = H->polyA;
1808 fmpz_mpoly_ts_struct * Q = H->polyQ;
1809 fmpz_mpoly_struct * T2 = W->polyT2;
1810
1811 mask = 0;
1812 for (i = 0; i < FLINT_BITS/H->bits; i++)
1813 mask = (mask << H->bits) + (UWORD(1) << (H->bits - 1));
1814
1815 /* return if this section has already finished processing */
1816 if (L->mq < 0)
1817 {
1818 return;
1819 }
1820
1821 /* process more quotient terms if available */
1822 q_prev_length = Q->length;
1823 if (q_prev_length > L->mq)
1824 {
1825 if (L->producer == 0 && q_prev_length - L->mq < 20)
1826 return;
1827
1828 chunk_mulsub(W, L, q_prev_length);
1829 }
1830
1831 if (L->producer == 1)
1832 {
1833 divides_heap_chunk_struct * next;
1834 fmpz * Rcoeff;
1835 ulong * Rexp;
1836 slong Rlen;
1837
1838 /* process the remaining quotient terms */
1839 q_prev_length = Q->length;
1840 if (q_prev_length > L->mq)
1841 {
1842 chunk_mulsub(W, L, q_prev_length);
1843 }
1844
1845 /* find location of remaining terms */
1846 if (L->Cinited)
1847 {
1848 Rlen = C->length;
1849 Rexp = C->exps;
1850 Rcoeff = C->coeffs;
1851 }
1852 else
1853 {
1854 slong startidx, stopidx;
1855 if (L->upperclosed)
1856 {
1857 startidx = 0;
1858 stopidx = chunk_find_exp(L->emin, 1, H);
1859 }
1860 else
1861 {
1862 startidx = chunk_find_exp(L->emax, 1, H);
1863 stopidx = chunk_find_exp(L->emin, startidx, H);
1864 }
1865 Rlen = stopidx - startidx;
1866 Rcoeff = A->coeffs + startidx;
1867 Rexp = A->exps + N*startidx;
1868 }
1869
1870 /* if we have remaining terms, add to quotient */
1871 if (Rlen > 0)
1872 {
1873 fmpz_mpoly_stripe_struct * S = W->S;
1874 S->startidx = &L->startidx;
1875 S->endidx = &L->endidx;
1876 S->emin = L->emin;
1877 S->emax = L->emax;
1878 S->upperclosed = L->upperclosed;
1879 if (N == 1)
1880 {
1881 T2->length = _fmpz_mpoly_divides_stripe1(
1882 &T2->coeffs, &T2->exps, &T2->alloc,
1883 Rcoeff, Rexp, Rlen,
1884 B->coeffs, B->exps, B->length, S);
1885 }
1886 else
1887 {
1888 T2->length = _fmpz_mpoly_divides_stripe(
1889 &T2->coeffs, &T2->exps, &T2->alloc,
1890 Rcoeff, Rexp, Rlen,
1891 B->coeffs, B->exps, B->length, S);
1892 }
1893 if (T2->length == 0)
1894 {
1895 H->failed = 1;
1896 return;
1897 }
1898 else
1899 {
1900 fmpz_mpoly_ts_append(H->polyQ, T2->coeffs, T2->exps,
1901 T2->length, N);
1902 }
1903 }
1904
1905 next = L->next;
1906 H->length--;
1907 H->cur = next;
1908
1909 if (next != NULL)
1910 {
1911 next->producer = 1;
1912 }
1913
1914 L->producer = 0;
1915 L->mq = -1;
1916 }
1917
1918 return;
1919 }
1920
1921
worker_loop(void * varg)1922 static void worker_loop(void * varg)
1923 {
1924 worker_arg_struct * W = (worker_arg_struct *) varg;
1925 divides_heap_base_struct * H = W->H;
1926 fmpz_mpoly_stripe_struct * S = W->S;
1927 const fmpz_mpoly_struct * B = H->polyB;
1928 fmpz_mpoly_struct * T1 = W->polyT1;
1929 fmpz_mpoly_struct * T2 = W->polyT2;
1930 slong N = H->N;
1931 slong Blen = B->length;
1932
1933 /* initialize stripe working memory */
1934 S->N = N;
1935 S->bits = H->bits;
1936 S->coeff_bits = FLINT_ABS(H->polyBcoeff_bits);
1937 S->cmpmask = H->cmpmask;
1938 S->big_mem_alloc = 0;
1939 S->big_mem = NULL;
1940
1941 stripe_fit_length(S, Blen);
1942
1943 fmpz_mpoly_init2(T1, 16, H->ctx);
1944 fmpz_mpoly_fit_bits(T1, H->bits, H->ctx);
1945 T1->bits = H->bits;
1946 fmpz_mpoly_init2(T2, 16, H->ctx);
1947 fmpz_mpoly_fit_bits(T2, H->bits, H->ctx);
1948 T2->bits = H->bits;
1949
1950 while (!H->failed)
1951 {
1952 divides_heap_chunk_struct * L;
1953 L = H->cur;
1954
1955 if (L == NULL)
1956 {
1957 break;
1958 }
1959 while (L != NULL)
1960 {
1961 #if HAVE_PTHREAD
1962 pthread_mutex_lock(&H->mutex);
1963 #endif
1964 if (L->lock != -1)
1965 {
1966 L->lock = -1;
1967 #if HAVE_PTHREAD
1968 pthread_mutex_unlock(&H->mutex);
1969 #endif
1970 trychunk(W, L);
1971 #if HAVE_PTHREAD
1972 pthread_mutex_lock(&H->mutex);
1973 #endif
1974 L->lock = 0;
1975 #if HAVE_PTHREAD
1976 pthread_mutex_unlock(&H->mutex);
1977 #endif
1978 break;
1979 }
1980 else
1981 {
1982 #if HAVE_PTHREAD
1983 pthread_mutex_unlock(&H->mutex);
1984 #endif
1985 }
1986
1987 L = L->next;
1988 }
1989 }
1990
1991 fmpz_mpoly_clear(T1, H->ctx);
1992 fmpz_mpoly_clear(T2, H->ctx);
1993 flint_free(S->big_mem);
1994
1995 return;
1996 }
1997
1998
1999 /* return 1 if quotient is exact */
_fmpz_mpoly_divides_heap_threaded_pool(fmpz_mpoly_t Q,const fmpz_mpoly_t A,const fmpz_mpoly_t B,const fmpz_mpoly_ctx_t ctx,const thread_pool_handle * handles,slong num_handles)2000 int _fmpz_mpoly_divides_heap_threaded_pool(
2001 fmpz_mpoly_t Q,
2002 const fmpz_mpoly_t A,
2003 const fmpz_mpoly_t B,
2004 const fmpz_mpoly_ctx_t ctx,
2005 const thread_pool_handle * handles,
2006 slong num_handles)
2007 {
2008 ulong mask;
2009 int divides;
2010 fmpz_mpoly_ctx_t zctx;
2011 fmpz_mpoly_t S;
2012 slong i, k, N;
2013 flint_bitcnt_t exp_bits;
2014 ulong * cmpmask;
2015 ulong * Aexp, * Bexp;
2016 int freeAexp, freeBexp;
2017 worker_arg_struct * worker_args;
2018 fmpz_t qcoeff, r;
2019 ulong * texps, * qexps;
2020 divides_heap_base_t H;
2021 TMP_INIT;
2022
2023 #if !FLINT_KNOW_STRONG_ORDER
2024 return fmpz_mpoly_divides_monagan_pearce(Q, A, B, ctx);
2025 #endif
2026
2027 if (B->length < 2 || A->length < 2)
2028 {
2029 return fmpz_mpoly_divides_monagan_pearce(Q, A, B, ctx);
2030 }
2031
2032 TMP_START;
2033
2034 fmpz_init(qcoeff);
2035 fmpz_init(r);
2036
2037 exp_bits = MPOLY_MIN_BITS;
2038 exp_bits = FLINT_MAX(exp_bits, A->bits);
2039 exp_bits = FLINT_MAX(exp_bits, B->bits);
2040 exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo);
2041
2042 N = mpoly_words_per_exp(exp_bits, ctx->minfo);
2043 cmpmask = (ulong*) TMP_ALLOC(N*sizeof(ulong));
2044 mpoly_get_cmpmask(cmpmask, N, exp_bits, ctx->minfo);
2045
2046 /* ensure input exponents packed to same size as output exponents */
2047 Aexp = A->exps;
2048 freeAexp = 0;
2049 if (exp_bits > A->bits)
2050 {
2051 freeAexp = 1;
2052 Aexp = (ulong *) flint_malloc(N*A->length*sizeof(ulong));
2053 mpoly_repack_monomials(Aexp, exp_bits, A->exps, A->bits,
2054 A->length, ctx->minfo);
2055 }
2056
2057 Bexp = B->exps;
2058 freeBexp = 0;
2059 if (exp_bits > B->bits)
2060 {
2061 freeBexp = 1;
2062 Bexp = (ulong *) flint_malloc(N*B->length*sizeof(ulong));
2063 mpoly_repack_monomials(Bexp, exp_bits, B->exps, B->bits,
2064 B->length, ctx->minfo);
2065 }
2066
2067 fmpz_mpoly_ctx_init(zctx, ctx->minfo->nvars, ctx->minfo->ord);
2068 fmpz_mpoly_init(S, zctx);
2069
2070 fmpz_mod(r, A->coeffs + 0, B->coeffs + 0);
2071 if (!fmpz_is_zero(r))
2072 {
2073 divides = 0;
2074 fmpz_mpoly_zero(Q, ctx);
2075 goto cleanup1;
2076 }
2077
2078 if (mpoly_divides_select_exps(S, zctx, num_handles,
2079 Aexp, A->length, Bexp, B->length, exp_bits))
2080 {
2081 divides = 0;
2082 fmpz_mpoly_zero(Q, ctx);
2083 goto cleanup1;
2084 }
2085
2086 /*
2087 At this point A and B both have at least two terms
2088 and the leading coefficients and monomials divide
2089 and the exponent selection did not give an easy exit
2090 */
2091
2092 divides_heap_base_init(H);
2093
2094 H->polyA->coeffs = A->coeffs;
2095 H->polyA->exps = Aexp;
2096 H->polyA->bits = exp_bits;
2097 H->polyA->length = A->length;
2098 H->polyA->alloc = A->alloc;
2099
2100 H->polyB->coeffs = B->coeffs;
2101 H->polyB->exps = Bexp;
2102 H->polyB->bits = exp_bits;
2103 H->polyB->length = B->length;
2104 H->polyB->alloc = B->alloc;
2105
2106 H->polyBcoeff_bits = _fmpz_vec_max_bits(B->coeffs, B->length);
2107
2108 H->ctx = ctx;
2109 H->bits = exp_bits;
2110 H->N = N;
2111 H->cmpmask = cmpmask;
2112 H->failed = 0;
2113
2114 for (i = 0; i + 1 < S->length; i++)
2115 {
2116 divides_heap_chunk_struct * L;
2117 L = (divides_heap_chunk_struct *) flint_malloc(
2118 sizeof(divides_heap_chunk_struct));
2119 L->ma = 0;
2120 L->mq = 0;
2121 L->emax = S->exps + N*i;
2122 L->emin = S->exps + N*(i + 1);
2123 L->upperclosed = 0;
2124 L->startidx = B->length;
2125 L->endidx = B->length;
2126 L->producer = 0;
2127 L->Cinited = 0;
2128 L->lock = -2;
2129 divides_heap_base_add_chunk(H, L);
2130 }
2131
2132 H->head->upperclosed = 1;
2133 H->head->producer = 1;
2134 H->cur = H->head;
2135
2136 /* generate at least the first quotient term */
2137
2138 texps = (ulong *) TMP_ALLOC(N*sizeof(ulong));
2139 qexps = (ulong *) TMP_ALLOC(N*sizeof(ulong));
2140
2141 mpoly_monomial_sub_mp(qexps + N*0, Aexp + N*0, Bexp + N*0, N);
2142 fmpz_divexact(qcoeff, A->coeffs + 0, B->coeffs + 0); /* already checked */
2143
2144 fmpz_mpoly_ts_init(H->polyQ, qcoeff, qexps, 1, H->bits, H->N);
2145
2146 mpoly_monomial_add_mp(texps, qexps + N*0, Bexp + N*1, N);
2147
2148 mask = 0;
2149 for (i = 0; i < FLINT_BITS/exp_bits; i++)
2150 mask = (mask << exp_bits) + (UWORD(1) << (exp_bits - 1));
2151
2152 k = 1;
2153 while (k < A->length && mpoly_monomial_gt(Aexp + N*k, texps, N, cmpmask))
2154 {
2155 int lt_divides;
2156 if (exp_bits <= FLINT_BITS)
2157 lt_divides = mpoly_monomial_divides(qexps, Aexp + N*k,
2158 Bexp + N*0, N, mask);
2159 else
2160 lt_divides = mpoly_monomial_divides_mp(qexps, Aexp + N*k,
2161 Bexp + N*0, N, exp_bits);
2162 if (!lt_divides)
2163 {
2164 H->failed = 1;
2165 break;
2166 }
2167
2168 fmpz_fdiv_qr(qcoeff, r, A->coeffs + k, B->coeffs + 0);
2169 if (!fmpz_is_zero(r))
2170 {
2171 H->failed = 1;
2172 break;
2173 }
2174
2175 fmpz_mpoly_ts_append(H->polyQ, qcoeff, qexps, 1, H->N);
2176 k++;
2177 }
2178
2179 /* start the workers */
2180
2181 #if HAVE_PTHREAD
2182 pthread_mutex_init(&H->mutex, NULL);
2183 #endif
2184
2185 worker_args = (worker_arg_struct *) flint_malloc((num_handles + 1)
2186 *sizeof(worker_arg_t));
2187
2188 for (i = 0; i < num_handles; i++)
2189 {
2190 (worker_args + i)->H = H;
2191 thread_pool_wake(global_thread_pool, handles[i], 0,
2192 worker_loop, worker_args + i);
2193 }
2194 (worker_args + num_handles)->H = H;
2195 worker_loop(worker_args + num_handles);
2196 for (i = 0; i < num_handles; i++)
2197 {
2198 thread_pool_wait(global_thread_pool, handles[i]);
2199 }
2200
2201 flint_free(worker_args);
2202
2203 #if HAVE_PTHREAD
2204 pthread_mutex_destroy(&H->mutex);
2205 #endif
2206
2207 divides = divides_heap_base_clear(Q, H);
2208
2209 cleanup1:
2210 fmpz_mpoly_clear(S, zctx);
2211 fmpz_mpoly_ctx_clear(zctx);
2212
2213 if (freeAexp)
2214 flint_free(Aexp);
2215
2216 if (freeBexp)
2217 flint_free(Bexp);
2218
2219 fmpz_clear(qcoeff);
2220 fmpz_clear(r);
2221
2222 TMP_END;
2223
2224 return divides;
2225 }
2226
2227
fmpz_mpoly_divides_heap_threaded(fmpz_mpoly_t Q,const fmpz_mpoly_t A,const fmpz_mpoly_t B,const fmpz_mpoly_ctx_t ctx)2228 int fmpz_mpoly_divides_heap_threaded(
2229 fmpz_mpoly_t Q,
2230 const fmpz_mpoly_t A,
2231 const fmpz_mpoly_t B,
2232 const fmpz_mpoly_ctx_t ctx)
2233 {
2234 thread_pool_handle * handles;
2235 slong num_handles;
2236 int divides;
2237 slong thread_limit = A->length/32;
2238
2239 if (B->length < 2 || A->length < 2)
2240 {
2241 if (B->length == 0)
2242 {
2243 flint_throw(FLINT_DIVZERO,
2244 "Divide by zero in fmpz_mpoly_divides_heap_threaded");
2245 }
2246
2247 if (A->length == 0)
2248 {
2249 fmpz_mpoly_zero(Q, ctx);
2250 return 1;
2251 }
2252 return fmpz_mpoly_divides_monagan_pearce(Q, A, B, ctx);
2253 }
2254
2255 num_handles = flint_request_threads(&handles, thread_limit);
2256
2257 divides = _fmpz_mpoly_divides_heap_threaded_pool(Q, A, B, ctx,
2258 handles, num_handles);
2259
2260 flint_give_back_threads(handles, num_handles);
2261
2262 return divides;
2263 }
2264