1 /*
2     Copyright (C) 2019 Daniel Schultz
3 
4     This file is part of FLINT.
5 
6     FLINT is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
10 */
11 
12 #include "fmpz_mpoly.h"
13 #include "nmod_mpoly.h"
14 #include "thread_pool.h"
15 #include "fmpq.h"
16 
17 typedef struct
18 {
19     volatile int gcd_is_one;
20     volatile mp_limb_t p;
21 #if HAVE_PTHREAD
22     pthread_mutex_t mutex;
23 #endif
24     fmpz_t gamma;
25     const fmpz_mpoly_ctx_struct * ctx;
26     fmpz_mpoly_struct * A, * B;
27     ulong num_threads;
28     slong var;
29     const mpoly_gcd_info_struct * I;
30 }
31 _splitbase_struct;
32 
33 typedef _splitbase_struct _splitbase_t[1];
34 
35 typedef struct
36 {
37     slong idx;
38     _splitbase_struct * base;
39     fmpz_mpoly_t G, Abar, Bbar;
40     fmpz_t modulus;
41     slong image_count;
42     slong required_images;
43     thread_pool_handle master_handle;
44     slong num_handles;
45     thread_pool_handle * worker_handles;
46 
47     nmod_mpoly_ctx_t pctx;
48     nmod_mpolyn_t Ap, Bp, Gp, Abarp, Bbarp;
49     fmpz_mpoly_t T, T1, T2;
50 }
51 _splitworker_arg_struct;
52 
53 
54 
55 /* worker for reducing polynomial over ZZ to polynomial over Fp */
_reduce_Bp_worker(void * varg)56 static void _reduce_Bp_worker(void * varg)
57 {
58     _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
59     fmpz_mpoly_interp_reduce_p_mpolyn(arg->Bp, arg->pctx, arg->base->B,
60                                                                arg->base->ctx);
61 }
62 
63 /* workers for crt'ing polynomials */
_join_Abar_worker(void * varg)64 static void _join_Abar_worker(void * varg)
65 {
66     _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
67     if (!fmpz_is_one(arg->modulus))
68         fmpz_mpoly_interp_crt_p_mpolyn(arg->Abar, arg->T1, arg->base->ctx,
69                                           arg->modulus, arg->Abarp, arg->pctx);
70     else
71         fmpz_mpoly_interp_lift_p_mpolyn(arg->Abar, arg->base->ctx,
72                                                         arg->Abarp, arg->pctx);
73 }
74 
_join_Bbar_worker(void * varg)75 static void _join_Bbar_worker(void * varg)
76 {
77     _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
78     if (!fmpz_is_one(arg->modulus))
79         fmpz_mpoly_interp_crt_p_mpolyn(arg->Bbar, arg->T2, arg->base->ctx,
80                                           arg->modulus, arg->Bbarp, arg->pctx);
81     else
82         fmpz_mpoly_interp_lift_p_mpolyn(arg->Bbar, arg->base->ctx,
83                                                         arg->Bbarp, arg->pctx);
84 }
85 
86 
_splitworker(void * varg)87 static void _splitworker(void * varg)
88 {
89     _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
90     _splitbase_struct * base = arg->base;
91     const fmpz_mpoly_ctx_struct * ctx = base->ctx;
92     flint_bitcnt_t bits = base->A->bits;
93     slong N = mpoly_words_per_exp_sp(bits, ctx->minfo);
94     slong offset, shift;
95     int success;
96     mp_limb_t p, gammared;
97     nmod_poly_stack_t Sp;
98 
99     mpoly_gen_offset_shift_sp(&offset, &shift,
100                                       ctx->minfo->nvars - 1, bits, ctx->minfo);
101 
102     fmpz_one(arg->modulus);
103     arg->image_count = 0;
104 
105     nmod_mpoly_ctx_init(arg->pctx, ctx->minfo->nvars, ORD_LEX, 2);
106     nmod_poly_stack_init(Sp, bits, arg->pctx);
107     nmod_mpolyn_init(arg->Ap, bits, arg->pctx);
108     nmod_mpolyn_init(arg->Bp, bits, arg->pctx);
109     nmod_mpolyn_init(arg->Gp, bits, arg->pctx);
110     nmod_mpolyn_init(arg->Abarp, bits, arg->pctx);
111     nmod_mpolyn_init(arg->Bbarp, bits, arg->pctx);
112     fmpz_mpoly_init3(arg->T, 0, bits, ctx);
113     fmpz_mpoly_init3(arg->T1, 0, bits, ctx);
114     fmpz_mpoly_init3(arg->T2, 0, bits, ctx);
115 
116     while (arg->image_count < arg->required_images)
117     {
118         /* get prime */
119 #if HAVE_PTHREAD
120         pthread_mutex_lock(&base->mutex);
121 #endif
122 	p = base->p;
123         if (p >= UWORD_MAX_PRIME)
124         {
125 #if HAVE_PTHREAD
126             pthread_mutex_unlock(&base->mutex);
127 #endif
128             break;
129         }
130         p = n_nextprime(base->p, 1);
131         base->p = p;
132 #if HAVE_PTHREAD
133         pthread_mutex_unlock(&base->mutex);
134 #endif
135 
136         /* make sure reduction does not kill both lc(A) and lc(B) */
137         gammared = fmpz_fdiv_ui(base->gamma, p);
138         if (gammared == 0)
139         {
140             continue;
141         }
142 
143         nmod_mpoly_ctx_set_modulus(arg->pctx, p);
144 
145         /* the unfortunate nmod poly's store their own context :( */
146         nmod_poly_stack_set_ctx(Sp, arg->pctx);
147         nmod_mpolyn_set_mod(arg->Ap, arg->pctx->ffinfo->mod);
148         nmod_mpolyn_set_mod(arg->Bp, arg->pctx->ffinfo->mod);
149         nmod_mpolyn_set_mod(arg->Gp, arg->pctx->ffinfo->mod);
150         nmod_mpolyn_set_mod(arg->Abarp, arg->pctx->ffinfo->mod);
151         nmod_mpolyn_set_mod(arg->Bbarp, arg->pctx->ffinfo->mod);
152 
153         /* reduce to Fp and calculate an image gcd */
154         if (arg->num_handles > 0)
155         {
156             thread_pool_wake(global_thread_pool, arg->worker_handles[0], 0,
157                                                        _reduce_Bp_worker, arg);
158 
159             fmpz_mpoly_interp_reduce_p_mpolyn(arg->Ap, arg->pctx, base->A, ctx);
160 
161             thread_pool_wait(global_thread_pool, arg->worker_handles[0]);
162 
163             success = nmod_mpolyn_gcd_brown_smprime_threaded_pool(
164                                   arg->Gp, arg->Abarp, arg->Bbarp,
165                    arg->Ap, arg->Bp, ctx->minfo->nvars - 1, arg->pctx, base->I,
166                                        arg->worker_handles, arg->num_handles);
167         }
168         else
169         {
170             /* reduction should kill neither A nor B */
171             fmpz_mpoly_interp_reduce_p_mpolyn(arg->Ap, arg->pctx, base->A, ctx);
172             fmpz_mpoly_interp_reduce_p_mpolyn(arg->Bp, arg->pctx, base->B, ctx);
173             FLINT_ASSERT(arg->Ap->length > 0);
174             FLINT_ASSERT(arg->Bp->length > 0);
175             success = nmod_mpolyn_gcd_brown_smprime(
176                             arg->Gp, arg->Abarp, arg->Bbarp, arg->Ap, arg->Bp,
177                                 ctx->minfo->nvars - 1, arg->pctx, base->I, Sp);
178         }
179 
180         if (!success)
181         {
182             continue;
183         }
184 
185         FLINT_ASSERT(arg->Gp->length > 0);
186         FLINT_ASSERT(arg->Abarp->length > 0);
187         FLINT_ASSERT(arg->Bbarp->length > 0);
188 
189         /* check up */
190         if (base->gcd_is_one)
191         {
192             break;
193         }
194 
195         if (nmod_mpolyn_is_nonzero_nmod(arg->Gp, arg->pctx))
196         {
197             base->gcd_is_one = 1;
198             break;
199         }
200 
201         if (!fmpz_is_one(arg->modulus))
202         {
203             int cmp = 0;
204             slong k;
205 
206             FLINT_ASSERT(arg->G->length > 0);
207 
208             k = nmod_poly_degree(arg->Gp->coeffs + 0);
209             cmp = mpoly_monomial_cmp_nomask_extra(arg->G->exps + N*0,
210                                    arg->Gp->exps + N*0, N, offset, k << shift);
211             if (cmp < 0)
212             {
213                 continue;
214             }
215             else if (cmp > 0)
216             {
217                 fmpz_one(arg->modulus);
218                 arg->image_count = 0;
219             }
220         }
221 
222         FLINT_ASSERT(1 == nmod_mpolyn_leadcoeff(arg->Gp, arg->pctx));
223         nmod_mpolyn_scalar_mul_nmod(arg->Gp, gammared, arg->pctx);
224 
225         /* crt image gcd */
226         if (arg->num_handles > 0)
227         {
228             thread_pool_wake(global_thread_pool, arg->worker_handles[0], 0,
229                                                        _join_Abar_worker, arg);
230             if (arg->num_handles > 1)
231             {
232                 thread_pool_wake(global_thread_pool, arg->worker_handles[1], 0,
233                                                        _join_Bbar_worker, arg);
234             }
235             else
236             {
237                 _join_Bbar_worker(arg);
238             }
239 
240             if (!fmpz_is_one(arg->modulus))
241                 fmpz_mpoly_interp_crt_p_mpolyn(arg->G, arg->T, ctx, arg->modulus,
242                                                            arg->Gp, arg->pctx);
243             else
244                 fmpz_mpoly_interp_lift_p_mpolyn(arg->G, ctx, arg->Gp, arg->pctx);
245 
246             thread_pool_wait(global_thread_pool, arg->worker_handles[0]);
247             if (arg->num_handles > 1)
248                 thread_pool_wait(global_thread_pool, arg->worker_handles[1]);
249         }
250         else
251         {
252             if (!fmpz_is_one(arg->modulus))
253             {
254                 fmpz_mpoly_interp_crt_p_mpolyn(arg->G, arg->T, ctx,
255                                              arg->modulus, arg->Gp, arg->pctx);
256                 fmpz_mpoly_interp_crt_p_mpolyn(arg->Abar, arg->T, ctx,
257                                           arg->modulus, arg->Abarp, arg->pctx);
258                 fmpz_mpoly_interp_crt_p_mpolyn(arg->Bbar, arg->T, ctx,
259                                           arg->modulus, arg->Bbarp, arg->pctx);
260             }
261             else
262             {
263                 fmpz_mpoly_interp_lift_p_mpolyn(arg->G, ctx,
264                                                            arg->Gp, arg->pctx);
265                 fmpz_mpoly_interp_lift_p_mpolyn(arg->Abar, ctx,
266                                                         arg->Abarp, arg->pctx);
267                 fmpz_mpoly_interp_lift_p_mpolyn(arg->Bbar, ctx,
268                                                         arg->Bbarp, arg->pctx);
269             }
270         }
271 
272         fmpz_mul_ui(arg->modulus, arg->modulus, p);
273         arg->image_count++;
274     }
275 
276     fmpz_mpoly_clear(arg->T, ctx);
277     fmpz_mpoly_clear(arg->T1, ctx);
278     fmpz_mpoly_clear(arg->T2, ctx);
279 
280     nmod_mpolyn_clear(arg->Ap, arg->pctx);
281     nmod_mpolyn_clear(arg->Bp, arg->pctx);
282     nmod_mpolyn_clear(arg->Gp, arg->pctx);
283     nmod_mpolyn_clear(arg->Abarp, arg->pctx);
284     nmod_mpolyn_clear(arg->Bbarp, arg->pctx);
285     nmod_poly_stack_clear(Sp);
286     nmod_mpoly_ctx_clear(arg->pctx);
287 }
288 
289 typedef struct
290 {
291     slong hint_start, hint_stop;
292     ulong * left_exp, * right_exp;
293     fmpz_mpoly_t poly;
294     fmpz_t maxcoeff, sumcoeff;
295     slong thread_idx;
296     slong final_idx;
297     int GAB; /* 0 -> G,  1 -> A,  2 -> B */
298 }
299 _joinworker_arg_struct;
300 
301 /*
302     A = crt(B[0], ...., B[count-1]) wrt to P
303     This function takes some preallocated temp space.
304 */
305 
_find_edge(slong * start,slong count,const ulong * exp_left,fmpz_mpoly_struct * const * B,slong N)306 static void _find_edge(
307     slong * start,
308     slong count,
309     const ulong * exp_left,
310     fmpz_mpoly_struct * const * B,
311     slong N)
312 {
313     slong k;
314 
315     for (k = 0; k < count; k++)
316     {
317         slong Blength = B[k]->length;
318         const ulong * Bexps = B[k]->exps;
319 
320         if (start[k] < Blength
321             && mpoly_monomial_gt_nomask(Bexps + N*start[k], exp_left, N))
322         {
323             /* go right */
324             do {
325                 start[k]++;
326             } while (start[k] < Blength
327                 && mpoly_monomial_gt_nomask(Bexps + N*start[k], exp_left, N));
328         }
329         else
330         {
331             /* go left */
332             while (start[k] > 0
333                 && !mpoly_monomial_gt_nomask(Bexps + N*(start[k] - 1), exp_left, N))
334             {
335                 start[k]--;
336             }
337         }
338     }
339 }
340 
_fmpz_mpoly_crt(const fmpz_multi_crt_t P,_joinworker_arg_struct * S,fmpz_mpoly_struct * const * B,slong count,fmpz * output,fmpz ** input,const fmpz_mpoly_ctx_t ctx)341 static slong _fmpz_mpoly_crt(
342     const fmpz_multi_crt_t P,
343     _joinworker_arg_struct * S,
344     fmpz_mpoly_struct * const * B,
345     slong count,
346     fmpz * output,
347     fmpz ** input,
348     const fmpz_mpoly_ctx_t ctx)
349 {
350     int cmp;
351     slong N = mpoly_words_per_exp_sp(S->poly->bits, ctx->minfo);
352     slong lastdegree;
353     slong Ai;
354     slong j, k;
355     slong * start, * stop;
356     fmpz_t zero, max, sum;
357     fmpz_mpoly_t A;
358     const ulong * exp_left = S->left_exp;
359     const ulong * exp_right = S->right_exp;
360     TMP_INIT;
361 
362     *A = *S->poly;
363 
364     TMP_START;
365 
366     start = (slong *) TMP_ALLOC(2*count*sizeof(slong));
367     stop = start + count;
368 
369     for (k = 0; k < count; k++)
370     {
371         start[k] = exp_left ? FLINT_MIN(S->hint_start, B[k]->length) : 0;
372         stop[k] = exp_right ? FLINT_MIN(S->hint_stop, B[k]->length) : B[k]->length;
373     }
374 
375     if (exp_left)
376         _find_edge(start, count, exp_left, B, N);
377 
378     if (exp_right)
379         _find_edge(stop, count, exp_right, B, N);
380 
381 #if WANT_ASSERT
382     for (k = 0; k < count; k++)
383     {
384         FLINT_ASSERT(0 <= start[k]);
385         FLINT_ASSERT(0 <= stop[k]);
386         FLINT_ASSERT(start[k] <= B[k]->length);
387         FLINT_ASSERT(stop[k] <= B[k]->length);
388         FLINT_ASSERT(start[k] <= stop[k]);
389 
390         /* check start */
391         if (!exp_left)
392         {
393             FLINT_ASSERT(start[k] == 0);
394         }
395         else
396         {
397             FLINT_ASSERT(start[k] == B[k]->length ||
398                     !mpoly_monomial_gt_nomask(B[k]->exps + N*start[k], exp_left, N));
399 
400             FLINT_ASSERT(start[k] == 0 ||
401                     mpoly_monomial_gt_nomask(B[k]->exps + N*(start[k] - 1), exp_left, N));
402         }
403 
404         /* check stop */
405         if (!exp_right)
406         {
407             FLINT_ASSERT(stop[k] == B[k]->length);
408         }
409         else
410         {
411             FLINT_ASSERT(stop[k] == B[k]->length ||
412                    !mpoly_monomial_gt_nomask(B[k]->exps + N*stop[k], exp_right, N));
413 
414             FLINT_ASSERT(stop[k] == 0 ||
415                     mpoly_monomial_gt_nomask(B[k]->exps + N*(stop[k] - 1), exp_right, N));
416         }
417     }
418 #endif
419 
420     fmpz_init(zero);
421     fmpz_init(max);
422     fmpz_init(sum);
423 
424     Ai = 0;
425     lastdegree = -WORD(1);
426     while (1)
427     {
428         fmpz_mpoly_fit_length(A, Ai + 1, ctx);
429 
430         k = 0;
431         do
432         {
433             input[k] = zero;
434             if (start[k] < stop[k])
435             {
436                 goto found_max;
437             }
438         } while (++k < count);
439 
440         break; /* all B[k] have been scanned completely */
441 
442     found_max:
443 
444         input[k] = B[k]->coeffs + start[k];
445         mpoly_monomial_set(A->exps + N*Ai, B[k]->exps + N*start[k], N);
446         start[k]++;
447 
448         for (k++; k < count; k++)
449         {
450             input[k] = zero;
451             if (start[k] >= stop[k])
452             {
453                 continue;
454             }
455 
456             cmp = mpoly_monomial_cmp_nomask(B[k]->exps + N*start[k], A->exps + N*Ai, N);
457             if (cmp == 0)
458             {
459                 input[k] = B[k]->coeffs + start[k];
460                 start[k]++;
461             }
462             else if (cmp > 0)
463             {
464                 /* undo previous max's */
465                 for (j = 0; j < k; j++)
466                 {
467                     start[j] -= (input[j] != zero);
468                     input[j] = zero;
469                 }
470                 goto found_max;
471             }
472         }
473 
474         _fmpz_multi_crt_run_p(output, P, (const fmpz * const *) input);
475         fmpz_swap(A->coeffs + Ai, output + 0);
476         cmp = fmpz_sgn(A->coeffs + Ai);
477         if (cmp != 0)
478         {
479             if (fmpz_cmpabs(max, A->coeffs + Ai) < 0)
480                 fmpz_abs(max, A->coeffs + Ai);
481             if (cmp > 0)
482                 fmpz_add(sum, sum, A->coeffs + Ai);
483             else
484                 fmpz_sub(sum, sum, A->coeffs + Ai);
485             Ai++;
486         }
487     }
488     A->length = Ai;
489 
490     fmpz_swap(S->maxcoeff, max);
491     fmpz_swap(S->sumcoeff, sum);
492 
493     fmpz_clear(zero);
494     fmpz_clear(max);
495     fmpz_clear(sum);
496 
497     TMP_END;
498 
499     *S->poly = *A;
500 
501     return lastdegree;
502 }
503 
504 
505 typedef struct
506 {
507     volatile int idx;
508 #if HAVE_PTHREAD
509     pthread_mutex_t mutex;
510 #endif
511     const fmpz_mpoly_ctx_struct * ctx;
512     fmpz_multi_crt_t CRT;
513     fmpz_mpoly_struct ** gptrs, ** abarptrs, ** bbarptrs;
514     fmpz_mpoly_struct * G, * Abar, * Bbar;
515     _joinworker_arg_struct * chunks;
516     slong chunks_length;
517     ulong num_images;
518 }
519 _joinbase_struct;
520 
521 typedef _joinbase_struct _joinbase_t[1];
522 
523 typedef struct
524 {
525     _joinbase_struct * base;
526     slong thread_idx;
527 }
528 _njoinworker_arg_struct;
529 
_joinworker(void * varg)530 static void _joinworker(void * varg)
531 {
532     _njoinworker_arg_struct * arg = (_njoinworker_arg_struct *) varg;
533     _joinbase_struct * base = arg->base;
534     fmpz ** input;
535     fmpz * output;
536     slong i, ls = _fmpz_multi_crt_local_size(base->CRT);
537     TMP_INIT;
538 
539     TMP_START;
540 
541     input = (fmpz **) TMP_ALLOC(base->num_images * sizeof(fmpz *));
542     output = (fmpz *) TMP_ALLOC(ls*sizeof(fmpz));
543     for (i = 0; i < ls; i++)
544         fmpz_init(output + i);
545 
546     while (1)
547     {
548         /* get exponent of either G, Abar, or Bbar to start working on */
549 #if HAVE_PTHREAD
550         pthread_mutex_lock(&base->mutex);
551 #endif
552 	i = base->idx;
553         base->idx = i + 1;
554 #if HAVE_PTHREAD
555         pthread_mutex_unlock(&base->mutex);
556 #endif
557 
558         if (i >= base->chunks_length)
559         {
560             goto cleanup;
561         }
562 
563         base->chunks[i].thread_idx = arg->thread_idx;
564 
565         if (base->chunks[i].GAB == 0)
566         {
567             _fmpz_mpoly_crt(base->CRT, base->chunks + i, base->gptrs,
568                                   base->num_images, output, input, base->ctx);
569         }
570         else if (base->chunks[i].GAB == 1)
571         {
572             _fmpz_mpoly_crt(base->CRT, base->chunks + i, base->abarptrs,
573                                   base->num_images, output, input, base->ctx);
574         }
575         else
576         {
577             FLINT_ASSERT(base->chunks[i].GAB == 2);
578 
579             _fmpz_mpoly_crt(base->CRT, base->chunks + i, base->bbarptrs,
580                                   base->num_images, output, input, base->ctx);
581         }
582     }
583 
584 cleanup:
585 
586     for (i = 0; i < ls; i++)
587         fmpz_clear(output + i);
588 
589     TMP_END;
590 
591     return;
592 }
593 
594 
_finaljoinworker(void * varg)595 static void _finaljoinworker(void * varg)
596 {
597     _njoinworker_arg_struct * arg = (_njoinworker_arg_struct *) varg;
598     _joinbase_struct * base = arg->base;
599     const fmpz_mpoly_ctx_struct * ctx = base->ctx;
600     flint_bitcnt_t bits = base->G->bits;
601     slong N = mpoly_words_per_exp_sp(bits, ctx->minfo);
602     slong i, j;
603     slong source_len;
604     ulong * source_exps;
605     fmpz * source_coeffs;
606     slong Ti;
607     ulong * Texps;
608     fmpz * Tcoeffs;
609 
610     for (i = base->chunks_length - 1; i >= 0; i--)
611     {
612         int type = base->chunks[i].GAB;
613 
614         FLINT_ASSERT(base->chunks[i].thread_idx >= 0);
615 
616         if (base->chunks[i].thread_idx != arg->thread_idx)
617             continue;
618 
619         if (type == 0)
620         {
621             Texps = base->G->exps;
622             Tcoeffs = base->G->coeffs;
623         }
624         else if (type == 1)
625         {
626             Texps = base->Abar->exps;
627             Tcoeffs = base->Abar->coeffs;
628         }
629         else
630         {
631             FLINT_ASSERT(type == 2);
632             Texps = base->Bbar->exps;
633             Tcoeffs = base->Bbar->coeffs;
634         }
635 
636         source_len = base->chunks[i].poly->length;
637         source_exps = base->chunks[i].poly->exps + N*0;
638         source_coeffs = base->chunks[i].poly->coeffs + 0;
639 
640         Ti = base->chunks[i].final_idx;
641         mpoly_copy_monomials(Texps + N*Ti, source_exps, source_len, N);
642         for (j = 0; j < source_len; j++)
643             fmpz_swap(Tcoeffs + Ti + j, source_coeffs + j);
644     }
645 }
646 
647 /*
648     Set 1 <= l <= min(n, m) and fractions v + 0, ..., v + l - 1
649     More comments in useage
650 
651     example operation on input n = 10, m = 16
652 
653     gcd is 2:
654         5/8, 5/8
655     split first 5/8:
656         2/3, 5/8, 3/5,
657     split first 5/8:
658         2/3, 2/3, 3/5, 3/5,
659     split first 3/5:
660         2/3, 2/3, 2/3, 3/5, 1/2
661     split first 3/5:
662         2/3, 2/3, 2/3, 2/3, 1/2, 1/2
663 
664     The maximum fraction is now 0.666 which is not much bigger than n/m
665 */
_divide_master_threads(fmpq * v,slong n,slong m)666 static slong _divide_master_threads(fmpq * v, slong n, slong m)
667 {
668     slong l, i;
669     double score_threashold;
670     fmpq_t left, right;
671 
672     FLINT_ASSERT(n > 0);
673     FLINT_ASSERT(m > 0);
674 
675     fmpq_init(left);
676     fmpq_init(right);
677 
678     score_threashold = (double)(n)/(double)(m);
679     score_threashold *= 1.1;
680 
681     /* initial choice for v */
682     l = n_gcd(n, m);
683     for (i = 0; i < l; i++)
684     {
685         fmpq_set_si(v + i, n, m);
686     }
687 
688     i = 0;
689     while (i < l)
690     {
691         if (fmpz_cmp_ui(fmpq_denref(v + i), 2) >= 0)
692         {
693             fmpq_farey_neighbors(left, right, v + i, fmpq_denref(v + i));
694 
695             FLINT_ASSERT(fmpz_get_ui(fmpq_denref(v + i))
696                             ==  fmpz_get_ui(fmpq_denref(left))
697                               + fmpz_get_ui(fmpq_denref(right)));
698 
699             if (fmpq_sgn(left) > 0 && fmpq_get_d(right) < score_threashold)
700             {
701                 /* delete v + i, add left and right */
702                 FLINT_ASSERT(l < m);
703                 fmpq_set(v + i, right);
704                 fmpq_set(v + l, left);
705                 l++;
706                 continue;
707             }
708         }
709         i++;
710     }
711 
712     fmpq_clear(left);
713     fmpq_clear(right);
714 
715     return l;
716 }
717 
718 /* inputs A and B are modified */
fmpz_mpolyl_gcd_brown_threaded_pool(fmpz_mpoly_t G,fmpz_mpoly_t Abar,fmpz_mpoly_t Bbar,fmpz_mpoly_t A,fmpz_mpoly_t B,const fmpz_mpoly_ctx_t ctx,const mpoly_gcd_info_t I,const thread_pool_handle * handles,slong num_handles)719 int fmpz_mpolyl_gcd_brown_threaded_pool(
720     fmpz_mpoly_t G,
721     fmpz_mpoly_t Abar,
722     fmpz_mpoly_t Bbar,
723     fmpz_mpoly_t A,
724     fmpz_mpoly_t B,
725     const fmpz_mpoly_ctx_t ctx,
726     const mpoly_gcd_info_t I,
727     const thread_pool_handle * handles,
728     slong num_handles)
729 {
730     slong i, j, k;
731     flint_bitcnt_t bits = A->bits;
732     slong N = mpoly_words_per_exp_sp(bits, ctx->minfo);
733     slong num_threads = num_handles + 1;
734     slong num_master_threads;
735     slong num_images;
736     int success;
737     fmpz_t bound, modulus, temp, temp2;
738     fmpz maxcoeff_Gs_Abars_Bbars[3];
739     fmpz sumcoeff_Gs_Abars_Bbars[3];
740     fmpz_t cA, cB, cG, cAbar, cBbar;
741     fmpz ** mptrs;
742     fmpz_mpoly_struct ** gptrs, ** abarptrs, ** bbarptrs;
743     fmpq * qvec;
744     _splitworker_arg_struct * splitargs;
745     _splitbase_t splitbase;
746     _njoinworker_arg_struct * joinargs;
747     _joinbase_t joinbase;
748 
749     fmpz_init(bound);
750     fmpz_init(modulus);
751     fmpz_init(temp);
752     fmpz_init(temp2);
753     for (i = 0; i < 3; i++)
754     {
755         fmpz_init(maxcoeff_Gs_Abars_Bbars + i);
756         fmpz_init(sumcoeff_Gs_Abars_Bbars + i);
757     }
758 
759     /* compute contents of G, Abar, Bbar, A, B */
760     fmpz_init(cA);
761     fmpz_init(cB);
762     fmpz_init(cG);
763     fmpz_init(cAbar);
764     fmpz_init(cBbar);
765     _fmpz_vec_content(cA, A->coeffs, A->length);
766     _fmpz_vec_content(cB, B->coeffs, B->length);
767     fmpz_gcd(cG, cA, cB);
768     fmpz_divexact(cAbar, cA, cG);
769     fmpz_divexact(cBbar, cB, cG);
770 
771     /* remove content from inputs */
772     fmpz_mpoly_scalar_divexact_fmpz(A, A, cA, ctx);
773     fmpz_mpoly_scalar_divexact_fmpz(B, B, cB, ctx);
774 
775     /* init split info */
776     qvec = _fmpq_vec_init(num_threads);
777 
778     fmpz_init(splitbase->gamma);
779     fmpz_gcd(splitbase->gamma, A->coeffs + 0, B->coeffs + 0);
780 
781     /*
782         if compute_split is jumped back to, there could be as many as
783         num_threads + 1 images that need to be joined.
784     */
785     gptrs = (fmpz_mpoly_struct **) flint_malloc(
786                                 (num_threads + 1)*sizeof(fmpz_mpoly_struct *));
787     abarptrs = (fmpz_mpoly_struct **) flint_malloc(
788                                 (num_threads + 1)*sizeof(fmpz_mpoly_struct *));
789     bbarptrs = (fmpz_mpoly_struct **) flint_malloc(
790                                 (num_threads + 1)*sizeof(fmpz_mpoly_struct *));
791     mptrs = (fmpz **) flint_malloc((num_threads + 1)*sizeof(fmpz *));
792 
793     splitargs = (_splitworker_arg_struct *) flint_malloc(
794                                   num_threads*sizeof(_splitworker_arg_struct));
795     for (i = 0; i < num_threads; i++)
796     {
797         fmpz_mpoly_init3(splitargs[i].G, 0, bits, ctx);
798         fmpz_mpoly_init3(splitargs[i].Abar, 0, bits, ctx);
799         fmpz_mpoly_init3(splitargs[i].Bbar, 0, bits, ctx);
800         fmpz_init(splitargs[i].modulus);
801         splitargs[i].worker_handles = (thread_pool_handle *) flint_malloc(
802                                        num_threads*sizeof(thread_pool_handle));
803     }
804 
805     splitbase->num_threads = num_threads;
806     splitbase->A = A;
807     splitbase->B = B;
808     splitbase->ctx = ctx;
809     splitbase->p = UWORD(1) << (FLINT_BITS - 2);
810     splitbase->I = I;
811 
812 #if HAVE_PTHREAD
813     pthread_mutex_init(&splitbase->mutex, NULL);
814 #endif
815 
816     /* initial bound on target modulus */
817     fmpz_mpoly_height(bound, A, ctx);
818     fmpz_mpoly_height(temp, B, ctx);
819 
820     if (fmpz_cmp(bound, temp) < 0)
821         fmpz_swap(bound, temp);
822     fmpz_mul(bound, bound, splitbase->gamma);
823     fmpz_add(bound, bound, bound);
824 
825     /* no images yet */
826     fmpz_one(modulus);
827 
828 compute_split:
829 
830     splitbase->gcd_is_one = 0;
831     fmpz_cdiv_q(temp, bound, modulus);
832     fmpz_add_ui(temp, temp, 2);
833 
834     /*
835         n := fmpz_clog_ui(temp, splitbase->p) is the number of images we need.
836         We have m := num_threads threads available for work.
837         We can calculate images in parallel (parallel on p), but the
838         calculation of each image can itself also be done in parallel.
839 
840         An integer l := num_master_threads with 1 <= l <= min(n, m) is selected
841         and fractions a_i/b_i, 0 <= i < l are also selected with sum_i(a_i) = n
842         and sum_i(b_i) = m. Then l jobs are spawned, each doing a_i images
843         where each image uses b_i threads.
844     */
845     num_master_threads = _divide_master_threads(qvec,
846                                 fmpz_clog_ui(temp, splitbase->p), num_threads);
847     FLINT_ASSERT(num_master_threads > 0);
848 
849     k = 0;
850     for (i = 0; i < num_master_threads; i++)
851     {
852         splitargs[i].idx = i;
853         splitargs[i].base = splitbase;
854         FLINT_ASSERT(fmpz_fits_si(fmpq_numref(qvec + i)));
855         FLINT_ASSERT(fmpz_fits_si(fmpq_denref(qvec + i)));
856         splitargs[i].required_images = fmpz_get_si(fmpq_numref(qvec + i));
857         splitargs[i].num_handles = fmpz_get_si(fmpq_denref(qvec + i)) - 1;
858         FLINT_ASSERT(splitargs[i].required_images > 0);
859         FLINT_ASSERT(splitargs[i].num_handles >= 0);
860 
861         if (i == 0)
862         {
863             splitargs[i].master_handle = -1;
864         }
865         else
866         {
867             splitargs[i].master_handle = handles[k++];
868         }
869         FLINT_ASSERT(splitargs[i].num_handles <= num_handles);
870         for (j = 0; j < splitargs[i].num_handles; j++)
871         {
872             splitargs[i].worker_handles[j] = handles[k++];
873         }
874     }
875     /* all handles should have been distributed */
876     FLINT_ASSERT(k == num_handles);
877 
878     for (i = 1; i < num_master_threads; i++)
879     {
880         thread_pool_wake(global_thread_pool, splitargs[i].master_handle, 0,
881                                                  _splitworker, &splitargs[i]);
882     }
883     _splitworker(&splitargs[0]);
884     for (i = 1; i < num_master_threads; i++)
885     {
886         thread_pool_wait(global_thread_pool, splitargs[i].master_handle);
887     }
888 
889     if (splitbase->gcd_is_one)
890     {
891         fmpz_mpoly_one(G, ctx);
892         fmpz_mpoly_swap(Abar, A, ctx);
893         fmpz_mpoly_swap(Bbar, B, ctx);
894         goto successful_put_content;
895     }
896 
897     /* check each thread reached its goal */
898     for (i = 0; i < num_master_threads; i++)
899     {
900         if (splitargs[i].image_count < splitargs[i].required_images)
901         {
902             /* ran out of rational primes - must fail */
903             success = 0;
904             goto cleanup_split;
905         }
906     }
907 
908     /* find images to join */
909     num_images = 0;
910     if (!fmpz_is_one(modulus))
911     {
912         gptrs[num_images] = G;
913         abarptrs[num_images] = Abar;
914         bbarptrs[num_images] = Bbar;
915         mptrs[num_images] = modulus;
916         num_images++;
917     }
918 
919     i = 0;
920     if (num_images == 0)
921     {
922         gptrs[num_images] = splitargs[i].G;
923         abarptrs[num_images] = splitargs[i].Abar;
924         bbarptrs[num_images] = splitargs[i].Bbar;
925         mptrs[num_images] = splitargs[i].modulus;
926         num_images++;
927         i++;
928     }
929 
930     FLINT_ASSERT(num_images <= num_master_threads + 1);
931 
932     for (; i < num_master_threads; i++)
933     {
934         int cmp = 0;
935         cmp = mpoly_monomial_cmp_nomask(gptrs[0]->exps + N*0,
936                                         splitargs[i].G->exps + N*0, N);
937 
938         if (cmp < 0)
939         {
940             /* splitarg[i] was unlucky - ignore it */
941         }
942         else
943         {
944             if (cmp > 0)
945             {
946                 /* splitarg[0], ..., splitarg[i - 1] were unlucky */
947                 num_images = 0;
948             }
949             gptrs[num_images] = splitargs[i].G;
950             abarptrs[num_images] = splitargs[i].Abar;
951             bbarptrs[num_images] = splitargs[i].Bbar;
952             mptrs[num_images] = splitargs[i].modulus;
953             num_images++;
954         }
955         FLINT_ASSERT(num_images <= num_master_threads + 1);
956     }
957 
958     /* now must join ptrs[0], ..., ptrs[num_images-1] where num_images > 0 */
959     FLINT_ASSERT(num_images > 0);
960     fmpz_multi_crt_init(joinbase->CRT);
961     success = fmpz_multi_crt_precompute_p(joinbase->CRT,
962                                      (const fmpz * const *) mptrs, num_images);
963     FLINT_ASSERT(success);
964 
965     joinbase->num_images = num_images;
966     joinbase->gptrs = gptrs;
967     joinbase->abarptrs = abarptrs;
968     joinbase->bbarptrs = bbarptrs;
969     joinbase->G = G;
970     joinbase->Abar = Abar;
971     joinbase->Bbar = Bbar;
972     joinbase->ctx = ctx;
973 #if HAVE_PTHREAD
974     pthread_mutex_init(&joinbase->mutex, NULL);
975 #endif
976 
977     joinargs = (_njoinworker_arg_struct *) flint_malloc(
978                                   num_threads*sizeof(_njoinworker_arg_struct));
979 
980     for (i = 0; i < num_threads; i++)
981     {
982         joinargs[i].base = joinbase;
983         joinargs[i].thread_idx = i;
984     }
985 
986     joinbase->chunks_length = 3*num_threads;
987     joinbase->chunks = (_joinworker_arg_struct *) flint_malloc(
988                    joinbase->chunks_length*sizeof(_joinworker_arg_struct));
989 
990     FLINT_ASSERT(joinbase->chunks_length >= 3);
991 
992     for (i = 0; i < 3; i++)
993     {
994         fmpz_mpoly_struct * poly = (i == 0) ? gptrs[0]
995                                  : (i == 1) ? abarptrs[0]
996                                  :            bbarptrs[0];
997 
998         for (j = 0; j < num_threads; j++)
999         {
1000             _joinworker_arg_struct * d = joinbase->chunks + i*num_threads + j;
1001             fmpz_mpoly_init3(d->poly, 0, bits, ctx);
1002             fmpz_init(d->maxcoeff);
1003             fmpz_init(d->sumcoeff);
1004             d->GAB = i;
1005             d->thread_idx = -WORD(1);
1006             d->final_idx = -WORD(1);
1007 
1008             d->hint_start = poly->length * (j + 0) / num_threads;
1009             d->hint_stop  = poly->length * (j + 1) / num_threads;
1010 
1011             d->left_exp = NULL;
1012             d->right_exp = NULL;
1013 
1014             FLINT_ASSERT(0 <= d->hint_start);
1015             FLINT_ASSERT(d->hint_start <= d->hint_stop);
1016             FLINT_ASSERT(d->hint_stop <= poly->length);
1017             FLINT_ASSERT(d->hint_start < poly->length);
1018 
1019             if (j > 0)
1020             {
1021                 FLINT_ASSERT(d->hint_start < poly->length);
1022                 d->left_exp = poly->exps + N*d->hint_start;
1023             }
1024 
1025             if (j + 1 < num_threads)
1026             {
1027                 FLINT_ASSERT(d->hint_stop < poly->length);
1028                 d->right_exp = poly->exps + N*d->hint_stop;
1029             }
1030         }
1031     }
1032 
1033     joinbase->idx = 0;
1034     for (i = 0; i + 1 < num_threads; i++)
1035     {
1036         thread_pool_wake(global_thread_pool,
1037                                  handles[i], 0, _joinworker, joinargs + i);
1038     }
1039     _joinworker(joinargs + num_threads - 1);
1040     for (i = 0; i + 1 < num_threads; i++)
1041     {
1042         thread_pool_wait(global_thread_pool, handles[i]);
1043     }
1044 
1045     /* final trivial join */
1046     for (i = 0; i < 3; i++)
1047     {
1048         fmpz_zero(maxcoeff_Gs_Abars_Bbars + i);
1049         fmpz_zero(sumcoeff_Gs_Abars_Bbars + i);
1050     }
1051 
1052     {
1053         slong idxs[3] = {0, 0, 0};
1054         for (i = 0; i < joinbase->chunks_length; i++)
1055         {
1056             int type = joinbase->chunks[i].GAB;
1057             FLINT_ASSERT(0 <= type && type < 3);
1058 
1059             joinbase->chunks[i].final_idx = idxs[type];
1060             idxs[type] += joinbase->chunks[i].poly->length;
1061 
1062             fmpz_add(sumcoeff_Gs_Abars_Bbars + type,
1063                      sumcoeff_Gs_Abars_Bbars + type,
1064                      joinbase->chunks[i].sumcoeff);
1065 
1066             if (fmpz_cmp(maxcoeff_Gs_Abars_Bbars + type,
1067                          joinbase->chunks[i].maxcoeff) < 0)
1068             {
1069                 fmpz_set(maxcoeff_Gs_Abars_Bbars + type,
1070                              joinbase->chunks[i].maxcoeff);
1071             }
1072         }
1073 
1074         fmpz_mpoly_fit_length(G, idxs[0], ctx);
1075         fmpz_mpoly_fit_length(Abar, idxs[1], ctx);
1076         fmpz_mpoly_fit_length(Bbar, idxs[2], ctx);
1077         G->length = idxs[0];
1078         Abar->length = idxs[1];
1079         Bbar->length = idxs[2];
1080     }
1081 
1082     joinbase->idx = 0;
1083     for (i = 0; i + 1 < num_threads; i++)
1084     {
1085         thread_pool_wake(global_thread_pool,
1086                             handles[i], 0, _finaljoinworker, joinargs + i);
1087     }
1088     _finaljoinworker(joinargs + num_threads - 1);
1089     for (i = 0; i + 1 < num_threads; i++)
1090     {
1091         thread_pool_wait(global_thread_pool, handles[i]);
1092     }
1093 
1094     FLINT_ASSERT(fmpz_mpoly_is_canonical(G, ctx));
1095     FLINT_ASSERT(fmpz_mpoly_is_canonical(Abar, ctx));
1096     FLINT_ASSERT(fmpz_mpoly_is_canonical(Bbar, ctx));
1097 
1098 #if HAVE_PTHREAD
1099     pthread_mutex_destroy(&joinbase->mutex);
1100 #endif
1101 
1102     /* free join data */
1103     fmpz_multi_crt_clear(joinbase->CRT);
1104     for (i = 0; i < joinbase->chunks_length; i++)
1105     {
1106         fmpz_mpoly_clear(joinbase->chunks[i].poly, ctx);
1107         fmpz_clear(joinbase->chunks[i].maxcoeff);
1108         fmpz_clear(joinbase->chunks[i].sumcoeff);
1109     }
1110 
1111     flint_free(joinbase->chunks);
1112     flint_free(joinargs);
1113 
1114     /* update modulus - modulus could be one of the mptrs */
1115     fmpz_one(temp);
1116     for (i = 0; i < num_images; i++)
1117     {
1118         fmpz_mul(temp, temp, mptrs[i]);
1119     }
1120     fmpz_swap(modulus, temp);
1121 
1122     for (i = 1; i < 3; i++)
1123     {
1124         fmpz_mul(temp, maxcoeff_Gs_Abars_Bbars + 0,
1125                        sumcoeff_Gs_Abars_Bbars + i);
1126         fmpz_mul(temp2, sumcoeff_Gs_Abars_Bbars + 0,
1127                         maxcoeff_Gs_Abars_Bbars + i);
1128         if (fmpz_cmp(temp, temp2) > 0)
1129             fmpz_swap(temp, temp2);
1130         fmpz_mul_2exp(temp, temp, 1);
1131         if (fmpz_cmp(temp, modulus) >= 0)
1132         {
1133             fmpz_mul_2exp(bound, modulus, 2*FLINT_BITS);
1134             goto compute_split;
1135         }
1136     }
1137 
1138     FLINT_ASSERT(fmpz_equal(splitbase->gamma, G->coeffs + 0));
1139 
1140     _fmpz_vec_content(temp, G->coeffs, G->length);
1141     fmpz_mpoly_scalar_divexact_fmpz(G, G, temp, ctx);
1142     fmpz_mpoly_scalar_divexact_fmpz(Abar, Abar, G->coeffs + 0, ctx);
1143     fmpz_mpoly_scalar_divexact_fmpz(Bbar, Bbar, G->coeffs + 0, ctx);
1144 
1145 successful_put_content:
1146 
1147     fmpz_mpoly_scalar_mul_fmpz(G, G, cG, ctx);
1148     fmpz_mpoly_scalar_mul_fmpz(Abar, Abar, cAbar, ctx);
1149     fmpz_mpoly_scalar_mul_fmpz(Bbar, Bbar, cBbar, ctx);
1150 
1151     success = 1;
1152 
1153 cleanup_split:
1154 
1155 #if HAVE_PTHREAD
1156     pthread_mutex_destroy(&splitbase->mutex);
1157 #endif
1158     fmpz_clear(splitbase->gamma);
1159 
1160     for (i = 0; i < num_threads; i++)
1161     {
1162         fmpz_mpoly_clear(splitargs[i].G, ctx);
1163         fmpz_mpoly_clear(splitargs[i].Abar, ctx);
1164         fmpz_mpoly_clear(splitargs[i].Bbar, ctx);
1165         fmpz_clear(splitargs[i].modulus);
1166         flint_free(splitargs[i].worker_handles);
1167     }
1168 
1169     flint_free(gptrs);
1170     flint_free(abarptrs);
1171     flint_free(bbarptrs);
1172     flint_free(mptrs);
1173     flint_free(splitargs);
1174 
1175     _fmpq_vec_clear(qvec, num_threads);
1176 
1177     fmpz_clear(cA);
1178     fmpz_clear(cB);
1179     fmpz_clear(cG);
1180     fmpz_clear(cAbar);
1181     fmpz_clear(cBbar);
1182 
1183     fmpz_clear(bound);
1184     fmpz_clear(modulus);
1185     fmpz_clear(temp);
1186     fmpz_clear(temp2);
1187     for (i = 0; i < 3; i++)
1188     {
1189         fmpz_clear(maxcoeff_Gs_Abars_Bbars + i);
1190         fmpz_clear(sumcoeff_Gs_Abars_Bbars + i);
1191     }
1192 
1193     return success;
1194 }
1195 
1196 
1197 typedef struct
1198 {
1199     fmpz_mpoly_struct * Pl;
1200     const fmpz_mpoly_ctx_struct * lctx;
1201     const fmpz_mpoly_struct * P;
1202     const fmpz_mpoly_ctx_struct * ctx;
1203     const slong * perm;
1204     const ulong * shift;
1205     const ulong * stride;
1206     const thread_pool_handle * handles;
1207     slong num_handles;
1208 }
1209 _convertl_arg_struct;
1210 
1211 typedef _convertl_arg_struct _convertl_arg_t[1];
1212 
_worker_convertu(void * varg)1213 static void _worker_convertu(void * varg)
1214 {
1215     _convertl_arg_struct * arg = (_convertl_arg_struct *) varg;
1216 
1217     fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(arg->Pl, arg->lctx, arg->P, arg->ctx,
1218                                          arg->perm, arg->shift, arg->stride,
1219                                                arg->handles, arg->num_handles);
1220 }
1221 
fmpz_mpoly_gcd_brown_threaded(fmpz_mpoly_t G,const fmpz_mpoly_t A,const fmpz_mpoly_t B,const fmpz_mpoly_ctx_t ctx)1222 int fmpz_mpoly_gcd_brown_threaded(
1223     fmpz_mpoly_t G,
1224     const fmpz_mpoly_t A,
1225     const fmpz_mpoly_t B,
1226     const fmpz_mpoly_ctx_t ctx)
1227 {
1228     int success;
1229     slong * perm;
1230     ulong * shift, * stride;
1231     slong i;
1232     flint_bitcnt_t ABbits;
1233     fmpz_mpoly_ctx_t lctx;
1234     fmpz_mpoly_t Al, Bl, Gl, Abarl, Bbarl;
1235     thread_pool_handle * handles;
1236     slong num_handles;
1237     slong thread_limit = FLINT_MIN(A->length, B->length)/16;
1238 
1239     if (fmpz_mpoly_is_zero(A, ctx))
1240     {
1241         if (fmpz_mpoly_is_zero(B, ctx))
1242         {
1243             fmpz_mpoly_zero(G, ctx);
1244             return 1;
1245         }
1246         if (fmpz_sgn(B->coeffs + 0) < 0)
1247         {
1248             fmpz_mpoly_neg(G, B, ctx);
1249             return 1;
1250         }
1251         else
1252         {
1253             fmpz_mpoly_set(G, B, ctx);
1254             return 1;
1255         }
1256     }
1257 
1258     if (fmpz_mpoly_is_zero(B, ctx))
1259     {
1260         if (fmpz_sgn(A->coeffs + 0) < 0)
1261         {
1262             fmpz_mpoly_neg(G, A, ctx);
1263             return 1;
1264         }
1265         else
1266         {
1267             fmpz_mpoly_set(G, A, ctx);
1268             return 1;
1269         }
1270     }
1271 
1272     if (A->bits > FLINT_BITS || B->bits > FLINT_BITS)
1273     {
1274         return 0;
1275     }
1276 
1277     perm = (slong *) flint_malloc(ctx->minfo->nvars*sizeof(slong));
1278     shift = (ulong *) flint_malloc(ctx->minfo->nvars*sizeof(ulong));
1279     stride = (ulong *) flint_malloc(ctx->minfo->nvars*sizeof(ulong));
1280     for (i = 0; i < ctx->minfo->nvars; i++)
1281     {
1282         perm[i] = i;
1283         shift[i] = 0;
1284         stride[i] = 1;
1285     }
1286 
1287     if (ctx->minfo->nvars == 1)
1288     {
1289         fmpz_poly_t a, b, g;
1290         fmpz_poly_init(a);
1291         fmpz_poly_init(b);
1292         fmpz_poly_init(g);
1293         _fmpz_mpoly_to_fmpz_poly_deflate(a, A, 0, shift, stride, ctx);
1294         _fmpz_mpoly_to_fmpz_poly_deflate(b, B, 0, shift, stride, ctx);
1295         fmpz_poly_gcd(g, a, b);
1296         _fmpz_mpoly_from_fmpz_poly_inflate(G, A->bits, g, 0, shift, stride, ctx);
1297         fmpz_poly_clear(a);
1298         fmpz_poly_clear(b);
1299         fmpz_poly_clear(g);
1300         success = 1;
1301         goto cleanup1;
1302     }
1303 
1304     ABbits = FLINT_MAX(A->bits, B->bits);
1305 
1306     fmpz_mpoly_ctx_init(lctx, ctx->minfo->nvars, ORD_LEX);
1307     fmpz_mpoly_init3(Al, 0, ABbits, lctx);
1308     fmpz_mpoly_init3(Bl, 0, ABbits, lctx);
1309     fmpz_mpoly_init3(Gl, 0, ABbits, lctx);
1310     fmpz_mpoly_init3(Abarl, 0, ABbits, lctx);
1311     fmpz_mpoly_init3(Bbarl, 0, ABbits, lctx);
1312 
1313     num_handles = flint_request_threads(&handles, thread_limit);
1314 
1315     /* convert inputs */
1316     if (num_handles > 0)
1317     {
1318         slong m = mpoly_divide_threads(num_handles, A->length, B->length);
1319         _convertl_arg_t arg;
1320 
1321         FLINT_ASSERT(m >= 0);
1322         FLINT_ASSERT(m < num_handles);
1323 
1324         arg->Pl = Bl;
1325         arg->lctx = lctx;
1326         arg->P = B;
1327         arg->ctx = ctx;
1328         arg->perm = perm;
1329         arg->shift = shift;
1330         arg->stride = stride;
1331         arg->handles = handles + (m + 1);
1332         arg->num_handles = num_handles - (m + 1);
1333 
1334         thread_pool_wake(global_thread_pool, handles[m], 0, _worker_convertu, arg);
1335 
1336         fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(Al, lctx, A, ctx,
1337                                          perm, shift, stride, handles + 0, m);
1338 
1339         thread_pool_wait(global_thread_pool, handles[m]);
1340     }
1341     else
1342     {
1343         fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(Al, lctx, A, ctx,
1344                                                  perm, shift, stride, NULL, 0);
1345         fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(Bl, lctx, B, ctx,
1346                                                  perm, shift, stride, NULL, 0);
1347     }
1348 
1349     /* calculate gcd */
1350     success = fmpz_mpolyl_gcd_brown_threaded_pool(Gl, Abarl, Bbarl, Al, Bl,
1351                                              lctx, NULL, handles, num_handles);
1352 
1353     flint_give_back_threads(handles, num_handles);
1354 
1355     if (success)
1356     {
1357         fmpz_mpoly_from_mpoly_perm_inflate(G, ABbits, ctx, Gl, lctx,
1358                                                           perm, shift, stride);
1359         if (fmpz_sgn(G->coeffs + 0) < 0)
1360             fmpz_mpoly_neg(G, G, ctx);
1361     }
1362 
1363     fmpz_mpoly_clear(Al, lctx);
1364     fmpz_mpoly_clear(Bl, lctx);
1365     fmpz_mpoly_clear(Gl, lctx);
1366     fmpz_mpoly_clear(Abarl, lctx);
1367     fmpz_mpoly_clear(Bbarl, lctx);
1368     fmpz_mpoly_ctx_clear(lctx);
1369 
1370 cleanup1:
1371 
1372     flint_free(perm);
1373     flint_free(shift);
1374     flint_free(stride);
1375 
1376     return success;
1377 }
1378 
1379 
1380