antic/fmpz_mpoly/gcd_brown_threaded.c

/*
    Copyright (C) 2019 Daniel Schultz

    This file is part of FLINT.

    FLINT is free software: you can redistribute it and/or modify it under
    the terms of the GNU Lesser General Public License (LGPL) as published
    by the Free Software Foundation; either version 2.1 of the License, or
    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
*/

#include "fmpz_mpoly.h"
#include "nmod_mpoly.h"
#include "thread_pool.h"
#include "fmpq.h"

typedef struct
{
    volatile int gcd_is_one;
    volatile mp_limb_t p;
#if HAVE_PTHREAD
    pthread_mutex_t mutex;
#endif
    fmpz_t gamma;
    const fmpz_mpoly_ctx_struct * ctx;
    fmpz_mpoly_struct * A, * B;
    ulong num_threads;
    slong var;
    const mpoly_gcd_info_struct * I;
}
_splitbase_struct;

typedef _splitbase_struct _splitbase_t[1];

typedef struct
{
    slong idx;
    _splitbase_struct * base;
    fmpz_mpoly_t G, Abar, Bbar;
    fmpz_t modulus;
    slong image_count;
    slong required_images;
    thread_pool_handle master_handle;
    slong num_handles;
    thread_pool_handle * worker_handles;

    nmod_mpoly_ctx_t pctx;
    nmod_mpolyn_t Ap, Bp, Gp, Abarp, Bbarp;
    fmpz_mpoly_t T, T1, T2;
}
_splitworker_arg_struct;


/* worker for reducing polynomial over ZZ to polynomial over Fp */
static void _reduce_Bp_worker(void * varg)
{
    _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
    fmpz_mpoly_interp_reduce_p_mpolyn(arg->Bp, arg->pctx, arg->base->B,
                                                               arg->base->ctx);
}

/* workers for crt'ing polynomials */
static void _join_Abar_worker(void * varg)
{
    _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
    if (!fmpz_is_one(arg->modulus))
        fmpz_mpoly_interp_crt_p_mpolyn(arg->Abar, arg->T1, arg->base->ctx,
                                          arg->modulus, arg->Abarp, arg->pctx);
    else
        fmpz_mpoly_interp_lift_p_mpolyn(arg->Abar, arg->base->ctx,
                                                        arg->Abarp, arg->pctx);
}

static void _join_Bbar_worker(void * varg)
{
    _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
    if (!fmpz_is_one(arg->modulus))
        fmpz_mpoly_interp_crt_p_mpolyn(arg->Bbar, arg->T2, arg->base->ctx,
                                          arg->modulus, arg->Bbarp, arg->pctx);
    else
        fmpz_mpoly_interp_lift_p_mpolyn(arg->Bbar, arg->base->ctx,
                                                        arg->Bbarp, arg->pctx);
}


static void _splitworker(void * varg)
{
    _splitworker_arg_struct * arg = (_splitworker_arg_struct *) varg;
    _splitbase_struct * base = arg->base;
    const fmpz_mpoly_ctx_struct * ctx = base->ctx;
    flint_bitcnt_t bits = base->A->bits;
    slong N = mpoly_words_per_exp_sp(bits, ctx->minfo);
    slong offset, shift;
    int success;
    mp_limb_t p, gammared;
    nmod_poly_stack_t Sp;

    mpoly_gen_offset_shift_sp(&offset, &shift,
                                      ctx->minfo->nvars - 1, bits, ctx->minfo);

    fmpz_one(arg->modulus);
    arg->image_count = 0;

    nmod_mpoly_ctx_init(arg->pctx, ctx->minfo->nvars, ORD_LEX, 2);
    nmod_poly_stack_init(Sp, bits, arg->pctx);
    nmod_mpolyn_init(arg->Ap, bits, arg->pctx);
    nmod_mpolyn_init(arg->Bp, bits, arg->pctx);
    nmod_mpolyn_init(arg->Gp, bits, arg->pctx);
    nmod_mpolyn_init(arg->Abarp, bits, arg->pctx);
    nmod_mpolyn_init(arg->Bbarp, bits, arg->pctx);
    fmpz_mpoly_init3(arg->T, 0, bits, ctx);
    fmpz_mpoly_init3(arg->T1, 0, bits, ctx);
    fmpz_mpoly_init3(arg->T2, 0, bits, ctx);

    while (arg->image_count < arg->required_images)
    {
        /* get prime */
#if HAVE_PTHREAD
        pthread_mutex_lock(&base->mutex);
#endif
	p = base->p;
        if (p >= UWORD_MAX_PRIME)
        {
#if HAVE_PTHREAD
            pthread_mutex_unlock(&base->mutex);
#endif
            break;
        }
        p = n_nextprime(base->p, 1);
        base->p = p;
#if HAVE_PTHREAD
        pthread_mutex_unlock(&base->mutex);
#endif

        /* make sure reduction does not kill both lc(A) and lc(B) */
        gammared = fmpz_fdiv_ui(base->gamma, p);
        if (gammared == 0)
        {
            continue;
        }

        nmod_mpoly_ctx_set_modulus(arg->pctx, p);

        /* the unfortunate nmod poly's store their own context :( */
        nmod_poly_stack_set_ctx(Sp, arg->pctx);
        nmod_mpolyn_set_mod(arg->Ap, arg->pctx->ffinfo->mod);
        nmod_mpolyn_set_mod(arg->Bp, arg->pctx->ffinfo->mod);
        nmod_mpolyn_set_mod(arg->Gp, arg->pctx->ffinfo->mod);
        nmod_mpolyn_set_mod(arg->Abarp, arg->pctx->ffinfo->mod);
        nmod_mpolyn_set_mod(arg->Bbarp, arg->pctx->ffinfo->mod);

        /* reduce to Fp and calculate an image gcd */
        if (arg->num_handles > 0)
        {
            thread_pool_wake(global_thread_pool, arg->worker_handles[0], 0,
                                                       _reduce_Bp_worker, arg);

            fmpz_mpoly_interp_reduce_p_mpolyn(arg->Ap, arg->pctx, base->A, ctx);

            thread_pool_wait(global_thread_pool, arg->worker_handles[0]);

            success = nmod_mpolyn_gcd_brown_smprime_threaded_pool(
                                  arg->Gp, arg->Abarp, arg->Bbarp,
                   arg->Ap, arg->Bp, ctx->minfo->nvars - 1, arg->pctx, base->I,
                                       arg->worker_handles, arg->num_handles);
        }
        else
        {
            /* reduction should kill neither A nor B */
            fmpz_mpoly_interp_reduce_p_mpolyn(arg->Ap, arg->pctx, base->A, ctx);
            fmpz_mpoly_interp_reduce_p_mpolyn(arg->Bp, arg->pctx, base->B, ctx);
            FLINT_ASSERT(arg->Ap->length > 0);
            FLINT_ASSERT(arg->Bp->length > 0);
            success = nmod_mpolyn_gcd_brown_smprime(
                            arg->Gp, arg->Abarp, arg->Bbarp, arg->Ap, arg->Bp,
                                ctx->minfo->nvars - 1, arg->pctx, base->I, Sp);
        }

        if (!success)
        {
            continue;
        }

        FLINT_ASSERT(arg->Gp->length > 0);
        FLINT_ASSERT(arg->Abarp->length > 0);
        FLINT_ASSERT(arg->Bbarp->length > 0);

        /* check up */
        if (base->gcd_is_one)
        {
            break;
        }

        if (nmod_mpolyn_is_nonzero_nmod(arg->Gp, arg->pctx))
        {
            base->gcd_is_one = 1;
            break;
        }

        if (!fmpz_is_one(arg->modulus))
        {
            int cmp = 0;
            slong k;

            FLINT_ASSERT(arg->G->length > 0);

            k = nmod_poly_degree(arg->Gp->coeffs + 0);
            cmp = mpoly_monomial_cmp_nomask_extra(arg->G->exps + N*0,
                                   arg->Gp->exps + N*0, N, offset, k << shift);
            if (cmp < 0)
            {
                continue;
            }
            else if (cmp > 0)
            {
                fmpz_one(arg->modulus);
                arg->image_count = 0;
            }
        }

        FLINT_ASSERT(1 == nmod_mpolyn_leadcoeff(arg->Gp, arg->pctx));
        nmod_mpolyn_scalar_mul_nmod(arg->Gp, gammared, arg->pctx);

        /* crt image gcd */
        if (arg->num_handles > 0)
        {
            thread_pool_wake(global_thread_pool, arg->worker_handles[0], 0,
                                                       _join_Abar_worker, arg);
            if (arg->num_handles > 1)
            {
                thread_pool_wake(global_thread_pool, arg->worker_handles[1], 0,
                                                       _join_Bbar_worker, arg);
            }
            else
            {
                _join_Bbar_worker(arg);
            }

            if (!fmpz_is_one(arg->modulus))
                fmpz_mpoly_interp_crt_p_mpolyn(arg->G, arg->T, ctx, arg->modulus,
                                                           arg->Gp, arg->pctx);
            else
                fmpz_mpoly_interp_lift_p_mpolyn(arg->G, ctx, arg->Gp, arg->pctx);

            thread_pool_wait(global_thread_pool, arg->worker_handles[0]);
            if (arg->num_handles > 1)
                thread_pool_wait(global_thread_pool, arg->worker_handles[1]);
        }
        else
        {
            if (!fmpz_is_one(arg->modulus))
            {
                fmpz_mpoly_interp_crt_p_mpolyn(arg->G, arg->T, ctx,
                                             arg->modulus, arg->Gp, arg->pctx);
                fmpz_mpoly_interp_crt_p_mpolyn(arg->Abar, arg->T, ctx,
                                          arg->modulus, arg->Abarp, arg->pctx);
                fmpz_mpoly_interp_crt_p_mpolyn(arg->Bbar, arg->T, ctx,
                                          arg->modulus, arg->Bbarp, arg->pctx);
            }
            else
            {
                fmpz_mpoly_interp_lift_p_mpolyn(arg->G, ctx,
                                                           arg->Gp, arg->pctx);
                fmpz_mpoly_interp_lift_p_mpolyn(arg->Abar, ctx,
                                                        arg->Abarp, arg->pctx);
                fmpz_mpoly_interp_lift_p_mpolyn(arg->Bbar, ctx,
                                                        arg->Bbarp, arg->pctx);
            }
        }

        fmpz_mul_ui(arg->modulus, arg->modulus, p);
        arg->image_count++;
    }

    fmpz_mpoly_clear(arg->T, ctx);
    fmpz_mpoly_clear(arg->T1, ctx);
    fmpz_mpoly_clear(arg->T2, ctx);

    nmod_mpolyn_clear(arg->Ap, arg->pctx);
    nmod_mpolyn_clear(arg->Bp, arg->pctx);
    nmod_mpolyn_clear(arg->Gp, arg->pctx);
    nmod_mpolyn_clear(arg->Abarp, arg->pctx);
    nmod_mpolyn_clear(arg->Bbarp, arg->pctx);
    nmod_poly_stack_clear(Sp);
    nmod_mpoly_ctx_clear(arg->pctx);
}

typedef struct
{
    slong hint_start, hint_stop;
    ulong * left_exp, * right_exp;
    fmpz_mpoly_t poly;
    fmpz_t maxcoeff, sumcoeff;
    slong thread_idx;
    slong final_idx;
    int GAB; /* 0 -> G,  1 -> A,  2 -> B */
}
_joinworker_arg_struct;

/*
    A = crt(B[0], ...., B[count-1]) wrt to P
    This function takes some preallocated temp space.
*/

static void _find_edge(
    slong * start,
    slong count,
    const ulong * exp_left,
    fmpz_mpoly_struct * const * B,
    slong N)
{
    slong k;

    for (k = 0; k < count; k++)
    {
        slong Blength = B[k]->length;
        const ulong * Bexps = B[k]->exps;

        if (start[k] < Blength
            && mpoly_monomial_gt_nomask(Bexps + N*start[k], exp_left, N))
        {
            /* go right */
            do {
                start[k]++;
            } while (start[k] < Blength
                && mpoly_monomial_gt_nomask(Bexps + N*start[k], exp_left, N));
        }
        else
        {
            /* go left */
            while (start[k] > 0
                && !mpoly_monomial_gt_nomask(Bexps + N*(start[k] - 1), exp_left, N))
            {
                start[k]--;
            }
        }
    }
}

static slong _fmpz_mpoly_crt(
    const fmpz_multi_crt_t P,
    _joinworker_arg_struct * S,
    fmpz_mpoly_struct * const * B,
    slong count,
    fmpz * output,
    fmpz ** input,
    const fmpz_mpoly_ctx_t ctx)
{
    int cmp;
    slong N = mpoly_words_per_exp_sp(S->poly->bits, ctx->minfo);
    slong lastdegree;
    slong Ai;
    slong j, k;
    slong * start, * stop;
    fmpz_t zero, max, sum;
    fmpz_mpoly_t A;
    const ulong * exp_left = S->left_exp;
    const ulong * exp_right = S->right_exp;
    TMP_INIT;

    *A = *S->poly;

    TMP_START;

    start = (slong *) TMP_ALLOC(2*count*sizeof(slong));
    stop = start + count;

    for (k = 0; k < count; k++)
    {
        start[k] = exp_left ? FLINT_MIN(S->hint_start, B[k]->length) : 0;
        stop[k] = exp_right ? FLINT_MIN(S->hint_stop, B[k]->length) : B[k]->length;
    }

    if (exp_left)
        _find_edge(start, count, exp_left, B, N);

    if (exp_right)
        _find_edge(stop, count, exp_right, B, N);

#if WANT_ASSERT
    for (k = 0; k < count; k++)
    {
        FLINT_ASSERT(0 <= start[k]);
        FLINT_ASSERT(0 <= stop[k]);
        FLINT_ASSERT(start[k] <= B[k]->length);
        FLINT_ASSERT(stop[k] <= B[k]->length);
        FLINT_ASSERT(start[k] <= stop[k]);

        /* check start */
        if (!exp_left)
        {
            FLINT_ASSERT(start[k] == 0);
        }
        else
        {
            FLINT_ASSERT(start[k] == B[k]->length ||
                    !mpoly_monomial_gt_nomask(B[k]->exps + N*start[k], exp_left, N));

            FLINT_ASSERT(start[k] == 0 ||
                    mpoly_monomial_gt_nomask(B[k]->exps + N*(start[k] - 1), exp_left, N));
        }

        /* check stop */
        if (!exp_right)
        {
            FLINT_ASSERT(stop[k] == B[k]->length);
        }
        else
        {
            FLINT_ASSERT(stop[k] == B[k]->length ||
                   !mpoly_monomial_gt_nomask(B[k]->exps + N*stop[k], exp_right, N));

            FLINT_ASSERT(stop[k] == 0 ||
                    mpoly_monomial_gt_nomask(B[k]->exps + N*(stop[k] - 1), exp_right, N));
        }
    }
#endif

    fmpz_init(zero);
    fmpz_init(max);
    fmpz_init(sum);

    Ai = 0;
    lastdegree = -WORD(1);
    while (1)
    {
        fmpz_mpoly_fit_length(A, Ai + 1, ctx);

        k = 0;
        do
        {
            input[k] = zero;
            if (start[k] < stop[k])
            {
                goto found_max;
            }
        } while (++k < count);

        break; /* all B[k] have been scanned completely */

    found_max:

        input[k] = B[k]->coeffs + start[k];
        mpoly_monomial_set(A->exps + N*Ai, B[k]->exps + N*start[k], N);
        start[k]++;

        for (k++; k < count; k++)
        {
            input[k] = zero;
            if (start[k] >= stop[k])
            {
                continue;
            }

            cmp = mpoly_monomial_cmp_nomask(B[k]->exps + N*start[k], A->exps + N*Ai, N);
            if (cmp == 0)
            {
                input[k] = B[k]->coeffs + start[k];
                start[k]++;
            }
            else if (cmp > 0)
            {
                /* undo previous max's */
                for (j = 0; j < k; j++)
                {
                    start[j] -= (input[j] != zero);
                    input[j] = zero;
                }
                goto found_max;
            }
        }

        _fmpz_multi_crt_run_p(output, P, (const fmpz * const *) input);
        fmpz_swap(A->coeffs + Ai, output + 0);
        cmp = fmpz_sgn(A->coeffs + Ai);
        if (cmp != 0)
        {
            if (fmpz_cmpabs(max, A->coeffs + Ai) < 0)
                fmpz_abs(max, A->coeffs + Ai);
            if (cmp > 0)
                fmpz_add(sum, sum, A->coeffs + Ai);
            else
                fmpz_sub(sum, sum, A->coeffs + Ai);
            Ai++;
        }
    }
    A->length = Ai;

    fmpz_swap(S->maxcoeff, max);
    fmpz_swap(S->sumcoeff, sum);

    fmpz_clear(zero);
    fmpz_clear(max);
    fmpz_clear(sum);

    TMP_END;

    *S->poly = *A;

    return lastdegree;
}


typedef struct
{
    volatile int idx;
#if HAVE_PTHREAD
    pthread_mutex_t mutex;
#endif
    const fmpz_mpoly_ctx_struct * ctx;
    fmpz_multi_crt_t CRT;
    fmpz_mpoly_struct ** gptrs, ** abarptrs, ** bbarptrs;
    fmpz_mpoly_struct * G, * Abar, * Bbar;
    _joinworker_arg_struct * chunks;
    slong chunks_length;
    ulong num_images;
}
_joinbase_struct;

typedef _joinbase_struct _joinbase_t[1];

typedef struct
{
    _joinbase_struct * base;
    slong thread_idx;
}
_njoinworker_arg_struct;

static void _joinworker(void * varg)
{
    _njoinworker_arg_struct * arg = (_njoinworker_arg_struct *) varg;
    _joinbase_struct * base = arg->base;
    fmpz ** input;
    fmpz * output;
    slong i, ls = _fmpz_multi_crt_local_size(base->CRT);
    TMP_INIT;

    TMP_START;

    input = (fmpz **) TMP_ALLOC(base->num_images * sizeof(fmpz *));
    output = (fmpz *) TMP_ALLOC(ls*sizeof(fmpz));
    for (i = 0; i < ls; i++)
        fmpz_init(output + i);

    while (1)
    {
        /* get exponent of either G, Abar, or Bbar to start working on */
#if HAVE_PTHREAD
        pthread_mutex_lock(&base->mutex);
#endif
	i = base->idx;
        base->idx = i + 1;
#if HAVE_PTHREAD
        pthread_mutex_unlock(&base->mutex);
#endif

        if (i >= base->chunks_length)
        {
            goto cleanup;
        }

        base->chunks[i].thread_idx = arg->thread_idx;

        if (base->chunks[i].GAB == 0)
        {
            _fmpz_mpoly_crt(base->CRT, base->chunks + i, base->gptrs,
                                  base->num_images, output, input, base->ctx);
        }
        else if (base->chunks[i].GAB == 1)
        {
            _fmpz_mpoly_crt(base->CRT, base->chunks + i, base->abarptrs,
                                  base->num_images, output, input, base->ctx);
        }
        else
        {
            FLINT_ASSERT(base->chunks[i].GAB == 2);

            _fmpz_mpoly_crt(base->CRT, base->chunks + i, base->bbarptrs,
                                  base->num_images, output, input, base->ctx);
        }
    }

cleanup:

    for (i = 0; i < ls; i++)
        fmpz_clear(output + i);

    TMP_END;

    return;
}


static void _finaljoinworker(void * varg)
{
    _njoinworker_arg_struct * arg = (_njoinworker_arg_struct *) varg;
    _joinbase_struct * base = arg->base;
    const fmpz_mpoly_ctx_struct * ctx = base->ctx;
    flint_bitcnt_t bits = base->G->bits;
    slong N = mpoly_words_per_exp_sp(bits, ctx->minfo);
    slong i, j;
    slong source_len;
    ulong * source_exps;
    fmpz * source_coeffs;
    slong Ti;
    ulong * Texps;
    fmpz * Tcoeffs;

    for (i = base->chunks_length - 1; i >= 0; i--)
    {
        int type = base->chunks[i].GAB;

        FLINT_ASSERT(base->chunks[i].thread_idx >= 0);

        if (base->chunks[i].thread_idx != arg->thread_idx)
            continue;

        if (type == 0)
        {
            Texps = base->G->exps;
            Tcoeffs = base->G->coeffs;
        }
        else if (type == 1)
        {
            Texps = base->Abar->exps;
            Tcoeffs = base->Abar->coeffs;
        }
        else
        {
            FLINT_ASSERT(type == 2);
            Texps = base->Bbar->exps;
            Tcoeffs = base->Bbar->coeffs;
        }

        source_len = base->chunks[i].poly->length;
        source_exps = base->chunks[i].poly->exps + N*0;
        source_coeffs = base->chunks[i].poly->coeffs + 0;

        Ti = base->chunks[i].final_idx;
        mpoly_copy_monomials(Texps + N*Ti, source_exps, source_len, N);
        for (j = 0; j < source_len; j++)
            fmpz_swap(Tcoeffs + Ti + j, source_coeffs + j);
    }
}

/*
    Set 1 <= l <= min(n, m) and fractions v + 0, ..., v + l - 1
    More comments in useage

    example operation on input n = 10, m = 16

    gcd is 2:
        5/8, 5/8
    split first 5/8:
        2/3, 5/8, 3/5,
    split first 5/8:
        2/3, 2/3, 3/5, 3/5,
    split first 3/5:
        2/3, 2/3, 2/3, 3/5, 1/2
    split first 3/5:
        2/3, 2/3, 2/3, 2/3, 1/2, 1/2

    The maximum fraction is now 0.666 which is not much bigger than n/m
*/
static slong _divide_master_threads(fmpq * v, slong n, slong m)
{
    slong l, i;
    double score_threashold;
    fmpq_t left, right;

    FLINT_ASSERT(n > 0);
    FLINT_ASSERT(m > 0);

    fmpq_init(left);
    fmpq_init(right);

    score_threashold = (double)(n)/(double)(m);
    score_threashold *= 1.1;

    /* initial choice for v */
    l = n_gcd(n, m);
    for (i = 0; i < l; i++)
    {
        fmpq_set_si(v + i, n, m);
    }

    i = 0;
    while (i < l)
    {
        if (fmpz_cmp_ui(fmpq_denref(v + i), 2) >= 0)
        {
            fmpq_farey_neighbors(left, right, v + i, fmpq_denref(v + i));

            FLINT_ASSERT(fmpz_get_ui(fmpq_denref(v + i))
                            ==  fmpz_get_ui(fmpq_denref(left))
                              + fmpz_get_ui(fmpq_denref(right)));

            if (fmpq_sgn(left) > 0 && fmpq_get_d(right) < score_threashold)
            {
                /* delete v + i, add left and right */
                FLINT_ASSERT(l < m);
                fmpq_set(v + i, right);
                fmpq_set(v + l, left);
                l++;
                continue;
            }
        }
        i++;
    }

    fmpq_clear(left);
    fmpq_clear(right);

    return l;
}

/* inputs A and B are modified */
int fmpz_mpolyl_gcd_brown_threaded_pool(
    fmpz_mpoly_t G,
    fmpz_mpoly_t Abar,
    fmpz_mpoly_t Bbar,
    fmpz_mpoly_t A,
    fmpz_mpoly_t B,
    const fmpz_mpoly_ctx_t ctx,
    const mpoly_gcd_info_t I,
    const thread_pool_handle * handles,
    slong num_handles)
{
    slong i, j, k;
    flint_bitcnt_t bits = A->bits;
    slong N = mpoly_words_per_exp_sp(bits, ctx->minfo);
    slong num_threads = num_handles + 1;
    slong num_master_threads;
    slong num_images;
    int success;
    fmpz_t bound, modulus, temp, temp2;
    fmpz maxcoeff_Gs_Abars_Bbars[3];
    fmpz sumcoeff_Gs_Abars_Bbars[3];
    fmpz_t cA, cB, cG, cAbar, cBbar;
    fmpz ** mptrs;
    fmpz_mpoly_struct ** gptrs, ** abarptrs, ** bbarptrs;
    fmpq * qvec;
    _splitworker_arg_struct * splitargs;
    _splitbase_t splitbase;
    _njoinworker_arg_struct * joinargs;
    _joinbase_t joinbase;

    fmpz_init(bound);
    fmpz_init(modulus);
    fmpz_init(temp);
    fmpz_init(temp2);
    for (i = 0; i < 3; i++)
    {
        fmpz_init(maxcoeff_Gs_Abars_Bbars + i);
        fmpz_init(sumcoeff_Gs_Abars_Bbars + i);
    }

    /* compute contents of G, Abar, Bbar, A, B */
    fmpz_init(cA);
    fmpz_init(cB);
    fmpz_init(cG);
    fmpz_init(cAbar);
    fmpz_init(cBbar);
    _fmpz_vec_content(cA, A->coeffs, A->length);
    _fmpz_vec_content(cB, B->coeffs, B->length);
    fmpz_gcd(cG, cA, cB);
    fmpz_divexact(cAbar, cA, cG);
    fmpz_divexact(cBbar, cB, cG);

    /* remove content from inputs */
    fmpz_mpoly_scalar_divexact_fmpz(A, A, cA, ctx);
    fmpz_mpoly_scalar_divexact_fmpz(B, B, cB, ctx);

    /* init split info */
    qvec = _fmpq_vec_init(num_threads);

    fmpz_init(splitbase->gamma);
    fmpz_gcd(splitbase->gamma, A->coeffs + 0, B->coeffs + 0);

    /*
        if compute_split is jumped back to, there could be as many as
        num_threads + 1 images that need to be joined.
    */
    gptrs = (fmpz_mpoly_struct **) flint_malloc(
                                (num_threads + 1)*sizeof(fmpz_mpoly_struct *));
    abarptrs = (fmpz_mpoly_struct **) flint_malloc(
                                (num_threads + 1)*sizeof(fmpz_mpoly_struct *));
    bbarptrs = (fmpz_mpoly_struct **) flint_malloc(
                                (num_threads + 1)*sizeof(fmpz_mpoly_struct *));
    mptrs = (fmpz **) flint_malloc((num_threads + 1)*sizeof(fmpz *));

    splitargs = (_splitworker_arg_struct *) flint_malloc(
                                  num_threads*sizeof(_splitworker_arg_struct));
    for (i = 0; i < num_threads; i++)
    {
        fmpz_mpoly_init3(splitargs[i].G, 0, bits, ctx);
        fmpz_mpoly_init3(splitargs[i].Abar, 0, bits, ctx);
        fmpz_mpoly_init3(splitargs[i].Bbar, 0, bits, ctx);
        fmpz_init(splitargs[i].modulus);
        splitargs[i].worker_handles = (thread_pool_handle *) flint_malloc(
                                       num_threads*sizeof(thread_pool_handle));
    }

    splitbase->num_threads = num_threads;
    splitbase->A = A;
    splitbase->B = B;
    splitbase->ctx = ctx;
    splitbase->p = UWORD(1) << (FLINT_BITS - 2);
    splitbase->I = I;

#if HAVE_PTHREAD
    pthread_mutex_init(&splitbase->mutex, NULL);
#endif

    /* initial bound on target modulus */
    fmpz_mpoly_height(bound, A, ctx);
    fmpz_mpoly_height(temp, B, ctx);

    if (fmpz_cmp(bound, temp) < 0)
        fmpz_swap(bound, temp);
    fmpz_mul(bound, bound, splitbase->gamma);
    fmpz_add(bound, bound, bound);

    /* no images yet */
    fmpz_one(modulus);

compute_split:

    splitbase->gcd_is_one = 0;
    fmpz_cdiv_q(temp, bound, modulus);
    fmpz_add_ui(temp, temp, 2);

    /*
        n := fmpz_clog_ui(temp, splitbase->p) is the number of images we need.
        We have m := num_threads threads available for work.
        We can calculate images in parallel (parallel on p), but the
        calculation of each image can itself also be done in parallel.

        An integer l := num_master_threads with 1 <= l <= min(n, m) is selected
        and fractions a_i/b_i, 0 <= i < l are also selected with sum_i(a_i) = n
        and sum_i(b_i) = m. Then l jobs are spawned, each doing a_i images
        where each image uses b_i threads.
    */
    num_master_threads = _divide_master_threads(qvec,
                                fmpz_clog_ui(temp, splitbase->p), num_threads);
    FLINT_ASSERT(num_master_threads > 0);

    k = 0;
    for (i = 0; i < num_master_threads; i++)
    {
        splitargs[i].idx = i;
        splitargs[i].base = splitbase;
        FLINT_ASSERT(fmpz_fits_si(fmpq_numref(qvec + i)));
        FLINT_ASSERT(fmpz_fits_si(fmpq_denref(qvec + i)));
        splitargs[i].required_images = fmpz_get_si(fmpq_numref(qvec + i));
        splitargs[i].num_handles = fmpz_get_si(fmpq_denref(qvec + i)) - 1;
        FLINT_ASSERT(splitargs[i].required_images > 0);
        FLINT_ASSERT(splitargs[i].num_handles >= 0);

        if (i == 0)
        {
            splitargs[i].master_handle = -1;
        }
        else
        {
            splitargs[i].master_handle = handles[k++];
        }
        FLINT_ASSERT(splitargs[i].num_handles <= num_handles);
        for (j = 0; j < splitargs[i].num_handles; j++)
        {
            splitargs[i].worker_handles[j] = handles[k++];
        }
    }
    /* all handles should have been distributed */
    FLINT_ASSERT(k == num_handles);

    for (i = 1; i < num_master_threads; i++)
    {
        thread_pool_wake(global_thread_pool, splitargs[i].master_handle, 0,
                                                 _splitworker, &splitargs[i]);
    }
    _splitworker(&splitargs[0]);
    for (i = 1; i < num_master_threads; i++)
    {
        thread_pool_wait(global_thread_pool, splitargs[i].master_handle);
    }

    if (splitbase->gcd_is_one)
    {
        fmpz_mpoly_one(G, ctx);
        fmpz_mpoly_swap(Abar, A, ctx);
        fmpz_mpoly_swap(Bbar, B, ctx);
        goto successful_put_content;
    }

    /* check each thread reached its goal */
    for (i = 0; i < num_master_threads; i++)
    {
        if (splitargs[i].image_count < splitargs[i].required_images)
        {
            /* ran out of rational primes - must fail */
            success = 0;
            goto cleanup_split;
        }
    }

    /* find images to join */
    num_images = 0;
    if (!fmpz_is_one(modulus))
    {
        gptrs[num_images] = G;
        abarptrs[num_images] = Abar;
        bbarptrs[num_images] = Bbar;
        mptrs[num_images] = modulus;
        num_images++;
    }

    i = 0;
    if (num_images == 0)
    {
        gptrs[num_images] = splitargs[i].G;
        abarptrs[num_images] = splitargs[i].Abar;
        bbarptrs[num_images] = splitargs[i].Bbar;
        mptrs[num_images] = splitargs[i].modulus;
        num_images++;
        i++;
    }

    FLINT_ASSERT(num_images <= num_master_threads + 1);

    for (; i < num_master_threads; i++)
    {
        int cmp = 0;
        cmp = mpoly_monomial_cmp_nomask(gptrs[0]->exps + N*0,
                                        splitargs[i].G->exps + N*0, N);

        if (cmp < 0)
        {
            /* splitarg[i] was unlucky - ignore it */
        }
        else
        {
            if (cmp > 0)
            {
                /* splitarg[0], ..., splitarg[i - 1] were unlucky */
                num_images = 0;
            }
            gptrs[num_images] = splitargs[i].G;
            abarptrs[num_images] = splitargs[i].Abar;
            bbarptrs[num_images] = splitargs[i].Bbar;
            mptrs[num_images] = splitargs[i].modulus;
            num_images++;
        }
        FLINT_ASSERT(num_images <= num_master_threads + 1);
    }

    /* now must join ptrs[0], ..., ptrs[num_images-1] where num_images > 0 */
    FLINT_ASSERT(num_images > 0);
    fmpz_multi_crt_init(joinbase->CRT);
    success = fmpz_multi_crt_precompute_p(joinbase->CRT,
                                     (const fmpz * const *) mptrs, num_images);
    FLINT_ASSERT(success);

    joinbase->num_images = num_images;
    joinbase->gptrs = gptrs;
    joinbase->abarptrs = abarptrs;
    joinbase->bbarptrs = bbarptrs;
    joinbase->G = G;
    joinbase->Abar = Abar;
    joinbase->Bbar = Bbar;
    joinbase->ctx = ctx;
#if HAVE_PTHREAD
    pthread_mutex_init(&joinbase->mutex, NULL);
#endif

    joinargs = (_njoinworker_arg_struct *) flint_malloc(
                                  num_threads*sizeof(_njoinworker_arg_struct));

    for (i = 0; i < num_threads; i++)
    {
        joinargs[i].base = joinbase;
        joinargs[i].thread_idx = i;
    }

    joinbase->chunks_length = 3*num_threads;
    joinbase->chunks = (_joinworker_arg_struct *) flint_malloc(
                   joinbase->chunks_length*sizeof(_joinworker_arg_struct));

    FLINT_ASSERT(joinbase->chunks_length >= 3);

    for (i = 0; i < 3; i++)
    {
        fmpz_mpoly_struct * poly = (i == 0) ? gptrs[0]
                                 : (i == 1) ? abarptrs[0]
                                 :            bbarptrs[0];

        for (j = 0; j < num_threads; j++)
        {
            _joinworker_arg_struct * d = joinbase->chunks + i*num_threads + j;
            fmpz_mpoly_init3(d->poly, 0, bits, ctx);
            fmpz_init(d->maxcoeff);
            fmpz_init(d->sumcoeff);
            d->GAB = i;
            d->thread_idx = -WORD(1);
            d->final_idx = -WORD(1);

            d->hint_start = poly->length * (j + 0) / num_threads;
            d->hint_stop  = poly->length * (j + 1) / num_threads;

            d->left_exp = NULL;
            d->right_exp = NULL;

            FLINT_ASSERT(0 <= d->hint_start);
            FLINT_ASSERT(d->hint_start <= d->hint_stop);
            FLINT_ASSERT(d->hint_stop <= poly->length);
            FLINT_ASSERT(d->hint_start < poly->length);

            if (j > 0)
            {
                FLINT_ASSERT(d->hint_start < poly->length);
                d->left_exp = poly->exps + N*d->hint_start;
            }

            if (j + 1 < num_threads)
            {
                FLINT_ASSERT(d->hint_stop < poly->length);
                d->right_exp = poly->exps + N*d->hint_stop;
            }
        }
    }

    joinbase->idx = 0;
    for (i = 0; i + 1 < num_threads; i++)
    {
        thread_pool_wake(global_thread_pool,
                                 handles[i], 0, _joinworker, joinargs + i);
    }
    _joinworker(joinargs + num_threads - 1);
    for (i = 0; i + 1 < num_threads; i++)
    {
        thread_pool_wait(global_thread_pool, handles[i]);
    }

    /* final trivial join */
    for (i = 0; i < 3; i++)
    {
        fmpz_zero(maxcoeff_Gs_Abars_Bbars + i);
        fmpz_zero(sumcoeff_Gs_Abars_Bbars + i);
    }

    {
        slong idxs[3] = {0, 0, 0};
        for (i = 0; i < joinbase->chunks_length; i++)
        {
            int type = joinbase->chunks[i].GAB;
            FLINT_ASSERT(0 <= type && type < 3);

            joinbase->chunks[i].final_idx = idxs[type];
            idxs[type] += joinbase->chunks[i].poly->length;

            fmpz_add(sumcoeff_Gs_Abars_Bbars + type,
                     sumcoeff_Gs_Abars_Bbars + type,
                     joinbase->chunks[i].sumcoeff);

            if (fmpz_cmp(maxcoeff_Gs_Abars_Bbars + type,
                         joinbase->chunks[i].maxcoeff) < 0)
            {
                fmpz_set(maxcoeff_Gs_Abars_Bbars + type,
                             joinbase->chunks[i].maxcoeff);
            }
        }

        fmpz_mpoly_fit_length(G, idxs[0], ctx);
        fmpz_mpoly_fit_length(Abar, idxs[1], ctx);
        fmpz_mpoly_fit_length(Bbar, idxs[2], ctx);
        G->length = idxs[0];
        Abar->length = idxs[1];
        Bbar->length = idxs[2];
    }

    joinbase->idx = 0;
    for (i = 0; i + 1 < num_threads; i++)
    {
        thread_pool_wake(global_thread_pool,
                            handles[i], 0, _finaljoinworker, joinargs + i);
    }
    _finaljoinworker(joinargs + num_threads - 1);
    for (i = 0; i + 1 < num_threads; i++)
    {
        thread_pool_wait(global_thread_pool, handles[i]);
    }

    FLINT_ASSERT(fmpz_mpoly_is_canonical(G, ctx));
    FLINT_ASSERT(fmpz_mpoly_is_canonical(Abar, ctx));
    FLINT_ASSERT(fmpz_mpoly_is_canonical(Bbar, ctx));

#if HAVE_PTHREAD
    pthread_mutex_destroy(&joinbase->mutex);
#endif

    /* free join data */
    fmpz_multi_crt_clear(joinbase->CRT);
    for (i = 0; i < joinbase->chunks_length; i++)
    {
        fmpz_mpoly_clear(joinbase->chunks[i].poly, ctx);
        fmpz_clear(joinbase->chunks[i].maxcoeff);
        fmpz_clear(joinbase->chunks[i].sumcoeff);
    }

    flint_free(joinbase->chunks);
    flint_free(joinargs);

    /* update modulus - modulus could be one of the mptrs */
    fmpz_one(temp);
    for (i = 0; i < num_images; i++)
    {
        fmpz_mul(temp, temp, mptrs[i]);
    }
    fmpz_swap(modulus, temp);

    for (i = 1; i < 3; i++)
    {
        fmpz_mul(temp, maxcoeff_Gs_Abars_Bbars + 0,
                       sumcoeff_Gs_Abars_Bbars + i);
        fmpz_mul(temp2, sumcoeff_Gs_Abars_Bbars + 0,
                        maxcoeff_Gs_Abars_Bbars + i);
        if (fmpz_cmp(temp, temp2) > 0)
            fmpz_swap(temp, temp2);
        fmpz_mul_2exp(temp, temp, 1);
        if (fmpz_cmp(temp, modulus) >= 0)
        {
            fmpz_mul_2exp(bound, modulus, 2*FLINT_BITS);
            goto compute_split;
        }
    }

    FLINT_ASSERT(fmpz_equal(splitbase->gamma, G->coeffs + 0));

    _fmpz_vec_content(temp, G->coeffs, G->length);
    fmpz_mpoly_scalar_divexact_fmpz(G, G, temp, ctx);
    fmpz_mpoly_scalar_divexact_fmpz(Abar, Abar, G->coeffs + 0, ctx);
    fmpz_mpoly_scalar_divexact_fmpz(Bbar, Bbar, G->coeffs + 0, ctx);

successful_put_content:

    fmpz_mpoly_scalar_mul_fmpz(G, G, cG, ctx);
    fmpz_mpoly_scalar_mul_fmpz(Abar, Abar, cAbar, ctx);
    fmpz_mpoly_scalar_mul_fmpz(Bbar, Bbar, cBbar, ctx);

    success = 1;

cleanup_split:

#if HAVE_PTHREAD
    pthread_mutex_destroy(&splitbase->mutex);
#endif
    fmpz_clear(splitbase->gamma);

    for (i = 0; i < num_threads; i++)
    {
        fmpz_mpoly_clear(splitargs[i].G, ctx);
        fmpz_mpoly_clear(splitargs[i].Abar, ctx);
        fmpz_mpoly_clear(splitargs[i].Bbar, ctx);
        fmpz_clear(splitargs[i].modulus);
        flint_free(splitargs[i].worker_handles);
    }

    flint_free(gptrs);
    flint_free(abarptrs);
    flint_free(bbarptrs);
    flint_free(mptrs);
    flint_free(splitargs);

    _fmpq_vec_clear(qvec, num_threads);

    fmpz_clear(cA);
    fmpz_clear(cB);
    fmpz_clear(cG);
    fmpz_clear(cAbar);
    fmpz_clear(cBbar);

    fmpz_clear(bound);
    fmpz_clear(modulus);
    fmpz_clear(temp);
    fmpz_clear(temp2);
    for (i = 0; i < 3; i++)
    {
        fmpz_clear(maxcoeff_Gs_Abars_Bbars + i);
        fmpz_clear(sumcoeff_Gs_Abars_Bbars + i);
    }

    return success;
}


typedef struct
{
    fmpz_mpoly_struct * Pl;
    const fmpz_mpoly_ctx_struct * lctx;
    const fmpz_mpoly_struct * P;
    const fmpz_mpoly_ctx_struct * ctx;
    const slong * perm;
    const ulong * shift;
    const ulong * stride;
    const thread_pool_handle * handles;
    slong num_handles;
}
_convertl_arg_struct;

typedef _convertl_arg_struct _convertl_arg_t[1];

static void _worker_convertu(void * varg)
{
    _convertl_arg_struct * arg = (_convertl_arg_struct *) varg;

    fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(arg->Pl, arg->lctx, arg->P, arg->ctx,
                                         arg->perm, arg->shift, arg->stride,
                                               arg->handles, arg->num_handles);
}

int fmpz_mpoly_gcd_brown_threaded(
    fmpz_mpoly_t G,
    const fmpz_mpoly_t A,
    const fmpz_mpoly_t B,
    const fmpz_mpoly_ctx_t ctx)
{
    int success;
    slong * perm;
    ulong * shift, * stride;
    slong i;
    flint_bitcnt_t ABbits;
    fmpz_mpoly_ctx_t lctx;
    fmpz_mpoly_t Al, Bl, Gl, Abarl, Bbarl;
    thread_pool_handle * handles;
    slong num_handles;
    slong thread_limit = FLINT_MIN(A->length, B->length)/16;

    if (fmpz_mpoly_is_zero(A, ctx))
    {
        if (fmpz_mpoly_is_zero(B, ctx))
        {
            fmpz_mpoly_zero(G, ctx);
            return 1;
        }
        if (fmpz_sgn(B->coeffs + 0) < 0)
        {
            fmpz_mpoly_neg(G, B, ctx);
            return 1;
        }
        else
        {
            fmpz_mpoly_set(G, B, ctx);
            return 1;
        }
    }

    if (fmpz_mpoly_is_zero(B, ctx))
    {
        if (fmpz_sgn(A->coeffs + 0) < 0)
        {
            fmpz_mpoly_neg(G, A, ctx);
            return 1;
        }
        else
        {
            fmpz_mpoly_set(G, A, ctx);
            return 1;
        }
    }

    if (A->bits > FLINT_BITS || B->bits > FLINT_BITS)
    {
        return 0;
    }

    perm = (slong *) flint_malloc(ctx->minfo->nvars*sizeof(slong));
    shift = (ulong *) flint_malloc(ctx->minfo->nvars*sizeof(ulong));
    stride = (ulong *) flint_malloc(ctx->minfo->nvars*sizeof(ulong));
    for (i = 0; i < ctx->minfo->nvars; i++)
    {
        perm[i] = i;
        shift[i] = 0;
        stride[i] = 1;
    }

    if (ctx->minfo->nvars == 1)
    {
        fmpz_poly_t a, b, g;
        fmpz_poly_init(a);
        fmpz_poly_init(b);
        fmpz_poly_init(g);
        _fmpz_mpoly_to_fmpz_poly_deflate(a, A, 0, shift, stride, ctx);
        _fmpz_mpoly_to_fmpz_poly_deflate(b, B, 0, shift, stride, ctx);
        fmpz_poly_gcd(g, a, b);
        _fmpz_mpoly_from_fmpz_poly_inflate(G, A->bits, g, 0, shift, stride, ctx);
        fmpz_poly_clear(a);
        fmpz_poly_clear(b);
        fmpz_poly_clear(g);
        success = 1;
        goto cleanup1;
    }

    ABbits = FLINT_MAX(A->bits, B->bits);

    fmpz_mpoly_ctx_init(lctx, ctx->minfo->nvars, ORD_LEX);
    fmpz_mpoly_init3(Al, 0, ABbits, lctx);
    fmpz_mpoly_init3(Bl, 0, ABbits, lctx);
    fmpz_mpoly_init3(Gl, 0, ABbits, lctx);
    fmpz_mpoly_init3(Abarl, 0, ABbits, lctx);
    fmpz_mpoly_init3(Bbarl, 0, ABbits, lctx);

    num_handles = flint_request_threads(&handles, thread_limit);

    /* convert inputs */
    if (num_handles > 0)
    {
        slong m = mpoly_divide_threads(num_handles, A->length, B->length);
        _convertl_arg_t arg;

        FLINT_ASSERT(m >= 0);
        FLINT_ASSERT(m < num_handles);

        arg->Pl = Bl;
        arg->lctx = lctx;
        arg->P = B;
        arg->ctx = ctx;
        arg->perm = perm;
        arg->shift = shift;
        arg->stride = stride;
        arg->handles = handles + (m + 1);
        arg->num_handles = num_handles - (m + 1);

        thread_pool_wake(global_thread_pool, handles[m], 0, _worker_convertu, arg);

        fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(Al, lctx, A, ctx,
                                         perm, shift, stride, handles + 0, m);

        thread_pool_wait(global_thread_pool, handles[m]);
    }
    else
    {
        fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(Al, lctx, A, ctx,
                                                 perm, shift, stride, NULL, 0);
        fmpz_mpoly_to_mpoly_perm_deflate_threaded_pool(Bl, lctx, B, ctx,
                                                 perm, shift, stride, NULL, 0);
    }

    /* calculate gcd */
    success = fmpz_mpolyl_gcd_brown_threaded_pool(Gl, Abarl, Bbarl, Al, Bl,
                                             lctx, NULL, handles, num_handles);

    flint_give_back_threads(handles, num_handles);

    if (success)
    {
        fmpz_mpoly_from_mpoly_perm_inflate(G, ABbits, ctx, Gl, lctx,
                                                          perm, shift, stride);
        if (fmpz_sgn(G->coeffs + 0) < 0)
            fmpz_mpoly_neg(G, G, ctx);
    }

    fmpz_mpoly_clear(Al, lctx);
    fmpz_mpoly_clear(Bl, lctx);
    fmpz_mpoly_clear(Gl, lctx);
    fmpz_mpoly_clear(Abarl, lctx);
    fmpz_mpoly_clear(Bbarl, lctx);
    fmpz_mpoly_ctx_clear(lctx);

cleanup1:

    flint_free(perm);
    flint_free(shift);
    flint_free(stride);

    return success;
}