/* mpn_div_q -- division for arbitrary size operands.

   Contributed to the GNU project by Torbjorn Granlund.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.

Copyright 2009, 2010 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"


/* Compute Q = N/D with truncation.
     N = {np,nn}
     D = {dp,dn}
     Q = {qp,nn-dn+1}
     T = {scratch,nn+1} is scratch space
   N and D are both untouched by the computation.
   N and T may overlap; pass the same space if N is irrelevant after the call,
   but note that tp needs an extra limb.

   Operand requirements:
     N >= D > 0
     dp[dn-1] != 0
     No overlap between the N, D, and Q areas.

   This division function does not clobber its input operands, since it is
   intended to support average-O(qn) division, and for that to be effective, it
   cannot put requirements on callers to copy a O(nn) operand.

   If a caller does not care about the value of {np,nn+1} after calling this
   function, it should pass np also for the scratch argument.  This function
   will then save some time and space by avoiding allocation and copying.
   (FIXME: Is this a good design?  We only really save any copying for
   already-normalised divisors, which should be rare.  It also prevents us from
   reasonably asking for all scratch space we need.)

   We write nn-dn+1 limbs for the quotient, but return void.  Why not return
   the most significant quotient limb?  Look at the 4 main code blocks below
   (consisting of an outer if-else where each arm contains an if-else). It is
   tricky for the first code block, since the mpn_*_div_q calls will typically
   generate all nn-dn+1 and return 0 or 1.  I don't see how to fix that unless
   we generate the most significant quotient limb here, before calling
   mpn_*_div_q, or put the quotient in a temporary area.  Since this is a
   critical division case (the SB sub-case in particular) copying is not a good
   idea.

   It might make sense to split the if-else parts of the (qn + FUDGE
   >= dn) blocks into separate functions, since we could promise quite
   different things to callers in these two cases.  The 'then' case
   benefits from np=scratch, and it could perhaps even tolerate qp=np,
   saving some headache for many callers.

   FIXME: Scratch allocation leaves a lot to be desired.  E.g., for the MU size
   operands, we do not reuse the huge scratch for adjustments.  This can be a
   serious waste of memory for the largest operands.
*/

/* FUDGE determines when to try getting an approximate quotient from the upper
   parts of the dividend and divisor, then adjust.  N.B. FUDGE must be >= 2
   for the code to be correct.  */
#define FUDGE 5			/* FIXME: tune this */

#define DC_DIV_Q_THRESHOLD      DC_DIVAPPR_Q_THRESHOLD
#define MU_DIV_Q_THRESHOLD      MU_DIVAPPR_Q_THRESHOLD
#define MUPI_DIV_Q_THRESHOLD  MUPI_DIVAPPR_Q_THRESHOLD
#ifndef MUPI_DIVAPPR_Q_THRESHOLD
#define MUPI_DIVAPPR_Q_THRESHOLD  MUPI_DIV_QR_THRESHOLD
#endif

void
mpn_div_q (mp_ptr qp,
	   mp_srcptr np, mp_size_t nn,
	   mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
{
  mp_ptr new_dp, new_np, tp, rp;
  mp_limb_t cy, dh, qh;
  mp_size_t new_nn, qn;
  gmp_pi1_t dinv;
  int cnt;
  TMP_DECL;
  TMP_MARK;

  ASSERT (nn >= dn);
  ASSERT (dn > 0);
  ASSERT (dp[dn - 1] != 0);
  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));
  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));
  ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn));

  ASSERT_ALWAYS (FUDGE >= 2);

  if (dn == 1)
    {
      mpn_divrem_1 (qp, 0L, np, nn, dp[dn - 1]);
      return;
    }

  qn = nn - dn + 1;		/* Quotient size, high limb might be zero */

  if (qn + FUDGE >= dn)
    {
      /* |________________________|
                          |_______|  */
      new_np = scratch;

      dh = dp[dn - 1];
      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
	{
	  count_leading_zeros (cnt, dh);

	  cy = mpn_lshift (new_np, np, nn, cnt);
	  new_np[nn] = cy;
	  new_nn = nn + (cy != 0);

	  new_dp = TMP_ALLOC_LIMBS (dn);
	  mpn_lshift (new_dp, dp, dn, cnt);

	  if (dn == 2)
	    {
	      qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp);
	    }
	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
		   BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD))
	    {
	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
	      qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32);
	    }
	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
	    {
	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
	      qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv);
	    }
	  else
	    {
	      mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0);
	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
	      qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch);
	    }
	  if (cy == 0)
	    qp[qn - 1] = qh;
	  else if (UNLIKELY (qh != 0))
	    {
	      /* This happens only when the quotient is close to B^n and
		 mpn_*_divappr_q returned B^n.  */
	      mp_size_t i, n;
	      n = new_nn - dn;
	      for (i = 0; i < n; i++)
		qp[i] = GMP_NUMB_MAX;
	      qh = 0;		/* currently ignored */
	    }
	}
      else  /* divisor is already normalised */
	{
	  if (new_np != np)
	    MPN_COPY (new_np, np, nn);

	  if (dn == 2)
	    {
	      qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp);
	    }
	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
		   BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD))
	    {
	      invert_pi1 (dinv, dh, dp[dn - 2]);
	      qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32);
	    }
	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
	    {
	      invert_pi1 (dinv, dh, dp[dn - 2]);
	      qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv);
	    }
	  else
	    {
	      mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0);
	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
	      qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch);
	    }
	  qp[nn - dn] = qh;
	}
    }
  else
    {
      /* |________________________|
                |_________________|  */
      tp = TMP_ALLOC_LIMBS (qn + 1);

      new_np = scratch;
      new_nn = 2 * qn + 1;
      if (new_np == np)
	/* We need {np,nn} to remain untouched until the final adjustment, so
	   we need to allocate separate space for new_np.  */
	new_np = TMP_ALLOC_LIMBS (new_nn + 1);


      dh = dp[dn - 1];
      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
	{
	  count_leading_zeros (cnt, dh);

	  cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt);
	  new_np[new_nn] = cy;

	  new_nn += (cy != 0);

	  new_dp = TMP_ALLOC_LIMBS (qn + 1);
	  mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt);
	  new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt);

	  if (qn + 1 == 2)
	    {
	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
	    }
	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
	    {
	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
	    }
	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
	    {
	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
	    }
	  else
	    {
	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
	    }
	  if (cy == 0)
	    tp[qn] = qh;
	  else if (UNLIKELY (qh != 0))
	    {
	      /* This happens only when the quotient is close to B^n and
		 mpn_*_divappr_q returned B^n.  */
	      mp_size_t i, n;
	      n = new_nn - (qn + 1);
	      for (i = 0; i < n; i++)
		tp[i] = GMP_NUMB_MAX;
	      qh = 0;		/* currently ignored */
	    }
	}
      else  /* divisor is already normalised */
	{
	  MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless of MU will be used */

	  new_dp = (mp_ptr) dp + dn - (qn + 1);

	  if (qn == 2 - 1)
	    {
	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
	    }
	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
	    {
	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
	    }
	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
	    {
	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
	    }
	  else
	    {
	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
	    }
	  tp[qn] = qh;
	}

      MPN_COPY (qp, tp + 1, qn);
      if (tp[0] <= 4)
        {
	  mp_size_t rn;

          rp = TMP_ALLOC_LIMBS (dn + qn);
          mpn_mul (rp, dp, dn, tp + 1, qn);
	  rn = dn + qn;
	  rn -= rp[rn - 1] == 0;

          if (rn > nn || mpn_cmp (np, rp, nn) < 0)
            mpn_decr_u (qp, 1);
        }
    }

  TMP_FREE;
}