14a1767b4Smrg /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
24a1767b4Smrg    storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
34a1767b4Smrg    overlap disallowed.
44a1767b4Smrg 
54a1767b4Smrg    Contributed to the GNU project by Torbjorn Granlund.
64a1767b4Smrg 
74a1767b4Smrg    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
84a1767b4Smrg    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
94a1767b4Smrg    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
104a1767b4Smrg 
11*671ea119Smrg Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc.
124a1767b4Smrg 
134a1767b4Smrg This file is part of the GNU MP Library.
144a1767b4Smrg 
154a1767b4Smrg The GNU MP Library is free software; you can redistribute it and/or modify
16f81b1c5bSmrg it under the terms of either:
17f81b1c5bSmrg 
18f81b1c5bSmrg   * the GNU Lesser General Public License as published by the Free
19f81b1c5bSmrg     Software Foundation; either version 3 of the License, or (at your
204a1767b4Smrg     option) any later version.
214a1767b4Smrg 
22f81b1c5bSmrg or
23f81b1c5bSmrg 
24f81b1c5bSmrg   * the GNU General Public License as published by the Free Software
25f81b1c5bSmrg     Foundation; either version 2 of the License, or (at your option) any
26f81b1c5bSmrg     later version.
27f81b1c5bSmrg 
28f81b1c5bSmrg or both in parallel, as here.
29f81b1c5bSmrg 
304a1767b4Smrg The GNU MP Library is distributed in the hope that it will be useful, but
314a1767b4Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32f81b1c5bSmrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33f81b1c5bSmrg for more details.
344a1767b4Smrg 
35f81b1c5bSmrg You should have received copies of the GNU General Public License and the
36f81b1c5bSmrg GNU Lesser General Public License along with the GNU MP Library.  If not,
37f81b1c5bSmrg see https://www.gnu.org/licenses/.  */
384a1767b4Smrg 
394a1767b4Smrg 
404a1767b4Smrg /*
414a1767b4Smrg    The idea of the algorithm used herein is to compute a smaller inverted value
424a1767b4Smrg    than used in the standard Barrett algorithm, and thus save time in the
434a1767b4Smrg    Newton iterations, and pay just a small price when using the inverted value
444a1767b4Smrg    for developing quotient bits.  This algorithm was presented at ICMS 2006.
454a1767b4Smrg */
464a1767b4Smrg 
474a1767b4Smrg #include "gmp-impl.h"
484a1767b4Smrg 
494a1767b4Smrg 
504a1767b4Smrg /* N = {np,nn}
514a1767b4Smrg    D = {dp,dn}
524a1767b4Smrg 
534a1767b4Smrg    Requirements: N >= D
544a1767b4Smrg 		 D >= 1
554a1767b4Smrg 		 D odd
564a1767b4Smrg 		 dn >= 2
574a1767b4Smrg 		 nn >= 2
584a1767b4Smrg 		 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
594a1767b4Smrg 
604a1767b4Smrg    Write quotient to Q = {qp,nn}.
614a1767b4Smrg 
624a1767b4Smrg    FIXME: When iterating, perhaps do the small step before loop, not after.
634a1767b4Smrg    FIXME: Try to avoid the scalar divisions when computing inverse size.
644a1767b4Smrg    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
654a1767b4Smrg 	  particular, when dn==in, tp and rp could use the same space.
664a1767b4Smrg    FIXME: Trim final quotient calculation to qn limbs of precision.
674a1767b4Smrg */
68*671ea119Smrg static void
mpn_mu_bdiv_q_old(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)69*671ea119Smrg mpn_mu_bdiv_q_old (mp_ptr qp,
704a1767b4Smrg 	       mp_srcptr np, mp_size_t nn,
714a1767b4Smrg 	       mp_srcptr dp, mp_size_t dn,
724a1767b4Smrg 	       mp_ptr scratch)
734a1767b4Smrg {
744a1767b4Smrg   mp_size_t qn;
754a1767b4Smrg   mp_size_t in;
764a1767b4Smrg   int cy, c0;
774a1767b4Smrg   mp_size_t tn, wn;
784a1767b4Smrg 
794a1767b4Smrg   qn = nn;
804a1767b4Smrg 
814a1767b4Smrg   ASSERT (dn >= 2);
824a1767b4Smrg   ASSERT (qn >= 2);
834a1767b4Smrg 
844a1767b4Smrg   if (qn > dn)
854a1767b4Smrg     {
864a1767b4Smrg       mp_size_t b;
874a1767b4Smrg 
884a1767b4Smrg       /* |_______________________|   dividend
894a1767b4Smrg 			|________|   divisor  */
904a1767b4Smrg 
914a1767b4Smrg #define ip           scratch			/* in */
924a1767b4Smrg #define rp           (scratch + in)		/* dn or rest >= binvert_itch(in) */
934a1767b4Smrg #define tp           (scratch + in + dn)	/* dn+in or next_size(dn) */
944a1767b4Smrg #define scratch_out  (scratch + in + dn + tn)	/* mulmod_bnm1_itch(next_size(dn)) */
954a1767b4Smrg 
964a1767b4Smrg       /* Compute an inverse size that is a nice partition of the quotient.  */
974a1767b4Smrg       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
984a1767b4Smrg       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
994a1767b4Smrg 
1004a1767b4Smrg       /* Some notes on allocation:
1014a1767b4Smrg 
1024a1767b4Smrg 	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
1034a1767b4Smrg 	 limbs of R dies at that point.  We could save memory by letting T live
1044a1767b4Smrg 	 just under R, and let the upper part of T expand into R. These changes
1054a1767b4Smrg 	 should reduce itch to perhaps 3dn.
1064a1767b4Smrg        */
1074a1767b4Smrg 
1084a1767b4Smrg       mpn_binvert (ip, dp, in, rp);
1094a1767b4Smrg 
1104a1767b4Smrg       cy = 0;
1114a1767b4Smrg 
1124a1767b4Smrg       MPN_COPY (rp, np, dn);
1134a1767b4Smrg       np += dn;
1144a1767b4Smrg       mpn_mullo_n (qp, rp, ip, in);
1154a1767b4Smrg       qn -= in;
1164a1767b4Smrg 
1174a1767b4Smrg       while (qn > in)
1184a1767b4Smrg 	{
1194a1767b4Smrg 	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
1204a1767b4Smrg 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
1214a1767b4Smrg 	  else
1224a1767b4Smrg 	    {
1234a1767b4Smrg 	      tn = mpn_mulmod_bnm1_next_size (dn);
1244a1767b4Smrg 	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
1254a1767b4Smrg 	      wn = dn + in - tn;		/* number of wrapped limbs */
1264a1767b4Smrg 	      if (wn > 0)
1274a1767b4Smrg 		{
1284a1767b4Smrg 		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
1294a1767b4Smrg 		  mpn_decr_u (tp + wn, c0);
1304a1767b4Smrg 		}
1314a1767b4Smrg 	    }
1324a1767b4Smrg 
1334a1767b4Smrg 	  qp += in;
1344a1767b4Smrg 	  if (dn != in)
1354a1767b4Smrg 	    {
1364a1767b4Smrg 	      /* Subtract tp[dn-1...in] from partial remainder.  */
1374a1767b4Smrg 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
1384a1767b4Smrg 	      if (cy == 2)
1394a1767b4Smrg 		{
1404a1767b4Smrg 		  mpn_incr_u (tp + dn, 1);
1414a1767b4Smrg 		  cy = 1;
1424a1767b4Smrg 		}
1434a1767b4Smrg 	    }
1444a1767b4Smrg 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
1454a1767b4Smrg 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
1464a1767b4Smrg 	  np += in;
1474a1767b4Smrg 	  mpn_mullo_n (qp, rp, ip, in);
1484a1767b4Smrg 	  qn -= in;
1494a1767b4Smrg 	}
1504a1767b4Smrg 
1514a1767b4Smrg       /* Generate last qn limbs.
1524a1767b4Smrg 	 FIXME: It should be possible to limit precision here, since qn is
1534a1767b4Smrg 	 typically somewhat smaller than dn.  No big gains expected.  */
1544a1767b4Smrg 
1554a1767b4Smrg       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
1564a1767b4Smrg 	mpn_mul (tp, dp, dn, qp, in);		/* mulhi, need tp[qn+in-1...in] */
1574a1767b4Smrg       else
1584a1767b4Smrg 	{
1594a1767b4Smrg 	  tn = mpn_mulmod_bnm1_next_size (dn);
1604a1767b4Smrg 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
1614a1767b4Smrg 	  wn = dn + in - tn;			/* number of wrapped limbs */
1624a1767b4Smrg 	  if (wn > 0)
1634a1767b4Smrg 	    {
1644a1767b4Smrg 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
1654a1767b4Smrg 	      mpn_decr_u (tp + wn, c0);
1664a1767b4Smrg 	    }
1674a1767b4Smrg 	}
1684a1767b4Smrg 
1694a1767b4Smrg       qp += in;
1704a1767b4Smrg       if (dn != in)
1714a1767b4Smrg 	{
1724a1767b4Smrg 	  cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
1734a1767b4Smrg 	  if (cy == 2)
1744a1767b4Smrg 	    {
1754a1767b4Smrg 	      mpn_incr_u (tp + dn, 1);
1764a1767b4Smrg 	      cy = 1;
1774a1767b4Smrg 	    }
1784a1767b4Smrg 	}
1794a1767b4Smrg 
1804a1767b4Smrg       mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
1814a1767b4Smrg       mpn_mullo_n (qp, rp, ip, qn);
1824a1767b4Smrg 
1834a1767b4Smrg #undef ip
1844a1767b4Smrg #undef rp
1854a1767b4Smrg #undef tp
1864a1767b4Smrg #undef scratch_out
1874a1767b4Smrg    }
1884a1767b4Smrg   else
1894a1767b4Smrg     {
1904a1767b4Smrg       /* |_______________________|   dividend
1914a1767b4Smrg 		|________________|   divisor  */
1924a1767b4Smrg 
1934a1767b4Smrg #define ip           scratch		/* in */
1944a1767b4Smrg #define tp           (scratch + in)	/* qn+in or next_size(qn) or rest >= binvert_itch(in) */
1954a1767b4Smrg #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
1964a1767b4Smrg 
1974a1767b4Smrg       /* Compute half-sized inverse.  */
1984a1767b4Smrg       in = qn - (qn >> 1);
1994a1767b4Smrg 
2004a1767b4Smrg       mpn_binvert (ip, dp, in, tp);
2014a1767b4Smrg 
2024a1767b4Smrg       mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
2034a1767b4Smrg 
2044a1767b4Smrg       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
2054a1767b4Smrg 	mpn_mul (tp, dp, qn, qp, in);		/* mulhigh */
2064a1767b4Smrg       else
2074a1767b4Smrg 	{
2084a1767b4Smrg 	  tn = mpn_mulmod_bnm1_next_size (qn);
2094a1767b4Smrg 	  mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
2104a1767b4Smrg 	  wn = qn + in - tn;			/* number of wrapped limbs */
2114a1767b4Smrg 	  if (wn > 0)
2124a1767b4Smrg 	    {
2134a1767b4Smrg 	      c0 = mpn_cmp (tp, np, wn) < 0;
2144a1767b4Smrg 	      mpn_decr_u (tp + wn, c0);
2154a1767b4Smrg 	    }
2164a1767b4Smrg 	}
2174a1767b4Smrg 
2184a1767b4Smrg       mpn_sub_n (tp, np + in, tp + in, qn - in);
2194a1767b4Smrg       mpn_mullo_n (qp + in, tp, ip, qn - in);	/* high qn-in quotient limbs */
2204a1767b4Smrg 
2214a1767b4Smrg #undef ip
2224a1767b4Smrg #undef tp
2234a1767b4Smrg #undef scratch_out
2244a1767b4Smrg     }
2254a1767b4Smrg }
2264a1767b4Smrg 
227*671ea119Smrg void
mpn_mu_bdiv_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)228*671ea119Smrg mpn_mu_bdiv_q (mp_ptr qp,
229*671ea119Smrg 	       mp_srcptr np, mp_size_t nn,
230*671ea119Smrg 	       mp_srcptr dp, mp_size_t dn,
231*671ea119Smrg 	       mp_ptr scratch)
232*671ea119Smrg {
233*671ea119Smrg   mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch);
234*671ea119Smrg   mpn_neg (qp, qp, nn);
235*671ea119Smrg }
236*671ea119Smrg 
2374a1767b4Smrg mp_size_t
mpn_mu_bdiv_q_itch(mp_size_t nn,mp_size_t dn)2384a1767b4Smrg mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
2394a1767b4Smrg {
2404a1767b4Smrg   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
2414a1767b4Smrg   mp_size_t b;
2424a1767b4Smrg 
243f81b1c5bSmrg   ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
244f81b1c5bSmrg 
2454a1767b4Smrg   qn = nn;
2464a1767b4Smrg 
2474a1767b4Smrg   if (qn > dn)
2484a1767b4Smrg     {
2494a1767b4Smrg       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
2504a1767b4Smrg       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
2514a1767b4Smrg       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
2524a1767b4Smrg 	{
2534a1767b4Smrg 	  tn = dn + in;
2544a1767b4Smrg 	  itch_out = 0;
2554a1767b4Smrg 	}
2564a1767b4Smrg       else
2574a1767b4Smrg 	{
2584a1767b4Smrg 	  tn = mpn_mulmod_bnm1_next_size (dn);
2594a1767b4Smrg 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
2604a1767b4Smrg 	}
2614a1767b4Smrg       itches = dn + tn + itch_out;
2624a1767b4Smrg     }
2634a1767b4Smrg   else
2644a1767b4Smrg     {
2654a1767b4Smrg       in = qn - (qn >> 1);
2664a1767b4Smrg       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
2674a1767b4Smrg 	{
2684a1767b4Smrg 	  tn = qn + in;
2694a1767b4Smrg 	  itch_out = 0;
2704a1767b4Smrg 	}
2714a1767b4Smrg       else
2724a1767b4Smrg 	{
2734a1767b4Smrg 	  tn = mpn_mulmod_bnm1_next_size (qn);
2744a1767b4Smrg 	  itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
2754a1767b4Smrg 	}
2764a1767b4Smrg       itches = tn + itch_out;
277f81b1c5bSmrg     }
278f81b1c5bSmrg 
279f81b1c5bSmrg   itch_binvert = mpn_binvert_itch (in);
2804a1767b4Smrg   return in + MAX (itches, itch_binvert);
2814a1767b4Smrg }
282