14a1767b4Smrg /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
24a1767b4Smrg storing the result in {qp,nn}. Overlap allowed between Q and N; all other
34a1767b4Smrg overlap disallowed.
44a1767b4Smrg
54a1767b4Smrg Contributed to the GNU project by Torbjorn Granlund.
64a1767b4Smrg
74a1767b4Smrg THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
84a1767b4Smrg SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
94a1767b4Smrg GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
104a1767b4Smrg
11*671ea119Smrg Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc.
124a1767b4Smrg
134a1767b4Smrg This file is part of the GNU MP Library.
144a1767b4Smrg
154a1767b4Smrg The GNU MP Library is free software; you can redistribute it and/or modify
16f81b1c5bSmrg it under the terms of either:
17f81b1c5bSmrg
18f81b1c5bSmrg * the GNU Lesser General Public License as published by the Free
19f81b1c5bSmrg Software Foundation; either version 3 of the License, or (at your
204a1767b4Smrg option) any later version.
214a1767b4Smrg
22f81b1c5bSmrg or
23f81b1c5bSmrg
24f81b1c5bSmrg * the GNU General Public License as published by the Free Software
25f81b1c5bSmrg Foundation; either version 2 of the License, or (at your option) any
26f81b1c5bSmrg later version.
27f81b1c5bSmrg
28f81b1c5bSmrg or both in parallel, as here.
29f81b1c5bSmrg
304a1767b4Smrg The GNU MP Library is distributed in the hope that it will be useful, but
314a1767b4Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32f81b1c5bSmrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33f81b1c5bSmrg for more details.
344a1767b4Smrg
35f81b1c5bSmrg You should have received copies of the GNU General Public License and the
36f81b1c5bSmrg GNU Lesser General Public License along with the GNU MP Library. If not,
37f81b1c5bSmrg see https://www.gnu.org/licenses/. */
384a1767b4Smrg
394a1767b4Smrg
404a1767b4Smrg /*
414a1767b4Smrg The idea of the algorithm used herein is to compute a smaller inverted value
424a1767b4Smrg than used in the standard Barrett algorithm, and thus save time in the
434a1767b4Smrg Newton iterations, and pay just a small price when using the inverted value
444a1767b4Smrg for developing quotient bits. This algorithm was presented at ICMS 2006.
454a1767b4Smrg */
464a1767b4Smrg
474a1767b4Smrg #include "gmp-impl.h"
484a1767b4Smrg
494a1767b4Smrg
504a1767b4Smrg /* N = {np,nn}
514a1767b4Smrg D = {dp,dn}
524a1767b4Smrg
534a1767b4Smrg Requirements: N >= D
544a1767b4Smrg D >= 1
554a1767b4Smrg D odd
564a1767b4Smrg dn >= 2
574a1767b4Smrg nn >= 2
584a1767b4Smrg scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
594a1767b4Smrg
604a1767b4Smrg Write quotient to Q = {qp,nn}.
614a1767b4Smrg
624a1767b4Smrg FIXME: When iterating, perhaps do the small step before loop, not after.
634a1767b4Smrg FIXME: Try to avoid the scalar divisions when computing inverse size.
644a1767b4Smrg FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
654a1767b4Smrg particular, when dn==in, tp and rp could use the same space.
664a1767b4Smrg FIXME: Trim final quotient calculation to qn limbs of precision.
674a1767b4Smrg */
68*671ea119Smrg static void
mpn_mu_bdiv_q_old(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)69*671ea119Smrg mpn_mu_bdiv_q_old (mp_ptr qp,
704a1767b4Smrg mp_srcptr np, mp_size_t nn,
714a1767b4Smrg mp_srcptr dp, mp_size_t dn,
724a1767b4Smrg mp_ptr scratch)
734a1767b4Smrg {
744a1767b4Smrg mp_size_t qn;
754a1767b4Smrg mp_size_t in;
764a1767b4Smrg int cy, c0;
774a1767b4Smrg mp_size_t tn, wn;
784a1767b4Smrg
794a1767b4Smrg qn = nn;
804a1767b4Smrg
814a1767b4Smrg ASSERT (dn >= 2);
824a1767b4Smrg ASSERT (qn >= 2);
834a1767b4Smrg
844a1767b4Smrg if (qn > dn)
854a1767b4Smrg {
864a1767b4Smrg mp_size_t b;
874a1767b4Smrg
884a1767b4Smrg /* |_______________________| dividend
894a1767b4Smrg |________| divisor */
904a1767b4Smrg
914a1767b4Smrg #define ip scratch /* in */
924a1767b4Smrg #define rp (scratch + in) /* dn or rest >= binvert_itch(in) */
934a1767b4Smrg #define tp (scratch + in + dn) /* dn+in or next_size(dn) */
944a1767b4Smrg #define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */
954a1767b4Smrg
964a1767b4Smrg /* Compute an inverse size that is a nice partition of the quotient. */
974a1767b4Smrg b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
984a1767b4Smrg in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
994a1767b4Smrg
1004a1767b4Smrg /* Some notes on allocation:
1014a1767b4Smrg
1024a1767b4Smrg When in = dn, R dies when mpn_mullo returns, if in < dn the low in
1034a1767b4Smrg limbs of R dies at that point. We could save memory by letting T live
1044a1767b4Smrg just under R, and let the upper part of T expand into R. These changes
1054a1767b4Smrg should reduce itch to perhaps 3dn.
1064a1767b4Smrg */
1074a1767b4Smrg
1084a1767b4Smrg mpn_binvert (ip, dp, in, rp);
1094a1767b4Smrg
1104a1767b4Smrg cy = 0;
1114a1767b4Smrg
1124a1767b4Smrg MPN_COPY (rp, np, dn);
1134a1767b4Smrg np += dn;
1144a1767b4Smrg mpn_mullo_n (qp, rp, ip, in);
1154a1767b4Smrg qn -= in;
1164a1767b4Smrg
1174a1767b4Smrg while (qn > in)
1184a1767b4Smrg {
1194a1767b4Smrg if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
1204a1767b4Smrg mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
1214a1767b4Smrg else
1224a1767b4Smrg {
1234a1767b4Smrg tn = mpn_mulmod_bnm1_next_size (dn);
1244a1767b4Smrg mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
1254a1767b4Smrg wn = dn + in - tn; /* number of wrapped limbs */
1264a1767b4Smrg if (wn > 0)
1274a1767b4Smrg {
1284a1767b4Smrg c0 = mpn_sub_n (tp + tn, tp, rp, wn);
1294a1767b4Smrg mpn_decr_u (tp + wn, c0);
1304a1767b4Smrg }
1314a1767b4Smrg }
1324a1767b4Smrg
1334a1767b4Smrg qp += in;
1344a1767b4Smrg if (dn != in)
1354a1767b4Smrg {
1364a1767b4Smrg /* Subtract tp[dn-1...in] from partial remainder. */
1374a1767b4Smrg cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
1384a1767b4Smrg if (cy == 2)
1394a1767b4Smrg {
1404a1767b4Smrg mpn_incr_u (tp + dn, 1);
1414a1767b4Smrg cy = 1;
1424a1767b4Smrg }
1434a1767b4Smrg }
1444a1767b4Smrg /* Subtract tp[dn+in-1...dn] from dividend. */
1454a1767b4Smrg cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
1464a1767b4Smrg np += in;
1474a1767b4Smrg mpn_mullo_n (qp, rp, ip, in);
1484a1767b4Smrg qn -= in;
1494a1767b4Smrg }
1504a1767b4Smrg
1514a1767b4Smrg /* Generate last qn limbs.
1524a1767b4Smrg FIXME: It should be possible to limit precision here, since qn is
1534a1767b4Smrg typically somewhat smaller than dn. No big gains expected. */
1544a1767b4Smrg
1554a1767b4Smrg if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
1564a1767b4Smrg mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */
1574a1767b4Smrg else
1584a1767b4Smrg {
1594a1767b4Smrg tn = mpn_mulmod_bnm1_next_size (dn);
1604a1767b4Smrg mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
1614a1767b4Smrg wn = dn + in - tn; /* number of wrapped limbs */
1624a1767b4Smrg if (wn > 0)
1634a1767b4Smrg {
1644a1767b4Smrg c0 = mpn_sub_n (tp + tn, tp, rp, wn);
1654a1767b4Smrg mpn_decr_u (tp + wn, c0);
1664a1767b4Smrg }
1674a1767b4Smrg }
1684a1767b4Smrg
1694a1767b4Smrg qp += in;
1704a1767b4Smrg if (dn != in)
1714a1767b4Smrg {
1724a1767b4Smrg cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
1734a1767b4Smrg if (cy == 2)
1744a1767b4Smrg {
1754a1767b4Smrg mpn_incr_u (tp + dn, 1);
1764a1767b4Smrg cy = 1;
1774a1767b4Smrg }
1784a1767b4Smrg }
1794a1767b4Smrg
1804a1767b4Smrg mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
1814a1767b4Smrg mpn_mullo_n (qp, rp, ip, qn);
1824a1767b4Smrg
1834a1767b4Smrg #undef ip
1844a1767b4Smrg #undef rp
1854a1767b4Smrg #undef tp
1864a1767b4Smrg #undef scratch_out
1874a1767b4Smrg }
1884a1767b4Smrg else
1894a1767b4Smrg {
1904a1767b4Smrg /* |_______________________| dividend
1914a1767b4Smrg |________________| divisor */
1924a1767b4Smrg
1934a1767b4Smrg #define ip scratch /* in */
1944a1767b4Smrg #define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */
1954a1767b4Smrg #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
1964a1767b4Smrg
1974a1767b4Smrg /* Compute half-sized inverse. */
1984a1767b4Smrg in = qn - (qn >> 1);
1994a1767b4Smrg
2004a1767b4Smrg mpn_binvert (ip, dp, in, tp);
2014a1767b4Smrg
2024a1767b4Smrg mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
2034a1767b4Smrg
2044a1767b4Smrg if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
2054a1767b4Smrg mpn_mul (tp, dp, qn, qp, in); /* mulhigh */
2064a1767b4Smrg else
2074a1767b4Smrg {
2084a1767b4Smrg tn = mpn_mulmod_bnm1_next_size (qn);
2094a1767b4Smrg mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
2104a1767b4Smrg wn = qn + in - tn; /* number of wrapped limbs */
2114a1767b4Smrg if (wn > 0)
2124a1767b4Smrg {
2134a1767b4Smrg c0 = mpn_cmp (tp, np, wn) < 0;
2144a1767b4Smrg mpn_decr_u (tp + wn, c0);
2154a1767b4Smrg }
2164a1767b4Smrg }
2174a1767b4Smrg
2184a1767b4Smrg mpn_sub_n (tp, np + in, tp + in, qn - in);
2194a1767b4Smrg mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */
2204a1767b4Smrg
2214a1767b4Smrg #undef ip
2224a1767b4Smrg #undef tp
2234a1767b4Smrg #undef scratch_out
2244a1767b4Smrg }
2254a1767b4Smrg }
2264a1767b4Smrg
227*671ea119Smrg void
mpn_mu_bdiv_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)228*671ea119Smrg mpn_mu_bdiv_q (mp_ptr qp,
229*671ea119Smrg mp_srcptr np, mp_size_t nn,
230*671ea119Smrg mp_srcptr dp, mp_size_t dn,
231*671ea119Smrg mp_ptr scratch)
232*671ea119Smrg {
233*671ea119Smrg mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch);
234*671ea119Smrg mpn_neg (qp, qp, nn);
235*671ea119Smrg }
236*671ea119Smrg
2374a1767b4Smrg mp_size_t
mpn_mu_bdiv_q_itch(mp_size_t nn,mp_size_t dn)2384a1767b4Smrg mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
2394a1767b4Smrg {
2404a1767b4Smrg mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
2414a1767b4Smrg mp_size_t b;
2424a1767b4Smrg
243f81b1c5bSmrg ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
244f81b1c5bSmrg
2454a1767b4Smrg qn = nn;
2464a1767b4Smrg
2474a1767b4Smrg if (qn > dn)
2484a1767b4Smrg {
2494a1767b4Smrg b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
2504a1767b4Smrg in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
2514a1767b4Smrg if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
2524a1767b4Smrg {
2534a1767b4Smrg tn = dn + in;
2544a1767b4Smrg itch_out = 0;
2554a1767b4Smrg }
2564a1767b4Smrg else
2574a1767b4Smrg {
2584a1767b4Smrg tn = mpn_mulmod_bnm1_next_size (dn);
2594a1767b4Smrg itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
2604a1767b4Smrg }
2614a1767b4Smrg itches = dn + tn + itch_out;
2624a1767b4Smrg }
2634a1767b4Smrg else
2644a1767b4Smrg {
2654a1767b4Smrg in = qn - (qn >> 1);
2664a1767b4Smrg if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
2674a1767b4Smrg {
2684a1767b4Smrg tn = qn + in;
2694a1767b4Smrg itch_out = 0;
2704a1767b4Smrg }
2714a1767b4Smrg else
2724a1767b4Smrg {
2734a1767b4Smrg tn = mpn_mulmod_bnm1_next_size (qn);
2744a1767b4Smrg itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
2754a1767b4Smrg }
2764a1767b4Smrg itches = tn + itch_out;
277f81b1c5bSmrg }
278f81b1c5bSmrg
279f81b1c5bSmrg itch_binvert = mpn_binvert_itch (in);
2804a1767b4Smrg return in + MAX (itches, itch_binvert);
2814a1767b4Smrg }
282