1 /* mpn_mu_div_q. 2 3 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 4 5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. 8 9 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 27 /* 28 The idea of the algorithm used herein is to compute a smaller inverted value 29 than used in the standard Barrett algorithm, and thus save time in the 30 Newton iterations, and pay just a small price when using the inverted value 31 for developing quotient bits. This algorithm was presented at ICMS 2006. 32 */ 33 34 /* 35 Things to work on: 36 37 1. This is a rudimentary implementation of mpn_mu_div_q. The algorithm is 38 probably close to optimal, except when mpn_mu_divappr_q fails. 39 40 An alternative which could be considered for much simpler code for the 41 complex qn>=dn arm would be to allocate a temporary nn+1 limb buffer, then 42 simply call mpn_mu_divappr_q. Such a temporary allocation is 43 unfortunately very large. 44 45 2. We used to fall back to mpn_mu_div_qr when we detect a possible 46 mpn_mu_divappr_q rounding problem, now we multiply and compare. 47 Unfortunately, since mpn_mu_divappr_q does not return the partial 48 remainder, this also doesn't become optimal. A mpn_mu_divappr_qr could 49 solve that. 50 51 3. The allocations done here should be made from the scratch area, which 52 then would need to be amended. 53 */ 54 55 #include <stdlib.h> /* for NULL */ 56 #include "gmp.h" 57 #include "gmp-impl.h" 58 59 60 mp_limb_t 61 mpn_mu_div_q (mp_ptr qp, 62 mp_srcptr np, mp_size_t nn, 63 mp_srcptr dp, mp_size_t dn, 64 mp_ptr scratch) 65 { 66 mp_ptr tp, rp, ip, this_ip; 67 mp_size_t qn, in, this_in; 68 mp_limb_t cy, qh; 69 TMP_DECL; 70 71 TMP_MARK; 72 73 qn = nn - dn; 74 75 tp = TMP_BALLOC_LIMBS (qn + 1); 76 77 if (qn >= dn) /* nn >= 2*dn + 1 */ 78 { 79 /* Find max inverse size needed by the two preinv calls. FIXME: This is 80 not optimal, it underestimates the invariance. */ 81 if (dn != qn) 82 { 83 mp_size_t in1, in2; 84 85 in1 = mpn_mu_div_qr_choose_in (qn - dn, dn, 0); 86 in2 = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0); 87 in = MAX (in1, in2); 88 } 89 else 90 { 91 in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0); 92 } 93 94 ip = TMP_BALLOC_LIMBS (in + 1); 95 96 if (dn == in) 97 { 98 MPN_COPY (scratch + 1, dp, in); 99 scratch[0] = 1; 100 mpn_invertappr (ip, scratch, in + 1, NULL); 101 MPN_COPY_INCR (ip, ip + 1, in); 102 } 103 else 104 { 105 cy = mpn_add_1 (scratch, dp + dn - (in + 1), in + 1, 1); 106 if (UNLIKELY (cy != 0)) 107 MPN_ZERO (ip, in); 108 else 109 { 110 mpn_invertappr (ip, scratch, in + 1, NULL); 111 MPN_COPY_INCR (ip, ip + 1, in); 112 } 113 } 114 115 /* |_______________________| dividend 116 |________| divisor */ 117 rp = TMP_BALLOC_LIMBS (2 * dn + 1); 118 119 this_in = mpn_mu_div_qr_choose_in (qn - dn, dn, 0); 120 this_ip = ip + in - this_in; 121 qh = mpn_preinv_mu_div_qr (tp + dn + 1, rp + dn + 1, np + dn, qn, dp, dn, 122 this_ip, this_in, scratch); 123 124 MPN_COPY (rp + 1, np, dn); 125 rp[0] = 0; 126 this_in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0); 127 this_ip = ip + in - this_in; 128 cy = mpn_preinv_mu_divappr_q (tp, rp, 2 * dn + 1, dp, dn, 129 this_ip, this_in, scratch); 130 131 if (UNLIKELY (cy != 0)) 132 { 133 /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was 134 canonically reduced, replace the returned value of B^(qn-dn)+eps 135 by the largest possible value. */ 136 mp_size_t i; 137 for (i = 0; i < dn + 1; i++) 138 tp[i] = GMP_NUMB_MAX; 139 } 140 141 /* The max error of mpn_mu_divappr_q is +4. If the low quotient limb is 142 greater than the max error, we cannot trust the quotient. */ 143 if (tp[0] > 4) 144 { 145 MPN_COPY (qp, tp + 1, qn); 146 } 147 else 148 { 149 mp_limb_t cy; 150 mp_ptr pp; 151 152 /* FIXME: can we use already allocated space? */ 153 pp = TMP_BALLOC_LIMBS (nn); 154 mpn_mul (pp, tp + 1, qn, dp, dn); 155 156 cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0; 157 158 if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */ 159 qh -= mpn_sub_1 (qp, tp + 1, qn, 1); 160 else /* Same as above */ 161 MPN_COPY (qp, tp + 1, qn); 162 } 163 } 164 else 165 { 166 /* |_______________________| dividend 167 |________________| divisor */ 168 169 /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed 170 here becomes 2dn, i.e., more than nn. This shouldn't hurt, since only 171 the most significant dn-1 limbs will actually be read, but it is not 172 pretty. */ 173 174 qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2, 175 dp + dn - (qn + 1), qn + 1, scratch); 176 177 /* The max error of mpn_mu_divappr_q is +4, but we get an additional 178 error from the divisor truncation. */ 179 if (tp[0] > 6) 180 { 181 MPN_COPY (qp, tp + 1, qn); 182 } 183 else 184 { 185 mp_limb_t cy; 186 187 /* FIXME: a shorter product should be enough; we may use already 188 allocated space... */ 189 rp = TMP_BALLOC_LIMBS (nn); 190 mpn_mul (rp, dp, dn, tp + 1, qn); 191 192 cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0; 193 194 if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */ 195 qh -= mpn_sub_1 (qp, tp + 1, qn, 1); 196 else /* Same as above */ 197 MPN_COPY (qp, tp + 1, qn); 198 } 199 } 200 201 TMP_FREE; 202 return qh; 203 } 204 205 mp_size_t 206 mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) 207 { 208 mp_size_t qn, itch1, itch2; 209 210 qn = nn - dn; 211 if (qn >= dn) 212 { 213 itch1 = mpn_mu_div_qr_itch (qn, dn, mua_k); 214 itch2 = mpn_mu_divappr_q_itch (2 * dn + 1, dn, mua_k); 215 return MAX (itch1, itch2); 216 } 217 else 218 { 219 itch1 = mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k); 220 return itch1; 221 } 222 } 223