1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn. 2 storing the result in {qp,nn}. Overlap allowed between Q and N; all other 3 overlap disallowed. 4 5 Contributed to the GNU project by Torbjorn Granlund. 6 7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS 8 ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS 9 ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP 10 RELEASE. 11 12 Copyright 2005, 2006, 2007 Free Software Foundation, Inc. 13 14 This file is part of the GNU MP Library. 15 16 The GNU MP Library is free software; you can redistribute it and/or modify 17 it under the terms of the GNU Lesser General Public License as published by 18 the Free Software Foundation; either version 3 of the License, or (at your 19 option) any later version. 20 21 The GNU MP Library is distributed in the hope that it will be useful, but 22 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 23 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 24 License for more details. 25 26 You should have received a copy of the GNU Lesser General Public License 27 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 28 29 30 /* We use the "misunderstanding algorithm" (MU), discovered by Paul Zimmermann 31 and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of 32 Jebelean's bidirectional exact division algorithm. 33 34 The idea of this algorithm is to compute a smaller inverted value than used 35 in the standard Barrett algorithm, and thus save time in the Newton 36 iterations, and pay just a small price when using the inverted value for 37 developing quotient bits. 38 39 Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the 40 "wrap around" trick. 41 */ 42 43 #include "gmp.h" 44 #include "gmp-impl.h" 45 46 47 /* N = {np,nn} 48 D = {dp,dn} 49 50 Requirements: N >= D 51 D >= 1 52 N mod D = 0 53 D odd 54 dn >= 2 55 nn >= 2 56 scratch space as determined by mpn_divexact_itch(nn,dn). 57 58 Write quotient to Q = {qp,nn}. 59 60 FIXME: When iterating, perhaps do the small step before loop, not after. 61 FIXME: Try to avoid the scalar divisions when computing inverse size. 62 FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In 63 particular, when dn==in, tp and rp could use the same space. 64 FIXME: Trim final quotient calculation to qn limbs of precision. 65 */ 66 void 67 mpn_mu_bdiv_q (mp_ptr qp, 68 mp_srcptr np, mp_size_t nn, 69 mp_srcptr dp, mp_size_t dn, 70 mp_ptr scratch) 71 { 72 mp_ptr ip; 73 mp_ptr rp; 74 mp_size_t qn; 75 mp_size_t in; 76 77 qn = nn; 78 79 ASSERT (dn >= 2); 80 ASSERT (qn >= 2); 81 82 if (qn > dn) 83 { 84 mp_size_t b; 85 mp_ptr tp; 86 mp_limb_t cy; 87 int k; 88 mp_size_t m, wn; 89 mp_size_t i; 90 91 /* |_______________________| dividend 92 |________| divisor */ 93 94 /* Compute an inverse size that is a nice partition of the quotient. */ 95 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 96 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 97 98 /* Some notes on allocation: 99 100 When in = dn, R dies when mpn_mullow returns, if in < dn the low in 101 limbs of R dies at that point. We could save memory by letting T live 102 just under R, and let the upper part of T expand into R. These changes 103 should reduce itch to perhaps 3dn. 104 */ 105 106 ip = scratch; /* in limbs */ 107 rp = scratch + in; /* dn limbs */ 108 tp = scratch + in + dn; /* dn + in limbs FIXME: mpn_fft_next_size */ 109 scratch += in; /* Roughly 2in+1 limbs */ 110 111 mpn_binvert (ip, dp, in, scratch); 112 113 cy = 0; 114 115 MPN_COPY (rp, np, dn); 116 np += dn; 117 mpn_mullow_n (qp, rp, ip, in); 118 qn -= in; 119 120 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) 121 { 122 k = mpn_fft_best_k (dn, 0); 123 m = mpn_fft_next_size (dn, k); 124 wn = dn + in - m; /* number of wrapped limbs */ 125 ASSERT_ALWAYS (wn >= 0); /* could handle this below */ 126 } 127 128 while (qn > in) 129 { 130 #if WANT_FFT 131 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) 132 { 133 /* The two multiplicands are dn and 'in' limbs, with dn >= in. 134 The relevant part of the result will typically partially wrap, 135 and that part will come out as subtracted to the right. The 136 unwrapped part, m-in limbs at the high end of tp, is the lower 137 part of the sought product. The wrapped part, at the low end 138 of tp, will be subtracted from the low part of the partial 139 remainder; we undo that operation with another subtraction. */ 140 int c0; 141 142 mpn_mul_fft (tp, m, dp, dn, qp, in, k); 143 144 c0 = mpn_sub_n (tp + m, rp, tp, wn); 145 146 for (i = wn; c0 != 0 && i < in; i++) 147 c0 = tp[i] == GMP_NUMB_MASK; 148 mpn_incr_u (tp + in, c0); 149 } 150 else 151 #endif 152 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ 153 qp += in; 154 if (dn != in) 155 { 156 /* Subtract tp[dn-1...in] from partial remainder. */ 157 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); 158 if (cy == 2) 159 { 160 mpn_incr_u (tp + dn, 1); 161 cy = 1; 162 } 163 } 164 /* Subtract tp[dn+in-1...dn] from dividend. */ 165 cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); 166 np += in; 167 mpn_mullow_n (qp, rp, ip, in); 168 qn -= in; 169 } 170 171 /* Generate last qn limbs. 172 FIXME: It should be possible to limit precision here, since qn is 173 typically somewhat smaller than dn. No big gains expected. */ 174 #if WANT_FFT 175 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) 176 { 177 int c0; 178 179 mpn_mul_fft (tp, m, dp, dn, qp, in, k); 180 181 c0 = mpn_sub_n (tp + m, rp, tp, wn); 182 183 for (i = wn; c0 != 0 && i < in; i++) 184 c0 = tp[i] == GMP_NUMB_MASK; 185 mpn_incr_u (tp + in, c0); 186 } 187 else 188 #endif 189 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */ 190 qp += in; 191 if (dn != in) 192 { 193 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); 194 if (cy == 2) 195 { 196 mpn_incr_u (tp + dn, 1); 197 cy = 1; 198 } 199 } 200 201 mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy); 202 mpn_mullow_n (qp, rp, ip, qn); 203 } 204 else 205 { 206 /* |_______________________| dividend 207 |________________| divisor */ 208 209 /* Compute half-sized inverse. */ 210 in = qn - (qn >> 1); 211 212 ip = scratch; /* ceil(qn/2) limbs */ 213 rp = scratch + in; /* ceil(qn/2)+qn limbs */ 214 scratch += in; /* 2*ceil(qn/2)+2 */ 215 216 mpn_binvert (ip, dp, in, scratch); 217 218 mpn_mullow_n (qp, np, ip, in); /* low `in' quotient limbs */ 219 #if WANT_FFT 220 if (ABOVE_THRESHOLD (qn, MUL_FFT_MODF_THRESHOLD)) 221 { 222 int k; 223 mp_size_t m; 224 225 k = mpn_fft_best_k (qn, 0); 226 m = mpn_fft_next_size (qn, k); 227 mpn_mul_fft (rp, m, dp, qn, qp, in, k); 228 if (mpn_cmp (np, rp, in) < 0) 229 mpn_incr_u (rp + in, 1); 230 } 231 else 232 #endif 233 mpn_mul (rp, dp, qn, qp, in); /* mulhigh */ 234 235 mpn_sub_n (rp, np + in, rp + in, qn - in); 236 mpn_mullow_n (qp + in, rp, ip, qn - in); /* high qn-in quotient limbs */ 237 } 238 } 239 240 mp_size_t 241 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn) 242 { 243 mp_size_t qn; 244 245 qn = nn; 246 247 if (qn > dn) 248 { 249 return 4 * dn; /* FIXME FIXME FIXME need mpn_fft_next_size */ 250 } 251 else 252 { 253 return 2 * qn + 1 + 2; /* FIXME FIXME FIXME need mpn_fft_next_size */ 254 } 255 } 256