1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn. 2 storing the result in {qp,nn}. Overlap allowed between Q and N; all other 3 overlap disallowed. 4 5 Contributed to the GNU project by Torbjorn Granlund. 6 7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. 10 11 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of the GNU Lesser General Public License as published by 17 the Free Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 The GNU MP Library is distributed in the hope that it will be useful, but 21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 22 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 23 License for more details. 24 25 You should have received a copy of the GNU Lesser General Public License 26 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 27 28 29 /* 30 The idea of the algorithm used herein is to compute a smaller inverted value 31 than used in the standard Barrett algorithm, and thus save time in the 32 Newton iterations, and pay just a small price when using the inverted value 33 for developing quotient bits. This algorithm was presented at ICMS 2006. 34 */ 35 36 #include "gmp.h" 37 #include "gmp-impl.h" 38 39 40 /* N = {np,nn} 41 D = {dp,dn} 42 43 Requirements: N >= D 44 D >= 1 45 D odd 46 dn >= 2 47 nn >= 2 48 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn). 49 50 Write quotient to Q = {qp,nn}. 51 52 FIXME: When iterating, perhaps do the small step before loop, not after. 53 FIXME: Try to avoid the scalar divisions when computing inverse size. 54 FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In 55 particular, when dn==in, tp and rp could use the same space. 56 FIXME: Trim final quotient calculation to qn limbs of precision. 57 */ 58 void 59 mpn_mu_bdiv_q (mp_ptr qp, 60 mp_srcptr np, mp_size_t nn, 61 mp_srcptr dp, mp_size_t dn, 62 mp_ptr scratch) 63 { 64 mp_size_t qn; 65 mp_size_t in; 66 int cy, c0; 67 mp_size_t tn, wn; 68 69 qn = nn; 70 71 ASSERT (dn >= 2); 72 ASSERT (qn >= 2); 73 74 if (qn > dn) 75 { 76 mp_size_t b; 77 78 /* |_______________________| dividend 79 |________| divisor */ 80 81 #define ip scratch /* in */ 82 #define rp (scratch + in) /* dn or rest >= binvert_itch(in) */ 83 #define tp (scratch + in + dn) /* dn+in or next_size(dn) */ 84 #define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */ 85 86 /* Compute an inverse size that is a nice partition of the quotient. */ 87 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 88 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 89 90 /* Some notes on allocation: 91 92 When in = dn, R dies when mpn_mullo returns, if in < dn the low in 93 limbs of R dies at that point. We could save memory by letting T live 94 just under R, and let the upper part of T expand into R. These changes 95 should reduce itch to perhaps 3dn. 96 */ 97 98 mpn_binvert (ip, dp, in, rp); 99 100 cy = 0; 101 102 MPN_COPY (rp, np, dn); 103 np += dn; 104 mpn_mullo_n (qp, rp, ip, in); 105 qn -= in; 106 107 while (qn > in) 108 { 109 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 110 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ 111 else 112 { 113 tn = mpn_mulmod_bnm1_next_size (dn); 114 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); 115 wn = dn + in - tn; /* number of wrapped limbs */ 116 if (wn > 0) 117 { 118 c0 = mpn_sub_n (tp + tn, tp, rp, wn); 119 mpn_decr_u (tp + wn, c0); 120 } 121 } 122 123 qp += in; 124 if (dn != in) 125 { 126 /* Subtract tp[dn-1...in] from partial remainder. */ 127 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); 128 if (cy == 2) 129 { 130 mpn_incr_u (tp + dn, 1); 131 cy = 1; 132 } 133 } 134 /* Subtract tp[dn+in-1...dn] from dividend. */ 135 cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); 136 np += in; 137 mpn_mullo_n (qp, rp, ip, in); 138 qn -= in; 139 } 140 141 /* Generate last qn limbs. 142 FIXME: It should be possible to limit precision here, since qn is 143 typically somewhat smaller than dn. No big gains expected. */ 144 145 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 146 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */ 147 else 148 { 149 tn = mpn_mulmod_bnm1_next_size (dn); 150 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); 151 wn = dn + in - tn; /* number of wrapped limbs */ 152 if (wn > 0) 153 { 154 c0 = mpn_sub_n (tp + tn, tp, rp, wn); 155 mpn_decr_u (tp + wn, c0); 156 } 157 } 158 159 qp += in; 160 if (dn != in) 161 { 162 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); 163 if (cy == 2) 164 { 165 mpn_incr_u (tp + dn, 1); 166 cy = 1; 167 } 168 } 169 170 mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy); 171 mpn_mullo_n (qp, rp, ip, qn); 172 173 #undef ip 174 #undef rp 175 #undef tp 176 #undef scratch_out 177 } 178 else 179 { 180 /* |_______________________| dividend 181 |________________| divisor */ 182 183 #define ip scratch /* in */ 184 #define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */ 185 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */ 186 187 /* Compute half-sized inverse. */ 188 in = qn - (qn >> 1); 189 190 mpn_binvert (ip, dp, in, tp); 191 192 mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ 193 194 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 195 mpn_mul (tp, dp, qn, qp, in); /* mulhigh */ 196 else 197 { 198 tn = mpn_mulmod_bnm1_next_size (qn); 199 mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out); 200 wn = qn + in - tn; /* number of wrapped limbs */ 201 if (wn > 0) 202 { 203 c0 = mpn_cmp (tp, np, wn) < 0; 204 mpn_decr_u (tp + wn, c0); 205 } 206 } 207 208 mpn_sub_n (tp, np + in, tp + in, qn - in); 209 mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */ 210 211 #undef ip 212 #undef tp 213 #undef scratch_out 214 } 215 } 216 217 mp_size_t 218 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn) 219 { 220 mp_size_t qn, in, tn, itch_binvert, itch_out, itches; 221 mp_size_t b; 222 223 qn = nn; 224 225 if (qn > dn) 226 { 227 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 228 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 229 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 230 { 231 tn = dn + in; 232 itch_out = 0; 233 } 234 else 235 { 236 tn = mpn_mulmod_bnm1_next_size (dn); 237 itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); 238 } 239 itch_binvert = mpn_binvert_itch (in); 240 itches = dn + tn + itch_out; 241 return in + MAX (itches, itch_binvert); 242 } 243 else 244 { 245 in = qn - (qn >> 1); 246 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 247 { 248 tn = qn + in; 249 itch_out = 0; 250 } 251 else 252 { 253 tn = mpn_mulmod_bnm1_next_size (qn); 254 itch_out = mpn_mulmod_bnm1_itch (tn, qn, in); 255 } 256 itch_binvert = mpn_binvert_itch (in); 257 itches = tn + itch_out; 258 return in + MAX (itches, itch_binvert); 259 } 260 } 261