1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q. 2 3 Compute Q = floor(N / D) + e. N is nn limbs, D is dn limbs and must be 4 normalized, and Q must be nn-dn limbs, 0 <= e <= 4. The requirement that Q 5 is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us 6 to let N be unmodified during the operation. 7 8 Contributed to the GNU project by Torbjorn Granlund. 9 10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 11 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 12 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. 13 14 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. 15 16 This file is part of the GNU MP Library. 17 18 The GNU MP Library is free software; you can redistribute it and/or modify 19 it under the terms of the GNU Lesser General Public License as published by 20 the Free Software Foundation; either version 3 of the License, or (at your 21 option) any later version. 22 23 The GNU MP Library is distributed in the hope that it will be useful, but 24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 26 License for more details. 27 28 You should have received a copy of the GNU Lesser General Public License 29 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 30 31 32 /* 33 The idea of the algorithm used herein is to compute a smaller inverted value 34 than used in the standard Barrett algorithm, and thus save time in the 35 Newton iterations, and pay just a small price when using the inverted value 36 for developing quotient bits. This algorithm was presented at ICMS 2006. 37 */ 38 39 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync. 40 41 Things to work on: 42 43 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, 44 demonstrated by the fact that the mpn_invertappr function's scratch needs 45 mean that we need to keep a large allocation long after it is needed. 46 Things are worse as mpn_mul_fft does not accept any scratch parameter, 47 which means we'll have a large memory hole while in mpn_mul_fft. In 48 general, a peak scratch need in the beginning of a function isn't 49 well-handled by the itch/scratch scheme. 50 */ 51 52 #ifdef STAT 53 #undef STAT 54 #define STAT(x) x 55 #else 56 #define STAT(x) 57 #endif 58 59 #include <stdlib.h> /* for NULL */ 60 #include "gmp.h" 61 #include "gmp-impl.h" 62 63 64 mp_limb_t 65 mpn_mu_divappr_q (mp_ptr qp, 66 mp_srcptr np, 67 mp_size_t nn, 68 mp_srcptr dp, 69 mp_size_t dn, 70 mp_ptr scratch) 71 { 72 mp_size_t qn, in; 73 mp_limb_t cy, qh; 74 mp_ptr ip, tp; 75 76 ASSERT (dn > 1); 77 78 qn = nn - dn; 79 80 /* If Q is smaller than D, truncate operands. */ 81 if (qn + 1 < dn) 82 { 83 np += dn - (qn + 1); 84 nn -= dn - (qn + 1); 85 dp += dn - (qn + 1); 86 dn = qn + 1; 87 } 88 89 /* Compute the inverse size. */ 90 in = mpn_mu_divappr_q_choose_in (qn, dn, 0); 91 ASSERT (in <= dn); 92 93 #if 1 94 /* This alternative inverse computation method gets slightly more accurate 95 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function 96 not adapted (3) mpn_invertappr scratch needs not met. */ 97 ip = scratch; 98 tp = scratch + in + 1; 99 100 /* compute an approximate inverse on (in+1) limbs */ 101 if (dn == in) 102 { 103 MPN_COPY (tp + 1, dp, in); 104 tp[0] = 1; 105 mpn_invertappr (ip, tp, in + 1, NULL); 106 MPN_COPY_INCR (ip, ip + 1, in); 107 } 108 else 109 { 110 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); 111 if (UNLIKELY (cy != 0)) 112 MPN_ZERO (ip, in); 113 else 114 { 115 mpn_invertappr (ip, tp, in + 1, NULL); 116 MPN_COPY_INCR (ip, ip + 1, in); 117 } 118 } 119 #else 120 /* This older inverse computation method gets slightly worse results than the 121 one above. */ 122 ip = scratch; 123 tp = scratch + in; 124 125 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the 126 inversion function should do this automatically. */ 127 if (dn == in) 128 { 129 tp[in + 1] = 0; 130 MPN_COPY (tp + in + 2, dp, in); 131 mpn_invertappr (tp, tp + in + 1, in + 1, NULL); 132 } 133 else 134 { 135 mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); 136 } 137 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); 138 if (UNLIKELY (cy != 0)) 139 MPN_ZERO (tp + 1, in); 140 MPN_COPY (ip, tp + 1, in); 141 #endif 142 143 qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in); 144 145 return qh; 146 } 147 148 mp_limb_t 149 mpn_preinv_mu_divappr_q (mp_ptr qp, 150 mp_srcptr np, 151 mp_size_t nn, 152 mp_srcptr dp, 153 mp_size_t dn, 154 mp_srcptr ip, 155 mp_size_t in, 156 mp_ptr scratch) 157 { 158 mp_size_t qn; 159 mp_limb_t cy, cx, qh; 160 mp_limb_t r; 161 mp_size_t tn, wn; 162 163 #define rp scratch 164 #define tp (scratch + dn) 165 #define scratch_out (scratch + dn + tn) 166 167 qn = nn - dn; 168 169 np += qn; 170 qp += qn; 171 172 qh = mpn_cmp (np, dp, dn) >= 0; 173 if (qh != 0) 174 mpn_sub_n (rp, np, dp, dn); 175 else 176 MPN_COPY (rp, np, dn); 177 178 if (qn == 0) 179 return qh; /* Degenerate use. Should we allow this? */ 180 181 while (qn > 0) 182 { 183 if (qn < in) 184 { 185 ip += in - qn; 186 in = qn; 187 } 188 np -= in; 189 qp -= in; 190 191 /* Compute the next block of quotient limbs by multiplying the inverse I 192 by the upper part of the partial remainder R. */ 193 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ 194 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ 195 ASSERT_ALWAYS (cy == 0); 196 197 qn -= in; 198 if (qn == 0) 199 break; 200 201 /* Compute the product of the quotient block and the divisor D, to be 202 subtracted from the partial remainder combined with new limbs from the 203 dividend N. We only really need the low dn limbs. */ 204 205 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 206 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ 207 else 208 { 209 tn = mpn_mulmod_bnm1_next_size (dn + 1); 210 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); 211 wn = dn + in - tn; /* number of wrapped limbs */ 212 if (wn > 0) 213 { 214 cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); 215 cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); 216 cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; 217 ASSERT_ALWAYS (cx >= cy); 218 mpn_incr_u (tp, cx - cy); 219 } 220 } 221 222 r = rp[dn - in] - tp[dn]; 223 224 /* Subtract the product from the partial remainder combined with new 225 limbs from the dividend N, generating a new partial remainder R. */ 226 if (dn != in) 227 { 228 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ 229 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); 230 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ 231 } 232 else 233 { 234 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ 235 } 236 237 STAT (int i; int err = 0; 238 static int errarr[5]; static int err_rec; static int tot); 239 240 /* Check the remainder R and adjust the quotient as needed. */ 241 r -= cy; 242 while (r != 0) 243 { 244 /* We loop 0 times with about 69% probability, 1 time with about 31% 245 probability, 2 times with about 0.6% probability, if inverse is 246 computed as recommended. */ 247 mpn_incr_u (qp, 1); 248 cy = mpn_sub_n (rp, rp, dp, dn); 249 r -= cy; 250 STAT (err++); 251 } 252 if (mpn_cmp (rp, dp, dn) >= 0) 253 { 254 /* This is executed with about 76% probability. */ 255 mpn_incr_u (qp, 1); 256 cy = mpn_sub_n (rp, rp, dp, dn); 257 STAT (err++); 258 } 259 260 STAT ( 261 tot++; 262 errarr[err]++; 263 if (err > err_rec) 264 err_rec = err; 265 if (tot % 0x10000 == 0) 266 { 267 for (i = 0; i <= err_rec; i++) 268 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); 269 printf ("\n"); 270 } 271 ); 272 } 273 274 /* FIXME: We should perhaps be somewhat more elegant in our rounding of the 275 quotient. For now, just make sure the returned quotient is >= the real 276 quotient; add 3 with saturating arithmetic. */ 277 qn = nn - dn; 278 cy += mpn_add_1 (qp, qp, qn, 3); 279 if (cy != 0) 280 { 281 if (qh != 0) 282 { 283 /* Return a quotient of just 1-bits, with qh set. */ 284 mp_size_t i; 285 for (i = 0; i < qn; i++) 286 qp[i] = GMP_NUMB_MAX; 287 } 288 else 289 { 290 /* Propagate carry into qh. */ 291 qh = 1; 292 } 293 } 294 295 return qh; 296 } 297 298 /* In case k=0 (automatic choice), we distinguish 3 cases: 299 (a) dn < qn: in = ceil(qn / ceil(qn/dn)) 300 (b) dn/3 < qn <= dn: in = ceil(qn / 2) 301 (c) qn < dn/3: in = qn 302 In all cases we have in <= dn. 303 */ 304 mp_size_t 305 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k) 306 { 307 mp_size_t in; 308 309 if (k == 0) 310 { 311 mp_size_t b; 312 if (qn > dn) 313 { 314 /* Compute an inverse size that is a nice partition of the quotient. */ 315 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 316 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 317 } 318 else if (3 * qn > dn) 319 { 320 in = (qn - 1) / 2 + 1; /* b = 2 */ 321 } 322 else 323 { 324 in = (qn - 1) / 1 + 1; /* b = 1 */ 325 } 326 } 327 else 328 { 329 mp_size_t xn; 330 xn = MIN (dn, qn); 331 in = (xn - 1) / k + 1; 332 } 333 334 return in; 335 } 336 337 mp_size_t 338 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) 339 { 340 mp_size_t qn, in, itch_local, itch_out; 341 342 qn = nn - dn; 343 if (qn + 1 < dn) 344 { 345 dn = qn + 1; 346 } 347 in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k); 348 349 itch_local = mpn_mulmod_bnm1_next_size (dn + 1); 350 itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); 351 return in + dn + itch_local + itch_out; 352 } 353